bio-pangenome 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/bin/pangenome_blast_flanking.rb +32 -13
- data/bin/pangenome_gene_bed_files.rb +125 -0
- data/lib/bio-pangenome.rb +2 -1
- data/lib/bio-pangenome/MultipleGFF3.rb +77 -0
- data/lib/bio-pangenome/gff3_extensions.rb +121 -0
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d1d196a63e35bb86d33ba74ed1465454ffa9d028f57f0603ff90f8708acc352
|
4
|
+
data.tar.gz: 5f00000897b7de9361eba0781d02b49da78dc3d9e0af75fc572283173826224e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f8eb3a8f1b10706cd193dc720b3138460ddd4ceb9c5cc1878779159da8064ec20975a032583f872effc19ac24ea1deab6adbd97186f7789a11fccd6d441f126
|
7
|
+
data.tar.gz: 11e2512016f4b5d36ee1c19f88f47dafe82f6e1995c673d39ca5efcfc836fc2cca4f2a03550fcbb0a0b8ee7ef02c1a52e859b4c4111664dfb0870b2197c009b9
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.2
|
@@ -4,6 +4,13 @@
|
|
4
4
|
# Author:: homonecloco
|
5
5
|
# Copyright:: 2019
|
6
6
|
|
7
|
+
class String
|
8
|
+
def integer?
|
9
|
+
Integer(self) != nil rescue false
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
|
7
14
|
USAGE = "panggenome_blast_flanking.rb [options]"
|
8
15
|
|
9
16
|
gempath = File.dirname(File.dirname(__FILE__))
|
@@ -83,19 +90,31 @@ lines = BioPangenome.load_lines(options[:lines])
|
|
83
90
|
|
84
91
|
projected_genes = BioPangenome.load_projected_genes options[:transcript_mapping], genes: genes
|
85
92
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
93
|
+
|
94
|
+
seqs = nil
|
95
|
+
|
96
|
+
if options[:distance].integer?
|
97
|
+
variety_coordinates = BioPangenome.load_mapping_hash(
|
98
|
+
varieties:lines,
|
99
|
+
genes: projected_genes ,
|
100
|
+
prefix: options[:basepath],
|
101
|
+
distance: options[:distance]
|
102
|
+
)
|
103
|
+
seqs = BioPangenome.load_sequences_from_hash(
|
104
|
+
coordinates:variety_coordinates,
|
105
|
+
prefix: options[:basepath],
|
106
|
+
distance: options[:distance],
|
107
|
+
projected_genes: projected_genes
|
108
|
+
)
|
109
|
+
else
|
110
|
+
seqs = BioPangenome.load_cds_sequences( varieties: lines,
|
111
|
+
prefix: "#{options[:basepath]}/#{options[:distance]}/",
|
112
|
+
suffix: ".#{options[:distance]}.fa.gz",
|
113
|
+
set_id: options[:distance],
|
114
|
+
genes:projected_genes )
|
115
|
+
end
|
116
|
+
|
117
|
+
puts "Loaded squence set for #{seqs.size} genes"
|
99
118
|
|
100
119
|
output = options[:output].to_s
|
101
120
|
output = output + "_" + options[:window].to_s if options[:no_windows] > 0
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# BioRuby bio-pangenome Plugin BioPangenome
|
4
|
+
# Author:: homonecloco
|
5
|
+
# Copyright:: 2019
|
6
|
+
|
7
|
+
USAGE = "pangenome_gene_bed_files.rb [options]"
|
8
|
+
|
9
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
10
|
+
$: << File.join(gempath,'lib')
|
11
|
+
|
12
|
+
VERSION_FILENAME=File.join(gempath,'VERSION')
|
13
|
+
version = File.new(VERSION_FILENAME).read.chomp
|
14
|
+
|
15
|
+
# print banner
|
16
|
+
print "pangenome_gene_bed_files #{version} by Ricardo H. Ramirez-Gonzalez 2020\n"
|
17
|
+
|
18
|
+
if ARGV.size == 0
|
19
|
+
print USAGE
|
20
|
+
end
|
21
|
+
|
22
|
+
path = gempath + '/lib/bio-pangenome.rb'
|
23
|
+
require path
|
24
|
+
#require 'bio-pangenome'
|
25
|
+
require 'optparse'
|
26
|
+
require 'tmpdir'
|
27
|
+
|
28
|
+
require 'csv'
|
29
|
+
require 'zlib'
|
30
|
+
require 'bio'
|
31
|
+
require 'bio-svgenes'
|
32
|
+
|
33
|
+
|
34
|
+
options = {
|
35
|
+
lines: "lines.txt",
|
36
|
+
distances: [0,1000,2000,5000],
|
37
|
+
path: "./gff/",
|
38
|
+
is_gz: true
|
39
|
+
}
|
40
|
+
|
41
|
+
opts = OptionParser.new do |o|
|
42
|
+
|
43
|
+
o.on("-l", "--lines PATH", "File containing the lines to be analysed") do |arg|
|
44
|
+
options[:lines] = arg
|
45
|
+
end
|
46
|
+
|
47
|
+
o.on("-d", "--distances 0,1000,2000", "File containing the distances to be analysed") do |arg|
|
48
|
+
options[:distances] = arg.split(",").map { |e| e.to_i }
|
49
|
+
end
|
50
|
+
|
51
|
+
o.on("-p", "--gff_path DIR", "The directory where the gff files are") do |arg|
|
52
|
+
options[:path ] = arg
|
53
|
+
end
|
54
|
+
|
55
|
+
o.separator ""
|
56
|
+
o.on_tail('-h', '--help', 'display this help and exit') do
|
57
|
+
options[:show_help] = true
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.parse!(ARGV)
|
63
|
+
|
64
|
+
puts options.inspect
|
65
|
+
|
66
|
+
lines = File.foreach(options[:lines]).map { |line| line.chomp }
|
67
|
+
distances = options[:distances]
|
68
|
+
puts lines
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
gffs = MultipleGFFs.new(folder: options[:path], lines:lines, suffix:".gff.gz",
|
73
|
+
is_gz:options[:is_gz] )
|
74
|
+
|
75
|
+
distances.each do |d|
|
76
|
+
gffs.bedAround(distance: d, prefix: options[:path], suffix: ".bed" )
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
def bedAroundToRegions(lines:[], distance: 1000, prefix: "../flanking/filtered/", suffix: ".RefSeqv1.1.reg" , suffix_in: ".RefSeqv1.1.bed" )
|
81
|
+
lines.each do |k|
|
82
|
+
path="#{prefix}#{k}_#{distance}bp_#{suffix_in}"
|
83
|
+
path2="#{prefix}#{k}_#{distance}bp_#{suffix}"
|
84
|
+
path3="#{prefix}#{k}_#{distance}bp_#{suffix}.map"
|
85
|
+
puts path
|
86
|
+
out=File.open(path2, "w")
|
87
|
+
out2=File.open(path3, "w")
|
88
|
+
File.foreach(path) do |line|
|
89
|
+
# puts line
|
90
|
+
arr = line.chomp!.split "\t"
|
91
|
+
first=arr[1]
|
92
|
+
last=arr[2]
|
93
|
+
name=arr[0]
|
94
|
+
#if(arr[5] == "-")
|
95
|
+
# first=arr[2]
|
96
|
+
# last=arr[1]
|
97
|
+
#end
|
98
|
+
|
99
|
+
reg = "#{name}:#{first}-#{last}"
|
100
|
+
out.puts reg
|
101
|
+
out2.puts [reg,arr[3]].join "\t"
|
102
|
+
#puts reg
|
103
|
+
#v.bedAroundGene(distance:distance, out:out)
|
104
|
+
#break
|
105
|
+
end
|
106
|
+
out.close
|
107
|
+
out2.close
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
distances.each do |d|
|
114
|
+
bedAroundToRegions(lines:lines,
|
115
|
+
distance: d,
|
116
|
+
prefix: options[:path],
|
117
|
+
suffix_in: ".bed",
|
118
|
+
suffix: ".reg")
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
|
data/lib/bio-pangenome.rb
CHANGED
@@ -0,0 +1,77 @@
|
|
1
|
+
class MultipleGFFs
|
2
|
+
attr_reader :lines_gffs
|
3
|
+
|
4
|
+
def initialize(folder: "../mapping/", lines:[], suffix:".SM1.cds.sorted.gff", is_gz:false )
|
5
|
+
@folder = folder
|
6
|
+
@lines = lines
|
7
|
+
@suffix = suffix
|
8
|
+
@lines_gffs = Hash.new
|
9
|
+
@lines.each do |l|
|
10
|
+
path ="#{folder}/#{l}#{suffix}"
|
11
|
+
@lines_gffs[l] = GFF3.new(file: path, is_gz: is_gz)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def each_gff
|
16
|
+
@lines_gffs.each_pair{|k,v| yield k, v }
|
17
|
+
end
|
18
|
+
|
19
|
+
def bedAround(distance: 1000, prefix: "../flanking/releasePGSBV1/", suffix: ".RefSeqv1.1.bed" )
|
20
|
+
each_gff do |k, v|
|
21
|
+
path="#{prefix}#{k}_#{distance}bp_#{suffix}"
|
22
|
+
puts path
|
23
|
+
out=File.open(path, "w")
|
24
|
+
v.bedAroundGene(distance:distance, out:out)
|
25
|
+
out.close
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def summary
|
30
|
+
ret = []
|
31
|
+
each_gff do |k,v|
|
32
|
+
v.each_mrna do |record|
|
33
|
+
tmp = {}
|
34
|
+
tmp[:line] = k
|
35
|
+
tmp[:id] = record.get_attribute "Name"
|
36
|
+
tmp[:chr] = record.seqid
|
37
|
+
tmp[:start] = record.start
|
38
|
+
tmp[:end] = record.end
|
39
|
+
tmp[:strand] = record.strand
|
40
|
+
tmp[:genomic_length] = record.end - record.start
|
41
|
+
tmp[:coverage] = record.get_attribute "coverage"
|
42
|
+
tmp[:identity] = record.get_attribute "identity"
|
43
|
+
tmp[:matches] = record.get_attribute "matches"
|
44
|
+
tmp[:mismatches] = record.get_attribute "mismatches"
|
45
|
+
tmp[:indels] = record.get_attribute "indels"
|
46
|
+
tmp[:unknowns] = record.get_attribute "unknowns"
|
47
|
+
mrna_stats = @lines_gffs[k].mrna_info(record.id)
|
48
|
+
tmp[:cds_count] = mrna_stats.cds_count
|
49
|
+
tmp[:cds_max_gap] = mrna_stats.cds_max_gap
|
50
|
+
ret << tmp
|
51
|
+
end
|
52
|
+
end
|
53
|
+
ret
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_svg(mrna: "Sm1_CDS.mrna1", positions: false, out: nil)
|
57
|
+
p = Bio::Graphics::Page.new(width: 800,
|
58
|
+
height: 1000,
|
59
|
+
number_of_intervals:10,
|
60
|
+
background_color: "white"
|
61
|
+
)
|
62
|
+
each_gff do |k,v|
|
63
|
+
generic_track = p.add_track(:glyph => :generic,
|
64
|
+
:name => k,
|
65
|
+
:label => true )
|
66
|
+
v.cds_to_print(mrna).each do |cds|
|
67
|
+
|
68
|
+
f_id = positions ? cds.offset_start : nil
|
69
|
+
feature = Bio::Graphics::MiniFeature.new(start: cds.start,
|
70
|
+
end: cds.end,
|
71
|
+
fill_color: cds.color,
|
72
|
+
id: f_id)
|
73
|
+
generic_track.add(feature)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'bio-gff3'
|
2
|
+
|
3
|
+
module Bio::GFFbrowser::FastLineParser
|
4
|
+
module_function :parse_line_fast
|
5
|
+
end
|
6
|
+
|
7
|
+
MrnaStats = Struct.new(:cds_count, :cds_max_gap)
|
8
|
+
|
9
|
+
class GFF3
|
10
|
+
CDS_feature = Struct.new(:start, :end, :color, :ref_chr,:ref_start, :ref_end, :offset_start)
|
11
|
+
|
12
|
+
def initialize(file: "", is_gz: true)
|
13
|
+
@file = file
|
14
|
+
@is_gz = is_gz
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
return enum_for(:each) unless block_given?
|
19
|
+
io = nil
|
20
|
+
if @is_gz
|
21
|
+
infile = open(@file)
|
22
|
+
io = Zlib::GzipReader.new(infile)
|
23
|
+
else
|
24
|
+
io = File.open(@file)
|
25
|
+
end
|
26
|
+
parser = Bio::GFFbrowser::FastLineParser
|
27
|
+
io.each_line do |line|
|
28
|
+
line.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
29
|
+
line.strip!
|
30
|
+
break if line == '##FASTA'
|
31
|
+
next if line.length == 0 or line =~ /^#/
|
32
|
+
begin
|
33
|
+
record = Bio::GFFbrowser::FastLineRecord.new(parser.parse_line_fast(line))
|
34
|
+
yield record
|
35
|
+
rescue Exception => e
|
36
|
+
$stderr.puts "Unable to parse '#{line}'\n#{e}"
|
37
|
+
throw e
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def each_gene
|
43
|
+
return enum_for(:each_gene) unless block_given?
|
44
|
+
self.each do |record|
|
45
|
+
next unless record.feature == "gene"
|
46
|
+
yield record
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def each_mrna
|
51
|
+
return enum_for(:each_mrna) unless block_given?
|
52
|
+
self.each do |record|
|
53
|
+
next unless record.feature == "mRNA"
|
54
|
+
yield record
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def each_cds
|
59
|
+
return enum_for(:each_mrna) unless block_given?
|
60
|
+
self.each do |record|
|
61
|
+
next unless record.feature == "CDS"
|
62
|
+
yield record
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def calculate_mrna_stats
|
67
|
+
return if @mrna_stats
|
68
|
+
@mrna_stats = Hash.new {|h,k| h[k] = MrnaStats.new(0,0) }
|
69
|
+
last_mrna = ""
|
70
|
+
last_record = nil
|
71
|
+
each_cds do |record|
|
72
|
+
parent = record.get_attribute "Parent"
|
73
|
+
mrna = @mrna_stats[parent]
|
74
|
+
mrna.cds_count += 1
|
75
|
+
if last_mrna == parent
|
76
|
+
distance = record.start - last_record.end
|
77
|
+
mrna.cds_max_gap = distance if distance > mrna.cds_max_gap
|
78
|
+
end
|
79
|
+
last_record = record
|
80
|
+
last_mrna = parent
|
81
|
+
end
|
82
|
+
return
|
83
|
+
end
|
84
|
+
|
85
|
+
def mrna_info(id)
|
86
|
+
calculate_mrna_stats
|
87
|
+
@mrna_stats[id]
|
88
|
+
end
|
89
|
+
|
90
|
+
def bedAroundGene(distance:1000, out:$stdout)
|
91
|
+
each_gene do |record|
|
92
|
+
start = record.start-distance
|
93
|
+
start = 1 if start < 1
|
94
|
+
reg_end=record.end + distance
|
95
|
+
out.puts [record.seqid, start, reg_end, "#{record.id}_#{record.source}_#{distance}bp", ".", record.strand].join "\t"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
def cds_to_print(mrna,cannonical_exons:[], colors:["#a6cee3", "#1f78b4", "#b2df8a" , "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"])
|
101
|
+
|
102
|
+
cds_features = []
|
103
|
+
i = 0
|
104
|
+
offset=0
|
105
|
+
offset_start=0
|
106
|
+
each_cds do |record|
|
107
|
+
target = record.get_attribute "Target"
|
108
|
+
arr = target.split(" ")
|
109
|
+
col = colors[i % colors.size ]
|
110
|
+
start = arr[1].to_i + offset
|
111
|
+
ends = arr[2].to_i + offset
|
112
|
+
offset_start = record.start if offset_start == 0
|
113
|
+
tmp = CDS_feature.new(start, ends, col,
|
114
|
+
record.seqid, record.start,record.end, record.start - offset_start )
|
115
|
+
cds_features << tmp
|
116
|
+
i += 1
|
117
|
+
end
|
118
|
+
cds_features
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-pangenome
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ricardo H. Ramirez-Gonzalez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-04-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -112,6 +112,7 @@ description: Tools to find similarity between pangenomes.
|
|
112
112
|
email: ricardo.ramirez-gonzalez@jic.ac.uk
|
113
113
|
executables:
|
114
114
|
- pangenome_blast_flanking.rb
|
115
|
+
- pangenome_gene_bed_files.rb
|
115
116
|
extensions: []
|
116
117
|
extra_rdoc_files:
|
117
118
|
- LICENSE.txt
|
@@ -127,7 +128,10 @@ files:
|
|
127
128
|
- Rakefile
|
128
129
|
- VERSION
|
129
130
|
- bin/pangenome_blast_flanking.rb
|
131
|
+
- bin/pangenome_gene_bed_files.rb
|
130
132
|
- lib/bio-pangenome.rb
|
133
|
+
- lib/bio-pangenome/MultipleGFF3.rb
|
134
|
+
- lib/bio-pangenome/gff3_extensions.rb
|
131
135
|
- lib/bio-pangenome/pangenome.rb
|
132
136
|
- test/helper.rb
|
133
137
|
- test/test_bio-pangenome.rb
|
@@ -150,7 +154,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
150
154
|
- !ruby/object:Gem::Version
|
151
155
|
version: '0'
|
152
156
|
requirements: []
|
153
|
-
|
157
|
+
rubyforge_project:
|
158
|
+
rubygems_version: 2.7.7
|
154
159
|
signing_key:
|
155
160
|
specification_version: 4
|
156
161
|
summary: Scripts to analyse pangenomes.
|