bacterial-annotator 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/bacterial-annotator +39 -29
- data/lib/bacterial-annotator/sequence-annotation.rb +209 -30
- data/lib/bacterial-annotator/sequence-fasta.rb +21 -18
- data/lib/bacterial-annotator/sequence-synteny.rb +77 -20
- data/lib/bacterial-annotator.rb +201 -64
- data/lib/bacterial-comparator.rb +42 -26
- data/lib/bacterial-identificator.rb +86 -13
- metadata +3 -3
@@ -6,15 +6,19 @@
|
|
6
6
|
# version: 0.0.1
|
7
7
|
# licence:
|
8
8
|
|
9
|
+
require 'json'
|
10
|
+
require 'zlib'
|
9
11
|
|
10
12
|
class SequenceSynteny
|
11
13
|
|
12
14
|
attr_reader :query_file, :subject_file, :aln_hits, :query_sequences, :subject_sequences
|
13
15
|
|
14
|
-
def initialize query_file, subject_file, name, pidentity, min_coverage, type
|
16
|
+
def initialize root, outdir, query_file, subject_file, name, pidentity, min_coverage, type
|
17
|
+
|
18
|
+
@root = root
|
19
|
+
@outdir = outdir
|
15
20
|
@query_file = query_file
|
16
21
|
@subject_file = subject_file
|
17
|
-
|
18
22
|
@query_sequences = get_sequences(query_file)
|
19
23
|
@subject_sequences = get_sequences(subject_file)
|
20
24
|
|
@@ -28,22 +32,47 @@ class SequenceSynteny
|
|
28
32
|
|
29
33
|
|
30
34
|
# get sequences name with length in hash
|
31
|
-
def get_sequences
|
35
|
+
def get_sequences raw_file
|
32
36
|
|
33
37
|
sequences = {}
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
|
39
|
+
if raw_file.include?(".dmnd")
|
40
|
+
|
41
|
+
seq_info_file = raw_file.gsub(".dmnd",".json.gz")
|
42
|
+
|
43
|
+
json_genes = {}
|
44
|
+
Zlib::GzipReader.open(seq_info_file) {|gz|
|
45
|
+
json_genes = JSON.parse(gz.read)
|
46
|
+
}
|
47
|
+
|
48
|
+
json_genes.each do |gene|
|
49
|
+
|
50
|
+
sequences[gene["cluster_id"]] = {}
|
51
|
+
sequences[gene["cluster_id"]][:length] = gene["consensus_length"].to_f
|
52
|
+
sequences[gene["cluster_id"]][:conserved] = false
|
53
|
+
sequences[gene["cluster_id"]][:contig] = gene["cluster_id"].split("_")[0..-2].join("_") if gene["cluster_id"].include? "_"
|
54
|
+
|
42
55
|
end
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
56
|
+
|
57
|
+
else
|
58
|
+
|
59
|
+
seq_file = raw_file
|
60
|
+
flat = Bio::FlatFile.auto("#{seq_file}")
|
61
|
+
flat.each_entry do |s|
|
62
|
+
s_name = s.definition.chomp.split(" ")[0]
|
63
|
+
sequences[s_name] = {}
|
64
|
+
properties = s.definition.chomp.split(";")
|
65
|
+
partial = false
|
66
|
+
if properties.length >= 2 and properties[1].include? "partial"
|
67
|
+
partial = (properties[1].gsub("partial=","").include? '1')
|
68
|
+
end
|
69
|
+
sequences[s_name][:partial] = partial
|
70
|
+
sequences[s_name][:length] = s.seq.length
|
71
|
+
sequences[s_name][:conserved] = false
|
72
|
+
sequences[s_name][:contig] = s_name.split("_")[0..-2].join("_") if s_name.include? "_"
|
73
|
+
|
74
|
+
end
|
75
|
+
|
47
76
|
end
|
48
77
|
|
49
78
|
sequences
|
@@ -51,14 +80,42 @@ class SequenceSynteny
|
|
51
80
|
end
|
52
81
|
|
53
82
|
# run blat on proteins
|
54
|
-
def run_blat
|
55
|
-
base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity} > /dev/null 2>&1"
|
83
|
+
def run_blat
|
84
|
+
base_cmd = "#{@root}/blat.linux -out=blast8 -minIdentity=#{@pidentity} > /dev/null 2>&1"
|
56
85
|
if @type == "prot"
|
57
|
-
system("#{base_cmd} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
|
86
|
+
system("#{base_cmd} -prot #{@subject_file} #{@query_file} #{@outdir}/#{@name}.blat8.tsv")
|
87
|
+
else
|
88
|
+
system("#{base_cmd} #{@subject_file} #{@query_file} #{@outdir}/#{@name}.blat8.tsv")
|
89
|
+
end
|
90
|
+
@aln_file = "#{@outdir}/#{@name}.blat8.tsv"
|
91
|
+
# extract_hits
|
92
|
+
end # end of method
|
93
|
+
|
94
|
+
# run fasta36 on proteins
|
95
|
+
def run_fasta36
|
96
|
+
if @type == "prot"
|
97
|
+
system("#{@root}/fasta36.linux -T 1 -b 3 -E 1e-40 -m 8 #{@query_file} #{@subject_file} > #{@outdir}/#{@name}.fasta36.tsv")
|
98
|
+
else
|
99
|
+
system("#{@root}/glsearch36.linux -T 1 -b 12 -E 1e-40 -m 8 #{@query_file} #{@subject_file} > #{@outdir}/#{@name}.fasta36.tsv")
|
100
|
+
end
|
101
|
+
@aln_file_fasta36 = "#{@outdir}/#{@name}.fasta36.tsv"
|
102
|
+
# extract_hits
|
103
|
+
end # end of method
|
104
|
+
|
105
|
+
# run diamond on proteins
|
106
|
+
def run_diamond
|
107
|
+
if @type == "prot"
|
108
|
+
if subject_file.include? ".dmnd"
|
109
|
+
db_file = subject_file
|
110
|
+
else
|
111
|
+
system("#{@root}/diamond.linux makedb --db #{subject_file} --in #{subject_file} > /dev/null 2>&1")
|
112
|
+
db_file = subject_file
|
113
|
+
end
|
114
|
+
system("#{@root}/diamond.linux blastp --db #{db_file} -q #{query_file} -o #{@outdir}/#{@name}.diamond.tsv -f 6 > /dev/null 2>&1")
|
58
115
|
else
|
59
|
-
system("#{
|
116
|
+
# system("#{@root}/glsearch36.linux -b 3 -E 1e-25 -m 8 #{@subject_file} #{@query_file} > #{@outdir}/#{@name}.fasta36.tsv")
|
60
117
|
end
|
61
|
-
@aln_file = "#{outdir}/#{@name}.
|
118
|
+
@aln_file = "#{@outdir}/#{@name}.diamond.tsv"
|
62
119
|
# extract_hits
|
63
120
|
end # end of method
|
64
121
|
|
data/lib/bacterial-annotator.rb
CHANGED
@@ -23,6 +23,8 @@ class BacterialAnnotator
|
|
23
23
|
@root = root
|
24
24
|
@options = options
|
25
25
|
|
26
|
+
abort if ! @options.has_key? :input
|
27
|
+
|
26
28
|
@minlength = @options[:minlength].to_i
|
27
29
|
@options[:minlength] = @options[:minlength].to_i
|
28
30
|
@options[:pidentity] = @options[:pidentity].to_f
|
@@ -44,12 +46,25 @@ class BacterialAnnotator
|
|
44
46
|
end
|
45
47
|
Dir.mkdir(@options[:outdir])
|
46
48
|
|
47
|
-
@query_fasta = SequenceFasta.new(@
|
49
|
+
@query_fasta = SequenceFasta.new(@root,
|
50
|
+
options[:outdir],
|
51
|
+
@options[:input],
|
52
|
+
@options[:meta])
|
48
53
|
|
49
54
|
@with_refence_genome = false
|
55
|
+
@with_db = false
|
50
56
|
if @options.has_key? :refgenome
|
51
57
|
@with_refence_genome = true
|
52
|
-
@ref_genome = SequenceAnnotation.new(@
|
58
|
+
@ref_genome = SequenceAnnotation.new(@root,
|
59
|
+
@options[:outdir],
|
60
|
+
@options[:refgenome],
|
61
|
+
"refGbk")
|
62
|
+
elsif @options[:mergem]
|
63
|
+
@with_db = true
|
64
|
+
@ref_genome = SequenceAnnotation.new(@root,
|
65
|
+
@options[:outdir],
|
66
|
+
@options[:mergem],
|
67
|
+
"db")
|
53
68
|
end
|
54
69
|
|
55
70
|
@with_external_db = false
|
@@ -76,52 +91,12 @@ class BacterialAnnotator
|
|
76
91
|
|
77
92
|
end # end of method
|
78
93
|
|
79
|
-
# Prepare files for the annotation
|
80
|
-
# Will run prodigal on the query and prepare reference genome files
|
81
|
-
def prepare_files_for_annotation
|
82
|
-
print "# Running Prodigal on your genome.."
|
83
|
-
start_time = Time.now
|
84
|
-
@query_fasta.run_prodigal @root, @options[:outdir]
|
85
|
-
end_time = Time.now
|
86
|
-
c_time = Helper.sec2str(end_time - start_time)
|
87
|
-
print "done (#{c_time})\n"
|
88
|
-
if @with_refence_genome
|
89
|
-
@ref_genome.write_cds_to_file @options[:outdir]
|
90
|
-
@ref_genome.write_rna_to_file @options[:outdir]
|
91
|
-
# puts "Successfully loaded #{@ref_genome.gbk.definition}"
|
92
|
-
end
|
93
|
-
end # end of method
|
94
|
-
|
95
|
-
|
96
|
-
def run_reference_synteny_prot
|
97
|
-
|
98
|
-
ref_synteny_prot = SequenceSynteny.new(@query_fasta.annotation_files[:proteins], @ref_genome.cds_file,
|
99
|
-
"Prot-Ref", @options[:pidentity], @options[:pcoverage], "prot")
|
100
|
-
|
101
|
-
print "# Running alignment with Reference Genome CDS (blat).."
|
102
|
-
start_time = Time.now
|
103
|
-
ref_synteny_prot.run_blat @root, @options[:outdir]
|
104
|
-
end_time = Time.now
|
105
|
-
c_time = Helper.sec2str(end_time - start_time)
|
106
|
-
print "done (#{c_time})\n"
|
107
|
-
|
108
|
-
ref_synteny_prot.extract_hits :refgenome
|
109
|
-
|
110
|
-
ref_synteny_prot.query_sequences.each do |k,v|
|
111
|
-
if v.has_key? :homology
|
112
|
-
@contig_annotations_cds[v[:contig]] = [] if ! @contig_annotations_cds.has_key? v[:contig]
|
113
|
-
@contig_annotations_cds[v[:contig]] << k
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
ref_synteny_prot
|
118
|
-
|
119
|
-
end
|
120
|
-
|
121
94
|
|
122
95
|
# run_alignment of reference genome proteins and the query
|
123
96
|
def run_annotation
|
124
97
|
|
98
|
+
prepare_files_for_annotation
|
99
|
+
|
125
100
|
# process reference genome synteny
|
126
101
|
if @with_refence_genome # Annotation with the Reference Genome
|
127
102
|
|
@@ -153,23 +128,69 @@ class BacterialAnnotator
|
|
153
128
|
dump_ref_synteny_to_file
|
154
129
|
|
155
130
|
# run RNA annotation
|
156
|
-
@rna_synteny = SequenceSynteny.new(@
|
157
|
-
|
131
|
+
@rna_synteny = SequenceSynteny.new(@root,
|
132
|
+
@options[:outdir],
|
133
|
+
@query_fasta.fasta_file,
|
134
|
+
@ref_genome.rna_file,
|
135
|
+
"RNA-Ref",
|
136
|
+
@options[:pidentity],
|
137
|
+
@options[:pcoverage],
|
138
|
+
"dna")
|
139
|
+
|
158
140
|
print "# Running alignment with Reference Genome RNA (blat).."
|
159
141
|
start_time = Time.now
|
160
|
-
@rna_synteny.run_blat
|
142
|
+
@rna_synteny.run_blat
|
161
143
|
end_time = Time.now
|
162
144
|
c_time = Helper.sec2str(end_time-start_time)
|
163
145
|
print "done (#{c_time})\n"
|
146
|
+
|
147
|
+
# # takes too long
|
148
|
+
# print "# Running alignment with Reference Genome RNA (fasta36).."
|
149
|
+
# start_time = Time.now
|
150
|
+
# @rna_synteny.run_fasta36
|
151
|
+
# end_time = Time.now
|
152
|
+
# c_time = Helper.sec2str(end_time-start_time)
|
153
|
+
# print "done (#{c_time})\n"
|
154
|
+
|
164
155
|
@rna_synteny.extract_hits_dna :rna
|
165
156
|
@contig_annotations_rna = {}
|
166
157
|
@query_fasta.annotation_files[:contigs].each_with_index do |contig, contig_index|
|
167
158
|
@contig_annotations_rna[contig] = @rna_synteny.get_annotation_for_contig contig
|
168
159
|
end
|
169
160
|
|
161
|
+
|
162
|
+
elsif @with_db
|
163
|
+
|
164
|
+
@prot_synteny_refgenome = run_mergem_synteny_prot
|
165
|
+
# iterate over each contig
|
166
|
+
# discard short contig
|
167
|
+
# cumulate statistics of homolog CDS
|
168
|
+
@query_fasta.annotation_files[:contigs].each_with_index do |contig, contig_index|
|
169
|
+
|
170
|
+
# Skip short contigs
|
171
|
+
if @query_fasta.annotation_files[:contigs_length][contig_index] < @minlength
|
172
|
+
@annotation_stats[:short_contigs] << contig
|
173
|
+
next
|
174
|
+
end
|
175
|
+
|
176
|
+
remaining_cds = cumulate_annotation_stats_reference contig
|
177
|
+
|
178
|
+
if remaining_cds != []
|
179
|
+
@contig_foreign_cds[contig] = remaining_cds
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
183
|
+
|
184
|
+
# dump foreign proteins to file
|
185
|
+
foreign_cds_file = dump_cds
|
186
|
+
|
187
|
+
# dump reference CDS synteny to file
|
188
|
+
dump_ref_synteny_to_file
|
189
|
+
|
190
|
+
|
170
191
|
else # no reference genome
|
171
192
|
|
172
|
-
# no reference genome .. will process all the CDS
|
193
|
+
# no reference genome .. will process all the CDS as foreign for the external db
|
173
194
|
foreign_cds_file = @query_fasta.annotation_files[:proteins]
|
174
195
|
|
175
196
|
end
|
@@ -187,6 +208,99 @@ class BacterialAnnotator
|
|
187
208
|
end # end of method
|
188
209
|
|
189
210
|
|
211
|
+
# Prepare files for the annotation
|
212
|
+
# Will run prodigal on the query and prepare reference genome files
|
213
|
+
def prepare_files_for_annotation
|
214
|
+
print "# Running Prodigal on your genome.."
|
215
|
+
start_time = Time.now
|
216
|
+
@query_fasta.run_prodigal
|
217
|
+
end_time = Time.now
|
218
|
+
c_time = Helper.sec2str(end_time - start_time)
|
219
|
+
print "done (#{c_time})\n"
|
220
|
+
end # end of method
|
221
|
+
|
222
|
+
|
223
|
+
def run_mergem_synteny_prot
|
224
|
+
|
225
|
+
|
226
|
+
ref_synteny_prot = SequenceSynteny.new(@root,
|
227
|
+
@options[:outdir],
|
228
|
+
@query_fasta.annotation_files[:proteins],
|
229
|
+
@ref_genome.cds_file,
|
230
|
+
"Prot-Ref",
|
231
|
+
@options[:pidentity],
|
232
|
+
@options[:pcoverage],
|
233
|
+
"prot")
|
234
|
+
|
235
|
+
print "# Running alignment with Reference Genome CDS (diamond).."
|
236
|
+
start_time = Time.now
|
237
|
+
ref_synteny_prot.run_diamond
|
238
|
+
end_time = Time.now
|
239
|
+
c_time = Helper.sec2str(end_time - start_time)
|
240
|
+
print "done (#{c_time})\n"
|
241
|
+
|
242
|
+
ref_synteny_prot.extract_hits :refgenome
|
243
|
+
|
244
|
+
ref_synteny_prot.query_sequences.each do |k,v|
|
245
|
+
if v.has_key? :homology
|
246
|
+
@contig_annotations_cds[v[:contig]] = [] if ! @contig_annotations_cds.has_key? v[:contig]
|
247
|
+
@contig_annotations_cds[v[:contig]] << k
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
ref_synteny_prot
|
252
|
+
|
253
|
+
|
254
|
+
end
|
255
|
+
|
256
|
+
|
257
|
+
|
258
|
+
def run_reference_synteny_prot
|
259
|
+
|
260
|
+
ref_synteny_prot = SequenceSynteny.new(@root,
|
261
|
+
@options[:outdir],
|
262
|
+
@query_fasta.annotation_files[:proteins],
|
263
|
+
@ref_genome.cds_file,
|
264
|
+
"Prot-Ref",
|
265
|
+
@options[:pidentity],
|
266
|
+
@options[:pcoverage],
|
267
|
+
"prot")
|
268
|
+
|
269
|
+
print "# Running alignment with Reference Genome CDS (diamond).."
|
270
|
+
start_time = Time.now
|
271
|
+
ref_synteny_prot.run_diamond
|
272
|
+
end_time = Time.now
|
273
|
+
c_time = Helper.sec2str(end_time - start_time)
|
274
|
+
print "done (#{c_time})\n"
|
275
|
+
|
276
|
+
# print "# Running alignment with Reference Genome CDS (blat).."
|
277
|
+
# start_time = Time.now
|
278
|
+
# ref_synteny_prot.run_blat
|
279
|
+
# end_time = Time.now
|
280
|
+
# c_time = Helper.sec2str(end_time - start_time)
|
281
|
+
# print "done (#{c_time})\n"
|
282
|
+
|
283
|
+
# print "# Running alignment with Reference Genome CDS (fasta36).."
|
284
|
+
# start_time = Time.now
|
285
|
+
# ref_synteny_prot.run_fasta36
|
286
|
+
# end_time = Time.now
|
287
|
+
# c_time = Helper.sec2str(end_time - start_time)
|
288
|
+
# print "done (#{c_time})\n"
|
289
|
+
|
290
|
+
ref_synteny_prot.extract_hits :refgenome
|
291
|
+
|
292
|
+
ref_synteny_prot.query_sequences.each do |k,v|
|
293
|
+
if v.has_key? :homology
|
294
|
+
@contig_annotations_cds[v[:contig]] = [] if ! @contig_annotations_cds.has_key? v[:contig]
|
295
|
+
@contig_annotations_cds[v[:contig]] << k
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
ref_synteny_prot
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
|
190
304
|
# Finishing the annotation of the remaining CDS
|
191
305
|
def finish_annotation remaining_cds_file
|
192
306
|
|
@@ -194,15 +308,25 @@ class BacterialAnnotator
|
|
194
308
|
if @options.has_key? :external_db # from an external DB
|
195
309
|
|
196
310
|
db_file = @options[:external_db]
|
197
|
-
ref_cds =
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
311
|
+
ref_cds = SequenceAnnotation.new(@root,
|
312
|
+
@options[:outdir],
|
313
|
+
db_file,
|
314
|
+
"fasta")
|
315
|
+
|
316
|
+
# ref_cds = extract_externaldb_prot_info db_file
|
317
|
+
|
318
|
+
@externaldb_synteny = SequenceSynteny.new(@root,
|
319
|
+
@options[:outdir],
|
320
|
+
remaining_cds_file,
|
321
|
+
db_file,
|
322
|
+
"Prot-ExternalDB",
|
323
|
+
@options[:pidentity],
|
324
|
+
@options[:pcoverage],
|
325
|
+
"prot")
|
202
326
|
|
203
327
|
print "# Running BLAT alignment with External Database.."
|
204
328
|
start_time = Time.now
|
205
|
-
@externaldb_synteny.run_blat
|
329
|
+
@externaldb_synteny.run_blat
|
206
330
|
end_time = Time.now
|
207
331
|
c_time = Helper.sec2str(end_time-start_time)
|
208
332
|
print "done (#{c_time})\n"
|
@@ -228,18 +352,18 @@ class BacterialAnnotator
|
|
228
352
|
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
|
229
353
|
cov_query = (v[:homology][:cov_query]*100).round(2)
|
230
354
|
cov_subject = (v[:homology][:cov_subject]*100).round(2)
|
231
|
-
note = "Protein homology (AA identity: #{v[:homology][:pId]}%; coverage (q,s): #{cov_query}%,#{cov_subject}%) with #{ref_cds[hit_gi][:prot_id]}"
|
232
|
-
inference = "similar to AA sequence:#{ref_cds[hit_gi][:db_source]}:#{ref_cds[hit_gi][:prot_id]}"
|
355
|
+
note = "Protein homology (AA identity: #{v[:homology][:pId]}%; coverage (q,s): #{cov_query}%,#{cov_subject}%) with #{ref_cds.coding_seq[hit_gi][:prot_id]}"
|
356
|
+
inference = "similar to AA sequence:#{ref_cds.coding_seq[hit_gi][:db_source]}:#{ref_cds.coding_seq[hit_gi][:prot_id]}"
|
233
357
|
|
234
|
-
if ref_cds[hit_gi][:org] != ""
|
235
|
-
note += " from #{ref_cds[hit_gi][:org]}"
|
358
|
+
if ref_cds.coding_seq[hit_gi][:org] != ""
|
359
|
+
note += " from #{ref_cds.coding_seq[hit_gi][:org]}"
|
236
360
|
end
|
237
361
|
|
238
362
|
@contig_annotations_externaldb[contig_of_protein][v[:homology][:hits][0]] = {
|
239
|
-
product: ref_cds[hit_gi][:product],
|
363
|
+
product: ref_cds.coding_seq[hit_gi][:product],
|
240
364
|
feature: "cds",
|
241
365
|
gene: nil,
|
242
|
-
prot_id: ref_cds[hit_gi][:prot_id],
|
366
|
+
prot_id: ref_cds.coding_seq[hit_gi][:prot_id],
|
243
367
|
locustag: nil,
|
244
368
|
note: note,
|
245
369
|
inference: inference
|
@@ -260,7 +384,10 @@ class BacterialAnnotator
|
|
260
384
|
@contig_annotations_cds.each do |contig, contig_prots|
|
261
385
|
|
262
386
|
gbk_path = @query_fasta.annotation_files[:gbk_path]
|
263
|
-
gbk_to_annotate = SequenceAnnotation.new(
|
387
|
+
gbk_to_annotate = SequenceAnnotation.new(@root,
|
388
|
+
"#{gbk_path}",
|
389
|
+
"#{gbk_path}/#{contig}.gbk",
|
390
|
+
"newGbk")
|
264
391
|
|
265
392
|
if @with_external_db and @with_refence_genome
|
266
393
|
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
@@ -273,6 +400,11 @@ class BacterialAnnotator
|
|
273
400
|
@externaldb_synteny.query_sequences,
|
274
401
|
@contig_annotations_externaldb[contig]
|
275
402
|
)
|
403
|
+
elsif @with_db
|
404
|
+
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
405
|
+
@prot_synteny_refgenome.query_sequences,
|
406
|
+
@ref_genome.coding_seq
|
407
|
+
)
|
276
408
|
else
|
277
409
|
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
278
410
|
@prot_synteny_refgenome.query_sequences,
|
@@ -286,7 +418,7 @@ class BacterialAnnotator
|
|
286
418
|
gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
|
287
419
|
end
|
288
420
|
|
289
|
-
gbk_to_annotate.save_genbank_to_file
|
421
|
+
gbk_to_annotate.save_genbank_to_file
|
290
422
|
|
291
423
|
end
|
292
424
|
end_time = Time.now
|
@@ -579,9 +711,14 @@ class BacterialAnnotator
|
|
579
711
|
partial = ref_annotated[ref_v[:protId]][:partial]
|
580
712
|
end
|
581
713
|
|
714
|
+
_locus_tag = ref_v[:locustag] || ""
|
715
|
+
_seq_len = "NA"
|
716
|
+
# _seq_len = ref_v[:bioseq].seq.length.to_s if ! ref_v[:bioseq].nil?
|
717
|
+
_seq_len = ref_v[:length].to_s if ! ref_v[:length].nil?
|
718
|
+
|
582
719
|
synteny_file.write(ref_v[:protId])
|
583
|
-
synteny_file.write("\t"+
|
584
|
-
synteny_file.write("\t"+
|
720
|
+
synteny_file.write("\t"+_locus_tag)
|
721
|
+
synteny_file.write("\t"+_seq_len)
|
585
722
|
synteny_file.write("\t"+coverage_ref.to_s)
|
586
723
|
synteny_file.write("\t"+pId.to_s)
|
587
724
|
synteny_file.write("\t"+gene)
|
data/lib/bacterial-comparator.rb
CHANGED
@@ -34,15 +34,31 @@ class BacterialComparator
|
|
34
34
|
min_pid = min_pid/100
|
35
35
|
end
|
36
36
|
|
37
|
+
@aln_opt = options[:align].downcase
|
38
|
+
@run_phylo = 0
|
39
|
+
if options[:phylogeny] == 1
|
40
|
+
@bootstrap = options[:bootstrap]
|
41
|
+
@run_phylo = 1
|
42
|
+
end
|
43
|
+
|
37
44
|
@ref_prot = get_ref_prot
|
38
45
|
@synteny = read_prot_synteny
|
39
46
|
@stats = extract_syntenic_fasta min_cov, min_pid
|
40
47
|
|
41
48
|
end
|
42
49
|
|
50
|
+
|
51
|
+
def run_comparison
|
52
|
+
|
53
|
+
run_mafft_aln
|
54
|
+
run_raxml_phylo if @run_phylo != 0
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
|
43
59
|
def read_prot_synteny
|
44
60
|
|
45
|
-
|
61
|
+
puts "# Reading genome synteny files START.."
|
46
62
|
start_time = Time.now
|
47
63
|
synteny = {}
|
48
64
|
@genomes_list.each do |g|
|
@@ -65,7 +81,8 @@ class BacterialComparator
|
|
65
81
|
end
|
66
82
|
end_time = Time.now
|
67
83
|
c_time = Helper.sec2str(end_time-start_time)
|
68
|
-
|
84
|
+
|
85
|
+
puts "# Reading genome synteny files [DONE] (in #{c_time})"
|
69
86
|
|
70
87
|
synteny
|
71
88
|
|
@@ -146,7 +163,7 @@ class BacterialComparator
|
|
146
163
|
# extract and dump multifasta for syntenic genes and proteins
|
147
164
|
def extract_syntenic_fasta min_cov, min_pid
|
148
165
|
|
149
|
-
|
166
|
+
puts "# Extracting Proteins and Genes multifasta START.."
|
150
167
|
start_time = Time.now
|
151
168
|
|
152
169
|
nb_of_syntenic = 0
|
@@ -216,14 +233,13 @@ class BacterialComparator
|
|
216
233
|
|
217
234
|
end_time = Time.now
|
218
235
|
c_time = Helper.sec2str(end_time-start_time)
|
219
|
-
|
236
|
+
puts "# Extracting Proteins and Genes multifasta [DONE] (in #{c_time})"
|
220
237
|
|
221
238
|
stats[:nb_of_syntenic] = nb_of_syntenic
|
222
239
|
#puts " Syntenic genes : " + nb_of_syntenic.to_s + " / " + @ref_prot.length.to_s
|
223
240
|
|
224
241
|
end
|
225
242
|
|
226
|
-
|
227
243
|
def mafft_align f
|
228
244
|
|
229
245
|
trying = 0
|
@@ -252,7 +268,7 @@ class BacterialComparator
|
|
252
268
|
|
253
269
|
def mafft_align_all_pep
|
254
270
|
|
255
|
-
|
271
|
+
puts "# Sequence alignments - individual proteins a.a. (MAFFT) START.."
|
256
272
|
start_time = Time.now
|
257
273
|
|
258
274
|
ori_dir = Dir.pwd
|
@@ -277,7 +293,7 @@ class BacterialComparator
|
|
277
293
|
|
278
294
|
end_time = Time.now
|
279
295
|
c_time = Helper.sec2str(end_time-start_time)
|
280
|
-
|
296
|
+
puts "# Sequence alignments - individual proteins a.a. (MAFFT) [DONE] (in #{c_time})"
|
281
297
|
|
282
298
|
# FIXME ugly hack to find out the reference genome
|
283
299
|
ref_id = Dir["#{ori_dir}/#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
|
@@ -290,7 +306,7 @@ class BacterialComparator
|
|
290
306
|
|
291
307
|
def mafft_align_all_dna
|
292
308
|
|
293
|
-
|
309
|
+
puts "# Sequence alignments - individual genes dna (MAFFT) START.."
|
294
310
|
start_time = Time.now
|
295
311
|
|
296
312
|
ori_dir = Dir.pwd
|
@@ -313,12 +329,12 @@ class BacterialComparator
|
|
313
329
|
}
|
314
330
|
end
|
315
331
|
|
316
|
-
# ugly hack to find out the reference genome
|
332
|
+
# ugly hack to find out the reference genome FIXME
|
317
333
|
ref_id = Dir["#{ori_dir}/#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
|
318
334
|
|
319
335
|
end_time = Time.now
|
320
336
|
c_time = Helper.sec2str(end_time-start_time)
|
321
|
-
|
337
|
+
puts "# Sequence alignments - individual genes dna (MAFFT) [DONE] (in #{c_time})"
|
322
338
|
|
323
339
|
concat_alignments "align-genes-dna.all.fasta", ref_id
|
324
340
|
|
@@ -377,21 +393,21 @@ class BacterialComparator
|
|
377
393
|
|
378
394
|
end
|
379
395
|
|
380
|
-
def
|
396
|
+
def run_mafft_aln
|
381
397
|
|
382
|
-
if aln_opt == "both"
|
398
|
+
if @aln_opt == "both"
|
383
399
|
mafft_align_all_pep
|
384
400
|
mafft_align_all_dna
|
385
|
-
elsif aln_opt == "prot"
|
401
|
+
elsif @aln_opt == "prot"
|
386
402
|
mafft_align_all_pep
|
387
|
-
elsif aln_opt == "dna"
|
403
|
+
elsif @aln_opt == "dna"
|
388
404
|
mafft_align_all_dna
|
389
405
|
end
|
390
406
|
|
391
407
|
end
|
392
408
|
|
393
409
|
def raxml_tree_dna bt
|
394
|
-
|
410
|
+
puts "# Genes DNA tree creation (RAXML) START.."
|
395
411
|
start_time = Time.now
|
396
412
|
ori_dir = Dir.pwd
|
397
413
|
Dir.chdir(@outdir)
|
@@ -405,11 +421,11 @@ class BacterialComparator
|
|
405
421
|
Dir.chdir(ori_dir)
|
406
422
|
end_time = Time.now
|
407
423
|
c_time = Helper.sec2str(end_time-start_time)
|
408
|
-
|
424
|
+
puts "# Genes DNA tree creation (RAXML) [DONE] (in #{c_time})"
|
409
425
|
end
|
410
426
|
|
411
427
|
def raxml_tree_pep bt
|
412
|
-
|
428
|
+
puts "# Proteins AA tree creation (RAXML) START.."
|
413
429
|
start_time = Time.now
|
414
430
|
ori_dir = Dir.pwd
|
415
431
|
Dir.chdir(@outdir)
|
@@ -423,18 +439,18 @@ class BacterialComparator
|
|
423
439
|
Dir.chdir(ori_dir)
|
424
440
|
end_time = Time.now
|
425
441
|
c_time = Helper.sec2str(end_time-start_time)
|
426
|
-
|
442
|
+
puts "# Proteins AA tree creation (RAXML) [DONE] (in #{c_time})"
|
427
443
|
end
|
428
444
|
|
429
|
-
def
|
445
|
+
def run_raxml_phylo
|
430
446
|
|
431
|
-
if aln_opt == "both"
|
432
|
-
raxml_tree_dna
|
433
|
-
raxml_tree_pep
|
434
|
-
elsif aln_opt == "prot"
|
435
|
-
raxml_tree_pep
|
436
|
-
elsif aln_opt == "dna"
|
437
|
-
raxml_tree_dna
|
447
|
+
if @aln_opt == "both"
|
448
|
+
raxml_tree_dna @bootstrap
|
449
|
+
raxml_tree_pep @bootstrap
|
450
|
+
elsif @aln_opt == "prot"
|
451
|
+
raxml_tree_pep @bootstrap
|
452
|
+
elsif @aln_opt == "dna"
|
453
|
+
raxml_tree_dna @bootstrap
|
438
454
|
end
|
439
455
|
|
440
456
|
end
|