bacterial-annotator 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/bacterial-annotator +39 -29
- data/lib/bacterial-annotator/sequence-annotation.rb +209 -30
- data/lib/bacterial-annotator/sequence-fasta.rb +21 -18
- data/lib/bacterial-annotator/sequence-synteny.rb +77 -20
- data/lib/bacterial-annotator.rb +201 -64
- data/lib/bacterial-comparator.rb +42 -26
- data/lib/bacterial-identificator.rb +86 -13
- metadata +3 -3
@@ -6,15 +6,19 @@
|
|
6
6
|
# version: 0.0.1
|
7
7
|
# licence:
|
8
8
|
|
9
|
+
require 'json'
|
10
|
+
require 'zlib'
|
9
11
|
|
10
12
|
class SequenceSynteny
|
11
13
|
|
12
14
|
attr_reader :query_file, :subject_file, :aln_hits, :query_sequences, :subject_sequences
|
13
15
|
|
14
|
-
def initialize query_file, subject_file, name, pidentity, min_coverage, type
|
16
|
+
def initialize root, outdir, query_file, subject_file, name, pidentity, min_coverage, type
|
17
|
+
|
18
|
+
@root = root
|
19
|
+
@outdir = outdir
|
15
20
|
@query_file = query_file
|
16
21
|
@subject_file = subject_file
|
17
|
-
|
18
22
|
@query_sequences = get_sequences(query_file)
|
19
23
|
@subject_sequences = get_sequences(subject_file)
|
20
24
|
|
@@ -28,22 +32,47 @@ class SequenceSynteny
|
|
28
32
|
|
29
33
|
|
30
34
|
# get sequences name with length in hash
|
31
|
-
def get_sequences
|
35
|
+
def get_sequences raw_file
|
32
36
|
|
33
37
|
sequences = {}
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
|
39
|
+
if raw_file.include?(".dmnd")
|
40
|
+
|
41
|
+
seq_info_file = raw_file.gsub(".dmnd",".json.gz")
|
42
|
+
|
43
|
+
json_genes = {}
|
44
|
+
Zlib::GzipReader.open(seq_info_file) {|gz|
|
45
|
+
json_genes = JSON.parse(gz.read)
|
46
|
+
}
|
47
|
+
|
48
|
+
json_genes.each do |gene|
|
49
|
+
|
50
|
+
sequences[gene["cluster_id"]] = {}
|
51
|
+
sequences[gene["cluster_id"]][:length] = gene["consensus_length"].to_f
|
52
|
+
sequences[gene["cluster_id"]][:conserved] = false
|
53
|
+
sequences[gene["cluster_id"]][:contig] = gene["cluster_id"].split("_")[0..-2].join("_") if gene["cluster_id"].include? "_"
|
54
|
+
|
42
55
|
end
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
56
|
+
|
57
|
+
else
|
58
|
+
|
59
|
+
seq_file = raw_file
|
60
|
+
flat = Bio::FlatFile.auto("#{seq_file}")
|
61
|
+
flat.each_entry do |s|
|
62
|
+
s_name = s.definition.chomp.split(" ")[0]
|
63
|
+
sequences[s_name] = {}
|
64
|
+
properties = s.definition.chomp.split(";")
|
65
|
+
partial = false
|
66
|
+
if properties.length >= 2 and properties[1].include? "partial"
|
67
|
+
partial = (properties[1].gsub("partial=","").include? '1')
|
68
|
+
end
|
69
|
+
sequences[s_name][:partial] = partial
|
70
|
+
sequences[s_name][:length] = s.seq.length
|
71
|
+
sequences[s_name][:conserved] = false
|
72
|
+
sequences[s_name][:contig] = s_name.split("_")[0..-2].join("_") if s_name.include? "_"
|
73
|
+
|
74
|
+
end
|
75
|
+
|
47
76
|
end
|
48
77
|
|
49
78
|
sequences
|
@@ -51,14 +80,42 @@ class SequenceSynteny
|
|
51
80
|
end
|
52
81
|
|
53
82
|
# run blat on proteins
|
54
|
-
def run_blat
|
55
|
-
base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity} > /dev/null 2>&1"
|
83
|
+
def run_blat
|
84
|
+
base_cmd = "#{@root}/blat.linux -out=blast8 -minIdentity=#{@pidentity} > /dev/null 2>&1"
|
56
85
|
if @type == "prot"
|
57
|
-
system("#{base_cmd} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
|
86
|
+
system("#{base_cmd} -prot #{@subject_file} #{@query_file} #{@outdir}/#{@name}.blat8.tsv")
|
87
|
+
else
|
88
|
+
system("#{base_cmd} #{@subject_file} #{@query_file} #{@outdir}/#{@name}.blat8.tsv")
|
89
|
+
end
|
90
|
+
@aln_file = "#{@outdir}/#{@name}.blat8.tsv"
|
91
|
+
# extract_hits
|
92
|
+
end # end of method
|
93
|
+
|
94
|
+
# run fasta36 on proteins
|
95
|
+
def run_fasta36
|
96
|
+
if @type == "prot"
|
97
|
+
system("#{@root}/fasta36.linux -T 1 -b 3 -E 1e-40 -m 8 #{@query_file} #{@subject_file} > #{@outdir}/#{@name}.fasta36.tsv")
|
98
|
+
else
|
99
|
+
system("#{@root}/glsearch36.linux -T 1 -b 12 -E 1e-40 -m 8 #{@query_file} #{@subject_file} > #{@outdir}/#{@name}.fasta36.tsv")
|
100
|
+
end
|
101
|
+
@aln_file_fasta36 = "#{@outdir}/#{@name}.fasta36.tsv"
|
102
|
+
# extract_hits
|
103
|
+
end # end of method
|
104
|
+
|
105
|
+
# run diamond on proteins
|
106
|
+
def run_diamond
|
107
|
+
if @type == "prot"
|
108
|
+
if subject_file.include? ".dmnd"
|
109
|
+
db_file = subject_file
|
110
|
+
else
|
111
|
+
system("#{@root}/diamond.linux makedb --db #{subject_file} --in #{subject_file} > /dev/null 2>&1")
|
112
|
+
db_file = subject_file
|
113
|
+
end
|
114
|
+
system("#{@root}/diamond.linux blastp --db #{db_file} -q #{query_file} -o #{@outdir}/#{@name}.diamond.tsv -f 6 > /dev/null 2>&1")
|
58
115
|
else
|
59
|
-
system("#{
|
116
|
+
# system("#{@root}/glsearch36.linux -b 3 -E 1e-25 -m 8 #{@subject_file} #{@query_file} > #{@outdir}/#{@name}.fasta36.tsv")
|
60
117
|
end
|
61
|
-
@aln_file = "#{outdir}/#{@name}.
|
118
|
+
@aln_file = "#{@outdir}/#{@name}.diamond.tsv"
|
62
119
|
# extract_hits
|
63
120
|
end # end of method
|
64
121
|
|
data/lib/bacterial-annotator.rb
CHANGED
@@ -23,6 +23,8 @@ class BacterialAnnotator
|
|
23
23
|
@root = root
|
24
24
|
@options = options
|
25
25
|
|
26
|
+
abort if ! @options.has_key? :input
|
27
|
+
|
26
28
|
@minlength = @options[:minlength].to_i
|
27
29
|
@options[:minlength] = @options[:minlength].to_i
|
28
30
|
@options[:pidentity] = @options[:pidentity].to_f
|
@@ -44,12 +46,25 @@ class BacterialAnnotator
|
|
44
46
|
end
|
45
47
|
Dir.mkdir(@options[:outdir])
|
46
48
|
|
47
|
-
@query_fasta = SequenceFasta.new(@
|
49
|
+
@query_fasta = SequenceFasta.new(@root,
|
50
|
+
options[:outdir],
|
51
|
+
@options[:input],
|
52
|
+
@options[:meta])
|
48
53
|
|
49
54
|
@with_refence_genome = false
|
55
|
+
@with_db = false
|
50
56
|
if @options.has_key? :refgenome
|
51
57
|
@with_refence_genome = true
|
52
|
-
@ref_genome = SequenceAnnotation.new(@
|
58
|
+
@ref_genome = SequenceAnnotation.new(@root,
|
59
|
+
@options[:outdir],
|
60
|
+
@options[:refgenome],
|
61
|
+
"refGbk")
|
62
|
+
elsif @options[:mergem]
|
63
|
+
@with_db = true
|
64
|
+
@ref_genome = SequenceAnnotation.new(@root,
|
65
|
+
@options[:outdir],
|
66
|
+
@options[:mergem],
|
67
|
+
"db")
|
53
68
|
end
|
54
69
|
|
55
70
|
@with_external_db = false
|
@@ -76,52 +91,12 @@ class BacterialAnnotator
|
|
76
91
|
|
77
92
|
end # end of method
|
78
93
|
|
79
|
-
# Prepare files for the annotation
|
80
|
-
# Will run prodigal on the query and prepare reference genome files
|
81
|
-
def prepare_files_for_annotation
|
82
|
-
print "# Running Prodigal on your genome.."
|
83
|
-
start_time = Time.now
|
84
|
-
@query_fasta.run_prodigal @root, @options[:outdir]
|
85
|
-
end_time = Time.now
|
86
|
-
c_time = Helper.sec2str(end_time - start_time)
|
87
|
-
print "done (#{c_time})\n"
|
88
|
-
if @with_refence_genome
|
89
|
-
@ref_genome.write_cds_to_file @options[:outdir]
|
90
|
-
@ref_genome.write_rna_to_file @options[:outdir]
|
91
|
-
# puts "Successfully loaded #{@ref_genome.gbk.definition}"
|
92
|
-
end
|
93
|
-
end # end of method
|
94
|
-
|
95
|
-
|
96
|
-
def run_reference_synteny_prot
|
97
|
-
|
98
|
-
ref_synteny_prot = SequenceSynteny.new(@query_fasta.annotation_files[:proteins], @ref_genome.cds_file,
|
99
|
-
"Prot-Ref", @options[:pidentity], @options[:pcoverage], "prot")
|
100
|
-
|
101
|
-
print "# Running alignment with Reference Genome CDS (blat).."
|
102
|
-
start_time = Time.now
|
103
|
-
ref_synteny_prot.run_blat @root, @options[:outdir]
|
104
|
-
end_time = Time.now
|
105
|
-
c_time = Helper.sec2str(end_time - start_time)
|
106
|
-
print "done (#{c_time})\n"
|
107
|
-
|
108
|
-
ref_synteny_prot.extract_hits :refgenome
|
109
|
-
|
110
|
-
ref_synteny_prot.query_sequences.each do |k,v|
|
111
|
-
if v.has_key? :homology
|
112
|
-
@contig_annotations_cds[v[:contig]] = [] if ! @contig_annotations_cds.has_key? v[:contig]
|
113
|
-
@contig_annotations_cds[v[:contig]] << k
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
ref_synteny_prot
|
118
|
-
|
119
|
-
end
|
120
|
-
|
121
94
|
|
122
95
|
# run_alignment of reference genome proteins and the query
|
123
96
|
def run_annotation
|
124
97
|
|
98
|
+
prepare_files_for_annotation
|
99
|
+
|
125
100
|
# process reference genome synteny
|
126
101
|
if @with_refence_genome # Annotation with the Reference Genome
|
127
102
|
|
@@ -153,23 +128,69 @@ class BacterialAnnotator
|
|
153
128
|
dump_ref_synteny_to_file
|
154
129
|
|
155
130
|
# run RNA annotation
|
156
|
-
@rna_synteny = SequenceSynteny.new(@
|
157
|
-
|
131
|
+
@rna_synteny = SequenceSynteny.new(@root,
|
132
|
+
@options[:outdir],
|
133
|
+
@query_fasta.fasta_file,
|
134
|
+
@ref_genome.rna_file,
|
135
|
+
"RNA-Ref",
|
136
|
+
@options[:pidentity],
|
137
|
+
@options[:pcoverage],
|
138
|
+
"dna")
|
139
|
+
|
158
140
|
print "# Running alignment with Reference Genome RNA (blat).."
|
159
141
|
start_time = Time.now
|
160
|
-
@rna_synteny.run_blat
|
142
|
+
@rna_synteny.run_blat
|
161
143
|
end_time = Time.now
|
162
144
|
c_time = Helper.sec2str(end_time-start_time)
|
163
145
|
print "done (#{c_time})\n"
|
146
|
+
|
147
|
+
# # takes too long
|
148
|
+
# print "# Running alignment with Reference Genome RNA (fasta36).."
|
149
|
+
# start_time = Time.now
|
150
|
+
# @rna_synteny.run_fasta36
|
151
|
+
# end_time = Time.now
|
152
|
+
# c_time = Helper.sec2str(end_time-start_time)
|
153
|
+
# print "done (#{c_time})\n"
|
154
|
+
|
164
155
|
@rna_synteny.extract_hits_dna :rna
|
165
156
|
@contig_annotations_rna = {}
|
166
157
|
@query_fasta.annotation_files[:contigs].each_with_index do |contig, contig_index|
|
167
158
|
@contig_annotations_rna[contig] = @rna_synteny.get_annotation_for_contig contig
|
168
159
|
end
|
169
160
|
|
161
|
+
|
162
|
+
elsif @with_db
|
163
|
+
|
164
|
+
@prot_synteny_refgenome = run_mergem_synteny_prot
|
165
|
+
# iterate over each contig
|
166
|
+
# discard short contig
|
167
|
+
# cumulate statistics of homolog CDS
|
168
|
+
@query_fasta.annotation_files[:contigs].each_with_index do |contig, contig_index|
|
169
|
+
|
170
|
+
# Skip short contigs
|
171
|
+
if @query_fasta.annotation_files[:contigs_length][contig_index] < @minlength
|
172
|
+
@annotation_stats[:short_contigs] << contig
|
173
|
+
next
|
174
|
+
end
|
175
|
+
|
176
|
+
remaining_cds = cumulate_annotation_stats_reference contig
|
177
|
+
|
178
|
+
if remaining_cds != []
|
179
|
+
@contig_foreign_cds[contig] = remaining_cds
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
183
|
+
|
184
|
+
# dump foreign proteins to file
|
185
|
+
foreign_cds_file = dump_cds
|
186
|
+
|
187
|
+
# dump reference CDS synteny to file
|
188
|
+
dump_ref_synteny_to_file
|
189
|
+
|
190
|
+
|
170
191
|
else # no reference genome
|
171
192
|
|
172
|
-
# no reference genome .. will process all the CDS
|
193
|
+
# no reference genome .. will process all the CDS as foreign for the external db
|
173
194
|
foreign_cds_file = @query_fasta.annotation_files[:proteins]
|
174
195
|
|
175
196
|
end
|
@@ -187,6 +208,99 @@ class BacterialAnnotator
|
|
187
208
|
end # end of method
|
188
209
|
|
189
210
|
|
211
|
+
# Prepare files for the annotation
|
212
|
+
# Will run prodigal on the query and prepare reference genome files
|
213
|
+
def prepare_files_for_annotation
|
214
|
+
print "# Running Prodigal on your genome.."
|
215
|
+
start_time = Time.now
|
216
|
+
@query_fasta.run_prodigal
|
217
|
+
end_time = Time.now
|
218
|
+
c_time = Helper.sec2str(end_time - start_time)
|
219
|
+
print "done (#{c_time})\n"
|
220
|
+
end # end of method
|
221
|
+
|
222
|
+
|
223
|
+
def run_mergem_synteny_prot
|
224
|
+
|
225
|
+
|
226
|
+
ref_synteny_prot = SequenceSynteny.new(@root,
|
227
|
+
@options[:outdir],
|
228
|
+
@query_fasta.annotation_files[:proteins],
|
229
|
+
@ref_genome.cds_file,
|
230
|
+
"Prot-Ref",
|
231
|
+
@options[:pidentity],
|
232
|
+
@options[:pcoverage],
|
233
|
+
"prot")
|
234
|
+
|
235
|
+
print "# Running alignment with Reference Genome CDS (diamond).."
|
236
|
+
start_time = Time.now
|
237
|
+
ref_synteny_prot.run_diamond
|
238
|
+
end_time = Time.now
|
239
|
+
c_time = Helper.sec2str(end_time - start_time)
|
240
|
+
print "done (#{c_time})\n"
|
241
|
+
|
242
|
+
ref_synteny_prot.extract_hits :refgenome
|
243
|
+
|
244
|
+
ref_synteny_prot.query_sequences.each do |k,v|
|
245
|
+
if v.has_key? :homology
|
246
|
+
@contig_annotations_cds[v[:contig]] = [] if ! @contig_annotations_cds.has_key? v[:contig]
|
247
|
+
@contig_annotations_cds[v[:contig]] << k
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
ref_synteny_prot
|
252
|
+
|
253
|
+
|
254
|
+
end
|
255
|
+
|
256
|
+
|
257
|
+
|
258
|
+
def run_reference_synteny_prot
|
259
|
+
|
260
|
+
ref_synteny_prot = SequenceSynteny.new(@root,
|
261
|
+
@options[:outdir],
|
262
|
+
@query_fasta.annotation_files[:proteins],
|
263
|
+
@ref_genome.cds_file,
|
264
|
+
"Prot-Ref",
|
265
|
+
@options[:pidentity],
|
266
|
+
@options[:pcoverage],
|
267
|
+
"prot")
|
268
|
+
|
269
|
+
print "# Running alignment with Reference Genome CDS (diamond).."
|
270
|
+
start_time = Time.now
|
271
|
+
ref_synteny_prot.run_diamond
|
272
|
+
end_time = Time.now
|
273
|
+
c_time = Helper.sec2str(end_time - start_time)
|
274
|
+
print "done (#{c_time})\n"
|
275
|
+
|
276
|
+
# print "# Running alignment with Reference Genome CDS (blat).."
|
277
|
+
# start_time = Time.now
|
278
|
+
# ref_synteny_prot.run_blat
|
279
|
+
# end_time = Time.now
|
280
|
+
# c_time = Helper.sec2str(end_time - start_time)
|
281
|
+
# print "done (#{c_time})\n"
|
282
|
+
|
283
|
+
# print "# Running alignment with Reference Genome CDS (fasta36).."
|
284
|
+
# start_time = Time.now
|
285
|
+
# ref_synteny_prot.run_fasta36
|
286
|
+
# end_time = Time.now
|
287
|
+
# c_time = Helper.sec2str(end_time - start_time)
|
288
|
+
# print "done (#{c_time})\n"
|
289
|
+
|
290
|
+
ref_synteny_prot.extract_hits :refgenome
|
291
|
+
|
292
|
+
ref_synteny_prot.query_sequences.each do |k,v|
|
293
|
+
if v.has_key? :homology
|
294
|
+
@contig_annotations_cds[v[:contig]] = [] if ! @contig_annotations_cds.has_key? v[:contig]
|
295
|
+
@contig_annotations_cds[v[:contig]] << k
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
ref_synteny_prot
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
|
190
304
|
# Finishing the annotation of the remaining CDS
|
191
305
|
def finish_annotation remaining_cds_file
|
192
306
|
|
@@ -194,15 +308,25 @@ class BacterialAnnotator
|
|
194
308
|
if @options.has_key? :external_db # from an external DB
|
195
309
|
|
196
310
|
db_file = @options[:external_db]
|
197
|
-
ref_cds =
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
311
|
+
ref_cds = SequenceAnnotation.new(@root,
|
312
|
+
@options[:outdir],
|
313
|
+
db_file,
|
314
|
+
"fasta")
|
315
|
+
|
316
|
+
# ref_cds = extract_externaldb_prot_info db_file
|
317
|
+
|
318
|
+
@externaldb_synteny = SequenceSynteny.new(@root,
|
319
|
+
@options[:outdir],
|
320
|
+
remaining_cds_file,
|
321
|
+
db_file,
|
322
|
+
"Prot-ExternalDB",
|
323
|
+
@options[:pidentity],
|
324
|
+
@options[:pcoverage],
|
325
|
+
"prot")
|
202
326
|
|
203
327
|
print "# Running BLAT alignment with External Database.."
|
204
328
|
start_time = Time.now
|
205
|
-
@externaldb_synteny.run_blat
|
329
|
+
@externaldb_synteny.run_blat
|
206
330
|
end_time = Time.now
|
207
331
|
c_time = Helper.sec2str(end_time-start_time)
|
208
332
|
print "done (#{c_time})\n"
|
@@ -228,18 +352,18 @@ class BacterialAnnotator
|
|
228
352
|
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
|
229
353
|
cov_query = (v[:homology][:cov_query]*100).round(2)
|
230
354
|
cov_subject = (v[:homology][:cov_subject]*100).round(2)
|
231
|
-
note = "Protein homology (AA identity: #{v[:homology][:pId]}%; coverage (q,s): #{cov_query}%,#{cov_subject}%) with #{ref_cds[hit_gi][:prot_id]}"
|
232
|
-
inference = "similar to AA sequence:#{ref_cds[hit_gi][:db_source]}:#{ref_cds[hit_gi][:prot_id]}"
|
355
|
+
note = "Protein homology (AA identity: #{v[:homology][:pId]}%; coverage (q,s): #{cov_query}%,#{cov_subject}%) with #{ref_cds.coding_seq[hit_gi][:prot_id]}"
|
356
|
+
inference = "similar to AA sequence:#{ref_cds.coding_seq[hit_gi][:db_source]}:#{ref_cds.coding_seq[hit_gi][:prot_id]}"
|
233
357
|
|
234
|
-
if ref_cds[hit_gi][:org] != ""
|
235
|
-
note += " from #{ref_cds[hit_gi][:org]}"
|
358
|
+
if ref_cds.coding_seq[hit_gi][:org] != ""
|
359
|
+
note += " from #{ref_cds.coding_seq[hit_gi][:org]}"
|
236
360
|
end
|
237
361
|
|
238
362
|
@contig_annotations_externaldb[contig_of_protein][v[:homology][:hits][0]] = {
|
239
|
-
product: ref_cds[hit_gi][:product],
|
363
|
+
product: ref_cds.coding_seq[hit_gi][:product],
|
240
364
|
feature: "cds",
|
241
365
|
gene: nil,
|
242
|
-
prot_id: ref_cds[hit_gi][:prot_id],
|
366
|
+
prot_id: ref_cds.coding_seq[hit_gi][:prot_id],
|
243
367
|
locustag: nil,
|
244
368
|
note: note,
|
245
369
|
inference: inference
|
@@ -260,7 +384,10 @@ class BacterialAnnotator
|
|
260
384
|
@contig_annotations_cds.each do |contig, contig_prots|
|
261
385
|
|
262
386
|
gbk_path = @query_fasta.annotation_files[:gbk_path]
|
263
|
-
gbk_to_annotate = SequenceAnnotation.new(
|
387
|
+
gbk_to_annotate = SequenceAnnotation.new(@root,
|
388
|
+
"#{gbk_path}",
|
389
|
+
"#{gbk_path}/#{contig}.gbk",
|
390
|
+
"newGbk")
|
264
391
|
|
265
392
|
if @with_external_db and @with_refence_genome
|
266
393
|
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
@@ -273,6 +400,11 @@ class BacterialAnnotator
|
|
273
400
|
@externaldb_synteny.query_sequences,
|
274
401
|
@contig_annotations_externaldb[contig]
|
275
402
|
)
|
403
|
+
elsif @with_db
|
404
|
+
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
405
|
+
@prot_synteny_refgenome.query_sequences,
|
406
|
+
@ref_genome.coding_seq
|
407
|
+
)
|
276
408
|
else
|
277
409
|
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
278
410
|
@prot_synteny_refgenome.query_sequences,
|
@@ -286,7 +418,7 @@ class BacterialAnnotator
|
|
286
418
|
gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
|
287
419
|
end
|
288
420
|
|
289
|
-
gbk_to_annotate.save_genbank_to_file
|
421
|
+
gbk_to_annotate.save_genbank_to_file
|
290
422
|
|
291
423
|
end
|
292
424
|
end_time = Time.now
|
@@ -579,9 +711,14 @@ class BacterialAnnotator
|
|
579
711
|
partial = ref_annotated[ref_v[:protId]][:partial]
|
580
712
|
end
|
581
713
|
|
714
|
+
_locus_tag = ref_v[:locustag] || ""
|
715
|
+
_seq_len = "NA"
|
716
|
+
# _seq_len = ref_v[:bioseq].seq.length.to_s if ! ref_v[:bioseq].nil?
|
717
|
+
_seq_len = ref_v[:length].to_s if ! ref_v[:length].nil?
|
718
|
+
|
582
719
|
synteny_file.write(ref_v[:protId])
|
583
|
-
synteny_file.write("\t"+
|
584
|
-
synteny_file.write("\t"+
|
720
|
+
synteny_file.write("\t"+_locus_tag)
|
721
|
+
synteny_file.write("\t"+_seq_len)
|
585
722
|
synteny_file.write("\t"+coverage_ref.to_s)
|
586
723
|
synteny_file.write("\t"+pId.to_s)
|
587
724
|
synteny_file.write("\t"+gene)
|
data/lib/bacterial-comparator.rb
CHANGED
@@ -34,15 +34,31 @@ class BacterialComparator
|
|
34
34
|
min_pid = min_pid/100
|
35
35
|
end
|
36
36
|
|
37
|
+
@aln_opt = options[:align].downcase
|
38
|
+
@run_phylo = 0
|
39
|
+
if options[:phylogeny] == 1
|
40
|
+
@bootstrap = options[:bootstrap]
|
41
|
+
@run_phylo = 1
|
42
|
+
end
|
43
|
+
|
37
44
|
@ref_prot = get_ref_prot
|
38
45
|
@synteny = read_prot_synteny
|
39
46
|
@stats = extract_syntenic_fasta min_cov, min_pid
|
40
47
|
|
41
48
|
end
|
42
49
|
|
50
|
+
|
51
|
+
def run_comparison
|
52
|
+
|
53
|
+
run_mafft_aln
|
54
|
+
run_raxml_phylo if @run_phylo != 0
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
|
43
59
|
def read_prot_synteny
|
44
60
|
|
45
|
-
|
61
|
+
puts "# Reading genome synteny files START.."
|
46
62
|
start_time = Time.now
|
47
63
|
synteny = {}
|
48
64
|
@genomes_list.each do |g|
|
@@ -65,7 +81,8 @@ class BacterialComparator
|
|
65
81
|
end
|
66
82
|
end_time = Time.now
|
67
83
|
c_time = Helper.sec2str(end_time-start_time)
|
68
|
-
|
84
|
+
|
85
|
+
puts "# Reading genome synteny files [DONE] (in #{c_time})"
|
69
86
|
|
70
87
|
synteny
|
71
88
|
|
@@ -146,7 +163,7 @@ class BacterialComparator
|
|
146
163
|
# extract and dump multifasta for syntenic genes and proteins
|
147
164
|
def extract_syntenic_fasta min_cov, min_pid
|
148
165
|
|
149
|
-
|
166
|
+
puts "# Extracting Proteins and Genes multifasta START.."
|
150
167
|
start_time = Time.now
|
151
168
|
|
152
169
|
nb_of_syntenic = 0
|
@@ -216,14 +233,13 @@ class BacterialComparator
|
|
216
233
|
|
217
234
|
end_time = Time.now
|
218
235
|
c_time = Helper.sec2str(end_time-start_time)
|
219
|
-
|
236
|
+
puts "# Extracting Proteins and Genes multifasta [DONE] (in #{c_time})"
|
220
237
|
|
221
238
|
stats[:nb_of_syntenic] = nb_of_syntenic
|
222
239
|
#puts " Syntenic genes : " + nb_of_syntenic.to_s + " / " + @ref_prot.length.to_s
|
223
240
|
|
224
241
|
end
|
225
242
|
|
226
|
-
|
227
243
|
def mafft_align f
|
228
244
|
|
229
245
|
trying = 0
|
@@ -252,7 +268,7 @@ class BacterialComparator
|
|
252
268
|
|
253
269
|
def mafft_align_all_pep
|
254
270
|
|
255
|
-
|
271
|
+
puts "# Sequence alignments - individual proteins a.a. (MAFFT) START.."
|
256
272
|
start_time = Time.now
|
257
273
|
|
258
274
|
ori_dir = Dir.pwd
|
@@ -277,7 +293,7 @@ class BacterialComparator
|
|
277
293
|
|
278
294
|
end_time = Time.now
|
279
295
|
c_time = Helper.sec2str(end_time-start_time)
|
280
|
-
|
296
|
+
puts "# Sequence alignments - individual proteins a.a. (MAFFT) [DONE] (in #{c_time})"
|
281
297
|
|
282
298
|
# FIXME ugly hack to find out the reference genome
|
283
299
|
ref_id = Dir["#{ori_dir}/#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
|
@@ -290,7 +306,7 @@ class BacterialComparator
|
|
290
306
|
|
291
307
|
def mafft_align_all_dna
|
292
308
|
|
293
|
-
|
309
|
+
puts "# Sequence alignments - individual genes dna (MAFFT) START.."
|
294
310
|
start_time = Time.now
|
295
311
|
|
296
312
|
ori_dir = Dir.pwd
|
@@ -313,12 +329,12 @@ class BacterialComparator
|
|
313
329
|
}
|
314
330
|
end
|
315
331
|
|
316
|
-
# ugly hack to find out the reference genome
|
332
|
+
# ugly hack to find out the reference genome FIXME
|
317
333
|
ref_id = Dir["#{ori_dir}/#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
|
318
334
|
|
319
335
|
end_time = Time.now
|
320
336
|
c_time = Helper.sec2str(end_time-start_time)
|
321
|
-
|
337
|
+
puts "# Sequence alignments - individual genes dna (MAFFT) [DONE] (in #{c_time})"
|
322
338
|
|
323
339
|
concat_alignments "align-genes-dna.all.fasta", ref_id
|
324
340
|
|
@@ -377,21 +393,21 @@ class BacterialComparator
|
|
377
393
|
|
378
394
|
end
|
379
395
|
|
380
|
-
def
|
396
|
+
def run_mafft_aln
|
381
397
|
|
382
|
-
if aln_opt == "both"
|
398
|
+
if @aln_opt == "both"
|
383
399
|
mafft_align_all_pep
|
384
400
|
mafft_align_all_dna
|
385
|
-
elsif aln_opt == "prot"
|
401
|
+
elsif @aln_opt == "prot"
|
386
402
|
mafft_align_all_pep
|
387
|
-
elsif aln_opt == "dna"
|
403
|
+
elsif @aln_opt == "dna"
|
388
404
|
mafft_align_all_dna
|
389
405
|
end
|
390
406
|
|
391
407
|
end
|
392
408
|
|
393
409
|
def raxml_tree_dna bt
|
394
|
-
|
410
|
+
puts "# Genes DNA tree creation (RAXML) START.."
|
395
411
|
start_time = Time.now
|
396
412
|
ori_dir = Dir.pwd
|
397
413
|
Dir.chdir(@outdir)
|
@@ -405,11 +421,11 @@ class BacterialComparator
|
|
405
421
|
Dir.chdir(ori_dir)
|
406
422
|
end_time = Time.now
|
407
423
|
c_time = Helper.sec2str(end_time-start_time)
|
408
|
-
|
424
|
+
puts "# Genes DNA tree creation (RAXML) [DONE] (in #{c_time})"
|
409
425
|
end
|
410
426
|
|
411
427
|
def raxml_tree_pep bt
|
412
|
-
|
428
|
+
puts "# Proteins AA tree creation (RAXML) START.."
|
413
429
|
start_time = Time.now
|
414
430
|
ori_dir = Dir.pwd
|
415
431
|
Dir.chdir(@outdir)
|
@@ -423,18 +439,18 @@ class BacterialComparator
|
|
423
439
|
Dir.chdir(ori_dir)
|
424
440
|
end_time = Time.now
|
425
441
|
c_time = Helper.sec2str(end_time-start_time)
|
426
|
-
|
442
|
+
puts "# Proteins AA tree creation (RAXML) [DONE] (in #{c_time})"
|
427
443
|
end
|
428
444
|
|
429
|
-
def
|
445
|
+
def run_raxml_phylo
|
430
446
|
|
431
|
-
if aln_opt == "both"
|
432
|
-
raxml_tree_dna
|
433
|
-
raxml_tree_pep
|
434
|
-
elsif aln_opt == "prot"
|
435
|
-
raxml_tree_pep
|
436
|
-
elsif aln_opt == "dna"
|
437
|
-
raxml_tree_dna
|
447
|
+
if @aln_opt == "both"
|
448
|
+
raxml_tree_dna @bootstrap
|
449
|
+
raxml_tree_pep @bootstrap
|
450
|
+
elsif @aln_opt == "prot"
|
451
|
+
raxml_tree_pep @bootstrap
|
452
|
+
elsif @aln_opt == "dna"
|
453
|
+
raxml_tree_dna @bootstrap
|
438
454
|
end
|
439
455
|
|
440
456
|
end
|