bacterial-annotator 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/ba_prodigal +1 -1
- data/bin/bacterial-annotator +13 -14
- data/lib/bacterial-annotator/{genbank-manip.rb → sequence-annotation.rb} +128 -16
- data/lib/bacterial-annotator/{fasta-manip.rb → sequence-fasta.rb} +32 -23
- data/lib/bacterial-annotator/{synteny-manip.rb → sequence-synteny.rb} +128 -8
- data/lib/bacterial-annotator.rb +211 -140
- data/lib/bacterial-comparator.rb +1 -0
- metadata +5 -6
- data/lib/bacterial-annotator/remote-ncbi.rb +0 -201
data/lib/bacterial-annotator.rb
CHANGED
@@ -9,95 +9,130 @@
|
|
9
9
|
require 'bio'
|
10
10
|
require 'fileutils'
|
11
11
|
|
12
|
-
require 'bacterial-annotator/
|
13
|
-
require 'bacterial-annotator/
|
14
|
-
require 'bacterial-annotator/synteny
|
15
|
-
|
12
|
+
require 'bacterial-annotator/sequence-fasta'
|
13
|
+
require 'bacterial-annotator/sequence-annotation'
|
14
|
+
require 'bacterial-annotator/sequence-synteny'
|
15
|
+
|
16
16
|
|
17
17
|
class BacterialAnnotator
|
18
18
|
|
19
19
|
# Initialize BacterialAnnotator
|
20
|
-
# options
|
20
|
+
# options, ROOT (path)
|
21
21
|
def initialize options, root
|
22
22
|
|
23
23
|
@root = root
|
24
24
|
@options = options
|
25
|
-
@outdir = @options[:outdir]
|
26
25
|
|
27
26
|
@minlength = @options[:minlength].to_i
|
28
|
-
@
|
29
|
-
@pidentity = @pidentity
|
27
|
+
@options[:minlength] = @options[:minlength].to_i
|
28
|
+
@options[:pidentity] = @options[:pidentity].to_f
|
29
|
+
@options[:pidentispacemacs-lightty] = @options[:pidentity] * 100 if @options[:pidentity] <= 1.00
|
30
|
+
@options[:pcoverage] = @options[:pcoverage].to_f
|
31
|
+
@options[:pcoverage] = @options[:pcoverage] / 100 if @options[:pcoverage] > 1.00
|
30
32
|
|
31
|
-
if File.exists? (@outdir)
|
33
|
+
if File.exists? (@options[:outdir])
|
32
34
|
if ! options.has_key? :force
|
33
35
|
abort "Output directory already exist ! Choose another one or use -f to overwrite"
|
34
36
|
else
|
35
|
-
puts "Overwriting output directory #{@outdir}"
|
36
|
-
FileUtils.remove_dir(@outdir, :force=>true)
|
37
|
+
puts "Overwriting output directory #{@options[:outdir]}"
|
38
|
+
FileUtils.remove_dir(@options[:outdir], :force=>true)
|
37
39
|
end
|
38
40
|
end
|
39
|
-
Dir.mkdir(@outdir)
|
41
|
+
Dir.mkdir(@options[:outdir])
|
40
42
|
|
41
|
-
@
|
43
|
+
@query_fasta = SequenceFasta.new(@options[:input], @options[:meta])
|
42
44
|
|
43
45
|
@with_refence_genome = false
|
44
46
|
if @options.has_key? :refgenome
|
45
47
|
@with_refence_genome = true
|
46
|
-
@
|
48
|
+
@ref_genome = SequenceAnnotation.new(@options[:refgenome], @options[:outdir])
|
47
49
|
end
|
48
50
|
|
51
|
+
@with_external_db = false
|
52
|
+
@with_external_db = true if @options.has_key? :external_db
|
53
|
+
|
49
54
|
@prot_synteny = nil
|
50
|
-
@annotation_stats = {
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
55
|
+
@annotation_stats = {
|
56
|
+
by_contigs: {},
|
57
|
+
annotated_cds: 0,
|
58
|
+
flagged_cds: [],
|
59
|
+
total_cds: 0,
|
60
|
+
foreign_contigs: [],
|
61
|
+
synteny_contigs: [],
|
62
|
+
short_contigs: []
|
63
|
+
}
|
56
64
|
|
57
65
|
@contig_foreign_cds = {}
|
66
|
+
|
58
67
|
@contig_annotations = {}
|
59
68
|
|
69
|
+
@contig_annotations_externaldb = {}
|
70
|
+
|
71
|
+
@contig_annotations_cds = {}
|
72
|
+
|
60
73
|
end # end of method
|
61
74
|
|
62
75
|
# Prepare files for the annotation
|
63
76
|
# Will run prodigal on the query and prepare reference genome files
|
64
77
|
def prepare_files_for_annotation
|
65
78
|
puts "\nRunning Prodigal on your genome.."
|
66
|
-
@
|
79
|
+
@query_fasta.run_prodigal @root, @options[:outdir]
|
67
80
|
puts "Prodigal done."
|
68
81
|
if @with_refence_genome
|
69
|
-
@
|
70
|
-
@
|
71
|
-
puts "Successfully loaded #{@
|
82
|
+
@ref_genome.write_cds_to_file @options[:outdir]
|
83
|
+
@ref_genome.write_rna_to_file @options[:outdir]
|
84
|
+
puts "Successfully loaded #{@ref_genome.gbk.definition}"
|
72
85
|
end
|
73
86
|
end # end of method
|
74
87
|
|
88
|
+
|
89
|
+
def run_reference_synteny_prot
|
90
|
+
|
91
|
+
ref_synteny_prot = SequenceSynteny.new(@query_fasta.annotation_files[:proteins], @ref_genome.cds_file,
|
92
|
+
"Prot-Ref", @options[:pidentity], @options[:pcoverage], "prot")
|
93
|
+
|
94
|
+
ref_synteny_prot.run_blat @root, @options[:outdir]
|
95
|
+
|
96
|
+
ref_synteny_prot.extract_hits :refgenome
|
97
|
+
|
98
|
+
fdebug = File.open("debug-synteny.tsv", "w")
|
99
|
+
|
100
|
+
ref_synteny_prot.query_sequences.each do |k,v|
|
101
|
+
if v.has_key? :homology
|
102
|
+
@contig_annotations_cds[v[:contig]] = [] if ! @contig_annotations_cds.has_key? v[:contig]
|
103
|
+
@contig_annotations_cds[v[:contig]] << k
|
104
|
+
fdebug.write("#{v[:contig]}\t#{k}\t#{v[:homology][:pId]}\t#{v[:homology][:cov_query]}\t#{v[:homology][:cov_subject]}\t#{v[:homology][:hits].join(',')}\t#{@ref_genome.coding_seq[v[:homology][:hits][0]][:locustag]}\t#{@ref_genome.coding_seq[v[:homology][:hits][0]][:product]}\t#{v[:homology][:assert_cutoff].join(',')}\n")
|
105
|
+
else
|
106
|
+
fdebug.write("#{v[:contig]} #{k} NONE...\n")
|
107
|
+
end
|
108
|
+
end
|
109
|
+
fdebug.close
|
110
|
+
|
111
|
+
ref_synteny_prot
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
|
75
116
|
# run_alignment of reference genome proteins and the query
|
76
117
|
def run_annotation
|
77
118
|
|
78
119
|
# process reference genome synteny
|
79
120
|
if @with_refence_genome # Annotation with the Reference Genome
|
80
121
|
|
81
|
-
|
82
|
-
puts "\nRunning BLAT alignment with Reference Genome CDS.."
|
83
|
-
@prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity, "prot")
|
84
|
-
@prot_synteny.run_blat @root, @outdir
|
85
|
-
@prot_synteny.extract_hits_prodigal :refgenome
|
122
|
+
@prot_synteny_refgenome = run_reference_synteny_prot
|
86
123
|
|
87
|
-
|
124
|
+
# iterate over each contig
|
125
|
+
# discard short contig
|
126
|
+
# cumulate statistics of homolog CDS
|
127
|
+
@query_fasta.annotation_files[:contigs].each_with_index do |contig, contig_index|
|
88
128
|
|
89
129
|
# Skip short contigs
|
90
|
-
if @
|
130
|
+
if @query_fasta.annotation_files[:contigs_length][contig_index] < @minlength
|
91
131
|
@annotation_stats[:short_contigs] << contig
|
92
132
|
next
|
93
133
|
end
|
94
134
|
|
95
|
-
|
96
|
-
# contig_to_annotate = contig_prots[0].split("_")[0..-2].join("_")
|
97
|
-
# contig_prot_annotations = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
|
98
|
-
@contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig, contig_prots, @refgenome.coding_seq
|
99
|
-
|
100
|
-
remaining_cds = cumulate_annotation_stats_reference contig, @contig_annotations[contig]
|
135
|
+
remaining_cds = cumulate_annotation_stats_reference contig
|
101
136
|
|
102
137
|
if ! remaining_cds.empty?
|
103
138
|
@contig_foreign_cds[contig] = remaining_cds
|
@@ -113,18 +148,19 @@ class BacterialAnnotator
|
|
113
148
|
|
114
149
|
# run RNA annotation
|
115
150
|
puts "\nRunning BLAT alignment with Reference Genome RNA.."
|
116
|
-
@rna_synteny =
|
117
|
-
|
151
|
+
@rna_synteny = SequenceSynteny.new(@query_fasta.fasta_file, @ref_genome.rna_file,
|
152
|
+
"RNA-Ref", @options[:pidentity], @options[:pcoverage], "dna")
|
153
|
+
@rna_synteny.run_blat @root, @options[:outdir]
|
118
154
|
@rna_synteny.extract_hits_dna :rna
|
119
155
|
@contig_annotations_rna = {}
|
120
|
-
@
|
156
|
+
@query_fasta.annotation_files[:contigs].each_with_index do |contig, contig_index|
|
121
157
|
@contig_annotations_rna[contig] = @rna_synteny.get_annotation_for_contig contig
|
122
158
|
end
|
123
159
|
|
124
160
|
else # no reference genome
|
125
161
|
|
126
162
|
# no reference genome .. will process all the CDS
|
127
|
-
foreign_cds_file = @
|
163
|
+
foreign_cds_file = @query_fasta.annotation_files[:proteins]
|
128
164
|
|
129
165
|
end
|
130
166
|
|
@@ -135,7 +171,7 @@ class BacterialAnnotator
|
|
135
171
|
parse_genbank_files
|
136
172
|
|
137
173
|
puts "\nPrinting Statistics.."
|
138
|
-
print_stats "#{@outdir}
|
174
|
+
print_stats "#{@options[:outdir]}"
|
139
175
|
|
140
176
|
|
141
177
|
end # end of method
|
@@ -150,84 +186,48 @@ class BacterialAnnotator
|
|
150
186
|
db_file = @options[:external_db]
|
151
187
|
ref_cds = extract_externaldb_prot_info db_file
|
152
188
|
|
153
|
-
externaldb_synteny =
|
189
|
+
@externaldb_synteny = SequenceSynteny.new(remaining_cds_file, db_file,
|
190
|
+
"Prot-ExternalDB", @options[:pidentity],
|
191
|
+
@options[:pcoverage], "prot")
|
192
|
+
|
154
193
|
puts "\nRunning BLAT alignment with External Database.."
|
155
|
-
externaldb_synteny.run_blat @root, @outdir
|
156
|
-
externaldb_synteny.
|
194
|
+
@externaldb_synteny.run_blat @root, @options[:outdir]
|
195
|
+
@externaldb_synteny.extract_hits :externaldb
|
196
|
+
|
197
|
+
@externaldb_synteny.query_sequences.each do |k, v|
|
157
198
|
|
158
|
-
externaldb_synteny.aln_hits.each do |k,v|
|
159
199
|
contig_of_protein = k.split("_")[0..-2].join("_")
|
160
200
|
|
161
|
-
if ! @
|
162
|
-
@
|
201
|
+
if ! @contig_annotations_externaldb.has_key? contig_of_protein
|
202
|
+
@contig_annotations_externaldb[contig_of_protein] = {}
|
163
203
|
end
|
164
204
|
|
165
|
-
|
205
|
+
next if ! v.has_key? :homology
|
206
|
+
|
207
|
+
@contig_annotations_cds[contig_of_protein] << k
|
208
|
+
|
209
|
+
hit_gi = v[:homology][:hits][0]
|
166
210
|
|
167
211
|
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
|
168
|
-
|
212
|
+
cov_query = (v[:homology][:cov_query]*100).round(2)
|
213
|
+
cov_subject = (v[:homology][:cov_subject]*100).round(2)
|
214
|
+
note = "Protein homology (AA identity: #{v[:homology][:pId]}%; coverage (q,s): #{cov_query}%,#{cov_subject}%) with #{ref_cds[hit_gi][:prot_id]}"
|
215
|
+
inference = "similar to AA sequence:#{ref_cds[hit_gi][:db_source]}:#{ref_cds[hit_gi][:prot_id]}"
|
169
216
|
|
170
217
|
if ref_cds[hit_gi][:org] != ""
|
171
218
|
note += " from #{ref_cds[hit_gi][:org]}"
|
172
219
|
end
|
173
|
-
@contig_annotations[contig_of_protein][k] = {product: ref_cds[hit_gi][:product],
|
174
|
-
feature: "cds",
|
175
|
-
gene: nil,
|
176
|
-
locustag: nil,
|
177
|
-
note: note}
|
178
|
-
|
179
|
-
end
|
180
|
-
|
181
|
-
|
182
|
-
elsif @options.has_key? :remote_db # from a remote DB
|
183
|
-
|
184
|
-
# do it by chunk to avoid NCBI CPU exceeding limit
|
185
|
-
cds_files = split_remaining_cds_file remaining_cds_file
|
186
|
-
@remotedb = @options[:remote_db]
|
187
220
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
cds_file,
|
198
|
-
"#{cds_file}.#{@remotedb}.xml",
|
199
|
-
@pidentity)
|
200
|
-
rescue
|
201
|
-
valid = false
|
202
|
-
end
|
203
|
-
|
204
|
-
# ncbi blast didn't worked out
|
205
|
-
if !valid
|
206
|
-
puts "Problem NCBI blast for foreign proteins"
|
207
|
-
else
|
208
|
-
ncbiblast.extract_blast_results
|
209
|
-
if ! ncbiblast.aln_hits
|
210
|
-
puts "Didn't produce the annotation for #{cds_file}"
|
211
|
-
next
|
212
|
-
end
|
213
|
-
ncbiblast.aln_hits.each do |k,v|
|
214
|
-
contig_of_protein = k.split("_")[0..-2].join("_")
|
215
|
-
if ! @contig_annotations.has_key? contig_of_protein
|
216
|
-
@contig_annotations[contig_of_protein] = {}
|
217
|
-
end
|
218
|
-
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
|
219
|
-
note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:accession]}"
|
220
|
-
if v[:hits][0][:org] != ""
|
221
|
-
note += " from #{v[:hits][0][:org]}"
|
222
|
-
end
|
223
|
-
@contig_annotations[contig_of_protein][k] = {product: v[:hits][0][:product],
|
224
|
-
feature: "cds",
|
225
|
-
gene: nil,
|
226
|
-
locustag: nil,
|
227
|
-
note: note}
|
228
|
-
end
|
221
|
+
@contig_annotations_externaldb[contig_of_protein][v[:homology][:hits][0]] = {
|
222
|
+
product: ref_cds[hit_gi][:product],
|
223
|
+
feature: "cds",
|
224
|
+
gene: nil,
|
225
|
+
prot_id: ref_cds[hit_gi][:prot_id],
|
226
|
+
locustag: nil,
|
227
|
+
note: note,
|
228
|
+
inference: inference
|
229
|
+
}
|
229
230
|
|
230
|
-
end
|
231
231
|
|
232
232
|
end
|
233
233
|
|
@@ -240,12 +240,24 @@ class BacterialAnnotator
|
|
240
240
|
def parse_genbank_files
|
241
241
|
|
242
242
|
puts "\nParsing annotation into genbank files.."
|
243
|
-
@
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
243
|
+
@contig_annotations_cds.each do |contig, contig_prots|
|
244
|
+
|
245
|
+
gbk_path = @query_fasta.annotation_files[:gbk_path]
|
246
|
+
gbk_to_annotate = SequenceAnnotation.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
|
247
|
+
|
248
|
+
if @with_external_db
|
249
|
+
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
250
|
+
(@prot_synteny_refgenome.query_sequences.merge(@externaldb_synteny.query_sequences)),
|
251
|
+
@contig_annotations_externaldb[contig].merge(@ref_genome.coding_seq),
|
252
|
+
@options[:refgenome].gsub(/.gb.*/,"")
|
253
|
+
)
|
254
|
+
else
|
255
|
+
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
256
|
+
@prot_synteny_refgenome.query_sequences,
|
257
|
+
@ref_genome.coding_seq,
|
258
|
+
@options[:refgenome].gsub(/.gb.*/,"")
|
259
|
+
)
|
260
|
+
end
|
249
261
|
|
250
262
|
if @contig_annotations_rna.has_key? contig
|
251
263
|
# puts "RNA annotation"
|
@@ -261,34 +273,65 @@ class BacterialAnnotator
|
|
261
273
|
|
262
274
|
# cumulate the stats for the synteny
|
263
275
|
# return : unannotated cds array
|
264
|
-
def cumulate_annotation_stats_reference contig, contig_prots_ann
|
276
|
+
# def cumulate_annotation_stats_reference contig, contig_prots_ann
|
277
|
+
def cumulate_annotation_stats_reference contig
|
265
278
|
|
266
279
|
remaining_cds = []
|
267
|
-
contig_prots = @
|
280
|
+
contig_prots = @query_fasta.annotation_files[:prot_ids_by_contig][contig]
|
268
281
|
|
269
282
|
@annotation_stats[:total_cds] += contig_prots.length if contig_prots
|
270
|
-
|
271
|
-
|
272
|
-
|
283
|
+
|
284
|
+
# count contig as foreign if no cds homolog in reference genome
|
285
|
+
if @contig_annotations_cds.has_key? contig and
|
286
|
+
@contig_annotations_cds[contig].length > 0
|
287
|
+
@annotation_stats[:synteny_contigs] << contig
|
288
|
+
else
|
289
|
+
@annotation_stats[:foreign_contigs] << contig
|
290
|
+
return
|
291
|
+
end
|
292
|
+
|
293
|
+
contig_prots.each do |prot|
|
294
|
+
|
295
|
+
if @contig_annotations_cds[contig].include? prot
|
296
|
+
|
297
|
+
if @prot_synteny_refgenome.query_sequences[prot].has_key? :homology and
|
298
|
+
@prot_synteny_refgenome.query_sequences[prot][:homology][:hits].length > 0
|
299
|
+
|
300
|
+
assert_sum = @prot_synteny_refgenome.query_sequences[prot][:homology][:assert_cutoff].inject(:+)
|
301
|
+
if assert_sum > 2
|
302
|
+
@annotation_stats[:annotated_cds] += 1
|
303
|
+
else
|
304
|
+
flag = "#{prot}"
|
305
|
+
flag += "\t#{@prot_synteny_refgenome.query_sequences[prot][:homology][:assert_cutoff].join(',')}"
|
306
|
+
flag += "\t#{@prot_synteny_refgenome.query_sequences[prot][:homology][:pId]}"
|
307
|
+
flag += "\t#{(@prot_synteny_refgenome.query_sequences[prot][:homology][:cov_query]*100).round(2)}"
|
308
|
+
flag += "\t#{(@prot_synteny_refgenome.query_sequences[prot][:homology][:cov_subject]*100).round(2)}"
|
309
|
+
@annotation_stats[:flagged_cds] << flag
|
310
|
+
end
|
311
|
+
|
312
|
+
else
|
313
|
+
|
314
|
+
puts "No " + prot
|
315
|
+
|
316
|
+
end
|
317
|
+
|
273
318
|
else
|
274
|
-
|
319
|
+
|
320
|
+
remaining_cds << prot
|
321
|
+
|
275
322
|
end
|
276
|
-
end
|
277
323
|
|
278
|
-
# Annotated Contigs
|
279
|
-
if contig_prots_ann.keys.length < 1
|
280
|
-
@annotation_stats[:foreign_contigs] << contig
|
281
|
-
else
|
282
|
-
@annotation_stats[:synteny_contigs] << contig
|
283
324
|
end
|
284
325
|
|
285
326
|
remaining_cds
|
327
|
+
|
286
328
|
end # end of method
|
287
329
|
|
288
330
|
|
289
331
|
# print statistics to file
|
290
|
-
def print_stats
|
332
|
+
def print_stats file_dir
|
291
333
|
|
334
|
+
file = file_dir + "/Annotation-Stats.txt"
|
292
335
|
total_nb_contigs = @annotation_stats[:foreign_contigs].length +
|
293
336
|
@annotation_stats[:synteny_contigs].length +
|
294
337
|
@annotation_stats[:short_contigs].length
|
@@ -307,25 +350,34 @@ class BacterialAnnotator
|
|
307
350
|
|
308
351
|
fopen.write("#CDS annotations based on reference genomes\n")
|
309
352
|
fopen.write("Annotated CDS :\t\t\t" + @annotation_stats[:annotated_cds].to_s + "\n")
|
353
|
+
fopen.write("Flagged CDS :\t\t\t" + @annotation_stats[:flagged_cds].length.to_s + "\n")
|
310
354
|
fopen.write("Total CDS :\t\t\t" + @annotation_stats[:total_cds].to_s + "\n")
|
311
355
|
fopen.write("% CDS annotated :\t\t" + (p_cds_annotated*100).round(2).to_s + "\n")
|
312
356
|
fopen.write("\n")
|
313
357
|
|
314
358
|
end
|
315
359
|
|
360
|
+
file_flagged_cds = file_dir + "/Prot-flagged.tsv"
|
361
|
+
File.open(file_flagged_cds, "w") do |fopen|
|
362
|
+
fopen.write("CDS locus\tAssertion-CutOff\tAA Identity\tCovQuery(%)\tCovSubject(%)\n")
|
363
|
+
@annotation_stats[:flagged_cds].each do |fcds|
|
364
|
+
fopen.write("#{fcds}\n")
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
316
368
|
end # end of method
|
317
369
|
|
318
370
|
|
319
371
|
# dump cds to file for blast
|
320
372
|
def dump_cds
|
321
373
|
|
322
|
-
cds_outfile = File.open("#{@outdir}/Proteins-foreign.fa","w")
|
374
|
+
cds_outfile = File.open("#{@options[:outdir]}/Proteins-foreign.fa","w")
|
323
375
|
foreign_cds = []
|
324
376
|
@contig_foreign_cds.each_value do |v|
|
325
377
|
foreign_cds.push(*v)
|
326
378
|
end
|
327
379
|
inprot = false
|
328
|
-
File.open(@
|
380
|
+
File.open(@query_fasta.annotation_files[:proteins]) do |fprot|
|
329
381
|
while l=fprot.gets
|
330
382
|
if l[0] == ">"
|
331
383
|
inprot = false
|
@@ -340,7 +392,7 @@ class BacterialAnnotator
|
|
340
392
|
end
|
341
393
|
end
|
342
394
|
cds_outfile.close
|
343
|
-
return "#{@outdir}/Proteins-foreign.fa"
|
395
|
+
return "#{@options[:outdir]}/Proteins-foreign.fa"
|
344
396
|
|
345
397
|
end # end of method
|
346
398
|
|
@@ -363,25 +415,46 @@ class BacterialAnnotator
|
|
363
415
|
if l[0] == ">"
|
364
416
|
|
365
417
|
lA = l.chomp.split("|")
|
366
|
-
key_gi = lA[1]
|
418
|
+
#key_gi = lA[1]
|
419
|
+
key_gi = l.split(" ")[0][1..-1]
|
367
420
|
product_long = lA[-1]
|
368
421
|
|
369
422
|
organism = ""
|
370
423
|
product = ""
|
424
|
+
db_source = "[DBSource]"
|
371
425
|
|
372
426
|
if product_long.include? " [" and product_long.include? "]" # NCBI
|
373
427
|
organism = product_long[/\[.*?\]/]
|
374
428
|
product = product_long.split(" [")[0].strip
|
375
|
-
elsif product_long.include? "OS="
|
429
|
+
elsif product_long.include? "OS=" # Swissprot / TrEMBL
|
376
430
|
product_tmp = product.split("OS=")
|
377
431
|
organism = product_tmp[1].split(/[A-Z][A-Z]=/)[0].strip
|
378
432
|
product = product_tmp[0].strip
|
379
|
-
elsif product_long.include? "[A-Z][A-Z]="
|
433
|
+
elsif product_long.include? "[A-Z][A-Z]=" # NCBI
|
380
434
|
product = product_long.split(/[A-Z][A-Z]=/)[0].strip
|
435
|
+
else
|
436
|
+
product = product_long
|
381
437
|
end
|
438
|
+
|
382
439
|
org = organism.gsub("[","").gsub("]","")
|
440
|
+
|
383
441
|
product.lstrip!
|
384
|
-
|
442
|
+
prot_id = nil
|
443
|
+
|
444
|
+
if key_gi.count("|") == 4
|
445
|
+
if lA[2] == "ref"
|
446
|
+
db_source = "RefSeq"
|
447
|
+
end
|
448
|
+
prot_id = lA[3]
|
449
|
+
elsif key_gi.count("|") == 2
|
450
|
+
if lA[0].include? == "sp" or
|
451
|
+
lA[0].include? == "tr"
|
452
|
+
db_source = "UniProtKB"
|
453
|
+
end
|
454
|
+
prot_id = lA[1]
|
455
|
+
end
|
456
|
+
|
457
|
+
ref_cds[key_gi] = {product: product, org: org, prot_id: prot_id, db_source: db_source}
|
385
458
|
|
386
459
|
end
|
387
460
|
|
@@ -398,7 +471,7 @@ class BacterialAnnotator
|
|
398
471
|
def split_remaining_cds_file file
|
399
472
|
|
400
473
|
cds_files = []
|
401
|
-
outdir = "#{@outdir}/Protein-foreign.split"
|
474
|
+
outdir = "#{@options[:outdir]}/Protein-foreign.split"
|
402
475
|
|
403
476
|
Dir.mkdir(outdir) if ! Dir.exists? outdir
|
404
477
|
|
@@ -429,22 +502,20 @@ class BacterialAnnotator
|
|
429
502
|
|
430
503
|
end # end of method
|
431
504
|
|
432
|
-
# will reference CDS synteny to file
|
505
|
+
# will dump reference CDS synteny to file
|
433
506
|
def dump_ref_synteny_to_file
|
434
507
|
|
435
508
|
# Iterate over each Ref protein and print syntheny
|
436
|
-
synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
|
509
|
+
synteny_file = File.open("#{@options[:outdir]}/Prot-Synteny.tsv","w")
|
437
510
|
synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
|
438
511
|
ref_annotated = {}
|
439
|
-
@contig_annotations.each do |contig,prot_annotations|
|
512
|
+
@contig_annotations.each do |contig, prot_annotations|
|
440
513
|
prot_annotations.each do |key,prot|
|
441
|
-
# p key
|
442
|
-
# p prot
|
443
514
|
ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
|
444
515
|
end
|
445
516
|
end
|
446
517
|
|
447
|
-
@
|
518
|
+
@ref_genome.coding_seq.each do |ref_k, ref_v|
|
448
519
|
|
449
520
|
gene = ""
|
450
521
|
coverage_ref = ""
|
@@ -454,7 +525,7 @@ class BacterialAnnotator
|
|
454
525
|
if ref_annotated[ref_v[:protId]] != nil
|
455
526
|
gene = ref_annotated[ref_v[:protId]][:key]
|
456
527
|
coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
|
457
|
-
query_length = @
|
528
|
+
query_length = @query_fasta.annotation_files[:prot_ids_length][gene]
|
458
529
|
coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
|
459
530
|
pId = ref_annotated[ref_v[:protId]][:pId]
|
460
531
|
end
|
data/lib/bacterial-comparator.rb
CHANGED
@@ -377,6 +377,7 @@ class BacterialComparator
|
|
377
377
|
cmd = system("#{@root}/raxml.linux -T 3 -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.PepTree -m PROTGAMMAAUTO -n PEP_BS_TREE -w #{tree_dir}")
|
378
378
|
cmd = system("ln -s #{tree_dir}/RAxML_bipartitionsBranchLabels.PEP_BS_TREE #{tree_dir}/../")
|
379
379
|
Dir.chdir(ori_dir)
|
380
|
+
|
380
381
|
end
|
381
382
|
|
382
383
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-05-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -88,10 +88,9 @@ files:
|
|
88
88
|
- bin/ba_raxml
|
89
89
|
- bin/bacterial-annotator
|
90
90
|
- lib/bacterial-annotator.rb
|
91
|
-
- lib/bacterial-annotator/
|
92
|
-
- lib/bacterial-annotator/
|
93
|
-
- lib/bacterial-annotator/
|
94
|
-
- lib/bacterial-annotator/synteny-manip.rb
|
91
|
+
- lib/bacterial-annotator/sequence-annotation.rb
|
92
|
+
- lib/bacterial-annotator/sequence-fasta.rb
|
93
|
+
- lib/bacterial-annotator/sequence-synteny.rb
|
95
94
|
- lib/bacterial-comparator.rb
|
96
95
|
homepage: http://rubygems.org/gems/bacterial-annotator
|
97
96
|
licenses:
|