bacterial-annotator 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ba_prodigal +1 -1
- data/bin/bacterial-annotator +13 -14
- data/lib/bacterial-annotator/{genbank-manip.rb → sequence-annotation.rb} +128 -16
- data/lib/bacterial-annotator/{fasta-manip.rb → sequence-fasta.rb} +32 -23
- data/lib/bacterial-annotator/{synteny-manip.rb → sequence-synteny.rb} +128 -8
- data/lib/bacterial-annotator.rb +211 -140
- data/lib/bacterial-comparator.rb +1 -0
- metadata +5 -6
- data/lib/bacterial-annotator/remote-ncbi.rb +0 -201
data/lib/bacterial-annotator.rb
CHANGED
@@ -9,95 +9,130 @@
|
|
9
9
|
require 'bio'
|
10
10
|
require 'fileutils'
|
11
11
|
|
12
|
-
require 'bacterial-annotator/
|
13
|
-
require 'bacterial-annotator/
|
14
|
-
require 'bacterial-annotator/synteny
|
15
|
-
|
12
|
+
require 'bacterial-annotator/sequence-fasta'
|
13
|
+
require 'bacterial-annotator/sequence-annotation'
|
14
|
+
require 'bacterial-annotator/sequence-synteny'
|
15
|
+
|
16
16
|
|
17
17
|
class BacterialAnnotator
|
18
18
|
|
19
19
|
# Initialize BacterialAnnotator
|
20
|
-
# options
|
20
|
+
# options, ROOT (path)
|
21
21
|
def initialize options, root
|
22
22
|
|
23
23
|
@root = root
|
24
24
|
@options = options
|
25
|
-
@outdir = @options[:outdir]
|
26
25
|
|
27
26
|
@minlength = @options[:minlength].to_i
|
28
|
-
@
|
29
|
-
@pidentity = @pidentity
|
27
|
+
@options[:minlength] = @options[:minlength].to_i
|
28
|
+
@options[:pidentity] = @options[:pidentity].to_f
|
29
|
+
@options[:pidentispacemacs-lightty] = @options[:pidentity] * 100 if @options[:pidentity] <= 1.00
|
30
|
+
@options[:pcoverage] = @options[:pcoverage].to_f
|
31
|
+
@options[:pcoverage] = @options[:pcoverage] / 100 if @options[:pcoverage] > 1.00
|
30
32
|
|
31
|
-
if File.exists? (@outdir)
|
33
|
+
if File.exists? (@options[:outdir])
|
32
34
|
if ! options.has_key? :force
|
33
35
|
abort "Output directory already exist ! Choose another one or use -f to overwrite"
|
34
36
|
else
|
35
|
-
puts "Overwriting output directory #{@outdir}"
|
36
|
-
FileUtils.remove_dir(@outdir, :force=>true)
|
37
|
+
puts "Overwriting output directory #{@options[:outdir]}"
|
38
|
+
FileUtils.remove_dir(@options[:outdir], :force=>true)
|
37
39
|
end
|
38
40
|
end
|
39
|
-
Dir.mkdir(@outdir)
|
41
|
+
Dir.mkdir(@options[:outdir])
|
40
42
|
|
41
|
-
@
|
43
|
+
@query_fasta = SequenceFasta.new(@options[:input], @options[:meta])
|
42
44
|
|
43
45
|
@with_refence_genome = false
|
44
46
|
if @options.has_key? :refgenome
|
45
47
|
@with_refence_genome = true
|
46
|
-
@
|
48
|
+
@ref_genome = SequenceAnnotation.new(@options[:refgenome], @options[:outdir])
|
47
49
|
end
|
48
50
|
|
51
|
+
@with_external_db = false
|
52
|
+
@with_external_db = true if @options.has_key? :external_db
|
53
|
+
|
49
54
|
@prot_synteny = nil
|
50
|
-
@annotation_stats = {
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
55
|
+
@annotation_stats = {
|
56
|
+
by_contigs: {},
|
57
|
+
annotated_cds: 0,
|
58
|
+
flagged_cds: [],
|
59
|
+
total_cds: 0,
|
60
|
+
foreign_contigs: [],
|
61
|
+
synteny_contigs: [],
|
62
|
+
short_contigs: []
|
63
|
+
}
|
56
64
|
|
57
65
|
@contig_foreign_cds = {}
|
66
|
+
|
58
67
|
@contig_annotations = {}
|
59
68
|
|
69
|
+
@contig_annotations_externaldb = {}
|
70
|
+
|
71
|
+
@contig_annotations_cds = {}
|
72
|
+
|
60
73
|
end # end of method
|
61
74
|
|
62
75
|
# Prepare files for the annotation
|
63
76
|
# Will run prodigal on the query and prepare reference genome files
|
64
77
|
def prepare_files_for_annotation
|
65
78
|
puts "\nRunning Prodigal on your genome.."
|
66
|
-
@
|
79
|
+
@query_fasta.run_prodigal @root, @options[:outdir]
|
67
80
|
puts "Prodigal done."
|
68
81
|
if @with_refence_genome
|
69
|
-
@
|
70
|
-
@
|
71
|
-
puts "Successfully loaded #{@
|
82
|
+
@ref_genome.write_cds_to_file @options[:outdir]
|
83
|
+
@ref_genome.write_rna_to_file @options[:outdir]
|
84
|
+
puts "Successfully loaded #{@ref_genome.gbk.definition}"
|
72
85
|
end
|
73
86
|
end # end of method
|
74
87
|
|
88
|
+
|
89
|
+
def run_reference_synteny_prot
|
90
|
+
|
91
|
+
ref_synteny_prot = SequenceSynteny.new(@query_fasta.annotation_files[:proteins], @ref_genome.cds_file,
|
92
|
+
"Prot-Ref", @options[:pidentity], @options[:pcoverage], "prot")
|
93
|
+
|
94
|
+
ref_synteny_prot.run_blat @root, @options[:outdir]
|
95
|
+
|
96
|
+
ref_synteny_prot.extract_hits :refgenome
|
97
|
+
|
98
|
+
fdebug = File.open("debug-synteny.tsv", "w")
|
99
|
+
|
100
|
+
ref_synteny_prot.query_sequences.each do |k,v|
|
101
|
+
if v.has_key? :homology
|
102
|
+
@contig_annotations_cds[v[:contig]] = [] if ! @contig_annotations_cds.has_key? v[:contig]
|
103
|
+
@contig_annotations_cds[v[:contig]] << k
|
104
|
+
fdebug.write("#{v[:contig]}\t#{k}\t#{v[:homology][:pId]}\t#{v[:homology][:cov_query]}\t#{v[:homology][:cov_subject]}\t#{v[:homology][:hits].join(',')}\t#{@ref_genome.coding_seq[v[:homology][:hits][0]][:locustag]}\t#{@ref_genome.coding_seq[v[:homology][:hits][0]][:product]}\t#{v[:homology][:assert_cutoff].join(',')}\n")
|
105
|
+
else
|
106
|
+
fdebug.write("#{v[:contig]} #{k} NONE...\n")
|
107
|
+
end
|
108
|
+
end
|
109
|
+
fdebug.close
|
110
|
+
|
111
|
+
ref_synteny_prot
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
|
75
116
|
# run_alignment of reference genome proteins and the query
|
76
117
|
def run_annotation
|
77
118
|
|
78
119
|
# process reference genome synteny
|
79
120
|
if @with_refence_genome # Annotation with the Reference Genome
|
80
121
|
|
81
|
-
|
82
|
-
puts "\nRunning BLAT alignment with Reference Genome CDS.."
|
83
|
-
@prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity, "prot")
|
84
|
-
@prot_synteny.run_blat @root, @outdir
|
85
|
-
@prot_synteny.extract_hits_prodigal :refgenome
|
122
|
+
@prot_synteny_refgenome = run_reference_synteny_prot
|
86
123
|
|
87
|
-
|
124
|
+
# iterate over each contig
|
125
|
+
# discard short contig
|
126
|
+
# cumulate statistics of homolog CDS
|
127
|
+
@query_fasta.annotation_files[:contigs].each_with_index do |contig, contig_index|
|
88
128
|
|
89
129
|
# Skip short contigs
|
90
|
-
if @
|
130
|
+
if @query_fasta.annotation_files[:contigs_length][contig_index] < @minlength
|
91
131
|
@annotation_stats[:short_contigs] << contig
|
92
132
|
next
|
93
133
|
end
|
94
134
|
|
95
|
-
|
96
|
-
# contig_to_annotate = contig_prots[0].split("_")[0..-2].join("_")
|
97
|
-
# contig_prot_annotations = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
|
98
|
-
@contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig, contig_prots, @refgenome.coding_seq
|
99
|
-
|
100
|
-
remaining_cds = cumulate_annotation_stats_reference contig, @contig_annotations[contig]
|
135
|
+
remaining_cds = cumulate_annotation_stats_reference contig
|
101
136
|
|
102
137
|
if ! remaining_cds.empty?
|
103
138
|
@contig_foreign_cds[contig] = remaining_cds
|
@@ -113,18 +148,19 @@ class BacterialAnnotator
|
|
113
148
|
|
114
149
|
# run RNA annotation
|
115
150
|
puts "\nRunning BLAT alignment with Reference Genome RNA.."
|
116
|
-
@rna_synteny =
|
117
|
-
|
151
|
+
@rna_synteny = SequenceSynteny.new(@query_fasta.fasta_file, @ref_genome.rna_file,
|
152
|
+
"RNA-Ref", @options[:pidentity], @options[:pcoverage], "dna")
|
153
|
+
@rna_synteny.run_blat @root, @options[:outdir]
|
118
154
|
@rna_synteny.extract_hits_dna :rna
|
119
155
|
@contig_annotations_rna = {}
|
120
|
-
@
|
156
|
+
@query_fasta.annotation_files[:contigs].each_with_index do |contig, contig_index|
|
121
157
|
@contig_annotations_rna[contig] = @rna_synteny.get_annotation_for_contig contig
|
122
158
|
end
|
123
159
|
|
124
160
|
else # no reference genome
|
125
161
|
|
126
162
|
# no reference genome .. will process all the CDS
|
127
|
-
foreign_cds_file = @
|
163
|
+
foreign_cds_file = @query_fasta.annotation_files[:proteins]
|
128
164
|
|
129
165
|
end
|
130
166
|
|
@@ -135,7 +171,7 @@ class BacterialAnnotator
|
|
135
171
|
parse_genbank_files
|
136
172
|
|
137
173
|
puts "\nPrinting Statistics.."
|
138
|
-
print_stats "#{@outdir}
|
174
|
+
print_stats "#{@options[:outdir]}"
|
139
175
|
|
140
176
|
|
141
177
|
end # end of method
|
@@ -150,84 +186,48 @@ class BacterialAnnotator
|
|
150
186
|
db_file = @options[:external_db]
|
151
187
|
ref_cds = extract_externaldb_prot_info db_file
|
152
188
|
|
153
|
-
externaldb_synteny =
|
189
|
+
@externaldb_synteny = SequenceSynteny.new(remaining_cds_file, db_file,
|
190
|
+
"Prot-ExternalDB", @options[:pidentity],
|
191
|
+
@options[:pcoverage], "prot")
|
192
|
+
|
154
193
|
puts "\nRunning BLAT alignment with External Database.."
|
155
|
-
externaldb_synteny.run_blat @root, @outdir
|
156
|
-
externaldb_synteny.
|
194
|
+
@externaldb_synteny.run_blat @root, @options[:outdir]
|
195
|
+
@externaldb_synteny.extract_hits :externaldb
|
196
|
+
|
197
|
+
@externaldb_synteny.query_sequences.each do |k, v|
|
157
198
|
|
158
|
-
externaldb_synteny.aln_hits.each do |k,v|
|
159
199
|
contig_of_protein = k.split("_")[0..-2].join("_")
|
160
200
|
|
161
|
-
if ! @
|
162
|
-
@
|
201
|
+
if ! @contig_annotations_externaldb.has_key? contig_of_protein
|
202
|
+
@contig_annotations_externaldb[contig_of_protein] = {}
|
163
203
|
end
|
164
204
|
|
165
|
-
|
205
|
+
next if ! v.has_key? :homology
|
206
|
+
|
207
|
+
@contig_annotations_cds[contig_of_protein] << k
|
208
|
+
|
209
|
+
hit_gi = v[:homology][:hits][0]
|
166
210
|
|
167
211
|
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
|
168
|
-
|
212
|
+
cov_query = (v[:homology][:cov_query]*100).round(2)
|
213
|
+
cov_subject = (v[:homology][:cov_subject]*100).round(2)
|
214
|
+
note = "Protein homology (AA identity: #{v[:homology][:pId]}%; coverage (q,s): #{cov_query}%,#{cov_subject}%) with #{ref_cds[hit_gi][:prot_id]}"
|
215
|
+
inference = "similar to AA sequence:#{ref_cds[hit_gi][:db_source]}:#{ref_cds[hit_gi][:prot_id]}"
|
169
216
|
|
170
217
|
if ref_cds[hit_gi][:org] != ""
|
171
218
|
note += " from #{ref_cds[hit_gi][:org]}"
|
172
219
|
end
|
173
|
-
@contig_annotations[contig_of_protein][k] = {product: ref_cds[hit_gi][:product],
|
174
|
-
feature: "cds",
|
175
|
-
gene: nil,
|
176
|
-
locustag: nil,
|
177
|
-
note: note}
|
178
|
-
|
179
|
-
end
|
180
|
-
|
181
|
-
|
182
|
-
elsif @options.has_key? :remote_db # from a remote DB
|
183
|
-
|
184
|
-
# do it by chunk to avoid NCBI CPU exceeding limit
|
185
|
-
cds_files = split_remaining_cds_file remaining_cds_file
|
186
|
-
@remotedb = @options[:remote_db]
|
187
220
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
cds_file,
|
198
|
-
"#{cds_file}.#{@remotedb}.xml",
|
199
|
-
@pidentity)
|
200
|
-
rescue
|
201
|
-
valid = false
|
202
|
-
end
|
203
|
-
|
204
|
-
# ncbi blast didn't worked out
|
205
|
-
if !valid
|
206
|
-
puts "Problem NCBI blast for foreign proteins"
|
207
|
-
else
|
208
|
-
ncbiblast.extract_blast_results
|
209
|
-
if ! ncbiblast.aln_hits
|
210
|
-
puts "Didn't produce the annotation for #{cds_file}"
|
211
|
-
next
|
212
|
-
end
|
213
|
-
ncbiblast.aln_hits.each do |k,v|
|
214
|
-
contig_of_protein = k.split("_")[0..-2].join("_")
|
215
|
-
if ! @contig_annotations.has_key? contig_of_protein
|
216
|
-
@contig_annotations[contig_of_protein] = {}
|
217
|
-
end
|
218
|
-
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
|
219
|
-
note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:accession]}"
|
220
|
-
if v[:hits][0][:org] != ""
|
221
|
-
note += " from #{v[:hits][0][:org]}"
|
222
|
-
end
|
223
|
-
@contig_annotations[contig_of_protein][k] = {product: v[:hits][0][:product],
|
224
|
-
feature: "cds",
|
225
|
-
gene: nil,
|
226
|
-
locustag: nil,
|
227
|
-
note: note}
|
228
|
-
end
|
221
|
+
@contig_annotations_externaldb[contig_of_protein][v[:homology][:hits][0]] = {
|
222
|
+
product: ref_cds[hit_gi][:product],
|
223
|
+
feature: "cds",
|
224
|
+
gene: nil,
|
225
|
+
prot_id: ref_cds[hit_gi][:prot_id],
|
226
|
+
locustag: nil,
|
227
|
+
note: note,
|
228
|
+
inference: inference
|
229
|
+
}
|
229
230
|
|
230
|
-
end
|
231
231
|
|
232
232
|
end
|
233
233
|
|
@@ -240,12 +240,24 @@ class BacterialAnnotator
|
|
240
240
|
def parse_genbank_files
|
241
241
|
|
242
242
|
puts "\nParsing annotation into genbank files.."
|
243
|
-
@
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
243
|
+
@contig_annotations_cds.each do |contig, contig_prots|
|
244
|
+
|
245
|
+
gbk_path = @query_fasta.annotation_files[:gbk_path]
|
246
|
+
gbk_to_annotate = SequenceAnnotation.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
|
247
|
+
|
248
|
+
if @with_external_db
|
249
|
+
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
250
|
+
(@prot_synteny_refgenome.query_sequences.merge(@externaldb_synteny.query_sequences)),
|
251
|
+
@contig_annotations_externaldb[contig].merge(@ref_genome.coding_seq),
|
252
|
+
@options[:refgenome].gsub(/.gb.*/,"")
|
253
|
+
)
|
254
|
+
else
|
255
|
+
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
256
|
+
@prot_synteny_refgenome.query_sequences,
|
257
|
+
@ref_genome.coding_seq,
|
258
|
+
@options[:refgenome].gsub(/.gb.*/,"")
|
259
|
+
)
|
260
|
+
end
|
249
261
|
|
250
262
|
if @contig_annotations_rna.has_key? contig
|
251
263
|
# puts "RNA annotation"
|
@@ -261,34 +273,65 @@ class BacterialAnnotator
|
|
261
273
|
|
262
274
|
# cumulate the stats for the synteny
|
263
275
|
# return : unannotated cds array
|
264
|
-
def cumulate_annotation_stats_reference contig, contig_prots_ann
|
276
|
+
# def cumulate_annotation_stats_reference contig, contig_prots_ann
|
277
|
+
def cumulate_annotation_stats_reference contig
|
265
278
|
|
266
279
|
remaining_cds = []
|
267
|
-
contig_prots = @
|
280
|
+
contig_prots = @query_fasta.annotation_files[:prot_ids_by_contig][contig]
|
268
281
|
|
269
282
|
@annotation_stats[:total_cds] += contig_prots.length if contig_prots
|
270
|
-
|
271
|
-
|
272
|
-
|
283
|
+
|
284
|
+
# count contig as foreign if no cds homolog in reference genome
|
285
|
+
if @contig_annotations_cds.has_key? contig and
|
286
|
+
@contig_annotations_cds[contig].length > 0
|
287
|
+
@annotation_stats[:synteny_contigs] << contig
|
288
|
+
else
|
289
|
+
@annotation_stats[:foreign_contigs] << contig
|
290
|
+
return
|
291
|
+
end
|
292
|
+
|
293
|
+
contig_prots.each do |prot|
|
294
|
+
|
295
|
+
if @contig_annotations_cds[contig].include? prot
|
296
|
+
|
297
|
+
if @prot_synteny_refgenome.query_sequences[prot].has_key? :homology and
|
298
|
+
@prot_synteny_refgenome.query_sequences[prot][:homology][:hits].length > 0
|
299
|
+
|
300
|
+
assert_sum = @prot_synteny_refgenome.query_sequences[prot][:homology][:assert_cutoff].inject(:+)
|
301
|
+
if assert_sum > 2
|
302
|
+
@annotation_stats[:annotated_cds] += 1
|
303
|
+
else
|
304
|
+
flag = "#{prot}"
|
305
|
+
flag += "\t#{@prot_synteny_refgenome.query_sequences[prot][:homology][:assert_cutoff].join(',')}"
|
306
|
+
flag += "\t#{@prot_synteny_refgenome.query_sequences[prot][:homology][:pId]}"
|
307
|
+
flag += "\t#{(@prot_synteny_refgenome.query_sequences[prot][:homology][:cov_query]*100).round(2)}"
|
308
|
+
flag += "\t#{(@prot_synteny_refgenome.query_sequences[prot][:homology][:cov_subject]*100).round(2)}"
|
309
|
+
@annotation_stats[:flagged_cds] << flag
|
310
|
+
end
|
311
|
+
|
312
|
+
else
|
313
|
+
|
314
|
+
puts "No " + prot
|
315
|
+
|
316
|
+
end
|
317
|
+
|
273
318
|
else
|
274
|
-
|
319
|
+
|
320
|
+
remaining_cds << prot
|
321
|
+
|
275
322
|
end
|
276
|
-
end
|
277
323
|
|
278
|
-
# Annotated Contigs
|
279
|
-
if contig_prots_ann.keys.length < 1
|
280
|
-
@annotation_stats[:foreign_contigs] << contig
|
281
|
-
else
|
282
|
-
@annotation_stats[:synteny_contigs] << contig
|
283
324
|
end
|
284
325
|
|
285
326
|
remaining_cds
|
327
|
+
|
286
328
|
end # end of method
|
287
329
|
|
288
330
|
|
289
331
|
# print statistics to file
|
290
|
-
def print_stats
|
332
|
+
def print_stats file_dir
|
291
333
|
|
334
|
+
file = file_dir + "/Annotation-Stats.txt"
|
292
335
|
total_nb_contigs = @annotation_stats[:foreign_contigs].length +
|
293
336
|
@annotation_stats[:synteny_contigs].length +
|
294
337
|
@annotation_stats[:short_contigs].length
|
@@ -307,25 +350,34 @@ class BacterialAnnotator
|
|
307
350
|
|
308
351
|
fopen.write("#CDS annotations based on reference genomes\n")
|
309
352
|
fopen.write("Annotated CDS :\t\t\t" + @annotation_stats[:annotated_cds].to_s + "\n")
|
353
|
+
fopen.write("Flagged CDS :\t\t\t" + @annotation_stats[:flagged_cds].length.to_s + "\n")
|
310
354
|
fopen.write("Total CDS :\t\t\t" + @annotation_stats[:total_cds].to_s + "\n")
|
311
355
|
fopen.write("% CDS annotated :\t\t" + (p_cds_annotated*100).round(2).to_s + "\n")
|
312
356
|
fopen.write("\n")
|
313
357
|
|
314
358
|
end
|
315
359
|
|
360
|
+
file_flagged_cds = file_dir + "/Prot-flagged.tsv"
|
361
|
+
File.open(file_flagged_cds, "w") do |fopen|
|
362
|
+
fopen.write("CDS locus\tAssertion-CutOff\tAA Identity\tCovQuery(%)\tCovSubject(%)\n")
|
363
|
+
@annotation_stats[:flagged_cds].each do |fcds|
|
364
|
+
fopen.write("#{fcds}\n")
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
316
368
|
end # end of method
|
317
369
|
|
318
370
|
|
319
371
|
# dump cds to file for blast
|
320
372
|
def dump_cds
|
321
373
|
|
322
|
-
cds_outfile = File.open("#{@outdir}/Proteins-foreign.fa","w")
|
374
|
+
cds_outfile = File.open("#{@options[:outdir]}/Proteins-foreign.fa","w")
|
323
375
|
foreign_cds = []
|
324
376
|
@contig_foreign_cds.each_value do |v|
|
325
377
|
foreign_cds.push(*v)
|
326
378
|
end
|
327
379
|
inprot = false
|
328
|
-
File.open(@
|
380
|
+
File.open(@query_fasta.annotation_files[:proteins]) do |fprot|
|
329
381
|
while l=fprot.gets
|
330
382
|
if l[0] == ">"
|
331
383
|
inprot = false
|
@@ -340,7 +392,7 @@ class BacterialAnnotator
|
|
340
392
|
end
|
341
393
|
end
|
342
394
|
cds_outfile.close
|
343
|
-
return "#{@outdir}/Proteins-foreign.fa"
|
395
|
+
return "#{@options[:outdir]}/Proteins-foreign.fa"
|
344
396
|
|
345
397
|
end # end of method
|
346
398
|
|
@@ -363,25 +415,46 @@ class BacterialAnnotator
|
|
363
415
|
if l[0] == ">"
|
364
416
|
|
365
417
|
lA = l.chomp.split("|")
|
366
|
-
key_gi = lA[1]
|
418
|
+
#key_gi = lA[1]
|
419
|
+
key_gi = l.split(" ")[0][1..-1]
|
367
420
|
product_long = lA[-1]
|
368
421
|
|
369
422
|
organism = ""
|
370
423
|
product = ""
|
424
|
+
db_source = "[DBSource]"
|
371
425
|
|
372
426
|
if product_long.include? " [" and product_long.include? "]" # NCBI
|
373
427
|
organism = product_long[/\[.*?\]/]
|
374
428
|
product = product_long.split(" [")[0].strip
|
375
|
-
elsif product_long.include? "OS="
|
429
|
+
elsif product_long.include? "OS=" # Swissprot / TrEMBL
|
376
430
|
product_tmp = product.split("OS=")
|
377
431
|
organism = product_tmp[1].split(/[A-Z][A-Z]=/)[0].strip
|
378
432
|
product = product_tmp[0].strip
|
379
|
-
elsif product_long.include? "[A-Z][A-Z]="
|
433
|
+
elsif product_long.include? "[A-Z][A-Z]=" # NCBI
|
380
434
|
product = product_long.split(/[A-Z][A-Z]=/)[0].strip
|
435
|
+
else
|
436
|
+
product = product_long
|
381
437
|
end
|
438
|
+
|
382
439
|
org = organism.gsub("[","").gsub("]","")
|
440
|
+
|
383
441
|
product.lstrip!
|
384
|
-
|
442
|
+
prot_id = nil
|
443
|
+
|
444
|
+
if key_gi.count("|") == 4
|
445
|
+
if lA[2] == "ref"
|
446
|
+
db_source = "RefSeq"
|
447
|
+
end
|
448
|
+
prot_id = lA[3]
|
449
|
+
elsif key_gi.count("|") == 2
|
450
|
+
if lA[0].include? == "sp" or
|
451
|
+
lA[0].include? == "tr"
|
452
|
+
db_source = "UniProtKB"
|
453
|
+
end
|
454
|
+
prot_id = lA[1]
|
455
|
+
end
|
456
|
+
|
457
|
+
ref_cds[key_gi] = {product: product, org: org, prot_id: prot_id, db_source: db_source}
|
385
458
|
|
386
459
|
end
|
387
460
|
|
@@ -398,7 +471,7 @@ class BacterialAnnotator
|
|
398
471
|
def split_remaining_cds_file file
|
399
472
|
|
400
473
|
cds_files = []
|
401
|
-
outdir = "#{@outdir}/Protein-foreign.split"
|
474
|
+
outdir = "#{@options[:outdir]}/Protein-foreign.split"
|
402
475
|
|
403
476
|
Dir.mkdir(outdir) if ! Dir.exists? outdir
|
404
477
|
|
@@ -429,22 +502,20 @@ class BacterialAnnotator
|
|
429
502
|
|
430
503
|
end # end of method
|
431
504
|
|
432
|
-
# will reference CDS synteny to file
|
505
|
+
# will dump reference CDS synteny to file
|
433
506
|
def dump_ref_synteny_to_file
|
434
507
|
|
435
508
|
# Iterate over each Ref protein and print syntheny
|
436
|
-
synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
|
509
|
+
synteny_file = File.open("#{@options[:outdir]}/Prot-Synteny.tsv","w")
|
437
510
|
synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
|
438
511
|
ref_annotated = {}
|
439
|
-
@contig_annotations.each do |contig,prot_annotations|
|
512
|
+
@contig_annotations.each do |contig, prot_annotations|
|
440
513
|
prot_annotations.each do |key,prot|
|
441
|
-
# p key
|
442
|
-
# p prot
|
443
514
|
ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
|
444
515
|
end
|
445
516
|
end
|
446
517
|
|
447
|
-
@
|
518
|
+
@ref_genome.coding_seq.each do |ref_k, ref_v|
|
448
519
|
|
449
520
|
gene = ""
|
450
521
|
coverage_ref = ""
|
@@ -454,7 +525,7 @@ class BacterialAnnotator
|
|
454
525
|
if ref_annotated[ref_v[:protId]] != nil
|
455
526
|
gene = ref_annotated[ref_v[:protId]][:key]
|
456
527
|
coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
|
457
|
-
query_length = @
|
528
|
+
query_length = @query_fasta.annotation_files[:prot_ids_length][gene]
|
458
529
|
coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
|
459
530
|
pId = ref_annotated[ref_v[:protId]][:pId]
|
460
531
|
end
|
data/lib/bacterial-comparator.rb
CHANGED
@@ -377,6 +377,7 @@ class BacterialComparator
|
|
377
377
|
cmd = system("#{@root}/raxml.linux -T 3 -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.PepTree -m PROTGAMMAAUTO -n PEP_BS_TREE -w #{tree_dir}")
|
378
378
|
cmd = system("ln -s #{tree_dir}/RAxML_bipartitionsBranchLabels.PEP_BS_TREE #{tree_dir}/../")
|
379
379
|
Dir.chdir(ori_dir)
|
380
|
+
|
380
381
|
end
|
381
382
|
|
382
383
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-05-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -88,10 +88,9 @@ files:
|
|
88
88
|
- bin/ba_raxml
|
89
89
|
- bin/bacterial-annotator
|
90
90
|
- lib/bacterial-annotator.rb
|
91
|
-
- lib/bacterial-annotator/
|
92
|
-
- lib/bacterial-annotator/
|
93
|
-
- lib/bacterial-annotator/
|
94
|
-
- lib/bacterial-annotator/synteny-manip.rb
|
91
|
+
- lib/bacterial-annotator/sequence-annotation.rb
|
92
|
+
- lib/bacterial-annotator/sequence-fasta.rb
|
93
|
+
- lib/bacterial-annotator/sequence-synteny.rb
|
95
94
|
- lib/bacterial-comparator.rb
|
96
95
|
homepage: http://rubygems.org/gems/bacterial-annotator
|
97
96
|
licenses:
|