bacterial-annotator 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0f8c89d68e4afea5b49e88e68159f573b8fe44c1
4
- data.tar.gz: a0742812f0e42a3346b63c79f590df774fb5e16c
3
+ metadata.gz: b46c0c48f45550ea7deb1580acb72927435df345
4
+ data.tar.gz: b5511795149b832c27fc8867dc3904c9c96d9561
5
5
  SHA512:
6
- metadata.gz: 0f8d59637356bc752bea3271d9f9d733eee83699928312becf7fbbf319ffc482e976c5dc7927f5ba300aa1b263b8909c63b8d17afb2e0c3d4a0b9497c5c49f80
7
- data.tar.gz: 55857b474d7e88373d295f3bf8ea7d9d99d431b8192e8f24d63dec012a511725bad7f1e43edc6e7b1259688c9c84cc868d08c3add2b2e082efca085cf2636489
6
+ metadata.gz: ac790dd5249a74cf559912dc88ebaad614d3f934a43c297848e1bda2db146c682b11862489170ddc21683fed070509f505cdc6759530ab0e1d2f8c80d8951ac7
7
+ data.tar.gz: 0de96df50d2e3a1ad4274134969007b8efef2f1a63b24e7063c45f102c9020ba3e6c337de2214bc1a2f26ae1d33057d9ec7c6ef0ea182e23c77633b3cd8d5e0a
data/bin/ba_blat CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
data/bin/ba_mafft CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
data/bin/ba_prodigal CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
data/bin/ba_raxml CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
@@ -64,7 +64,6 @@ annotate [OPTIONS]
64
64
  --minlength Minimum contig length for annotation [default=500]
65
65
 
66
66
  --meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
67
- --gff Will also generate gff annotation files [off by default]
68
67
 
69
68
  OEM
70
69
 
@@ -92,8 +91,6 @@ def parseOptions_annotate
92
91
  options[:outdir] = ARGV.shift
93
92
  when "--force", "-f"
94
93
  options[:force] = 1
95
- when "--gff"
96
- options[:gff] = 1
97
94
  when "--minlength"
98
95
  options[:minlength] = ARGV.shift
99
96
  when "--pidentity"
@@ -192,7 +189,10 @@ def parseOptions_compare
192
189
  end
193
190
 
194
191
 
195
- # Main
192
+ ########
193
+ # MAIN #
194
+ ########
195
+
196
196
  if ARGV.size > 1
197
197
 
198
198
  ROOT = File.dirname(__FILE__)
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -10,7 +10,7 @@
10
10
 
11
11
  class GenbankManip
12
12
 
13
- attr_accessor :gbk, :coding_seq, :cds_file
13
+ attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
14
14
 
15
15
  # Initialize then genbank file
16
16
  def initialize gbk_file, outdir
@@ -58,13 +58,6 @@ class GenbankManip
58
58
  protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
59
59
  locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
60
60
 
61
- # if ftH.has_key? "translation"
62
- # pep = ftH["translation"][0] if !ftH["translation"].nil?
63
- # else
64
- # dna = get_DNA(ft,@bioseq)
65
- # pep = dna.translate
66
- # end
67
-
68
61
  dna = get_DNA(ft,@bioseq)
69
62
  pep = dna.translate
70
63
  pepBioSeq = Bio::Sequence.auto(pep)
@@ -89,6 +82,49 @@ class GenbankManip
89
82
 
90
83
  end
91
84
 
85
+ # Prepare rRNA tRNA
86
+ def get_rna
87
+
88
+ if @rna_seq == nil
89
+
90
+ @rna_seq = {}
91
+ @gbk.features do |ft|
92
+
93
+ next if ! ft.feature.to_s.include? "RNA"
94
+
95
+ ftH = ft.to_hash
96
+ loc = ft.locations
97
+ # seqBeg = loc[0].from.to_s
98
+ # seqEnd = loc[0].to.to_s
99
+ # strand = loc[0].strand.to_s
100
+ if ftH.has_key? "pseudo"
101
+ next
102
+ end
103
+ # gene = ftH["gene"] if !ftH["gene"].nil?
104
+ # protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
105
+ product = ""
106
+ product = ftH["product"][0] if !ftH["product"].nil?
107
+ locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
108
+
109
+ # puts "#{@accession}\t#{seqBeg}\t#{seqEnd}\t#{strand}\t#{protId}\t#{locustag}\t#{gene[0]}\t#{product[0]}"
110
+ dna = get_DNA(ft,@bioseq)
111
+ dnaBioSeq = Bio::Sequence.auto(dna)
112
+
113
+ @rna_seq[locustag] = {type: ft.feature.to_s,
114
+ location: loc,
115
+ locustag: locustag,
116
+ product: product,
117
+ bioseq_gene: dnaBioSeq}
118
+
119
+ end
120
+
121
+ end
122
+
123
+ @rna_seq
124
+
125
+ end
126
+
127
+
92
128
 
93
129
  # Print CDS to files
94
130
  # RETURN : cds_file path
@@ -116,79 +152,136 @@ class GenbankManip
116
152
 
117
153
  end
118
154
 
155
+ # Print RNA to files
156
+ # RETURN : rna_file path
157
+ def write_rna_to_file outdir
158
+
159
+ rna_file = "#{@gbk.accession}.rna"
160
+
161
+ if @rna_seq == nil
162
+ get_rna
163
+ end
164
+
165
+ File.open("#{outdir}/#{rna_file}", "w") do |fwrite|
166
+ @rna_seq.each_key do |k|
167
+ seqout_dna = @rna_seq[k][:bioseq_gene].output_fasta("#{k}|#{@rna_seq[k][:type]}|#{@rna_seq[k][:product]}",60)
168
+ fwrite.write(seqout_dna)
169
+ end
170
+ end
171
+
172
+ @rna_file = "#{outdir}/" + rna_file
173
+
174
+ end
175
+
119
176
 
120
177
  # add annotation to a genbank file produced by prodigal
121
- def add_annotation annotations, outdir, mode, reference_locus
178
+ def add_annotations annotations, mode, reference_locus=nil
122
179
 
123
- nb_of_added_ft = 0
180
+ # nb_of_added_ft = 0
124
181
  i = 0
125
182
 
126
183
  contig = @gbk.definition
127
184
 
128
- # iterate through
129
- @gbk.features.each_with_index do |cds, ft_index|
185
+ if mode == "inplace"
186
+
187
+ # iterate through
188
+ @gbk.features.each_with_index do |cds, ft_index|
130
189
 
131
- next if cds.feature != "CDS"
190
+ next if cds.feature != "CDS"
132
191
 
133
- if mode == 0
134
192
  ftArray = []
135
193
  cds.qualifiers = []
136
- else
137
- ftArray = cds.qualifiers
138
- end
139
194
 
140
- i += 1
141
- prot_id = contig+"_"+i.to_s
142
- hit = nil
143
- hit = annotations[prot_id] if annotations.has_key? prot_id
144
-
145
- if hit != nil
146
- locus, gene, product, note = nil
147
- locus = hit[:locustag]
148
- gene = hit[:gene]
149
- product = hit[:product]
150
- note = hit[:note]
151
- pId = hit[:pId]
152
-
153
- if gene != nil
154
- qGene = Bio::Feature::Qualifier.new('gene', gene)
155
- ftArray.push(qGene)
156
- end
195
+ i += 1
196
+ prot_id = contig+"_"+i.to_s
197
+ hit = nil
198
+ hit = annotations[prot_id] if annotations.has_key? prot_id
199
+
200
+ if hit != nil
201
+ locus, gene, product, note = nil
202
+ locus = hit[:locustag]
203
+ gene = hit[:gene]
204
+ product = hit[:product]
205
+ note = hit[:note]
206
+ pId = hit[:pId]
207
+
208
+ if gene != nil
209
+ qGene = Bio::Feature::Qualifier.new('gene', gene)
210
+ ftArray.push(qGene)
211
+ end
212
+
213
+ if product != nil
214
+ qProd = Bio::Feature::Qualifier.new('product', product)
215
+ ftArray.push(qProd)
216
+ end
217
+
218
+ # check if there is a reference genome.. reference_locus shouldn't be nil in that case
219
+ if locus != nil
220
+ qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (#{pId}% identity) from #{reference_locus.entry_id}")
221
+ ftArray.push(qNote)
222
+ end
223
+
224
+ if note != nil
225
+ qNote = Bio::Feature::Qualifier.new('note', note)
226
+ ftArray.push(qNote)
227
+ end
157
228
 
158
- if product != nil
159
- qProd = Bio::Feature::Qualifier.new('product', product)
160
- ftArray.push(qProd)
161
229
  end
230
+ cds.qualifiers = ftArray
162
231
 
163
- # check if there is a reference genome.. reference_locus shouldn't be nil in that case
164
- if locus != nil
165
- qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (#{pId}% identity) from #{reference_locus.entry_id}")
166
- ftArray.push(qNote)
167
- end
232
+ end
233
+
234
+
235
+ elsif mode == "new"
236
+
237
+ sorted_annotations = annotations.sort_by { |k, v| v[:query_location][0][0] }
238
+
239
+ new_features = {}
240
+ annotations_done = {}
241
+
242
+ @gbk.features.each_with_index do |ft, ft_index|
243
+
244
+ sorted_annotations.each do |k,v|
245
+
246
+ next if annotations_done.has_key? k
247
+
248
+ if v[:query_location][0][0] < ft.locations[0].from
249
+
250
+ if v[:subject_location][0][0] > v[:subject_location][0][1]
251
+ location = "complement(#{v[:query_location][0][0]}..#{v[:query_location][0][1]})"
252
+ else
253
+ location = "#{v[:query_location][0][0]}..#{v[:query_location][0][1]}"
254
+ end
255
+
256
+ feature = Bio::Feature.new(v[:feature][0],location)
257
+ feature.qualifiers.push(Bio::Feature::Qualifier.new('product',v[:product][0])) if ! v[:product][0].nil? or v[:product][0] != ""
258
+ new_features[ft_index] = feature
259
+ annotations_done[k] = 1
260
+ break
261
+
262
+ end
168
263
 
169
- if note != nil
170
- qNote = Bio::Feature::Qualifier.new('note', note)
171
- ftArray.push(qNote)
172
264
  end
173
265
 
266
+ end
174
267
 
268
+ new_features.each do |k,v|
269
+ @gbk.features.insert(k,v)
175
270
  end
176
- cds.qualifiers = ftArray
177
271
 
178
272
  end
179
273
 
180
- File.open("#{outdir}/#{contig}.gbk", "w") do |f|
274
+ end
275
+
276
+
277
+ def save_genbank_to_file outdir
278
+
279
+ File.open("#{outdir}/#{@gbk.definition}.gbk", "w") do |f|
181
280
  f.write(@gbk.to_biosequence.output(:genbank))
182
281
  end
183
282
 
184
- # Bioruby doesn't support gff at this point
185
- # File.open("#{outdir}/#{contig}.gff", "w") do |f|
186
- # f.write(@gbk.to_biosequence.output(:gff))
187
- # end
188
-
189
283
  end
190
284
 
191
-
192
285
  ###################
193
286
  # Private Methods #
194
287
  ###################
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -12,34 +12,42 @@ class SyntenyManip
12
12
 
13
13
  attr_reader :query_file, :subject_file, :aln_hits
14
14
 
15
- def initialize query_file, subject_file, name, pidentity
15
+ def initialize query_file, subject_file, name, pidentity, type
16
16
  @query_file = query_file
17
17
  @subject_file = subject_file
18
18
  @name = name
19
19
  @pidentity = pidentity
20
20
  @aln_file = nil
21
+ @type = type
21
22
  end # end of initialize
22
23
 
23
24
  # run blat on proteins
24
25
  def run_blat root, outdir
25
- system("#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
26
+ base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity}"
27
+ system("#{base_cmd} #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
28
+ if @type == "prot"
29
+ system("#{base_cmd} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
30
+ end
26
31
  @aln_file = "#{outdir}/#{@name}.blat8.tsv"
27
32
  # extract_hits
28
33
  end # end of method
29
34
 
30
35
  # Extract Hit from blast8 file and save it in hash
31
36
  # contig-0_1 ABJ71957.1 96.92 65 2 0 1 65 1 65 9.2e-31 131.0
32
- def extract_hits mode
37
+ def extract_hits_prodigal mode, ref_cds=nil
33
38
 
34
39
  @aln_hits = {}
40
+ feature = ""
35
41
  File.open(@aln_file,"r") do |fread|
36
42
  while l = fread.gets
37
43
  lA = l.chomp!.split("\t")
38
44
  key = lA[0]
39
45
  if mode == :refgenome
40
46
  hit = lA[1]
47
+ feature = "cds"
41
48
  elsif mode == :externaldb
42
- hit = lA[1].chomp.split("|")[1]
49
+ hit = lA[1].chomp.split("|")[3]
50
+ feature = "cds"
43
51
  end
44
52
  if ! @aln_hits.has_key? key
45
53
  next if lA[2].to_f < @pidentity
@@ -50,7 +58,8 @@ class SyntenyManip
50
58
  hits: [hit],
51
59
  length: [lA[3].to_i],
52
60
  query_location: [[lA[6].to_i,lA[7].to_i]],
53
- subject_location: [[lA[8].to_i,lA[9].to_i]]
61
+ subject_location: [[lA[8].to_i,lA[9].to_i]],
62
+ feature: feature
54
63
  }
55
64
  elsif lA[11].to_f > @aln_hits[key][:score]
56
65
  @aln_hits[key] = {
@@ -60,7 +69,8 @@ class SyntenyManip
60
69
  hits: [hit],
61
70
  length: [lA[3].to_i],
62
71
  query_location: [[lA[6].to_i,lA[7].to_i]],
63
- subject_location: [[lA[8].to_i,lA[9].to_i]]
72
+ subject_location: [[lA[8].to_i,lA[9].to_i]],
73
+ feature: feature
64
74
  }
65
75
  elsif lA[11].to_f == @aln_hits[key][:score]
66
76
  @aln_hits[key][:hits] << hit
@@ -73,49 +83,116 @@ class SyntenyManip
73
83
 
74
84
  end # end of method
75
85
 
86
+ # Extract Hit from blast8 file and save it in hash
87
+ # prpa PA0668.4|rRNA|23S 99.97 2891 1 0 705042 707932 1 2891 0.0e+00 5671.0
88
+ def extract_hits_dna mode
76
89
 
90
+ @aln_hits = {}
91
+ feature = ""
92
+ File.open(@aln_file,"r") do |fread|
93
+ while l = fread.gets
94
+ lA = l.chomp!.split("\t")
95
+ key = lA[0]+"_"+lA[6]+"_"+lA[7]
96
+ if mode == :rna
97
+ hit_split = lA[1].chomp.split("|")
98
+ hit = hit_split[0]
99
+ feature = hit_split[1]
100
+ product = hit_split[2]
101
+ end
102
+ if ! @aln_hits.has_key? key
103
+ next if lA[2].to_f < @pidentity
104
+ @aln_hits[key] = {
105
+ pId: lA[2].to_f.round(2),
106
+ evalue: lA[10],
107
+ score: lA[11].to_f,
108
+ hits: [hit],
109
+ product: [product],
110
+ length: [lA[3].to_i],
111
+ query_location: [[lA[6].to_i,lA[7].to_i]],
112
+ subject_location: [[lA[8].to_i,lA[9].to_i]],
113
+ feature: [feature]
114
+ }
115
+ elsif lA[11].to_f > @aln_hits[key][:score]
116
+ @aln_hits[key] = {
117
+ pId: lA[2].to_f.round(2),
118
+ evalue: lA[10],
119
+ score: lA[11].to_f,
120
+ hits: [hit],
121
+ product: [product],
122
+ length: [lA[3].to_i],
123
+ query_location: [[lA[6].to_i,lA[7].to_i]],
124
+ subject_location: [[lA[8].to_i,lA[9].to_i]],
125
+ feature: [feature]
126
+ }
127
+ elsif lA[11].to_f == @aln_hits[key][:score]
128
+ @aln_hits[key][:hits] << hit
129
+ @aln_hits[key][:length] << lA[3].to_i
130
+ @aln_hits[key][:query_location] << [lA[6].to_i,lA[7].to_i]
131
+ @aln_hits[key][:subject_location] << [lA[8].to_i,lA[9].to_i]
132
+ @aln_hits[key][:feature] << feature
133
+ @aln_hits[key][:product] << product
134
+ end
135
+ end
136
+ end
77
137
 
78
- # Get the annotations for a contig for RerenceGenome
79
- def get_annotation_for_contig prots_to_annotate, ref_cds
138
+ prune_aln_hits @aln_hits
139
+
140
+ end # end of method
80
141
 
81
- return {} if prots_to_annotate == nil
82
142
 
83
- contig_to_annotate = prots_to_annotate[0].split("_")[0..-2].join("_")
143
+ # Get the annotations for a contig for RerenceGenome
144
+ def get_annotation_for_contig contig_to_annotate, prots_to_annotate=nil, ref_cds=nil
145
+
84
146
  annotations = {}
85
- prots = []
86
147
 
87
- @aln_hits.each_key do |k|
88
- contig = k.split("_")[0..-2].join("_")
89
- if contig == contig_to_annotate
90
- prots << k
148
+ if prots_to_annotate != nil
149
+
150
+ # contig_to_annotate = prots_to_annotate[0].split("_")[0..-2].join("_")
151
+ prots = []
152
+
153
+ @aln_hits.each_key do |k|
154
+ contig = k.split("_")[0..-2].join("_")
155
+ if contig == contig_to_annotate
156
+ prots << k
157
+ end
91
158
  end
92
- end
93
159
 
94
- # sorting the prot by their appearance in the contig
95
- prots.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
160
+ # sorting the prot by their appearance in the contig
161
+ prots.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
96
162
 
97
- i = 0
98
- prots_to_annotate.each do |p|
163
+ i = 0
164
+ prots_to_annotate.each do |p|
99
165
 
100
- if @aln_hits.has_key? p
166
+ if @aln_hits.has_key? p
101
167
 
102
- hit_index = 0
168
+ hit_index = 0
103
169
 
104
- if @aln_hits[p][:hits].length > 1
105
- hit_index = choose_best_hit i, prots, ref_cds
106
- end
170
+ if @aln_hits[p][:hits].length > 1
171
+ hit_index = choose_best_hit i, prots, ref_cds
172
+ end
107
173
 
108
- h = @aln_hits[p][:hits][hit_index]
109
- hit = ref_cds[h]
110
- annotations[p] = hit
111
- annotations[p][:pId] = @aln_hits[p][:pId]
112
- annotations[p][:length] = @aln_hits[p][:length][hit_index]
113
- i+=1
174
+ h = @aln_hits[p][:hits][hit_index]
175
+ hit = ref_cds[h]
176
+ annotations[p] = hit
177
+ annotations[p][:pId] = @aln_hits[p][:pId]
178
+ annotations[p][:length] = @aln_hits[p][:length][hit_index]
179
+ i+=1
114
180
 
115
- else
181
+ else
182
+
183
+ annotations[p] = nil
184
+
185
+ end
116
186
 
117
- annotations[p] = nil
187
+ end
188
+
189
+ elsif ! @aln_hits.empty?
118
190
 
191
+ @aln_hits.each_key do |k|
192
+ contig = k.split("_")[0..-3].join("_")
193
+ if contig == contig_to_annotate
194
+ annotations[k] = @aln_hits[k]
195
+ end
119
196
  end
120
197
 
121
198
  end
@@ -192,6 +269,64 @@ class SyntenyManip
192
269
 
193
270
  end # end of method
194
271
 
272
+ def prune_aln_hits aln_hits
273
+
274
+ # @aln_hits[key] = {
275
+ # pId: lA[2].to_f.round(2),
276
+ # evalue: lA[10],
277
+ # score: lA[11].to_f,
278
+ # hits: [hit],
279
+ # length: [lA[3].to_i],
280
+ # query_location: [[lA[6].to_i,lA[7].to_i]],
281
+ # subject_location: [[lA[8].to_i,lA[9].to_i]],
282
+ # feature: [feature]
283
+ # }
284
+
285
+ keys_to_delete = []
286
+
287
+ aln_hits.each do |key1,val1|
288
+
289
+ aln_hits.each do |key2,val2|
290
+
291
+ next if key1==key2
292
+ next if keys_to_delete.include? key1
293
+ next if keys_to_delete.include? key2
294
+
295
+ if val1[:query_location][0][0] >= val2[:query_location][0][0] and
296
+ val1[:query_location][0][0] < val2[:query_location][0][1]
297
+ overlap_len = val2[:query_location][0][1] - val1[:query_location][0][0]
298
+ val1_len = val1[:query_location][0][1]-val1[:query_location][0][0]
299
+ val2_len = val2[:query_location][0][1]-val2[:query_location][0][0]
300
+ if overlap_len.to_f/val1_len > 0.2 and overlap_len.to_f/val2_len > 0.2
301
+ if val1[:score] < val2[:score]
302
+ keys_to_delete << key1
303
+ else
304
+ keys_to_delete << key2
305
+ end
306
+ end
307
+ elsif val2[:query_location][0][0] >= val1[:query_location][0][0] and
308
+ val2[:query_location][0][0] < val1[:query_location][0][1]
309
+ overlap_len = val1[:query_location][0][1] - val2[:query_location][0][0]
310
+ val1_len = val1[:query_location][0][1]-val1[:query_location][0][0]
311
+ val2_len = val2[:query_location][0][1]-val2[:query_location][0][0]
312
+ if overlap_len.to_f/val1_len > 0.2 and overlap_len.to_f/val2_len > 0.2
313
+ if val1[:score] < val2[:score]
314
+ keys_to_delete << key1
315
+ else
316
+ keys_to_delete << key2
317
+ end
318
+ end
319
+ end
320
+
321
+ end
322
+
323
+ end
324
+
325
+ keys_to_delete.each do |k|
326
+ aln_hits.delete(k)
327
+ end
328
+
329
+ end # end of method
195
330
 
196
331
 
197
332
  end # end of class
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -33,7 +33,7 @@ class BacterialAnnotator
33
33
  abort "Output directory already exist ! Choose another one or use -f to overwrite"
34
34
  else
35
35
  puts "Overwriting output directory #{@outdir}"
36
- FileUtils.remove_dir(@outdir, force=true)
36
+ FileUtils.remove_dir(@outdir, :force=>true)
37
37
  end
38
38
  end
39
39
  Dir.mkdir(@outdir)
@@ -67,6 +67,7 @@ class BacterialAnnotator
67
67
  puts "Prodigal done."
68
68
  if @with_refence_genome
69
69
  @refgenome.write_cds_to_file @outdir
70
+ @refgenome.write_rna_to_file @outdir
70
71
  puts "Successfully loaded #{@refgenome.gbk.definition}"
71
72
  end
72
73
  end # end of method
@@ -77,10 +78,11 @@ class BacterialAnnotator
77
78
  # process reference genome synteny
78
79
  if @with_refence_genome # Annotation with the Reference Genome
79
80
 
80
- @prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity)
81
- puts "\nRunning BLAT alignment with Reference Genome.."
81
+ # run CDS annotation
82
+ puts "\nRunning BLAT alignment with Reference Genome CDS.."
83
+ @prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity, "prot")
82
84
  @prot_synteny.run_blat @root, @outdir
83
- @prot_synteny.extract_hits :refgenome
85
+ @prot_synteny.extract_hits_prodigal :refgenome
84
86
 
85
87
  @fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
86
88
 
@@ -91,9 +93,9 @@ class BacterialAnnotator
91
93
  end
92
94
 
93
95
  contig_prots = @fasta.prodigal_files[:prot_ids_by_contig][contig]
94
-
96
+ # contig_to_annotate = contig_prots[0].split("_")[0..-2].join("_")
95
97
  # contig_prot_annotations = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
96
- @contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
98
+ @contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig, contig_prots, @refgenome.coding_seq
97
99
 
98
100
  remaining_cds = cumulate_annotation_stats_reference contig, @contig_annotations[contig]
99
101
 
@@ -106,44 +108,20 @@ class BacterialAnnotator
106
108
  # dump foreign proteins to file
107
109
  foreign_cds_file = dump_cds
108
110
 
109
- # Iterate over each Ref protein and print syntheny
110
- synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
111
- synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
112
- ref_annotated = {}
113
- @contig_annotations.each do |contig,prot_annotations|
114
- prot_annotations.each do |key,prot|
115
- # p key
116
- # p prot
117
- ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
118
- end
119
- end
120
-
121
- @refgenome.coding_seq.each do |ref_k, ref_v|
122
- gene = ""
123
- coverage_ref = ""
124
- coverage_query = ""
125
- query_length = ""
126
- pId = ""
127
- if ref_annotated[ref_v[:protId]] != nil
128
- gene = ref_annotated[ref_v[:protId]][:key]
129
- coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
130
- query_length = @fasta.prodigal_files[:prot_ids_length][gene]
131
- coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
132
- pId = ref_annotated[ref_v[:protId]][:pId]
133
- end
134
-
135
- synteny_file.write(ref_v[:protId])
136
- synteny_file.write("\t"+ref_v[:locustag])
137
- synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
138
- synteny_file.write("\t"+coverage_ref.to_s)
139
- synteny_file.write("\t"+pId.to_s)
140
- synteny_file.write("\t"+gene)
141
- synteny_file.write("\t"+query_length.to_s)
142
- synteny_file.write("\t"+coverage_query.to_s)
143
- synteny_file.write("\n")
111
+ # dump reference CDS synteny to file
112
+ dump_ref_synteny_to_file
144
113
 
114
+ # run RNA annotation
115
+ puts "\nRunning BLAT alignment with Reference Genome RNA.."
116
+ @rna_synteny = SyntenyManip.new(@fasta.fasta_file, @refgenome.rna_file, "RNA-Ref", @pidentity, "dna")
117
+ @rna_synteny.run_blat @root, @outdir
118
+ @rna_synteny.extract_hits_dna :rna
119
+ @contig_annotations_rna = {}
120
+ @fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
121
+ puts "adding rna_annotation for contig #{contig}"
122
+ @contig_annotations_rna[contig] = @rna_synteny.get_annotation_for_contig contig
123
+ p @contig_annotations_rna[contig]
145
124
  end
146
- synteny_file.close
147
125
 
148
126
  else # no reference genome
149
127
 
@@ -156,7 +134,7 @@ class BacterialAnnotator
156
134
  finish_annotation foreign_cds_file
157
135
 
158
136
  # Parse annotations to genbank files
159
- parsing_genbank_files
137
+ parse_genbank_files
160
138
 
161
139
  puts "\nPrinting Statistics.."
162
140
  print_stats "#{@outdir}/Annotation-Stats.txt"
@@ -177,7 +155,7 @@ class BacterialAnnotator
177
155
  externaldb_synteny = SyntenyManip.new(remaining_cds_file, db_file, "Prot-ExternalDB", @pidentity)
178
156
  puts "\nRunning BLAT alignment with External Database.."
179
157
  externaldb_synteny.run_blat @root, @outdir
180
- externaldb_synteny.extract_hits :externaldb
158
+ externaldb_synteny.extract_hits_prodigal :externaldb
181
159
 
182
160
  externaldb_synteny.aln_hits.each do |k,v|
183
161
  contig_of_protein = k.split("_")[0..-2].join("_")
@@ -188,15 +166,14 @@ class BacterialAnnotator
188
166
 
189
167
  hit_gi = v[:hits][0]
190
168
 
191
- note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
192
-
193
- # p v
194
- # p ref_cds[hit_gi]
169
+ # note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
170
+ note = "Protein homology (#{v[:pId]}% identity) with #{hit_gi}"
195
171
 
196
172
  if ref_cds[hit_gi][:org] != ""
197
173
  note += " from #{ref_cds[hit_gi][:org]}"
198
174
  end
199
175
  @contig_annotations[contig_of_protein][k] = {product: ref_cds[hit_gi][:product],
176
+ feature: "cds",
200
177
  gene: nil,
201
178
  locustag: nil,
202
179
  note: note}
@@ -237,17 +214,16 @@ class BacterialAnnotator
237
214
  end
238
215
  ncbiblast.aln_hits.each do |k,v|
239
216
  contig_of_protein = k.split("_")[0..-2].join("_")
240
- # @contig_annotations[contig_of_protein][k][:product] = v[:hits][0][:product]
241
217
  if ! @contig_annotations.has_key? contig_of_protein
242
218
  @contig_annotations[contig_of_protein] = {}
243
219
  end
244
-
245
- note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
246
- # note = "correspond to gi:#{v[:hits][0][:gi]}"
220
+ # note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
221
+ note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:accession]}"
247
222
  if v[:hits][0][:org] != ""
248
223
  note += " from #{v[:hits][0][:org]}"
249
224
  end
250
225
  @contig_annotations[contig_of_protein][k] = {product: v[:hits][0][:product],
226
+ feature: "cds",
251
227
  gene: nil,
252
228
  locustag: nil,
253
229
  note: note}
@@ -263,7 +239,7 @@ class BacterialAnnotator
263
239
 
264
240
 
265
241
  # parse all genbank files
266
- def parsing_genbank_files
242
+ def parse_genbank_files
267
243
 
268
244
  puts "\nParsing annotation into genbank files.."
269
245
  @contig_annotations.each do |contig, contig_prot_annotations|
@@ -271,7 +247,15 @@ class BacterialAnnotator
271
247
  gbk_to_annotate = GenbankManip.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
272
248
  reference_locus = nil
273
249
  reference_locus = @refgenome.gbk.locus if @with_refence_genome
274
- gbk_to_annotate.add_annotation contig_prot_annotations, gbk_path, 0, reference_locus
250
+ gbk_to_annotate.add_annotations contig_prot_annotations, "inplace", reference_locus
251
+
252
+ if @contig_annotations_rna.has_key? contig
253
+ puts "Trying RNA annotation"
254
+ gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
255
+ end
256
+
257
+ gbk_to_annotate.save_genbank_to_file gbk_path
258
+
275
259
  end
276
260
 
277
261
  end # end of method
@@ -314,6 +298,7 @@ class BacterialAnnotator
314
298
  p_cds_annotated = @annotation_stats[:annotated_cds].to_f/@annotation_stats[:total_cds].to_f
315
299
 
316
300
  File.open(file, "w") do |fopen|
301
+
317
302
  fopen.write("#Contigs annotation based on reference genomes\n")
318
303
  fopen.write("Short Contigs (< #{@minlength}) :\t\t" + @annotation_stats[:short_contigs].length.to_s + "\n")
319
304
  fopen.write("Foreign Contigs :\t\t" + @annotation_stats[:foreign_contigs].length.to_s + "\n")
@@ -446,6 +431,51 @@ class BacterialAnnotator
446
431
 
447
432
  end # end of method
448
433
 
449
- private :dump_cds, :split_remaining_cds_file
434
+ # will reference CDS synteny to file
435
+ def dump_ref_synteny_to_file
436
+
437
+ # Iterate over each Ref protein and print syntheny
438
+ synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
439
+ synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
440
+ ref_annotated = {}
441
+ @contig_annotations.each do |contig,prot_annotations|
442
+ prot_annotations.each do |key,prot|
443
+ # p key
444
+ # p prot
445
+ ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
446
+ end
447
+ end
448
+
449
+ @refgenome.coding_seq.each do |ref_k, ref_v|
450
+
451
+ gene = ""
452
+ coverage_ref = ""
453
+ coverage_query = ""
454
+ query_length = ""
455
+ pId = ""
456
+ if ref_annotated[ref_v[:protId]] != nil
457
+ gene = ref_annotated[ref_v[:protId]][:key]
458
+ coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
459
+ query_length = @fasta.prodigal_files[:prot_ids_length][gene]
460
+ coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
461
+ pId = ref_annotated[ref_v[:protId]][:pId]
462
+ end
463
+
464
+ synteny_file.write(ref_v[:protId])
465
+ synteny_file.write("\t"+ref_v[:locustag])
466
+ synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
467
+ synteny_file.write("\t"+coverage_ref.to_s)
468
+ synteny_file.write("\t"+pId.to_s)
469
+ synteny_file.write("\t"+gene)
470
+ synteny_file.write("\t"+query_length.to_s)
471
+ synteny_file.write("\t"+coverage_query.to_s)
472
+ synteny_file.write("\n")
473
+
474
+ end
475
+ synteny_file.close
476
+
477
+ end
478
+
479
+ private :dump_cds, :split_remaining_cds_file, :dump_ref_synteny_to_file
450
480
 
451
481
  end # end of class
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7
4
+ version: 0.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-25 00:00:00.000000000 Z
11
+ date: 2017-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -72,7 +72,7 @@ dependencies:
72
72
  version: 1.9.0
73
73
  description: GEM to annotate bacterial genome sequence based on a reference genome
74
74
  and complete the annotation with an external database or a remote database.
75
- email: maxime@deraspe.net
75
+ email: maximilien1er@gmail.com
76
76
  executables:
77
77
  - bacterial-annotator
78
78
  - ba_prodigal