bacterial-annotator 0.3.7 → 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0f8c89d68e4afea5b49e88e68159f573b8fe44c1
4
- data.tar.gz: a0742812f0e42a3346b63c79f590df774fb5e16c
3
+ metadata.gz: b46c0c48f45550ea7deb1580acb72927435df345
4
+ data.tar.gz: b5511795149b832c27fc8867dc3904c9c96d9561
5
5
  SHA512:
6
- metadata.gz: 0f8d59637356bc752bea3271d9f9d733eee83699928312becf7fbbf319ffc482e976c5dc7927f5ba300aa1b263b8909c63b8d17afb2e0c3d4a0b9497c5c49f80
7
- data.tar.gz: 55857b474d7e88373d295f3bf8ea7d9d99d431b8192e8f24d63dec012a511725bad7f1e43edc6e7b1259688c9c84cc868d08c3add2b2e082efca085cf2636489
6
+ metadata.gz: ac790dd5249a74cf559912dc88ebaad614d3f934a43c297848e1bda2db146c682b11862489170ddc21683fed070509f505cdc6759530ab0e1d2f8c80d8951ac7
7
+ data.tar.gz: 0de96df50d2e3a1ad4274134969007b8efef2f1a63b24e7063c45f102c9020ba3e6c337de2214bc1a2f26ae1d33057d9ec7c6ef0ea182e23c77633b3cd8d5e0a
data/bin/ba_blat CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
data/bin/ba_mafft CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
data/bin/ba_prodigal CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
data/bin/ba_raxml CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
  # -*- coding: utf-8 -*-
3
3
  # author: maxime déraspe
4
- # email: maxime@deraspe.net
4
+ # email: maximilien1er@gmail.com
5
5
  # review:
6
6
  # date: 15-02-24
7
7
  # version: 0.01
@@ -64,7 +64,6 @@ annotate [OPTIONS]
64
64
  --minlength Minimum contig length for annotation [default=500]
65
65
 
66
66
  --meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
67
- --gff Will also generate gff annotation files [off by default]
68
67
 
69
68
  OEM
70
69
 
@@ -92,8 +91,6 @@ def parseOptions_annotate
92
91
  options[:outdir] = ARGV.shift
93
92
  when "--force", "-f"
94
93
  options[:force] = 1
95
- when "--gff"
96
- options[:gff] = 1
97
94
  when "--minlength"
98
95
  options[:minlength] = ARGV.shift
99
96
  when "--pidentity"
@@ -192,7 +189,10 @@ def parseOptions_compare
192
189
  end
193
190
 
194
191
 
195
- # Main
192
+ ########
193
+ # MAIN #
194
+ ########
195
+
196
196
  if ARGV.size > 1
197
197
 
198
198
  ROOT = File.dirname(__FILE__)
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -10,7 +10,7 @@
10
10
 
11
11
  class GenbankManip
12
12
 
13
- attr_accessor :gbk, :coding_seq, :cds_file
13
+ attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
14
14
 
15
15
  # Initialize then genbank file
16
16
  def initialize gbk_file, outdir
@@ -58,13 +58,6 @@ class GenbankManip
58
58
  protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
59
59
  locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
60
60
 
61
- # if ftH.has_key? "translation"
62
- # pep = ftH["translation"][0] if !ftH["translation"].nil?
63
- # else
64
- # dna = get_DNA(ft,@bioseq)
65
- # pep = dna.translate
66
- # end
67
-
68
61
  dna = get_DNA(ft,@bioseq)
69
62
  pep = dna.translate
70
63
  pepBioSeq = Bio::Sequence.auto(pep)
@@ -89,6 +82,49 @@ class GenbankManip
89
82
 
90
83
  end
91
84
 
85
+ # Prepare rRNA tRNA
86
+ def get_rna
87
+
88
+ if @rna_seq == nil
89
+
90
+ @rna_seq = {}
91
+ @gbk.features do |ft|
92
+
93
+ next if ! ft.feature.to_s.include? "RNA"
94
+
95
+ ftH = ft.to_hash
96
+ loc = ft.locations
97
+ # seqBeg = loc[0].from.to_s
98
+ # seqEnd = loc[0].to.to_s
99
+ # strand = loc[0].strand.to_s
100
+ if ftH.has_key? "pseudo"
101
+ next
102
+ end
103
+ # gene = ftH["gene"] if !ftH["gene"].nil?
104
+ # protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
105
+ product = ""
106
+ product = ftH["product"][0] if !ftH["product"].nil?
107
+ locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
108
+
109
+ # puts "#{@accession}\t#{seqBeg}\t#{seqEnd}\t#{strand}\t#{protId}\t#{locustag}\t#{gene[0]}\t#{product[0]}"
110
+ dna = get_DNA(ft,@bioseq)
111
+ dnaBioSeq = Bio::Sequence.auto(dna)
112
+
113
+ @rna_seq[locustag] = {type: ft.feature.to_s,
114
+ location: loc,
115
+ locustag: locustag,
116
+ product: product,
117
+ bioseq_gene: dnaBioSeq}
118
+
119
+ end
120
+
121
+ end
122
+
123
+ @rna_seq
124
+
125
+ end
126
+
127
+
92
128
 
93
129
  # Print CDS to files
94
130
  # RETURN : cds_file path
@@ -116,79 +152,136 @@ class GenbankManip
116
152
 
117
153
  end
118
154
 
155
+ # Print RNA to files
156
+ # RETURN : rna_file path
157
+ def write_rna_to_file outdir
158
+
159
+ rna_file = "#{@gbk.accession}.rna"
160
+
161
+ if @rna_seq == nil
162
+ get_rna
163
+ end
164
+
165
+ File.open("#{outdir}/#{rna_file}", "w") do |fwrite|
166
+ @rna_seq.each_key do |k|
167
+ seqout_dna = @rna_seq[k][:bioseq_gene].output_fasta("#{k}|#{@rna_seq[k][:type]}|#{@rna_seq[k][:product]}",60)
168
+ fwrite.write(seqout_dna)
169
+ end
170
+ end
171
+
172
+ @rna_file = "#{outdir}/" + rna_file
173
+
174
+ end
175
+
119
176
 
120
177
  # add annotation to a genbank file produced by prodigal
121
- def add_annotation annotations, outdir, mode, reference_locus
178
+ def add_annotations annotations, mode, reference_locus=nil
122
179
 
123
- nb_of_added_ft = 0
180
+ # nb_of_added_ft = 0
124
181
  i = 0
125
182
 
126
183
  contig = @gbk.definition
127
184
 
128
- # iterate through
129
- @gbk.features.each_with_index do |cds, ft_index|
185
+ if mode == "inplace"
186
+
187
+ # iterate through
188
+ @gbk.features.each_with_index do |cds, ft_index|
130
189
 
131
- next if cds.feature != "CDS"
190
+ next if cds.feature != "CDS"
132
191
 
133
- if mode == 0
134
192
  ftArray = []
135
193
  cds.qualifiers = []
136
- else
137
- ftArray = cds.qualifiers
138
- end
139
194
 
140
- i += 1
141
- prot_id = contig+"_"+i.to_s
142
- hit = nil
143
- hit = annotations[prot_id] if annotations.has_key? prot_id
144
-
145
- if hit != nil
146
- locus, gene, product, note = nil
147
- locus = hit[:locustag]
148
- gene = hit[:gene]
149
- product = hit[:product]
150
- note = hit[:note]
151
- pId = hit[:pId]
152
-
153
- if gene != nil
154
- qGene = Bio::Feature::Qualifier.new('gene', gene)
155
- ftArray.push(qGene)
156
- end
195
+ i += 1
196
+ prot_id = contig+"_"+i.to_s
197
+ hit = nil
198
+ hit = annotations[prot_id] if annotations.has_key? prot_id
199
+
200
+ if hit != nil
201
+ locus, gene, product, note = nil
202
+ locus = hit[:locustag]
203
+ gene = hit[:gene]
204
+ product = hit[:product]
205
+ note = hit[:note]
206
+ pId = hit[:pId]
207
+
208
+ if gene != nil
209
+ qGene = Bio::Feature::Qualifier.new('gene', gene)
210
+ ftArray.push(qGene)
211
+ end
212
+
213
+ if product != nil
214
+ qProd = Bio::Feature::Qualifier.new('product', product)
215
+ ftArray.push(qProd)
216
+ end
217
+
218
+ # check if there is a reference genome.. reference_locus shouldn't be nil in that case
219
+ if locus != nil
220
+ qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (#{pId}% identity) from #{reference_locus.entry_id}")
221
+ ftArray.push(qNote)
222
+ end
223
+
224
+ if note != nil
225
+ qNote = Bio::Feature::Qualifier.new('note', note)
226
+ ftArray.push(qNote)
227
+ end
157
228
 
158
- if product != nil
159
- qProd = Bio::Feature::Qualifier.new('product', product)
160
- ftArray.push(qProd)
161
229
  end
230
+ cds.qualifiers = ftArray
162
231
 
163
- # check if there is a reference genome.. reference_locus shouldn't be nil in that case
164
- if locus != nil
165
- qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (#{pId}% identity) from #{reference_locus.entry_id}")
166
- ftArray.push(qNote)
167
- end
232
+ end
233
+
234
+
235
+ elsif mode == "new"
236
+
237
+ sorted_annotations = annotations.sort_by { |k, v| v[:query_location][0][0] }
238
+
239
+ new_features = {}
240
+ annotations_done = {}
241
+
242
+ @gbk.features.each_with_index do |ft, ft_index|
243
+
244
+ sorted_annotations.each do |k,v|
245
+
246
+ next if annotations_done.has_key? k
247
+
248
+ if v[:query_location][0][0] < ft.locations[0].from
249
+
250
+ if v[:subject_location][0][0] > v[:subject_location][0][1]
251
+ location = "complement(#{v[:query_location][0][0]}..#{v[:query_location][0][1]})"
252
+ else
253
+ location = "#{v[:query_location][0][0]}..#{v[:query_location][0][1]}"
254
+ end
255
+
256
+ feature = Bio::Feature.new(v[:feature][0],location)
257
+ feature.qualifiers.push(Bio::Feature::Qualifier.new('product',v[:product][0])) if ! v[:product][0].nil? or v[:product][0] != ""
258
+ new_features[ft_index] = feature
259
+ annotations_done[k] = 1
260
+ break
261
+
262
+ end
168
263
 
169
- if note != nil
170
- qNote = Bio::Feature::Qualifier.new('note', note)
171
- ftArray.push(qNote)
172
264
  end
173
265
 
266
+ end
174
267
 
268
+ new_features.each do |k,v|
269
+ @gbk.features.insert(k,v)
175
270
  end
176
- cds.qualifiers = ftArray
177
271
 
178
272
  end
179
273
 
180
- File.open("#{outdir}/#{contig}.gbk", "w") do |f|
274
+ end
275
+
276
+
277
+ def save_genbank_to_file outdir
278
+
279
+ File.open("#{outdir}/#{@gbk.definition}.gbk", "w") do |f|
181
280
  f.write(@gbk.to_biosequence.output(:genbank))
182
281
  end
183
282
 
184
- # Bioruby doesn't support gff at this point
185
- # File.open("#{outdir}/#{contig}.gff", "w") do |f|
186
- # f.write(@gbk.to_biosequence.output(:gff))
187
- # end
188
-
189
283
  end
190
284
 
191
-
192
285
  ###################
193
286
  # Private Methods #
194
287
  ###################
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -12,34 +12,42 @@ class SyntenyManip
12
12
 
13
13
  attr_reader :query_file, :subject_file, :aln_hits
14
14
 
15
- def initialize query_file, subject_file, name, pidentity
15
+ def initialize query_file, subject_file, name, pidentity, type
16
16
  @query_file = query_file
17
17
  @subject_file = subject_file
18
18
  @name = name
19
19
  @pidentity = pidentity
20
20
  @aln_file = nil
21
+ @type = type
21
22
  end # end of initialize
22
23
 
23
24
  # run blat on proteins
24
25
  def run_blat root, outdir
25
- system("#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
26
+ base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity}"
27
+ system("#{base_cmd} #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
28
+ if @type == "prot"
29
+ system("#{base_cmd} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
30
+ end
26
31
  @aln_file = "#{outdir}/#{@name}.blat8.tsv"
27
32
  # extract_hits
28
33
  end # end of method
29
34
 
30
35
  # Extract Hit from blast8 file and save it in hash
31
36
  # contig-0_1 ABJ71957.1 96.92 65 2 0 1 65 1 65 9.2e-31 131.0
32
- def extract_hits mode
37
+ def extract_hits_prodigal mode, ref_cds=nil
33
38
 
34
39
  @aln_hits = {}
40
+ feature = ""
35
41
  File.open(@aln_file,"r") do |fread|
36
42
  while l = fread.gets
37
43
  lA = l.chomp!.split("\t")
38
44
  key = lA[0]
39
45
  if mode == :refgenome
40
46
  hit = lA[1]
47
+ feature = "cds"
41
48
  elsif mode == :externaldb
42
- hit = lA[1].chomp.split("|")[1]
49
+ hit = lA[1].chomp.split("|")[3]
50
+ feature = "cds"
43
51
  end
44
52
  if ! @aln_hits.has_key? key
45
53
  next if lA[2].to_f < @pidentity
@@ -50,7 +58,8 @@ class SyntenyManip
50
58
  hits: [hit],
51
59
  length: [lA[3].to_i],
52
60
  query_location: [[lA[6].to_i,lA[7].to_i]],
53
- subject_location: [[lA[8].to_i,lA[9].to_i]]
61
+ subject_location: [[lA[8].to_i,lA[9].to_i]],
62
+ feature: feature
54
63
  }
55
64
  elsif lA[11].to_f > @aln_hits[key][:score]
56
65
  @aln_hits[key] = {
@@ -60,7 +69,8 @@ class SyntenyManip
60
69
  hits: [hit],
61
70
  length: [lA[3].to_i],
62
71
  query_location: [[lA[6].to_i,lA[7].to_i]],
63
- subject_location: [[lA[8].to_i,lA[9].to_i]]
72
+ subject_location: [[lA[8].to_i,lA[9].to_i]],
73
+ feature: feature
64
74
  }
65
75
  elsif lA[11].to_f == @aln_hits[key][:score]
66
76
  @aln_hits[key][:hits] << hit
@@ -73,49 +83,116 @@ class SyntenyManip
73
83
 
74
84
  end # end of method
75
85
 
86
+ # Extract Hit from blast8 file and save it in hash
87
+ # prpa PA0668.4|rRNA|23S 99.97 2891 1 0 705042 707932 1 2891 0.0e+00 5671.0
88
+ def extract_hits_dna mode
76
89
 
90
+ @aln_hits = {}
91
+ feature = ""
92
+ File.open(@aln_file,"r") do |fread|
93
+ while l = fread.gets
94
+ lA = l.chomp!.split("\t")
95
+ key = lA[0]+"_"+lA[6]+"_"+lA[7]
96
+ if mode == :rna
97
+ hit_split = lA[1].chomp.split("|")
98
+ hit = hit_split[0]
99
+ feature = hit_split[1]
100
+ product = hit_split[2]
101
+ end
102
+ if ! @aln_hits.has_key? key
103
+ next if lA[2].to_f < @pidentity
104
+ @aln_hits[key] = {
105
+ pId: lA[2].to_f.round(2),
106
+ evalue: lA[10],
107
+ score: lA[11].to_f,
108
+ hits: [hit],
109
+ product: [product],
110
+ length: [lA[3].to_i],
111
+ query_location: [[lA[6].to_i,lA[7].to_i]],
112
+ subject_location: [[lA[8].to_i,lA[9].to_i]],
113
+ feature: [feature]
114
+ }
115
+ elsif lA[11].to_f > @aln_hits[key][:score]
116
+ @aln_hits[key] = {
117
+ pId: lA[2].to_f.round(2),
118
+ evalue: lA[10],
119
+ score: lA[11].to_f,
120
+ hits: [hit],
121
+ product: [product],
122
+ length: [lA[3].to_i],
123
+ query_location: [[lA[6].to_i,lA[7].to_i]],
124
+ subject_location: [[lA[8].to_i,lA[9].to_i]],
125
+ feature: [feature]
126
+ }
127
+ elsif lA[11].to_f == @aln_hits[key][:score]
128
+ @aln_hits[key][:hits] << hit
129
+ @aln_hits[key][:length] << lA[3].to_i
130
+ @aln_hits[key][:query_location] << [lA[6].to_i,lA[7].to_i]
131
+ @aln_hits[key][:subject_location] << [lA[8].to_i,lA[9].to_i]
132
+ @aln_hits[key][:feature] << feature
133
+ @aln_hits[key][:product] << product
134
+ end
135
+ end
136
+ end
77
137
 
78
- # Get the annotations for a contig for RerenceGenome
79
- def get_annotation_for_contig prots_to_annotate, ref_cds
138
+ prune_aln_hits @aln_hits
139
+
140
+ end # end of method
80
141
 
81
- return {} if prots_to_annotate == nil
82
142
 
83
- contig_to_annotate = prots_to_annotate[0].split("_")[0..-2].join("_")
143
+ # Get the annotations for a contig for RerenceGenome
144
+ def get_annotation_for_contig contig_to_annotate, prots_to_annotate=nil, ref_cds=nil
145
+
84
146
  annotations = {}
85
- prots = []
86
147
 
87
- @aln_hits.each_key do |k|
88
- contig = k.split("_")[0..-2].join("_")
89
- if contig == contig_to_annotate
90
- prots << k
148
+ if prots_to_annotate != nil
149
+
150
+ # contig_to_annotate = prots_to_annotate[0].split("_")[0..-2].join("_")
151
+ prots = []
152
+
153
+ @aln_hits.each_key do |k|
154
+ contig = k.split("_")[0..-2].join("_")
155
+ if contig == contig_to_annotate
156
+ prots << k
157
+ end
91
158
  end
92
- end
93
159
 
94
- # sorting the prot by their appearance in the contig
95
- prots.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
160
+ # sorting the prot by their appearance in the contig
161
+ prots.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
96
162
 
97
- i = 0
98
- prots_to_annotate.each do |p|
163
+ i = 0
164
+ prots_to_annotate.each do |p|
99
165
 
100
- if @aln_hits.has_key? p
166
+ if @aln_hits.has_key? p
101
167
 
102
- hit_index = 0
168
+ hit_index = 0
103
169
 
104
- if @aln_hits[p][:hits].length > 1
105
- hit_index = choose_best_hit i, prots, ref_cds
106
- end
170
+ if @aln_hits[p][:hits].length > 1
171
+ hit_index = choose_best_hit i, prots, ref_cds
172
+ end
107
173
 
108
- h = @aln_hits[p][:hits][hit_index]
109
- hit = ref_cds[h]
110
- annotations[p] = hit
111
- annotations[p][:pId] = @aln_hits[p][:pId]
112
- annotations[p][:length] = @aln_hits[p][:length][hit_index]
113
- i+=1
174
+ h = @aln_hits[p][:hits][hit_index]
175
+ hit = ref_cds[h]
176
+ annotations[p] = hit
177
+ annotations[p][:pId] = @aln_hits[p][:pId]
178
+ annotations[p][:length] = @aln_hits[p][:length][hit_index]
179
+ i+=1
114
180
 
115
- else
181
+ else
182
+
183
+ annotations[p] = nil
184
+
185
+ end
116
186
 
117
- annotations[p] = nil
187
+ end
188
+
189
+ elsif ! @aln_hits.empty?
118
190
 
191
+ @aln_hits.each_key do |k|
192
+ contig = k.split("_")[0..-3].join("_")
193
+ if contig == contig_to_annotate
194
+ annotations[k] = @aln_hits[k]
195
+ end
119
196
  end
120
197
 
121
198
  end
@@ -192,6 +269,64 @@ class SyntenyManip
192
269
 
193
270
  end # end of method
194
271
 
272
+ def prune_aln_hits aln_hits
273
+
274
+ # @aln_hits[key] = {
275
+ # pId: lA[2].to_f.round(2),
276
+ # evalue: lA[10],
277
+ # score: lA[11].to_f,
278
+ # hits: [hit],
279
+ # length: [lA[3].to_i],
280
+ # query_location: [[lA[6].to_i,lA[7].to_i]],
281
+ # subject_location: [[lA[8].to_i,lA[9].to_i]],
282
+ # feature: [feature]
283
+ # }
284
+
285
+ keys_to_delete = []
286
+
287
+ aln_hits.each do |key1,val1|
288
+
289
+ aln_hits.each do |key2,val2|
290
+
291
+ next if key1==key2
292
+ next if keys_to_delete.include? key1
293
+ next if keys_to_delete.include? key2
294
+
295
+ if val1[:query_location][0][0] >= val2[:query_location][0][0] and
296
+ val1[:query_location][0][0] < val2[:query_location][0][1]
297
+ overlap_len = val2[:query_location][0][1] - val1[:query_location][0][0]
298
+ val1_len = val1[:query_location][0][1]-val1[:query_location][0][0]
299
+ val2_len = val2[:query_location][0][1]-val2[:query_location][0][0]
300
+ if overlap_len.to_f/val1_len > 0.2 and overlap_len.to_f/val2_len > 0.2
301
+ if val1[:score] < val2[:score]
302
+ keys_to_delete << key1
303
+ else
304
+ keys_to_delete << key2
305
+ end
306
+ end
307
+ elsif val2[:query_location][0][0] >= val1[:query_location][0][0] and
308
+ val2[:query_location][0][0] < val1[:query_location][0][1]
309
+ overlap_len = val1[:query_location][0][1] - val2[:query_location][0][0]
310
+ val1_len = val1[:query_location][0][1]-val1[:query_location][0][0]
311
+ val2_len = val2[:query_location][0][1]-val2[:query_location][0][0]
312
+ if overlap_len.to_f/val1_len > 0.2 and overlap_len.to_f/val2_len > 0.2
313
+ if val1[:score] < val2[:score]
314
+ keys_to_delete << key1
315
+ else
316
+ keys_to_delete << key2
317
+ end
318
+ end
319
+ end
320
+
321
+ end
322
+
323
+ end
324
+
325
+ keys_to_delete.each do |k|
326
+ aln_hits.delete(k)
327
+ end
328
+
329
+ end # end of method
195
330
 
196
331
 
197
332
  end # end of class
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
@@ -33,7 +33,7 @@ class BacterialAnnotator
33
33
  abort "Output directory already exist ! Choose another one or use -f to overwrite"
34
34
  else
35
35
  puts "Overwriting output directory #{@outdir}"
36
- FileUtils.remove_dir(@outdir, force=true)
36
+ FileUtils.remove_dir(@outdir, :force=>true)
37
37
  end
38
38
  end
39
39
  Dir.mkdir(@outdir)
@@ -67,6 +67,7 @@ class BacterialAnnotator
67
67
  puts "Prodigal done."
68
68
  if @with_refence_genome
69
69
  @refgenome.write_cds_to_file @outdir
70
+ @refgenome.write_rna_to_file @outdir
70
71
  puts "Successfully loaded #{@refgenome.gbk.definition}"
71
72
  end
72
73
  end # end of method
@@ -77,10 +78,11 @@ class BacterialAnnotator
77
78
  # process reference genome synteny
78
79
  if @with_refence_genome # Annotation with the Reference Genome
79
80
 
80
- @prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity)
81
- puts "\nRunning BLAT alignment with Reference Genome.."
81
+ # run CDS annotation
82
+ puts "\nRunning BLAT alignment with Reference Genome CDS.."
83
+ @prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity, "prot")
82
84
  @prot_synteny.run_blat @root, @outdir
83
- @prot_synteny.extract_hits :refgenome
85
+ @prot_synteny.extract_hits_prodigal :refgenome
84
86
 
85
87
  @fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
86
88
 
@@ -91,9 +93,9 @@ class BacterialAnnotator
91
93
  end
92
94
 
93
95
  contig_prots = @fasta.prodigal_files[:prot_ids_by_contig][contig]
94
-
96
+ # contig_to_annotate = contig_prots[0].split("_")[0..-2].join("_")
95
97
  # contig_prot_annotations = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
96
- @contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
98
+ @contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig, contig_prots, @refgenome.coding_seq
97
99
 
98
100
  remaining_cds = cumulate_annotation_stats_reference contig, @contig_annotations[contig]
99
101
 
@@ -106,44 +108,20 @@ class BacterialAnnotator
106
108
  # dump foreign proteins to file
107
109
  foreign_cds_file = dump_cds
108
110
 
109
- # Iterate over each Ref protein and print syntheny
110
- synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
111
- synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
112
- ref_annotated = {}
113
- @contig_annotations.each do |contig,prot_annotations|
114
- prot_annotations.each do |key,prot|
115
- # p key
116
- # p prot
117
- ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
118
- end
119
- end
120
-
121
- @refgenome.coding_seq.each do |ref_k, ref_v|
122
- gene = ""
123
- coverage_ref = ""
124
- coverage_query = ""
125
- query_length = ""
126
- pId = ""
127
- if ref_annotated[ref_v[:protId]] != nil
128
- gene = ref_annotated[ref_v[:protId]][:key]
129
- coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
130
- query_length = @fasta.prodigal_files[:prot_ids_length][gene]
131
- coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
132
- pId = ref_annotated[ref_v[:protId]][:pId]
133
- end
134
-
135
- synteny_file.write(ref_v[:protId])
136
- synteny_file.write("\t"+ref_v[:locustag])
137
- synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
138
- synteny_file.write("\t"+coverage_ref.to_s)
139
- synteny_file.write("\t"+pId.to_s)
140
- synteny_file.write("\t"+gene)
141
- synteny_file.write("\t"+query_length.to_s)
142
- synteny_file.write("\t"+coverage_query.to_s)
143
- synteny_file.write("\n")
111
+ # dump reference CDS synteny to file
112
+ dump_ref_synteny_to_file
144
113
 
114
+ # run RNA annotation
115
+ puts "\nRunning BLAT alignment with Reference Genome RNA.."
116
+ @rna_synteny = SyntenyManip.new(@fasta.fasta_file, @refgenome.rna_file, "RNA-Ref", @pidentity, "dna")
117
+ @rna_synteny.run_blat @root, @outdir
118
+ @rna_synteny.extract_hits_dna :rna
119
+ @contig_annotations_rna = {}
120
+ @fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
121
+ puts "adding rna_annotation for contig #{contig}"
122
+ @contig_annotations_rna[contig] = @rna_synteny.get_annotation_for_contig contig
123
+ p @contig_annotations_rna[contig]
145
124
  end
146
- synteny_file.close
147
125
 
148
126
  else # no reference genome
149
127
 
@@ -156,7 +134,7 @@ class BacterialAnnotator
156
134
  finish_annotation foreign_cds_file
157
135
 
158
136
  # Parse annotations to genbank files
159
- parsing_genbank_files
137
+ parse_genbank_files
160
138
 
161
139
  puts "\nPrinting Statistics.."
162
140
  print_stats "#{@outdir}/Annotation-Stats.txt"
@@ -177,7 +155,7 @@ class BacterialAnnotator
177
155
  externaldb_synteny = SyntenyManip.new(remaining_cds_file, db_file, "Prot-ExternalDB", @pidentity)
178
156
  puts "\nRunning BLAT alignment with External Database.."
179
157
  externaldb_synteny.run_blat @root, @outdir
180
- externaldb_synteny.extract_hits :externaldb
158
+ externaldb_synteny.extract_hits_prodigal :externaldb
181
159
 
182
160
  externaldb_synteny.aln_hits.each do |k,v|
183
161
  contig_of_protein = k.split("_")[0..-2].join("_")
@@ -188,15 +166,14 @@ class BacterialAnnotator
188
166
 
189
167
  hit_gi = v[:hits][0]
190
168
 
191
- note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
192
-
193
- # p v
194
- # p ref_cds[hit_gi]
169
+ # note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
170
+ note = "Protein homology (#{v[:pId]}% identity) with #{hit_gi}"
195
171
 
196
172
  if ref_cds[hit_gi][:org] != ""
197
173
  note += " from #{ref_cds[hit_gi][:org]}"
198
174
  end
199
175
  @contig_annotations[contig_of_protein][k] = {product: ref_cds[hit_gi][:product],
176
+ feature: "cds",
200
177
  gene: nil,
201
178
  locustag: nil,
202
179
  note: note}
@@ -237,17 +214,16 @@ class BacterialAnnotator
237
214
  end
238
215
  ncbiblast.aln_hits.each do |k,v|
239
216
  contig_of_protein = k.split("_")[0..-2].join("_")
240
- # @contig_annotations[contig_of_protein][k][:product] = v[:hits][0][:product]
241
217
  if ! @contig_annotations.has_key? contig_of_protein
242
218
  @contig_annotations[contig_of_protein] = {}
243
219
  end
244
-
245
- note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
246
- # note = "correspond to gi:#{v[:hits][0][:gi]}"
220
+ # note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
221
+ note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:accession]}"
247
222
  if v[:hits][0][:org] != ""
248
223
  note += " from #{v[:hits][0][:org]}"
249
224
  end
250
225
  @contig_annotations[contig_of_protein][k] = {product: v[:hits][0][:product],
226
+ feature: "cds",
251
227
  gene: nil,
252
228
  locustag: nil,
253
229
  note: note}
@@ -263,7 +239,7 @@ class BacterialAnnotator
263
239
 
264
240
 
265
241
  # parse all genbank files
266
- def parsing_genbank_files
242
+ def parse_genbank_files
267
243
 
268
244
  puts "\nParsing annotation into genbank files.."
269
245
  @contig_annotations.each do |contig, contig_prot_annotations|
@@ -271,7 +247,15 @@ class BacterialAnnotator
271
247
  gbk_to_annotate = GenbankManip.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
272
248
  reference_locus = nil
273
249
  reference_locus = @refgenome.gbk.locus if @with_refence_genome
274
- gbk_to_annotate.add_annotation contig_prot_annotations, gbk_path, 0, reference_locus
250
+ gbk_to_annotate.add_annotations contig_prot_annotations, "inplace", reference_locus
251
+
252
+ if @contig_annotations_rna.has_key? contig
253
+ puts "Trying RNA annotation"
254
+ gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
255
+ end
256
+
257
+ gbk_to_annotate.save_genbank_to_file gbk_path
258
+
275
259
  end
276
260
 
277
261
  end # end of method
@@ -314,6 +298,7 @@ class BacterialAnnotator
314
298
  p_cds_annotated = @annotation_stats[:annotated_cds].to_f/@annotation_stats[:total_cds].to_f
315
299
 
316
300
  File.open(file, "w") do |fopen|
301
+
317
302
  fopen.write("#Contigs annotation based on reference genomes\n")
318
303
  fopen.write("Short Contigs (< #{@minlength}) :\t\t" + @annotation_stats[:short_contigs].length.to_s + "\n")
319
304
  fopen.write("Foreign Contigs :\t\t" + @annotation_stats[:foreign_contigs].length.to_s + "\n")
@@ -446,6 +431,51 @@ class BacterialAnnotator
446
431
 
447
432
  end # end of method
448
433
 
449
- private :dump_cds, :split_remaining_cds_file
434
+ # will reference CDS synteny to file
435
+ def dump_ref_synteny_to_file
436
+
437
+ # Iterate over each Ref protein and print syntheny
438
+ synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
439
+ synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
440
+ ref_annotated = {}
441
+ @contig_annotations.each do |contig,prot_annotations|
442
+ prot_annotations.each do |key,prot|
443
+ # p key
444
+ # p prot
445
+ ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
446
+ end
447
+ end
448
+
449
+ @refgenome.coding_seq.each do |ref_k, ref_v|
450
+
451
+ gene = ""
452
+ coverage_ref = ""
453
+ coverage_query = ""
454
+ query_length = ""
455
+ pId = ""
456
+ if ref_annotated[ref_v[:protId]] != nil
457
+ gene = ref_annotated[ref_v[:protId]][:key]
458
+ coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
459
+ query_length = @fasta.prodigal_files[:prot_ids_length][gene]
460
+ coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
461
+ pId = ref_annotated[ref_v[:protId]][:pId]
462
+ end
463
+
464
+ synteny_file.write(ref_v[:protId])
465
+ synteny_file.write("\t"+ref_v[:locustag])
466
+ synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
467
+ synteny_file.write("\t"+coverage_ref.to_s)
468
+ synteny_file.write("\t"+pId.to_s)
469
+ synteny_file.write("\t"+gene)
470
+ synteny_file.write("\t"+query_length.to_s)
471
+ synteny_file.write("\t"+coverage_query.to_s)
472
+ synteny_file.write("\n")
473
+
474
+ end
475
+ synteny_file.close
476
+
477
+ end
478
+
479
+ private :dump_cds, :split_remaining_cds_file, :dump_ref_synteny_to_file
450
480
 
451
481
  end # end of class
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # author: maxime déraspe
3
- # email: maxime@deraspe.net
3
+ # email: maximilien1er@gmail.com
4
4
  # review:
5
5
  # date: 15-02-24
6
6
  # version: 0.0.1
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7
4
+ version: 0.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-01-25 00:00:00.000000000 Z
11
+ date: 2017-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -72,7 +72,7 @@ dependencies:
72
72
  version: 1.9.0
73
73
  description: GEM to annotate bacterial genome sequence based on a reference genome
74
74
  and complete the annotation with an external database or a remote database.
75
- email: maxime@deraspe.net
75
+ email: maximilien1er@gmail.com
76
76
  executables:
77
77
  - bacterial-annotator
78
78
  - ba_prodigal