bacterial-annotator 0.3.7 → 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/ba_blat +1 -1
- data/bin/ba_mafft +1 -1
- data/bin/ba_prodigal +1 -1
- data/bin/ba_raxml +1 -1
- data/bin/bacterial-annotator +5 -5
- data/lib/bacterial-annotator/fasta-manip.rb +1 -1
- data/lib/bacterial-annotator/genbank-manip.rb +147 -54
- data/lib/bacterial-annotator/remote-ncbi.rb +1 -1
- data/lib/bacterial-annotator/synteny-manip.rb +169 -34
- data/lib/bacterial-annotator.rb +86 -56
- data/lib/bacterial-comparator.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b46c0c48f45550ea7deb1580acb72927435df345
|
4
|
+
data.tar.gz: b5511795149b832c27fc8867dc3904c9c96d9561
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac790dd5249a74cf559912dc88ebaad614d3f934a43c297848e1bda2db146c682b11862489170ddc21683fed070509f505cdc6759530ab0e1d2f8c80d8951ac7
|
7
|
+
data.tar.gz: 0de96df50d2e3a1ad4274134969007b8efef2f1a63b24e7063c45f102c9020ba3e6c337de2214bc1a2f26ae1d33057d9ec7c6ef0ea182e23c77633b3cd8d5e0a
|
data/bin/ba_blat
CHANGED
data/bin/ba_mafft
CHANGED
data/bin/ba_prodigal
CHANGED
data/bin/ba_raxml
CHANGED
data/bin/bacterial-annotator
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# -*- coding: utf-8 -*-
|
3
3
|
# author: maxime déraspe
|
4
|
-
# email:
|
4
|
+
# email: maximilien1er@gmail.com
|
5
5
|
# review:
|
6
6
|
# date: 15-02-24
|
7
7
|
# version: 0.01
|
@@ -64,7 +64,6 @@ annotate [OPTIONS]
|
|
64
64
|
--minlength Minimum contig length for annotation [default=500]
|
65
65
|
|
66
66
|
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
67
|
-
--gff Will also generate gff annotation files [off by default]
|
68
67
|
|
69
68
|
OEM
|
70
69
|
|
@@ -92,8 +91,6 @@ def parseOptions_annotate
|
|
92
91
|
options[:outdir] = ARGV.shift
|
93
92
|
when "--force", "-f"
|
94
93
|
options[:force] = 1
|
95
|
-
when "--gff"
|
96
|
-
options[:gff] = 1
|
97
94
|
when "--minlength"
|
98
95
|
options[:minlength] = ARGV.shift
|
99
96
|
when "--pidentity"
|
@@ -192,7 +189,10 @@ def parseOptions_compare
|
|
192
189
|
end
|
193
190
|
|
194
191
|
|
195
|
-
|
192
|
+
########
|
193
|
+
# MAIN #
|
194
|
+
########
|
195
|
+
|
196
196
|
if ARGV.size > 1
|
197
197
|
|
198
198
|
ROOT = File.dirname(__FILE__)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
# author: maxime déraspe
|
3
|
-
# email:
|
3
|
+
# email: maximilien1er@gmail.com
|
4
4
|
# review:
|
5
5
|
# date: 15-02-24
|
6
6
|
# version: 0.0.1
|
@@ -10,7 +10,7 @@
|
|
10
10
|
|
11
11
|
class GenbankManip
|
12
12
|
|
13
|
-
attr_accessor :gbk, :coding_seq, :cds_file
|
13
|
+
attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
|
14
14
|
|
15
15
|
# Initialize then genbank file
|
16
16
|
def initialize gbk_file, outdir
|
@@ -58,13 +58,6 @@ class GenbankManip
|
|
58
58
|
protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
|
59
59
|
locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
|
60
60
|
|
61
|
-
# if ftH.has_key? "translation"
|
62
|
-
# pep = ftH["translation"][0] if !ftH["translation"].nil?
|
63
|
-
# else
|
64
|
-
# dna = get_DNA(ft,@bioseq)
|
65
|
-
# pep = dna.translate
|
66
|
-
# end
|
67
|
-
|
68
61
|
dna = get_DNA(ft,@bioseq)
|
69
62
|
pep = dna.translate
|
70
63
|
pepBioSeq = Bio::Sequence.auto(pep)
|
@@ -89,6 +82,49 @@ class GenbankManip
|
|
89
82
|
|
90
83
|
end
|
91
84
|
|
85
|
+
# Prepare rRNA tRNA
|
86
|
+
def get_rna
|
87
|
+
|
88
|
+
if @rna_seq == nil
|
89
|
+
|
90
|
+
@rna_seq = {}
|
91
|
+
@gbk.features do |ft|
|
92
|
+
|
93
|
+
next if ! ft.feature.to_s.include? "RNA"
|
94
|
+
|
95
|
+
ftH = ft.to_hash
|
96
|
+
loc = ft.locations
|
97
|
+
# seqBeg = loc[0].from.to_s
|
98
|
+
# seqEnd = loc[0].to.to_s
|
99
|
+
# strand = loc[0].strand.to_s
|
100
|
+
if ftH.has_key? "pseudo"
|
101
|
+
next
|
102
|
+
end
|
103
|
+
# gene = ftH["gene"] if !ftH["gene"].nil?
|
104
|
+
# protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
|
105
|
+
product = ""
|
106
|
+
product = ftH["product"][0] if !ftH["product"].nil?
|
107
|
+
locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
|
108
|
+
|
109
|
+
# puts "#{@accession}\t#{seqBeg}\t#{seqEnd}\t#{strand}\t#{protId}\t#{locustag}\t#{gene[0]}\t#{product[0]}"
|
110
|
+
dna = get_DNA(ft,@bioseq)
|
111
|
+
dnaBioSeq = Bio::Sequence.auto(dna)
|
112
|
+
|
113
|
+
@rna_seq[locustag] = {type: ft.feature.to_s,
|
114
|
+
location: loc,
|
115
|
+
locustag: locustag,
|
116
|
+
product: product,
|
117
|
+
bioseq_gene: dnaBioSeq}
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
@rna_seq
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
|
92
128
|
|
93
129
|
# Print CDS to files
|
94
130
|
# RETURN : cds_file path
|
@@ -116,79 +152,136 @@ class GenbankManip
|
|
116
152
|
|
117
153
|
end
|
118
154
|
|
155
|
+
# Print RNA to files
|
156
|
+
# RETURN : rna_file path
|
157
|
+
def write_rna_to_file outdir
|
158
|
+
|
159
|
+
rna_file = "#{@gbk.accession}.rna"
|
160
|
+
|
161
|
+
if @rna_seq == nil
|
162
|
+
get_rna
|
163
|
+
end
|
164
|
+
|
165
|
+
File.open("#{outdir}/#{rna_file}", "w") do |fwrite|
|
166
|
+
@rna_seq.each_key do |k|
|
167
|
+
seqout_dna = @rna_seq[k][:bioseq_gene].output_fasta("#{k}|#{@rna_seq[k][:type]}|#{@rna_seq[k][:product]}",60)
|
168
|
+
fwrite.write(seqout_dna)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
@rna_file = "#{outdir}/" + rna_file
|
173
|
+
|
174
|
+
end
|
175
|
+
|
119
176
|
|
120
177
|
# add annotation to a genbank file produced by prodigal
|
121
|
-
def
|
178
|
+
def add_annotations annotations, mode, reference_locus=nil
|
122
179
|
|
123
|
-
nb_of_added_ft = 0
|
180
|
+
# nb_of_added_ft = 0
|
124
181
|
i = 0
|
125
182
|
|
126
183
|
contig = @gbk.definition
|
127
184
|
|
128
|
-
|
129
|
-
|
185
|
+
if mode == "inplace"
|
186
|
+
|
187
|
+
# iterate through
|
188
|
+
@gbk.features.each_with_index do |cds, ft_index|
|
130
189
|
|
131
|
-
|
190
|
+
next if cds.feature != "CDS"
|
132
191
|
|
133
|
-
if mode == 0
|
134
192
|
ftArray = []
|
135
193
|
cds.qualifiers = []
|
136
|
-
else
|
137
|
-
ftArray = cds.qualifiers
|
138
|
-
end
|
139
194
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
195
|
+
i += 1
|
196
|
+
prot_id = contig+"_"+i.to_s
|
197
|
+
hit = nil
|
198
|
+
hit = annotations[prot_id] if annotations.has_key? prot_id
|
199
|
+
|
200
|
+
if hit != nil
|
201
|
+
locus, gene, product, note = nil
|
202
|
+
locus = hit[:locustag]
|
203
|
+
gene = hit[:gene]
|
204
|
+
product = hit[:product]
|
205
|
+
note = hit[:note]
|
206
|
+
pId = hit[:pId]
|
207
|
+
|
208
|
+
if gene != nil
|
209
|
+
qGene = Bio::Feature::Qualifier.new('gene', gene)
|
210
|
+
ftArray.push(qGene)
|
211
|
+
end
|
212
|
+
|
213
|
+
if product != nil
|
214
|
+
qProd = Bio::Feature::Qualifier.new('product', product)
|
215
|
+
ftArray.push(qProd)
|
216
|
+
end
|
217
|
+
|
218
|
+
# check if there is a reference genome.. reference_locus shouldn't be nil in that case
|
219
|
+
if locus != nil
|
220
|
+
qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (#{pId}% identity) from #{reference_locus.entry_id}")
|
221
|
+
ftArray.push(qNote)
|
222
|
+
end
|
223
|
+
|
224
|
+
if note != nil
|
225
|
+
qNote = Bio::Feature::Qualifier.new('note', note)
|
226
|
+
ftArray.push(qNote)
|
227
|
+
end
|
157
228
|
|
158
|
-
if product != nil
|
159
|
-
qProd = Bio::Feature::Qualifier.new('product', product)
|
160
|
-
ftArray.push(qProd)
|
161
229
|
end
|
230
|
+
cds.qualifiers = ftArray
|
162
231
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
elsif mode == "new"
|
236
|
+
|
237
|
+
sorted_annotations = annotations.sort_by { |k, v| v[:query_location][0][0] }
|
238
|
+
|
239
|
+
new_features = {}
|
240
|
+
annotations_done = {}
|
241
|
+
|
242
|
+
@gbk.features.each_with_index do |ft, ft_index|
|
243
|
+
|
244
|
+
sorted_annotations.each do |k,v|
|
245
|
+
|
246
|
+
next if annotations_done.has_key? k
|
247
|
+
|
248
|
+
if v[:query_location][0][0] < ft.locations[0].from
|
249
|
+
|
250
|
+
if v[:subject_location][0][0] > v[:subject_location][0][1]
|
251
|
+
location = "complement(#{v[:query_location][0][0]}..#{v[:query_location][0][1]})"
|
252
|
+
else
|
253
|
+
location = "#{v[:query_location][0][0]}..#{v[:query_location][0][1]}"
|
254
|
+
end
|
255
|
+
|
256
|
+
feature = Bio::Feature.new(v[:feature][0],location)
|
257
|
+
feature.qualifiers.push(Bio::Feature::Qualifier.new('product',v[:product][0])) if ! v[:product][0].nil? or v[:product][0] != ""
|
258
|
+
new_features[ft_index] = feature
|
259
|
+
annotations_done[k] = 1
|
260
|
+
break
|
261
|
+
|
262
|
+
end
|
168
263
|
|
169
|
-
if note != nil
|
170
|
-
qNote = Bio::Feature::Qualifier.new('note', note)
|
171
|
-
ftArray.push(qNote)
|
172
264
|
end
|
173
265
|
|
266
|
+
end
|
174
267
|
|
268
|
+
new_features.each do |k,v|
|
269
|
+
@gbk.features.insert(k,v)
|
175
270
|
end
|
176
|
-
cds.qualifiers = ftArray
|
177
271
|
|
178
272
|
end
|
179
273
|
|
180
|
-
|
274
|
+
end
|
275
|
+
|
276
|
+
|
277
|
+
def save_genbank_to_file outdir
|
278
|
+
|
279
|
+
File.open("#{outdir}/#{@gbk.definition}.gbk", "w") do |f|
|
181
280
|
f.write(@gbk.to_biosequence.output(:genbank))
|
182
281
|
end
|
183
282
|
|
184
|
-
# Bioruby doesn't support gff at this point
|
185
|
-
# File.open("#{outdir}/#{contig}.gff", "w") do |f|
|
186
|
-
# f.write(@gbk.to_biosequence.output(:gff))
|
187
|
-
# end
|
188
|
-
|
189
283
|
end
|
190
284
|
|
191
|
-
|
192
285
|
###################
|
193
286
|
# Private Methods #
|
194
287
|
###################
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
# author: maxime déraspe
|
3
|
-
# email:
|
3
|
+
# email: maximilien1er@gmail.com
|
4
4
|
# review:
|
5
5
|
# date: 15-02-24
|
6
6
|
# version: 0.0.1
|
@@ -12,34 +12,42 @@ class SyntenyManip
|
|
12
12
|
|
13
13
|
attr_reader :query_file, :subject_file, :aln_hits
|
14
14
|
|
15
|
-
def initialize query_file, subject_file, name, pidentity
|
15
|
+
def initialize query_file, subject_file, name, pidentity, type
|
16
16
|
@query_file = query_file
|
17
17
|
@subject_file = subject_file
|
18
18
|
@name = name
|
19
19
|
@pidentity = pidentity
|
20
20
|
@aln_file = nil
|
21
|
+
@type = type
|
21
22
|
end # end of initialize
|
22
23
|
|
23
24
|
# run blat on proteins
|
24
25
|
def run_blat root, outdir
|
25
|
-
|
26
|
+
base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity}"
|
27
|
+
system("#{base_cmd} #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
|
28
|
+
if @type == "prot"
|
29
|
+
system("#{base_cmd} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
|
30
|
+
end
|
26
31
|
@aln_file = "#{outdir}/#{@name}.blat8.tsv"
|
27
32
|
# extract_hits
|
28
33
|
end # end of method
|
29
34
|
|
30
35
|
# Extract Hit from blast8 file and save it in hash
|
31
36
|
# contig-0_1 ABJ71957.1 96.92 65 2 0 1 65 1 65 9.2e-31 131.0
|
32
|
-
def
|
37
|
+
def extract_hits_prodigal mode, ref_cds=nil
|
33
38
|
|
34
39
|
@aln_hits = {}
|
40
|
+
feature = ""
|
35
41
|
File.open(@aln_file,"r") do |fread|
|
36
42
|
while l = fread.gets
|
37
43
|
lA = l.chomp!.split("\t")
|
38
44
|
key = lA[0]
|
39
45
|
if mode == :refgenome
|
40
46
|
hit = lA[1]
|
47
|
+
feature = "cds"
|
41
48
|
elsif mode == :externaldb
|
42
|
-
hit = lA[1].chomp.split("|")[
|
49
|
+
hit = lA[1].chomp.split("|")[3]
|
50
|
+
feature = "cds"
|
43
51
|
end
|
44
52
|
if ! @aln_hits.has_key? key
|
45
53
|
next if lA[2].to_f < @pidentity
|
@@ -50,7 +58,8 @@ class SyntenyManip
|
|
50
58
|
hits: [hit],
|
51
59
|
length: [lA[3].to_i],
|
52
60
|
query_location: [[lA[6].to_i,lA[7].to_i]],
|
53
|
-
subject_location: [[lA[8].to_i,lA[9].to_i]]
|
61
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
62
|
+
feature: feature
|
54
63
|
}
|
55
64
|
elsif lA[11].to_f > @aln_hits[key][:score]
|
56
65
|
@aln_hits[key] = {
|
@@ -60,7 +69,8 @@ class SyntenyManip
|
|
60
69
|
hits: [hit],
|
61
70
|
length: [lA[3].to_i],
|
62
71
|
query_location: [[lA[6].to_i,lA[7].to_i]],
|
63
|
-
subject_location: [[lA[8].to_i,lA[9].to_i]]
|
72
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
73
|
+
feature: feature
|
64
74
|
}
|
65
75
|
elsif lA[11].to_f == @aln_hits[key][:score]
|
66
76
|
@aln_hits[key][:hits] << hit
|
@@ -73,49 +83,116 @@ class SyntenyManip
|
|
73
83
|
|
74
84
|
end # end of method
|
75
85
|
|
86
|
+
# Extract Hit from blast8 file and save it in hash
|
87
|
+
# prpa PA0668.4|rRNA|23S 99.97 2891 1 0 705042 707932 1 2891 0.0e+00 5671.0
|
88
|
+
def extract_hits_dna mode
|
76
89
|
|
90
|
+
@aln_hits = {}
|
91
|
+
feature = ""
|
92
|
+
File.open(@aln_file,"r") do |fread|
|
93
|
+
while l = fread.gets
|
94
|
+
lA = l.chomp!.split("\t")
|
95
|
+
key = lA[0]+"_"+lA[6]+"_"+lA[7]
|
96
|
+
if mode == :rna
|
97
|
+
hit_split = lA[1].chomp.split("|")
|
98
|
+
hit = hit_split[0]
|
99
|
+
feature = hit_split[1]
|
100
|
+
product = hit_split[2]
|
101
|
+
end
|
102
|
+
if ! @aln_hits.has_key? key
|
103
|
+
next if lA[2].to_f < @pidentity
|
104
|
+
@aln_hits[key] = {
|
105
|
+
pId: lA[2].to_f.round(2),
|
106
|
+
evalue: lA[10],
|
107
|
+
score: lA[11].to_f,
|
108
|
+
hits: [hit],
|
109
|
+
product: [product],
|
110
|
+
length: [lA[3].to_i],
|
111
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
112
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
113
|
+
feature: [feature]
|
114
|
+
}
|
115
|
+
elsif lA[11].to_f > @aln_hits[key][:score]
|
116
|
+
@aln_hits[key] = {
|
117
|
+
pId: lA[2].to_f.round(2),
|
118
|
+
evalue: lA[10],
|
119
|
+
score: lA[11].to_f,
|
120
|
+
hits: [hit],
|
121
|
+
product: [product],
|
122
|
+
length: [lA[3].to_i],
|
123
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
124
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
125
|
+
feature: [feature]
|
126
|
+
}
|
127
|
+
elsif lA[11].to_f == @aln_hits[key][:score]
|
128
|
+
@aln_hits[key][:hits] << hit
|
129
|
+
@aln_hits[key][:length] << lA[3].to_i
|
130
|
+
@aln_hits[key][:query_location] << [lA[6].to_i,lA[7].to_i]
|
131
|
+
@aln_hits[key][:subject_location] << [lA[8].to_i,lA[9].to_i]
|
132
|
+
@aln_hits[key][:feature] << feature
|
133
|
+
@aln_hits[key][:product] << product
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
77
137
|
|
78
|
-
|
79
|
-
|
138
|
+
prune_aln_hits @aln_hits
|
139
|
+
|
140
|
+
end # end of method
|
80
141
|
|
81
|
-
return {} if prots_to_annotate == nil
|
82
142
|
|
83
|
-
|
143
|
+
# Get the annotations for a contig for RerenceGenome
|
144
|
+
def get_annotation_for_contig contig_to_annotate, prots_to_annotate=nil, ref_cds=nil
|
145
|
+
|
84
146
|
annotations = {}
|
85
|
-
prots = []
|
86
147
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
148
|
+
if prots_to_annotate != nil
|
149
|
+
|
150
|
+
# contig_to_annotate = prots_to_annotate[0].split("_")[0..-2].join("_")
|
151
|
+
prots = []
|
152
|
+
|
153
|
+
@aln_hits.each_key do |k|
|
154
|
+
contig = k.split("_")[0..-2].join("_")
|
155
|
+
if contig == contig_to_annotate
|
156
|
+
prots << k
|
157
|
+
end
|
91
158
|
end
|
92
|
-
end
|
93
159
|
|
94
|
-
|
95
|
-
|
160
|
+
# sorting the prot by their appearance in the contig
|
161
|
+
prots.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
|
96
162
|
|
97
|
-
|
98
|
-
|
163
|
+
i = 0
|
164
|
+
prots_to_annotate.each do |p|
|
99
165
|
|
100
|
-
|
166
|
+
if @aln_hits.has_key? p
|
101
167
|
|
102
|
-
|
168
|
+
hit_index = 0
|
103
169
|
|
104
|
-
|
105
|
-
|
106
|
-
|
170
|
+
if @aln_hits[p][:hits].length > 1
|
171
|
+
hit_index = choose_best_hit i, prots, ref_cds
|
172
|
+
end
|
107
173
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
174
|
+
h = @aln_hits[p][:hits][hit_index]
|
175
|
+
hit = ref_cds[h]
|
176
|
+
annotations[p] = hit
|
177
|
+
annotations[p][:pId] = @aln_hits[p][:pId]
|
178
|
+
annotations[p][:length] = @aln_hits[p][:length][hit_index]
|
179
|
+
i+=1
|
114
180
|
|
115
|
-
|
181
|
+
else
|
182
|
+
|
183
|
+
annotations[p] = nil
|
184
|
+
|
185
|
+
end
|
116
186
|
|
117
|
-
|
187
|
+
end
|
188
|
+
|
189
|
+
elsif ! @aln_hits.empty?
|
118
190
|
|
191
|
+
@aln_hits.each_key do |k|
|
192
|
+
contig = k.split("_")[0..-3].join("_")
|
193
|
+
if contig == contig_to_annotate
|
194
|
+
annotations[k] = @aln_hits[k]
|
195
|
+
end
|
119
196
|
end
|
120
197
|
|
121
198
|
end
|
@@ -192,6 +269,64 @@ class SyntenyManip
|
|
192
269
|
|
193
270
|
end # end of method
|
194
271
|
|
272
|
+
def prune_aln_hits aln_hits
|
273
|
+
|
274
|
+
# @aln_hits[key] = {
|
275
|
+
# pId: lA[2].to_f.round(2),
|
276
|
+
# evalue: lA[10],
|
277
|
+
# score: lA[11].to_f,
|
278
|
+
# hits: [hit],
|
279
|
+
# length: [lA[3].to_i],
|
280
|
+
# query_location: [[lA[6].to_i,lA[7].to_i]],
|
281
|
+
# subject_location: [[lA[8].to_i,lA[9].to_i]],
|
282
|
+
# feature: [feature]
|
283
|
+
# }
|
284
|
+
|
285
|
+
keys_to_delete = []
|
286
|
+
|
287
|
+
aln_hits.each do |key1,val1|
|
288
|
+
|
289
|
+
aln_hits.each do |key2,val2|
|
290
|
+
|
291
|
+
next if key1==key2
|
292
|
+
next if keys_to_delete.include? key1
|
293
|
+
next if keys_to_delete.include? key2
|
294
|
+
|
295
|
+
if val1[:query_location][0][0] >= val2[:query_location][0][0] and
|
296
|
+
val1[:query_location][0][0] < val2[:query_location][0][1]
|
297
|
+
overlap_len = val2[:query_location][0][1] - val1[:query_location][0][0]
|
298
|
+
val1_len = val1[:query_location][0][1]-val1[:query_location][0][0]
|
299
|
+
val2_len = val2[:query_location][0][1]-val2[:query_location][0][0]
|
300
|
+
if overlap_len.to_f/val1_len > 0.2 and overlap_len.to_f/val2_len > 0.2
|
301
|
+
if val1[:score] < val2[:score]
|
302
|
+
keys_to_delete << key1
|
303
|
+
else
|
304
|
+
keys_to_delete << key2
|
305
|
+
end
|
306
|
+
end
|
307
|
+
elsif val2[:query_location][0][0] >= val1[:query_location][0][0] and
|
308
|
+
val2[:query_location][0][0] < val1[:query_location][0][1]
|
309
|
+
overlap_len = val1[:query_location][0][1] - val2[:query_location][0][0]
|
310
|
+
val1_len = val1[:query_location][0][1]-val1[:query_location][0][0]
|
311
|
+
val2_len = val2[:query_location][0][1]-val2[:query_location][0][0]
|
312
|
+
if overlap_len.to_f/val1_len > 0.2 and overlap_len.to_f/val2_len > 0.2
|
313
|
+
if val1[:score] < val2[:score]
|
314
|
+
keys_to_delete << key1
|
315
|
+
else
|
316
|
+
keys_to_delete << key2
|
317
|
+
end
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
end
|
322
|
+
|
323
|
+
end
|
324
|
+
|
325
|
+
keys_to_delete.each do |k|
|
326
|
+
aln_hits.delete(k)
|
327
|
+
end
|
328
|
+
|
329
|
+
end # end of method
|
195
330
|
|
196
331
|
|
197
332
|
end # end of class
|
data/lib/bacterial-annotator.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
# author: maxime déraspe
|
3
|
-
# email:
|
3
|
+
# email: maximilien1er@gmail.com
|
4
4
|
# review:
|
5
5
|
# date: 15-02-24
|
6
6
|
# version: 0.0.1
|
@@ -33,7 +33,7 @@ class BacterialAnnotator
|
|
33
33
|
abort "Output directory already exist ! Choose another one or use -f to overwrite"
|
34
34
|
else
|
35
35
|
puts "Overwriting output directory #{@outdir}"
|
36
|
-
FileUtils.remove_dir(@outdir, force
|
36
|
+
FileUtils.remove_dir(@outdir, :force=>true)
|
37
37
|
end
|
38
38
|
end
|
39
39
|
Dir.mkdir(@outdir)
|
@@ -67,6 +67,7 @@ class BacterialAnnotator
|
|
67
67
|
puts "Prodigal done."
|
68
68
|
if @with_refence_genome
|
69
69
|
@refgenome.write_cds_to_file @outdir
|
70
|
+
@refgenome.write_rna_to_file @outdir
|
70
71
|
puts "Successfully loaded #{@refgenome.gbk.definition}"
|
71
72
|
end
|
72
73
|
end # end of method
|
@@ -77,10 +78,11 @@ class BacterialAnnotator
|
|
77
78
|
# process reference genome synteny
|
78
79
|
if @with_refence_genome # Annotation with the Reference Genome
|
79
80
|
|
80
|
-
|
81
|
-
puts "\nRunning BLAT alignment with Reference Genome.."
|
81
|
+
# run CDS annotation
|
82
|
+
puts "\nRunning BLAT alignment with Reference Genome CDS.."
|
83
|
+
@prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity, "prot")
|
82
84
|
@prot_synteny.run_blat @root, @outdir
|
83
|
-
@prot_synteny.
|
85
|
+
@prot_synteny.extract_hits_prodigal :refgenome
|
84
86
|
|
85
87
|
@fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
|
86
88
|
|
@@ -91,9 +93,9 @@ class BacterialAnnotator
|
|
91
93
|
end
|
92
94
|
|
93
95
|
contig_prots = @fasta.prodigal_files[:prot_ids_by_contig][contig]
|
94
|
-
|
96
|
+
# contig_to_annotate = contig_prots[0].split("_")[0..-2].join("_")
|
95
97
|
# contig_prot_annotations = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
|
96
|
-
@contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
|
98
|
+
@contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig, contig_prots, @refgenome.coding_seq
|
97
99
|
|
98
100
|
remaining_cds = cumulate_annotation_stats_reference contig, @contig_annotations[contig]
|
99
101
|
|
@@ -106,44 +108,20 @@ class BacterialAnnotator
|
|
106
108
|
# dump foreign proteins to file
|
107
109
|
foreign_cds_file = dump_cds
|
108
110
|
|
109
|
-
#
|
110
|
-
|
111
|
-
synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
|
112
|
-
ref_annotated = {}
|
113
|
-
@contig_annotations.each do |contig,prot_annotations|
|
114
|
-
prot_annotations.each do |key,prot|
|
115
|
-
# p key
|
116
|
-
# p prot
|
117
|
-
ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
@refgenome.coding_seq.each do |ref_k, ref_v|
|
122
|
-
gene = ""
|
123
|
-
coverage_ref = ""
|
124
|
-
coverage_query = ""
|
125
|
-
query_length = ""
|
126
|
-
pId = ""
|
127
|
-
if ref_annotated[ref_v[:protId]] != nil
|
128
|
-
gene = ref_annotated[ref_v[:protId]][:key]
|
129
|
-
coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
|
130
|
-
query_length = @fasta.prodigal_files[:prot_ids_length][gene]
|
131
|
-
coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
|
132
|
-
pId = ref_annotated[ref_v[:protId]][:pId]
|
133
|
-
end
|
134
|
-
|
135
|
-
synteny_file.write(ref_v[:protId])
|
136
|
-
synteny_file.write("\t"+ref_v[:locustag])
|
137
|
-
synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
|
138
|
-
synteny_file.write("\t"+coverage_ref.to_s)
|
139
|
-
synteny_file.write("\t"+pId.to_s)
|
140
|
-
synteny_file.write("\t"+gene)
|
141
|
-
synteny_file.write("\t"+query_length.to_s)
|
142
|
-
synteny_file.write("\t"+coverage_query.to_s)
|
143
|
-
synteny_file.write("\n")
|
111
|
+
# dump reference CDS synteny to file
|
112
|
+
dump_ref_synteny_to_file
|
144
113
|
|
114
|
+
# run RNA annotation
|
115
|
+
puts "\nRunning BLAT alignment with Reference Genome RNA.."
|
116
|
+
@rna_synteny = SyntenyManip.new(@fasta.fasta_file, @refgenome.rna_file, "RNA-Ref", @pidentity, "dna")
|
117
|
+
@rna_synteny.run_blat @root, @outdir
|
118
|
+
@rna_synteny.extract_hits_dna :rna
|
119
|
+
@contig_annotations_rna = {}
|
120
|
+
@fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
|
121
|
+
puts "adding rna_annotation for contig #{contig}"
|
122
|
+
@contig_annotations_rna[contig] = @rna_synteny.get_annotation_for_contig contig
|
123
|
+
p @contig_annotations_rna[contig]
|
145
124
|
end
|
146
|
-
synteny_file.close
|
147
125
|
|
148
126
|
else # no reference genome
|
149
127
|
|
@@ -156,7 +134,7 @@ class BacterialAnnotator
|
|
156
134
|
finish_annotation foreign_cds_file
|
157
135
|
|
158
136
|
# Parse annotations to genbank files
|
159
|
-
|
137
|
+
parse_genbank_files
|
160
138
|
|
161
139
|
puts "\nPrinting Statistics.."
|
162
140
|
print_stats "#{@outdir}/Annotation-Stats.txt"
|
@@ -177,7 +155,7 @@ class BacterialAnnotator
|
|
177
155
|
externaldb_synteny = SyntenyManip.new(remaining_cds_file, db_file, "Prot-ExternalDB", @pidentity)
|
178
156
|
puts "\nRunning BLAT alignment with External Database.."
|
179
157
|
externaldb_synteny.run_blat @root, @outdir
|
180
|
-
externaldb_synteny.
|
158
|
+
externaldb_synteny.extract_hits_prodigal :externaldb
|
181
159
|
|
182
160
|
externaldb_synteny.aln_hits.each do |k,v|
|
183
161
|
contig_of_protein = k.split("_")[0..-2].join("_")
|
@@ -188,15 +166,14 @@ class BacterialAnnotator
|
|
188
166
|
|
189
167
|
hit_gi = v[:hits][0]
|
190
168
|
|
191
|
-
note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
|
192
|
-
|
193
|
-
# p v
|
194
|
-
# p ref_cds[hit_gi]
|
169
|
+
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
|
170
|
+
note = "Protein homology (#{v[:pId]}% identity) with #{hit_gi}"
|
195
171
|
|
196
172
|
if ref_cds[hit_gi][:org] != ""
|
197
173
|
note += " from #{ref_cds[hit_gi][:org]}"
|
198
174
|
end
|
199
175
|
@contig_annotations[contig_of_protein][k] = {product: ref_cds[hit_gi][:product],
|
176
|
+
feature: "cds",
|
200
177
|
gene: nil,
|
201
178
|
locustag: nil,
|
202
179
|
note: note}
|
@@ -237,17 +214,16 @@ class BacterialAnnotator
|
|
237
214
|
end
|
238
215
|
ncbiblast.aln_hits.each do |k,v|
|
239
216
|
contig_of_protein = k.split("_")[0..-2].join("_")
|
240
|
-
# @contig_annotations[contig_of_protein][k][:product] = v[:hits][0][:product]
|
241
217
|
if ! @contig_annotations.has_key? contig_of_protein
|
242
218
|
@contig_annotations[contig_of_protein] = {}
|
243
219
|
end
|
244
|
-
|
245
|
-
note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:
|
246
|
-
# note = "correspond to gi:#{v[:hits][0][:gi]}"
|
220
|
+
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
|
221
|
+
note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:accession]}"
|
247
222
|
if v[:hits][0][:org] != ""
|
248
223
|
note += " from #{v[:hits][0][:org]}"
|
249
224
|
end
|
250
225
|
@contig_annotations[contig_of_protein][k] = {product: v[:hits][0][:product],
|
226
|
+
feature: "cds",
|
251
227
|
gene: nil,
|
252
228
|
locustag: nil,
|
253
229
|
note: note}
|
@@ -263,7 +239,7 @@ class BacterialAnnotator
|
|
263
239
|
|
264
240
|
|
265
241
|
# parse all genbank files
|
266
|
-
def
|
242
|
+
def parse_genbank_files
|
267
243
|
|
268
244
|
puts "\nParsing annotation into genbank files.."
|
269
245
|
@contig_annotations.each do |contig, contig_prot_annotations|
|
@@ -271,7 +247,15 @@ class BacterialAnnotator
|
|
271
247
|
gbk_to_annotate = GenbankManip.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
|
272
248
|
reference_locus = nil
|
273
249
|
reference_locus = @refgenome.gbk.locus if @with_refence_genome
|
274
|
-
gbk_to_annotate.
|
250
|
+
gbk_to_annotate.add_annotations contig_prot_annotations, "inplace", reference_locus
|
251
|
+
|
252
|
+
if @contig_annotations_rna.has_key? contig
|
253
|
+
puts "Trying RNA annotation"
|
254
|
+
gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
|
255
|
+
end
|
256
|
+
|
257
|
+
gbk_to_annotate.save_genbank_to_file gbk_path
|
258
|
+
|
275
259
|
end
|
276
260
|
|
277
261
|
end # end of method
|
@@ -314,6 +298,7 @@ class BacterialAnnotator
|
|
314
298
|
p_cds_annotated = @annotation_stats[:annotated_cds].to_f/@annotation_stats[:total_cds].to_f
|
315
299
|
|
316
300
|
File.open(file, "w") do |fopen|
|
301
|
+
|
317
302
|
fopen.write("#Contigs annotation based on reference genomes\n")
|
318
303
|
fopen.write("Short Contigs (< #{@minlength}) :\t\t" + @annotation_stats[:short_contigs].length.to_s + "\n")
|
319
304
|
fopen.write("Foreign Contigs :\t\t" + @annotation_stats[:foreign_contigs].length.to_s + "\n")
|
@@ -446,6 +431,51 @@ class BacterialAnnotator
|
|
446
431
|
|
447
432
|
end # end of method
|
448
433
|
|
449
|
-
|
434
|
+
# will reference CDS synteny to file
|
435
|
+
def dump_ref_synteny_to_file
|
436
|
+
|
437
|
+
# Iterate over each Ref protein and print syntheny
|
438
|
+
synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
|
439
|
+
synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
|
440
|
+
ref_annotated = {}
|
441
|
+
@contig_annotations.each do |contig,prot_annotations|
|
442
|
+
prot_annotations.each do |key,prot|
|
443
|
+
# p key
|
444
|
+
# p prot
|
445
|
+
ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
@refgenome.coding_seq.each do |ref_k, ref_v|
|
450
|
+
|
451
|
+
gene = ""
|
452
|
+
coverage_ref = ""
|
453
|
+
coverage_query = ""
|
454
|
+
query_length = ""
|
455
|
+
pId = ""
|
456
|
+
if ref_annotated[ref_v[:protId]] != nil
|
457
|
+
gene = ref_annotated[ref_v[:protId]][:key]
|
458
|
+
coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
|
459
|
+
query_length = @fasta.prodigal_files[:prot_ids_length][gene]
|
460
|
+
coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
|
461
|
+
pId = ref_annotated[ref_v[:protId]][:pId]
|
462
|
+
end
|
463
|
+
|
464
|
+
synteny_file.write(ref_v[:protId])
|
465
|
+
synteny_file.write("\t"+ref_v[:locustag])
|
466
|
+
synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
|
467
|
+
synteny_file.write("\t"+coverage_ref.to_s)
|
468
|
+
synteny_file.write("\t"+pId.to_s)
|
469
|
+
synteny_file.write("\t"+gene)
|
470
|
+
synteny_file.write("\t"+query_length.to_s)
|
471
|
+
synteny_file.write("\t"+coverage_query.to_s)
|
472
|
+
synteny_file.write("\n")
|
473
|
+
|
474
|
+
end
|
475
|
+
synteny_file.close
|
476
|
+
|
477
|
+
end
|
478
|
+
|
479
|
+
private :dump_cds, :split_remaining_cds_file, :dump_ref_synteny_to_file
|
450
480
|
|
451
481
|
end # end of class
|
data/lib/bacterial-comparator.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -72,7 +72,7 @@ dependencies:
|
|
72
72
|
version: 1.9.0
|
73
73
|
description: GEM to annotate bacterial genome sequence based on a reference genome
|
74
74
|
and complete the annotation with an external database or a remote database.
|
75
|
-
email:
|
75
|
+
email: maximilien1er@gmail.com
|
76
76
|
executables:
|
77
77
|
- bacterial-annotator
|
78
78
|
- ba_prodigal
|