bacterial-annotator 0.3.7 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ba_blat +1 -1
- data/bin/ba_mafft +1 -1
- data/bin/ba_prodigal +1 -1
- data/bin/ba_raxml +1 -1
- data/bin/bacterial-annotator +5 -5
- data/lib/bacterial-annotator/fasta-manip.rb +1 -1
- data/lib/bacterial-annotator/genbank-manip.rb +147 -54
- data/lib/bacterial-annotator/remote-ncbi.rb +1 -1
- data/lib/bacterial-annotator/synteny-manip.rb +169 -34
- data/lib/bacterial-annotator.rb +86 -56
- data/lib/bacterial-comparator.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b46c0c48f45550ea7deb1580acb72927435df345
|
4
|
+
data.tar.gz: b5511795149b832c27fc8867dc3904c9c96d9561
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac790dd5249a74cf559912dc88ebaad614d3f934a43c297848e1bda2db146c682b11862489170ddc21683fed070509f505cdc6759530ab0e1d2f8c80d8951ac7
|
7
|
+
data.tar.gz: 0de96df50d2e3a1ad4274134969007b8efef2f1a63b24e7063c45f102c9020ba3e6c337de2214bc1a2f26ae1d33057d9ec7c6ef0ea182e23c77633b3cd8d5e0a
|
data/bin/ba_blat
CHANGED
data/bin/ba_mafft
CHANGED
data/bin/ba_prodigal
CHANGED
data/bin/ba_raxml
CHANGED
data/bin/bacterial-annotator
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# -*- coding: utf-8 -*-
|
3
3
|
# author: maxime déraspe
|
4
|
-
# email:
|
4
|
+
# email: maximilien1er@gmail.com
|
5
5
|
# review:
|
6
6
|
# date: 15-02-24
|
7
7
|
# version: 0.01
|
@@ -64,7 +64,6 @@ annotate [OPTIONS]
|
|
64
64
|
--minlength Minimum contig length for annotation [default=500]
|
65
65
|
|
66
66
|
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
67
|
-
--gff Will also generate gff annotation files [off by default]
|
68
67
|
|
69
68
|
OEM
|
70
69
|
|
@@ -92,8 +91,6 @@ def parseOptions_annotate
|
|
92
91
|
options[:outdir] = ARGV.shift
|
93
92
|
when "--force", "-f"
|
94
93
|
options[:force] = 1
|
95
|
-
when "--gff"
|
96
|
-
options[:gff] = 1
|
97
94
|
when "--minlength"
|
98
95
|
options[:minlength] = ARGV.shift
|
99
96
|
when "--pidentity"
|
@@ -192,7 +189,10 @@ def parseOptions_compare
|
|
192
189
|
end
|
193
190
|
|
194
191
|
|
195
|
-
|
192
|
+
########
|
193
|
+
# MAIN #
|
194
|
+
########
|
195
|
+
|
196
196
|
if ARGV.size > 1
|
197
197
|
|
198
198
|
ROOT = File.dirname(__FILE__)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
# author: maxime déraspe
|
3
|
-
# email:
|
3
|
+
# email: maximilien1er@gmail.com
|
4
4
|
# review:
|
5
5
|
# date: 15-02-24
|
6
6
|
# version: 0.0.1
|
@@ -10,7 +10,7 @@
|
|
10
10
|
|
11
11
|
class GenbankManip
|
12
12
|
|
13
|
-
attr_accessor :gbk, :coding_seq, :cds_file
|
13
|
+
attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
|
14
14
|
|
15
15
|
# Initialize then genbank file
|
16
16
|
def initialize gbk_file, outdir
|
@@ -58,13 +58,6 @@ class GenbankManip
|
|
58
58
|
protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
|
59
59
|
locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
|
60
60
|
|
61
|
-
# if ftH.has_key? "translation"
|
62
|
-
# pep = ftH["translation"][0] if !ftH["translation"].nil?
|
63
|
-
# else
|
64
|
-
# dna = get_DNA(ft,@bioseq)
|
65
|
-
# pep = dna.translate
|
66
|
-
# end
|
67
|
-
|
68
61
|
dna = get_DNA(ft,@bioseq)
|
69
62
|
pep = dna.translate
|
70
63
|
pepBioSeq = Bio::Sequence.auto(pep)
|
@@ -89,6 +82,49 @@ class GenbankManip
|
|
89
82
|
|
90
83
|
end
|
91
84
|
|
85
|
+
# Prepare rRNA tRNA
|
86
|
+
def get_rna
|
87
|
+
|
88
|
+
if @rna_seq == nil
|
89
|
+
|
90
|
+
@rna_seq = {}
|
91
|
+
@gbk.features do |ft|
|
92
|
+
|
93
|
+
next if ! ft.feature.to_s.include? "RNA"
|
94
|
+
|
95
|
+
ftH = ft.to_hash
|
96
|
+
loc = ft.locations
|
97
|
+
# seqBeg = loc[0].from.to_s
|
98
|
+
# seqEnd = loc[0].to.to_s
|
99
|
+
# strand = loc[0].strand.to_s
|
100
|
+
if ftH.has_key? "pseudo"
|
101
|
+
next
|
102
|
+
end
|
103
|
+
# gene = ftH["gene"] if !ftH["gene"].nil?
|
104
|
+
# protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
|
105
|
+
product = ""
|
106
|
+
product = ftH["product"][0] if !ftH["product"].nil?
|
107
|
+
locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
|
108
|
+
|
109
|
+
# puts "#{@accession}\t#{seqBeg}\t#{seqEnd}\t#{strand}\t#{protId}\t#{locustag}\t#{gene[0]}\t#{product[0]}"
|
110
|
+
dna = get_DNA(ft,@bioseq)
|
111
|
+
dnaBioSeq = Bio::Sequence.auto(dna)
|
112
|
+
|
113
|
+
@rna_seq[locustag] = {type: ft.feature.to_s,
|
114
|
+
location: loc,
|
115
|
+
locustag: locustag,
|
116
|
+
product: product,
|
117
|
+
bioseq_gene: dnaBioSeq}
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
@rna_seq
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
|
92
128
|
|
93
129
|
# Print CDS to files
|
94
130
|
# RETURN : cds_file path
|
@@ -116,79 +152,136 @@ class GenbankManip
|
|
116
152
|
|
117
153
|
end
|
118
154
|
|
155
|
+
# Print RNA to files
|
156
|
+
# RETURN : rna_file path
|
157
|
+
def write_rna_to_file outdir
|
158
|
+
|
159
|
+
rna_file = "#{@gbk.accession}.rna"
|
160
|
+
|
161
|
+
if @rna_seq == nil
|
162
|
+
get_rna
|
163
|
+
end
|
164
|
+
|
165
|
+
File.open("#{outdir}/#{rna_file}", "w") do |fwrite|
|
166
|
+
@rna_seq.each_key do |k|
|
167
|
+
seqout_dna = @rna_seq[k][:bioseq_gene].output_fasta("#{k}|#{@rna_seq[k][:type]}|#{@rna_seq[k][:product]}",60)
|
168
|
+
fwrite.write(seqout_dna)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
@rna_file = "#{outdir}/" + rna_file
|
173
|
+
|
174
|
+
end
|
175
|
+
|
119
176
|
|
120
177
|
# add annotation to a genbank file produced by prodigal
|
121
|
-
def
|
178
|
+
def add_annotations annotations, mode, reference_locus=nil
|
122
179
|
|
123
|
-
nb_of_added_ft = 0
|
180
|
+
# nb_of_added_ft = 0
|
124
181
|
i = 0
|
125
182
|
|
126
183
|
contig = @gbk.definition
|
127
184
|
|
128
|
-
|
129
|
-
|
185
|
+
if mode == "inplace"
|
186
|
+
|
187
|
+
# iterate through
|
188
|
+
@gbk.features.each_with_index do |cds, ft_index|
|
130
189
|
|
131
|
-
|
190
|
+
next if cds.feature != "CDS"
|
132
191
|
|
133
|
-
if mode == 0
|
134
192
|
ftArray = []
|
135
193
|
cds.qualifiers = []
|
136
|
-
else
|
137
|
-
ftArray = cds.qualifiers
|
138
|
-
end
|
139
194
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
195
|
+
i += 1
|
196
|
+
prot_id = contig+"_"+i.to_s
|
197
|
+
hit = nil
|
198
|
+
hit = annotations[prot_id] if annotations.has_key? prot_id
|
199
|
+
|
200
|
+
if hit != nil
|
201
|
+
locus, gene, product, note = nil
|
202
|
+
locus = hit[:locustag]
|
203
|
+
gene = hit[:gene]
|
204
|
+
product = hit[:product]
|
205
|
+
note = hit[:note]
|
206
|
+
pId = hit[:pId]
|
207
|
+
|
208
|
+
if gene != nil
|
209
|
+
qGene = Bio::Feature::Qualifier.new('gene', gene)
|
210
|
+
ftArray.push(qGene)
|
211
|
+
end
|
212
|
+
|
213
|
+
if product != nil
|
214
|
+
qProd = Bio::Feature::Qualifier.new('product', product)
|
215
|
+
ftArray.push(qProd)
|
216
|
+
end
|
217
|
+
|
218
|
+
# check if there is a reference genome.. reference_locus shouldn't be nil in that case
|
219
|
+
if locus != nil
|
220
|
+
qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (#{pId}% identity) from #{reference_locus.entry_id}")
|
221
|
+
ftArray.push(qNote)
|
222
|
+
end
|
223
|
+
|
224
|
+
if note != nil
|
225
|
+
qNote = Bio::Feature::Qualifier.new('note', note)
|
226
|
+
ftArray.push(qNote)
|
227
|
+
end
|
157
228
|
|
158
|
-
if product != nil
|
159
|
-
qProd = Bio::Feature::Qualifier.new('product', product)
|
160
|
-
ftArray.push(qProd)
|
161
229
|
end
|
230
|
+
cds.qualifiers = ftArray
|
162
231
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
elsif mode == "new"
|
236
|
+
|
237
|
+
sorted_annotations = annotations.sort_by { |k, v| v[:query_location][0][0] }
|
238
|
+
|
239
|
+
new_features = {}
|
240
|
+
annotations_done = {}
|
241
|
+
|
242
|
+
@gbk.features.each_with_index do |ft, ft_index|
|
243
|
+
|
244
|
+
sorted_annotations.each do |k,v|
|
245
|
+
|
246
|
+
next if annotations_done.has_key? k
|
247
|
+
|
248
|
+
if v[:query_location][0][0] < ft.locations[0].from
|
249
|
+
|
250
|
+
if v[:subject_location][0][0] > v[:subject_location][0][1]
|
251
|
+
location = "complement(#{v[:query_location][0][0]}..#{v[:query_location][0][1]})"
|
252
|
+
else
|
253
|
+
location = "#{v[:query_location][0][0]}..#{v[:query_location][0][1]}"
|
254
|
+
end
|
255
|
+
|
256
|
+
feature = Bio::Feature.new(v[:feature][0],location)
|
257
|
+
feature.qualifiers.push(Bio::Feature::Qualifier.new('product',v[:product][0])) if ! v[:product][0].nil? or v[:product][0] != ""
|
258
|
+
new_features[ft_index] = feature
|
259
|
+
annotations_done[k] = 1
|
260
|
+
break
|
261
|
+
|
262
|
+
end
|
168
263
|
|
169
|
-
if note != nil
|
170
|
-
qNote = Bio::Feature::Qualifier.new('note', note)
|
171
|
-
ftArray.push(qNote)
|
172
264
|
end
|
173
265
|
|
266
|
+
end
|
174
267
|
|
268
|
+
new_features.each do |k,v|
|
269
|
+
@gbk.features.insert(k,v)
|
175
270
|
end
|
176
|
-
cds.qualifiers = ftArray
|
177
271
|
|
178
272
|
end
|
179
273
|
|
180
|
-
|
274
|
+
end
|
275
|
+
|
276
|
+
|
277
|
+
def save_genbank_to_file outdir
|
278
|
+
|
279
|
+
File.open("#{outdir}/#{@gbk.definition}.gbk", "w") do |f|
|
181
280
|
f.write(@gbk.to_biosequence.output(:genbank))
|
182
281
|
end
|
183
282
|
|
184
|
-
# Bioruby doesn't support gff at this point
|
185
|
-
# File.open("#{outdir}/#{contig}.gff", "w") do |f|
|
186
|
-
# f.write(@gbk.to_biosequence.output(:gff))
|
187
|
-
# end
|
188
|
-
|
189
283
|
end
|
190
284
|
|
191
|
-
|
192
285
|
###################
|
193
286
|
# Private Methods #
|
194
287
|
###################
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
# author: maxime déraspe
|
3
|
-
# email:
|
3
|
+
# email: maximilien1er@gmail.com
|
4
4
|
# review:
|
5
5
|
# date: 15-02-24
|
6
6
|
# version: 0.0.1
|
@@ -12,34 +12,42 @@ class SyntenyManip
|
|
12
12
|
|
13
13
|
attr_reader :query_file, :subject_file, :aln_hits
|
14
14
|
|
15
|
-
def initialize query_file, subject_file, name, pidentity
|
15
|
+
def initialize query_file, subject_file, name, pidentity, type
|
16
16
|
@query_file = query_file
|
17
17
|
@subject_file = subject_file
|
18
18
|
@name = name
|
19
19
|
@pidentity = pidentity
|
20
20
|
@aln_file = nil
|
21
|
+
@type = type
|
21
22
|
end # end of initialize
|
22
23
|
|
23
24
|
# run blat on proteins
|
24
25
|
def run_blat root, outdir
|
25
|
-
|
26
|
+
base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity}"
|
27
|
+
system("#{base_cmd} #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
|
28
|
+
if @type == "prot"
|
29
|
+
system("#{base_cmd} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
|
30
|
+
end
|
26
31
|
@aln_file = "#{outdir}/#{@name}.blat8.tsv"
|
27
32
|
# extract_hits
|
28
33
|
end # end of method
|
29
34
|
|
30
35
|
# Extract Hit from blast8 file and save it in hash
|
31
36
|
# contig-0_1 ABJ71957.1 96.92 65 2 0 1 65 1 65 9.2e-31 131.0
|
32
|
-
def
|
37
|
+
def extract_hits_prodigal mode, ref_cds=nil
|
33
38
|
|
34
39
|
@aln_hits = {}
|
40
|
+
feature = ""
|
35
41
|
File.open(@aln_file,"r") do |fread|
|
36
42
|
while l = fread.gets
|
37
43
|
lA = l.chomp!.split("\t")
|
38
44
|
key = lA[0]
|
39
45
|
if mode == :refgenome
|
40
46
|
hit = lA[1]
|
47
|
+
feature = "cds"
|
41
48
|
elsif mode == :externaldb
|
42
|
-
hit = lA[1].chomp.split("|")[
|
49
|
+
hit = lA[1].chomp.split("|")[3]
|
50
|
+
feature = "cds"
|
43
51
|
end
|
44
52
|
if ! @aln_hits.has_key? key
|
45
53
|
next if lA[2].to_f < @pidentity
|
@@ -50,7 +58,8 @@ class SyntenyManip
|
|
50
58
|
hits: [hit],
|
51
59
|
length: [lA[3].to_i],
|
52
60
|
query_location: [[lA[6].to_i,lA[7].to_i]],
|
53
|
-
subject_location: [[lA[8].to_i,lA[9].to_i]]
|
61
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
62
|
+
feature: feature
|
54
63
|
}
|
55
64
|
elsif lA[11].to_f > @aln_hits[key][:score]
|
56
65
|
@aln_hits[key] = {
|
@@ -60,7 +69,8 @@ class SyntenyManip
|
|
60
69
|
hits: [hit],
|
61
70
|
length: [lA[3].to_i],
|
62
71
|
query_location: [[lA[6].to_i,lA[7].to_i]],
|
63
|
-
subject_location: [[lA[8].to_i,lA[9].to_i]]
|
72
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
73
|
+
feature: feature
|
64
74
|
}
|
65
75
|
elsif lA[11].to_f == @aln_hits[key][:score]
|
66
76
|
@aln_hits[key][:hits] << hit
|
@@ -73,49 +83,116 @@ class SyntenyManip
|
|
73
83
|
|
74
84
|
end # end of method
|
75
85
|
|
86
|
+
# Extract Hit from blast8 file and save it in hash
|
87
|
+
# prpa PA0668.4|rRNA|23S 99.97 2891 1 0 705042 707932 1 2891 0.0e+00 5671.0
|
88
|
+
def extract_hits_dna mode
|
76
89
|
|
90
|
+
@aln_hits = {}
|
91
|
+
feature = ""
|
92
|
+
File.open(@aln_file,"r") do |fread|
|
93
|
+
while l = fread.gets
|
94
|
+
lA = l.chomp!.split("\t")
|
95
|
+
key = lA[0]+"_"+lA[6]+"_"+lA[7]
|
96
|
+
if mode == :rna
|
97
|
+
hit_split = lA[1].chomp.split("|")
|
98
|
+
hit = hit_split[0]
|
99
|
+
feature = hit_split[1]
|
100
|
+
product = hit_split[2]
|
101
|
+
end
|
102
|
+
if ! @aln_hits.has_key? key
|
103
|
+
next if lA[2].to_f < @pidentity
|
104
|
+
@aln_hits[key] = {
|
105
|
+
pId: lA[2].to_f.round(2),
|
106
|
+
evalue: lA[10],
|
107
|
+
score: lA[11].to_f,
|
108
|
+
hits: [hit],
|
109
|
+
product: [product],
|
110
|
+
length: [lA[3].to_i],
|
111
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
112
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
113
|
+
feature: [feature]
|
114
|
+
}
|
115
|
+
elsif lA[11].to_f > @aln_hits[key][:score]
|
116
|
+
@aln_hits[key] = {
|
117
|
+
pId: lA[2].to_f.round(2),
|
118
|
+
evalue: lA[10],
|
119
|
+
score: lA[11].to_f,
|
120
|
+
hits: [hit],
|
121
|
+
product: [product],
|
122
|
+
length: [lA[3].to_i],
|
123
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
124
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
125
|
+
feature: [feature]
|
126
|
+
}
|
127
|
+
elsif lA[11].to_f == @aln_hits[key][:score]
|
128
|
+
@aln_hits[key][:hits] << hit
|
129
|
+
@aln_hits[key][:length] << lA[3].to_i
|
130
|
+
@aln_hits[key][:query_location] << [lA[6].to_i,lA[7].to_i]
|
131
|
+
@aln_hits[key][:subject_location] << [lA[8].to_i,lA[9].to_i]
|
132
|
+
@aln_hits[key][:feature] << feature
|
133
|
+
@aln_hits[key][:product] << product
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
77
137
|
|
78
|
-
|
79
|
-
|
138
|
+
prune_aln_hits @aln_hits
|
139
|
+
|
140
|
+
end # end of method
|
80
141
|
|
81
|
-
return {} if prots_to_annotate == nil
|
82
142
|
|
83
|
-
|
143
|
+
# Get the annotations for a contig for RerenceGenome
|
144
|
+
def get_annotation_for_contig contig_to_annotate, prots_to_annotate=nil, ref_cds=nil
|
145
|
+
|
84
146
|
annotations = {}
|
85
|
-
prots = []
|
86
147
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
148
|
+
if prots_to_annotate != nil
|
149
|
+
|
150
|
+
# contig_to_annotate = prots_to_annotate[0].split("_")[0..-2].join("_")
|
151
|
+
prots = []
|
152
|
+
|
153
|
+
@aln_hits.each_key do |k|
|
154
|
+
contig = k.split("_")[0..-2].join("_")
|
155
|
+
if contig == contig_to_annotate
|
156
|
+
prots << k
|
157
|
+
end
|
91
158
|
end
|
92
|
-
end
|
93
159
|
|
94
|
-
|
95
|
-
|
160
|
+
# sorting the prot by their appearance in the contig
|
161
|
+
prots.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
|
96
162
|
|
97
|
-
|
98
|
-
|
163
|
+
i = 0
|
164
|
+
prots_to_annotate.each do |p|
|
99
165
|
|
100
|
-
|
166
|
+
if @aln_hits.has_key? p
|
101
167
|
|
102
|
-
|
168
|
+
hit_index = 0
|
103
169
|
|
104
|
-
|
105
|
-
|
106
|
-
|
170
|
+
if @aln_hits[p][:hits].length > 1
|
171
|
+
hit_index = choose_best_hit i, prots, ref_cds
|
172
|
+
end
|
107
173
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
174
|
+
h = @aln_hits[p][:hits][hit_index]
|
175
|
+
hit = ref_cds[h]
|
176
|
+
annotations[p] = hit
|
177
|
+
annotations[p][:pId] = @aln_hits[p][:pId]
|
178
|
+
annotations[p][:length] = @aln_hits[p][:length][hit_index]
|
179
|
+
i+=1
|
114
180
|
|
115
|
-
|
181
|
+
else
|
182
|
+
|
183
|
+
annotations[p] = nil
|
184
|
+
|
185
|
+
end
|
116
186
|
|
117
|
-
|
187
|
+
end
|
188
|
+
|
189
|
+
elsif ! @aln_hits.empty?
|
118
190
|
|
191
|
+
@aln_hits.each_key do |k|
|
192
|
+
contig = k.split("_")[0..-3].join("_")
|
193
|
+
if contig == contig_to_annotate
|
194
|
+
annotations[k] = @aln_hits[k]
|
195
|
+
end
|
119
196
|
end
|
120
197
|
|
121
198
|
end
|
@@ -192,6 +269,64 @@ class SyntenyManip
|
|
192
269
|
|
193
270
|
end # end of method
|
194
271
|
|
272
|
+
def prune_aln_hits aln_hits
|
273
|
+
|
274
|
+
# @aln_hits[key] = {
|
275
|
+
# pId: lA[2].to_f.round(2),
|
276
|
+
# evalue: lA[10],
|
277
|
+
# score: lA[11].to_f,
|
278
|
+
# hits: [hit],
|
279
|
+
# length: [lA[3].to_i],
|
280
|
+
# query_location: [[lA[6].to_i,lA[7].to_i]],
|
281
|
+
# subject_location: [[lA[8].to_i,lA[9].to_i]],
|
282
|
+
# feature: [feature]
|
283
|
+
# }
|
284
|
+
|
285
|
+
keys_to_delete = []
|
286
|
+
|
287
|
+
aln_hits.each do |key1,val1|
|
288
|
+
|
289
|
+
aln_hits.each do |key2,val2|
|
290
|
+
|
291
|
+
next if key1==key2
|
292
|
+
next if keys_to_delete.include? key1
|
293
|
+
next if keys_to_delete.include? key2
|
294
|
+
|
295
|
+
if val1[:query_location][0][0] >= val2[:query_location][0][0] and
|
296
|
+
val1[:query_location][0][0] < val2[:query_location][0][1]
|
297
|
+
overlap_len = val2[:query_location][0][1] - val1[:query_location][0][0]
|
298
|
+
val1_len = val1[:query_location][0][1]-val1[:query_location][0][0]
|
299
|
+
val2_len = val2[:query_location][0][1]-val2[:query_location][0][0]
|
300
|
+
if overlap_len.to_f/val1_len > 0.2 and overlap_len.to_f/val2_len > 0.2
|
301
|
+
if val1[:score] < val2[:score]
|
302
|
+
keys_to_delete << key1
|
303
|
+
else
|
304
|
+
keys_to_delete << key2
|
305
|
+
end
|
306
|
+
end
|
307
|
+
elsif val2[:query_location][0][0] >= val1[:query_location][0][0] and
|
308
|
+
val2[:query_location][0][0] < val1[:query_location][0][1]
|
309
|
+
overlap_len = val1[:query_location][0][1] - val2[:query_location][0][0]
|
310
|
+
val1_len = val1[:query_location][0][1]-val1[:query_location][0][0]
|
311
|
+
val2_len = val2[:query_location][0][1]-val2[:query_location][0][0]
|
312
|
+
if overlap_len.to_f/val1_len > 0.2 and overlap_len.to_f/val2_len > 0.2
|
313
|
+
if val1[:score] < val2[:score]
|
314
|
+
keys_to_delete << key1
|
315
|
+
else
|
316
|
+
keys_to_delete << key2
|
317
|
+
end
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
end
|
322
|
+
|
323
|
+
end
|
324
|
+
|
325
|
+
keys_to_delete.each do |k|
|
326
|
+
aln_hits.delete(k)
|
327
|
+
end
|
328
|
+
|
329
|
+
end # end of method
|
195
330
|
|
196
331
|
|
197
332
|
end # end of class
|
data/lib/bacterial-annotator.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
# author: maxime déraspe
|
3
|
-
# email:
|
3
|
+
# email: maximilien1er@gmail.com
|
4
4
|
# review:
|
5
5
|
# date: 15-02-24
|
6
6
|
# version: 0.0.1
|
@@ -33,7 +33,7 @@ class BacterialAnnotator
|
|
33
33
|
abort "Output directory already exist ! Choose another one or use -f to overwrite"
|
34
34
|
else
|
35
35
|
puts "Overwriting output directory #{@outdir}"
|
36
|
-
FileUtils.remove_dir(@outdir, force
|
36
|
+
FileUtils.remove_dir(@outdir, :force=>true)
|
37
37
|
end
|
38
38
|
end
|
39
39
|
Dir.mkdir(@outdir)
|
@@ -67,6 +67,7 @@ class BacterialAnnotator
|
|
67
67
|
puts "Prodigal done."
|
68
68
|
if @with_refence_genome
|
69
69
|
@refgenome.write_cds_to_file @outdir
|
70
|
+
@refgenome.write_rna_to_file @outdir
|
70
71
|
puts "Successfully loaded #{@refgenome.gbk.definition}"
|
71
72
|
end
|
72
73
|
end # end of method
|
@@ -77,10 +78,11 @@ class BacterialAnnotator
|
|
77
78
|
# process reference genome synteny
|
78
79
|
if @with_refence_genome # Annotation with the Reference Genome
|
79
80
|
|
80
|
-
|
81
|
-
puts "\nRunning BLAT alignment with Reference Genome.."
|
81
|
+
# run CDS annotation
|
82
|
+
puts "\nRunning BLAT alignment with Reference Genome CDS.."
|
83
|
+
@prot_synteny = SyntenyManip.new(@fasta.prodigal_files[:proteins], @refgenome.cds_file, "Prot-Ref", @pidentity, "prot")
|
82
84
|
@prot_synteny.run_blat @root, @outdir
|
83
|
-
@prot_synteny.
|
85
|
+
@prot_synteny.extract_hits_prodigal :refgenome
|
84
86
|
|
85
87
|
@fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
|
86
88
|
|
@@ -91,9 +93,9 @@ class BacterialAnnotator
|
|
91
93
|
end
|
92
94
|
|
93
95
|
contig_prots = @fasta.prodigal_files[:prot_ids_by_contig][contig]
|
94
|
-
|
96
|
+
# contig_to_annotate = contig_prots[0].split("_")[0..-2].join("_")
|
95
97
|
# contig_prot_annotations = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
|
96
|
-
@contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig_prots, @refgenome.coding_seq
|
98
|
+
@contig_annotations[contig] = @prot_synteny.get_annotation_for_contig contig, contig_prots, @refgenome.coding_seq
|
97
99
|
|
98
100
|
remaining_cds = cumulate_annotation_stats_reference contig, @contig_annotations[contig]
|
99
101
|
|
@@ -106,44 +108,20 @@ class BacterialAnnotator
|
|
106
108
|
# dump foreign proteins to file
|
107
109
|
foreign_cds_file = dump_cds
|
108
110
|
|
109
|
-
#
|
110
|
-
|
111
|
-
synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
|
112
|
-
ref_annotated = {}
|
113
|
-
@contig_annotations.each do |contig,prot_annotations|
|
114
|
-
prot_annotations.each do |key,prot|
|
115
|
-
# p key
|
116
|
-
# p prot
|
117
|
-
ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
@refgenome.coding_seq.each do |ref_k, ref_v|
|
122
|
-
gene = ""
|
123
|
-
coverage_ref = ""
|
124
|
-
coverage_query = ""
|
125
|
-
query_length = ""
|
126
|
-
pId = ""
|
127
|
-
if ref_annotated[ref_v[:protId]] != nil
|
128
|
-
gene = ref_annotated[ref_v[:protId]][:key]
|
129
|
-
coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
|
130
|
-
query_length = @fasta.prodigal_files[:prot_ids_length][gene]
|
131
|
-
coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
|
132
|
-
pId = ref_annotated[ref_v[:protId]][:pId]
|
133
|
-
end
|
134
|
-
|
135
|
-
synteny_file.write(ref_v[:protId])
|
136
|
-
synteny_file.write("\t"+ref_v[:locustag])
|
137
|
-
synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
|
138
|
-
synteny_file.write("\t"+coverage_ref.to_s)
|
139
|
-
synteny_file.write("\t"+pId.to_s)
|
140
|
-
synteny_file.write("\t"+gene)
|
141
|
-
synteny_file.write("\t"+query_length.to_s)
|
142
|
-
synteny_file.write("\t"+coverage_query.to_s)
|
143
|
-
synteny_file.write("\n")
|
111
|
+
# dump reference CDS synteny to file
|
112
|
+
dump_ref_synteny_to_file
|
144
113
|
|
114
|
+
# run RNA annotation
|
115
|
+
puts "\nRunning BLAT alignment with Reference Genome RNA.."
|
116
|
+
@rna_synteny = SyntenyManip.new(@fasta.fasta_file, @refgenome.rna_file, "RNA-Ref", @pidentity, "dna")
|
117
|
+
@rna_synteny.run_blat @root, @outdir
|
118
|
+
@rna_synteny.extract_hits_dna :rna
|
119
|
+
@contig_annotations_rna = {}
|
120
|
+
@fasta.prodigal_files[:contigs].each_with_index do |contig, contig_index|
|
121
|
+
puts "adding rna_annotation for contig #{contig}"
|
122
|
+
@contig_annotations_rna[contig] = @rna_synteny.get_annotation_for_contig contig
|
123
|
+
p @contig_annotations_rna[contig]
|
145
124
|
end
|
146
|
-
synteny_file.close
|
147
125
|
|
148
126
|
else # no reference genome
|
149
127
|
|
@@ -156,7 +134,7 @@ class BacterialAnnotator
|
|
156
134
|
finish_annotation foreign_cds_file
|
157
135
|
|
158
136
|
# Parse annotations to genbank files
|
159
|
-
|
137
|
+
parse_genbank_files
|
160
138
|
|
161
139
|
puts "\nPrinting Statistics.."
|
162
140
|
print_stats "#{@outdir}/Annotation-Stats.txt"
|
@@ -177,7 +155,7 @@ class BacterialAnnotator
|
|
177
155
|
externaldb_synteny = SyntenyManip.new(remaining_cds_file, db_file, "Prot-ExternalDB", @pidentity)
|
178
156
|
puts "\nRunning BLAT alignment with External Database.."
|
179
157
|
externaldb_synteny.run_blat @root, @outdir
|
180
|
-
externaldb_synteny.
|
158
|
+
externaldb_synteny.extract_hits_prodigal :externaldb
|
181
159
|
|
182
160
|
externaldb_synteny.aln_hits.each do |k,v|
|
183
161
|
contig_of_protein = k.split("_")[0..-2].join("_")
|
@@ -188,15 +166,14 @@ class BacterialAnnotator
|
|
188
166
|
|
189
167
|
hit_gi = v[:hits][0]
|
190
168
|
|
191
|
-
note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
|
192
|
-
|
193
|
-
# p v
|
194
|
-
# p ref_cds[hit_gi]
|
169
|
+
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{hit_gi}"
|
170
|
+
note = "Protein homology (#{v[:pId]}% identity) with #{hit_gi}"
|
195
171
|
|
196
172
|
if ref_cds[hit_gi][:org] != ""
|
197
173
|
note += " from #{ref_cds[hit_gi][:org]}"
|
198
174
|
end
|
199
175
|
@contig_annotations[contig_of_protein][k] = {product: ref_cds[hit_gi][:product],
|
176
|
+
feature: "cds",
|
200
177
|
gene: nil,
|
201
178
|
locustag: nil,
|
202
179
|
note: note}
|
@@ -237,17 +214,16 @@ class BacterialAnnotator
|
|
237
214
|
end
|
238
215
|
ncbiblast.aln_hits.each do |k,v|
|
239
216
|
contig_of_protein = k.split("_")[0..-2].join("_")
|
240
|
-
# @contig_annotations[contig_of_protein][k][:product] = v[:hits][0][:product]
|
241
217
|
if ! @contig_annotations.has_key? contig_of_protein
|
242
218
|
@contig_annotations[contig_of_protein] = {}
|
243
219
|
end
|
244
|
-
|
245
|
-
note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:
|
246
|
-
# note = "correspond to gi:#{v[:hits][0][:gi]}"
|
220
|
+
# note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:gi]}"
|
221
|
+
note = "Protein homology (#{v[:pId]}% identity) with gi:#{v[:hits][0][:accession]}"
|
247
222
|
if v[:hits][0][:org] != ""
|
248
223
|
note += " from #{v[:hits][0][:org]}"
|
249
224
|
end
|
250
225
|
@contig_annotations[contig_of_protein][k] = {product: v[:hits][0][:product],
|
226
|
+
feature: "cds",
|
251
227
|
gene: nil,
|
252
228
|
locustag: nil,
|
253
229
|
note: note}
|
@@ -263,7 +239,7 @@ class BacterialAnnotator
|
|
263
239
|
|
264
240
|
|
265
241
|
# parse all genbank files
|
266
|
-
def
|
242
|
+
def parse_genbank_files
|
267
243
|
|
268
244
|
puts "\nParsing annotation into genbank files.."
|
269
245
|
@contig_annotations.each do |contig, contig_prot_annotations|
|
@@ -271,7 +247,15 @@ class BacterialAnnotator
|
|
271
247
|
gbk_to_annotate = GenbankManip.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
|
272
248
|
reference_locus = nil
|
273
249
|
reference_locus = @refgenome.gbk.locus if @with_refence_genome
|
274
|
-
gbk_to_annotate.
|
250
|
+
gbk_to_annotate.add_annotations contig_prot_annotations, "inplace", reference_locus
|
251
|
+
|
252
|
+
if @contig_annotations_rna.has_key? contig
|
253
|
+
puts "Trying RNA annotation"
|
254
|
+
gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
|
255
|
+
end
|
256
|
+
|
257
|
+
gbk_to_annotate.save_genbank_to_file gbk_path
|
258
|
+
|
275
259
|
end
|
276
260
|
|
277
261
|
end # end of method
|
@@ -314,6 +298,7 @@ class BacterialAnnotator
|
|
314
298
|
p_cds_annotated = @annotation_stats[:annotated_cds].to_f/@annotation_stats[:total_cds].to_f
|
315
299
|
|
316
300
|
File.open(file, "w") do |fopen|
|
301
|
+
|
317
302
|
fopen.write("#Contigs annotation based on reference genomes\n")
|
318
303
|
fopen.write("Short Contigs (< #{@minlength}) :\t\t" + @annotation_stats[:short_contigs].length.to_s + "\n")
|
319
304
|
fopen.write("Foreign Contigs :\t\t" + @annotation_stats[:foreign_contigs].length.to_s + "\n")
|
@@ -446,6 +431,51 @@ class BacterialAnnotator
|
|
446
431
|
|
447
432
|
end # end of method
|
448
433
|
|
449
|
-
|
434
|
+
# will reference CDS synteny to file
|
435
|
+
def dump_ref_synteny_to_file
|
436
|
+
|
437
|
+
# Iterate over each Ref protein and print syntheny
|
438
|
+
synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
|
439
|
+
synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
|
440
|
+
ref_annotated = {}
|
441
|
+
@contig_annotations.each do |contig,prot_annotations|
|
442
|
+
prot_annotations.each do |key,prot|
|
443
|
+
# p key
|
444
|
+
# p prot
|
445
|
+
ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
|
446
|
+
end
|
447
|
+
end
|
448
|
+
|
449
|
+
@refgenome.coding_seq.each do |ref_k, ref_v|
|
450
|
+
|
451
|
+
gene = ""
|
452
|
+
coverage_ref = ""
|
453
|
+
coverage_query = ""
|
454
|
+
query_length = ""
|
455
|
+
pId = ""
|
456
|
+
if ref_annotated[ref_v[:protId]] != nil
|
457
|
+
gene = ref_annotated[ref_v[:protId]][:key]
|
458
|
+
coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
|
459
|
+
query_length = @fasta.prodigal_files[:prot_ids_length][gene]
|
460
|
+
coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
|
461
|
+
pId = ref_annotated[ref_v[:protId]][:pId]
|
462
|
+
end
|
463
|
+
|
464
|
+
synteny_file.write(ref_v[:protId])
|
465
|
+
synteny_file.write("\t"+ref_v[:locustag])
|
466
|
+
synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
|
467
|
+
synteny_file.write("\t"+coverage_ref.to_s)
|
468
|
+
synteny_file.write("\t"+pId.to_s)
|
469
|
+
synteny_file.write("\t"+gene)
|
470
|
+
synteny_file.write("\t"+query_length.to_s)
|
471
|
+
synteny_file.write("\t"+coverage_query.to_s)
|
472
|
+
synteny_file.write("\n")
|
473
|
+
|
474
|
+
end
|
475
|
+
synteny_file.close
|
476
|
+
|
477
|
+
end
|
478
|
+
|
479
|
+
private :dump_cds, :split_remaining_cds_file, :dump_ref_synteny_to_file
|
450
480
|
|
451
481
|
end # end of class
|
data/lib/bacterial-comparator.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -72,7 +72,7 @@ dependencies:
|
|
72
72
|
version: 1.9.0
|
73
73
|
description: GEM to annotate bacterial genome sequence based on a reference genome
|
74
74
|
and complete the annotation with an external database or a remote database.
|
75
|
-
email:
|
75
|
+
email: maximilien1er@gmail.com
|
76
76
|
executables:
|
77
77
|
- bacterial-annotator
|
78
78
|
- ba_prodigal
|