bacterial-annotator 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/ba_prodigal +1 -1
- data/bin/bacterial-annotator +13 -14
- data/lib/bacterial-annotator/{genbank-manip.rb → sequence-annotation.rb} +128 -16
- data/lib/bacterial-annotator/{fasta-manip.rb → sequence-fasta.rb} +32 -23
- data/lib/bacterial-annotator/{synteny-manip.rb → sequence-synteny.rb} +128 -8
- data/lib/bacterial-annotator.rb +211 -140
- data/lib/bacterial-comparator.rb +1 -0
- metadata +5 -6
- data/lib/bacterial-annotator/remote-ncbi.rb +0 -201
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: befd57ce78f0c186da1553c7372c3aa6faeb9d90
|
4
|
+
data.tar.gz: 5e37d6a7e579a1e9e428deb9864e4a9d5ea9f057
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9a9766113cef56ae7ed35749cd5fbc10d746aa82e403596dccd0c5e7946786b136a69e19c74a4cece73549b0f1a8de077a8c106fc0e2310f7b000dc6cbad962
|
7
|
+
data.tar.gz: 00a0c5cf815252fa45ffae194318f730cf69e27dd53643659fb06d2dac131a3de881cbad2595b43df6ba5014be75cef8b337e6710afb46e42b420fdd1cf9b178
|
data/bin/ba_prodigal
CHANGED
data/bin/bacterial-annotator
CHANGED
@@ -46,22 +46,21 @@ annotate [OPTIONS]
|
|
46
46
|
--force/-f Force to overwrite the output directory
|
47
47
|
|
48
48
|
// Dataset
|
49
|
-
--refgenome/-g
|
50
|
-
--guessref
|
49
|
+
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
50
|
+
--guessref Will guess the best reference genome to use for the annotation.
|
51
51
|
|
52
|
-
--
|
53
|
-
|
54
|
-
Can be very slow, better to use an external database !
|
55
|
-
|
56
|
-
--externaldb <proteins fasta_file>
|
57
|
-
Complete or do the annotation of remaining CDS with this database (a protein fasta file).
|
52
|
+
--externaldb <proteins fasta_file>
|
53
|
+
Finish or do a complete annotation with this sequence database (a protein fasta file).
|
58
54
|
Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
|
59
55
|
>gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
60
56
|
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
61
57
|
|
62
58
|
// Other options
|
63
|
-
--pidentity Minimum percentage identity to incorporate a CDS annotation [default=0.7]
|
64
|
-
--
|
59
|
+
--pidentity <% identity> Minimum percentage identity to incorporate a CDS annotation [default=0.7]
|
60
|
+
--pcoverage <% identity> Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.7]
|
61
|
+
.. otherwise hint for a non-functional protein
|
62
|
+
|
63
|
+
--minlength <length> Minimum contig length for annotation [default=500]
|
65
64
|
|
66
65
|
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
67
66
|
|
@@ -77,6 +76,7 @@ def parseOptions_annotate
|
|
77
76
|
# default options
|
78
77
|
options[:outdir] = "BAnnotation"
|
79
78
|
options[:pidentity] = 70
|
79
|
+
options[:pcoverage] = 70
|
80
80
|
options[:minlength] = 500
|
81
81
|
options[:meta] = 0
|
82
82
|
|
@@ -95,10 +95,10 @@ def parseOptions_annotate
|
|
95
95
|
options[:minlength] = ARGV.shift
|
96
96
|
when "--pidentity"
|
97
97
|
options[:pidentity] = ARGV.shift
|
98
|
+
when "--pcoverage"
|
99
|
+
options[:pcoverage] = ARGV.shift
|
98
100
|
when "--meta"
|
99
101
|
options[:meta] = 1
|
100
|
-
when "--remotedb"
|
101
|
-
options[:remote_db] = ARGV.shift
|
102
102
|
when "--externaldb"
|
103
103
|
options[:external_db] = ARGV.shift
|
104
104
|
when "--help", "-h"
|
@@ -204,7 +204,7 @@ if ARGV.size > 1
|
|
204
204
|
system("ba_raxml")
|
205
205
|
|
206
206
|
options = {}
|
207
|
-
genomes_list = []
|
207
|
+
genomes_list = [] # TODO multiple input genomes
|
208
208
|
|
209
209
|
if ARGV[0] == "annotate"
|
210
210
|
|
@@ -217,7 +217,6 @@ if ARGV.size > 1
|
|
217
217
|
|
218
218
|
# Check Options
|
219
219
|
if ! options.has_key? :refgenome and
|
220
|
-
! options.has_key? :remote_db and
|
221
220
|
! options.has_key? :external_db
|
222
221
|
puts "You didn't provide a reference genome or a database for the annotation !"
|
223
222
|
elsif ! options.has_key? :input
|
@@ -1,14 +1,13 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
# author: maxime déraspe
|
3
3
|
# email: maximilien1er@gmail.com
|
4
|
-
# review:
|
5
4
|
# date: 15-02-24
|
6
5
|
# version: 0.0.1
|
7
6
|
# licence:
|
8
7
|
|
9
8
|
|
10
9
|
|
11
|
-
class
|
10
|
+
class SequenceAnnotation
|
12
11
|
|
13
12
|
attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
|
14
13
|
|
@@ -67,13 +66,16 @@ class GenbankManip
|
|
67
66
|
protId = locustag
|
68
67
|
end
|
69
68
|
|
70
|
-
@coding_seq[protId] = {
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
69
|
+
@coding_seq[protId] = {
|
70
|
+
protId: protId,
|
71
|
+
location: loc,
|
72
|
+
locustag: locustag,
|
73
|
+
gene: gene[0],
|
74
|
+
product: product[0],
|
75
|
+
bioseq: pepBioSeq,
|
76
|
+
bioseq_gene: dnaBioSeq,
|
77
|
+
bioseq_len: pepBioSeq.length
|
78
|
+
}
|
77
79
|
end
|
78
80
|
|
79
81
|
end
|
@@ -110,11 +112,13 @@ class GenbankManip
|
|
110
112
|
dna = get_DNA(ft,@bioseq)
|
111
113
|
dnaBioSeq = Bio::Sequence.auto(dna)
|
112
114
|
|
113
|
-
@rna_seq[locustag] = {
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
115
|
+
@rna_seq[locustag] = {
|
116
|
+
type: ft.feature.to_s,
|
117
|
+
location: loc,
|
118
|
+
locustag: locustag,
|
119
|
+
product: product,
|
120
|
+
bioseq_gene: dnaBioSeq
|
121
|
+
}
|
118
122
|
|
119
123
|
end
|
120
124
|
|
@@ -125,7 +129,6 @@ class GenbankManip
|
|
125
129
|
end
|
126
130
|
|
127
131
|
|
128
|
-
|
129
132
|
# Print CDS to files
|
130
133
|
# RETURN : cds_file path
|
131
134
|
def write_cds_to_file outdir
|
@@ -174,12 +177,108 @@ class GenbankManip
|
|
174
177
|
end
|
175
178
|
|
176
179
|
|
180
|
+
# add annotation from reference prot synteny
|
181
|
+
def add_annotation_ref_synteny_prot synteny_prot, annotations, ref_genome=nil
|
182
|
+
|
183
|
+
contig = @gbk.definition
|
184
|
+
|
185
|
+
prot_iterator = 0
|
186
|
+
@gbk.features.each_with_index do |cds, ft_index|
|
187
|
+
|
188
|
+
next if cds.feature != "CDS"
|
189
|
+
|
190
|
+
prot_iterator+=1
|
191
|
+
prot_id = contig+"_"+prot_iterator.to_s
|
192
|
+
|
193
|
+
ftArray = []
|
194
|
+
cds.qualifiers = []
|
195
|
+
|
196
|
+
hit = nil
|
197
|
+
|
198
|
+
next if ! synteny_prot.has_key? prot_id or
|
199
|
+
! synteny_prot[prot_id].has_key? :homology
|
200
|
+
|
201
|
+
# puts "#{annotations.keys}"
|
202
|
+
if annotations.has_key? synteny_prot[prot_id][:homology][:hits][0]
|
203
|
+
hit = annotations[synteny_prot[prot_id][:homology][:hits][0]]
|
204
|
+
# puts hit
|
205
|
+
else
|
206
|
+
puts "no hit for #{prot_id}"
|
207
|
+
next
|
208
|
+
end
|
209
|
+
|
210
|
+
# hit = annotations[synteny_prot[prot_id][:homology][:hits][0]]
|
211
|
+
|
212
|
+
if synteny_prot.has_key? prot_id
|
213
|
+
|
214
|
+
locus, gene, product, note, inference = nil
|
215
|
+
locus = hit[:locustag]
|
216
|
+
gene = hit[:gene]
|
217
|
+
product = hit[:product]
|
218
|
+
note = hit[:note]
|
219
|
+
inference = hit[:inference]
|
220
|
+
pId = synteny_prot[prot_id][:homology][:pId]
|
221
|
+
cov_query = (synteny_prot[prot_id][:homology][:cov_query]*100).round(2)
|
222
|
+
cov_subject = (synteny_prot[prot_id][:homology][:cov_subject]*100).round(2)
|
223
|
+
reference_prot_id = synteny_prot[prot_id][:homology][:hits][0]
|
224
|
+
|
225
|
+
qLocusTag = Bio::Feature::Qualifier.new('locus_tag', "#{prot_id}")
|
226
|
+
ftArray.push(qLocusTag)
|
227
|
+
|
228
|
+
if gene != nil
|
229
|
+
qGene = Bio::Feature::Qualifier.new('gene', gene)
|
230
|
+
ftArray.push(qGene)
|
231
|
+
end
|
232
|
+
|
233
|
+
if product != nil
|
234
|
+
qProd = Bio::Feature::Qualifier.new('product', product)
|
235
|
+
ftArray.push(qProd)
|
236
|
+
end
|
237
|
+
|
238
|
+
# check if there is a reference genome.. reference_locus shouldn't be nil in that case
|
239
|
+
if locus != nil
|
240
|
+
qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (AA identity: #{pId}%; coverage(q,s): #{cov_query}%,#{cov_subject}%) from #{ref_genome}")
|
241
|
+
ftArray.push(qNote)
|
242
|
+
|
243
|
+
db_source = "[DBSource]"
|
244
|
+
if reference_prot_id.include? "_"
|
245
|
+
db_source = "RefSeq"
|
246
|
+
else
|
247
|
+
db_source = "INSD"
|
248
|
+
end
|
249
|
+
qInference = Bio::Feature::Qualifier.new('inference', "similar to AA sequence:#{db_source}:#{reference_prot_id}")
|
250
|
+
ftArray.push(qInference)
|
251
|
+
|
252
|
+
end
|
253
|
+
|
254
|
+
if note != nil
|
255
|
+
qNote = Bio::Feature::Qualifier.new('note', note)
|
256
|
+
ftArray.push(qNote)
|
257
|
+
end
|
258
|
+
|
259
|
+
if inference != nil
|
260
|
+
qInference = Bio::Feature::Qualifier.new('inference', inference)
|
261
|
+
ftArray.push(qInference)
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
|
266
|
+
cds.qualifiers = ftArray
|
267
|
+
|
268
|
+
end
|
269
|
+
|
270
|
+
|
271
|
+
end
|
272
|
+
|
273
|
+
|
177
274
|
# add annotation to a genbank file produced by prodigal
|
178
275
|
def add_annotations annotations, mode, reference_locus=nil
|
179
276
|
|
180
277
|
# nb_of_added_ft = 0
|
181
278
|
i = 0
|
182
279
|
|
280
|
+
fdebug = File.open("debug-add-annotation.txt","w")
|
281
|
+
|
183
282
|
contig = @gbk.definition
|
184
283
|
|
185
284
|
if mode == "inplace"
|
@@ -195,9 +294,19 @@ class GenbankManip
|
|
195
294
|
i += 1
|
196
295
|
prot_id = contig+"_"+i.to_s
|
197
296
|
hit = nil
|
198
|
-
|
297
|
+
|
298
|
+
if annotations.has_key? prot_id
|
299
|
+
hit = annotations[prot_id]
|
300
|
+
else
|
301
|
+
puts "no hit for #{prot_id}"
|
302
|
+
next
|
303
|
+
end
|
199
304
|
|
200
305
|
if hit != nil
|
306
|
+
|
307
|
+
fdebug.write(hit)
|
308
|
+
fdebug.write("\n")
|
309
|
+
|
201
310
|
locus, gene, product, note = nil
|
202
311
|
locus = hit[:locustag]
|
203
312
|
gene = hit[:gene]
|
@@ -271,6 +380,8 @@ class GenbankManip
|
|
271
380
|
|
272
381
|
end
|
273
382
|
|
383
|
+
fdebug.close
|
384
|
+
|
274
385
|
end
|
275
386
|
|
276
387
|
|
@@ -315,3 +426,4 @@ class GenbankManip
|
|
315
426
|
|
316
427
|
|
317
428
|
end # end of Class
|
429
|
+
|
@@ -8,29 +8,35 @@
|
|
8
8
|
|
9
9
|
|
10
10
|
|
11
|
-
class
|
11
|
+
class SequenceFasta
|
12
12
|
|
13
|
-
attr_reader :fasta_flat, :fasta_file, :
|
13
|
+
attr_reader :fasta_flat, :fasta_file, :annotation_files
|
14
14
|
|
15
15
|
# Initialize fasta holder
|
16
16
|
def initialize fasta_file, meta
|
17
17
|
|
18
18
|
@fasta_file = fasta_file
|
19
19
|
@fasta_flat = Bio::FlatFile.auto(@fasta_file)
|
20
|
-
@meta = meta
|
21
|
-
@prodigal_files = nil
|
22
|
-
@single_fasta = nil
|
23
|
-
@seq_info = nil
|
24
20
|
|
25
21
|
if @fasta_flat.dbclass != Bio::FastaFormat
|
26
22
|
abort "Aborting : The input sequence is not a fasta file !"
|
27
23
|
end
|
28
24
|
|
25
|
+
# @contigs = extract_contigs(@fasta_flat)
|
26
|
+
|
27
|
+
@meta = meta
|
28
|
+
|
29
|
+
@annotation_files = nil
|
30
|
+
@single_fasta = nil
|
31
|
+
@seq_info = nil
|
32
|
+
|
29
33
|
end
|
30
34
|
|
35
|
+
|
31
36
|
# Run prodigal on the genome to annotate
|
32
37
|
def run_prodigal root, outdir
|
33
|
-
|
38
|
+
|
39
|
+
@annotation_files = {}
|
34
40
|
Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
|
35
41
|
if @meta
|
36
42
|
system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
@@ -38,30 +44,34 @@ class FastaManip
|
|
38
44
|
system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
39
45
|
end
|
40
46
|
|
41
|
-
@
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
47
|
+
@annotation_files = {
|
48
|
+
multiGBK: "#{outdir}/Genbanks.gbk",
|
49
|
+
contigs: [],
|
50
|
+
contigs_length: [],
|
51
|
+
genes: "#{outdir}/Genes.fa",
|
52
|
+
proteins: "#{outdir}/Proteins.fa",
|
53
|
+
prot_ids_by_contig: {},
|
54
|
+
fasta_path: "#{outdir}/single-fasta/",
|
55
|
+
gbk_path: "#{outdir}/single-genbank/"
|
56
|
+
}
|
57
|
+
|
49
58
|
split_fasta outdir
|
50
59
|
split_genbank outdir, "#{outdir}/Genbanks.gbk"
|
51
60
|
extract_cds_names
|
52
|
-
@
|
61
|
+
@annotation_files
|
62
|
+
|
53
63
|
end
|
54
64
|
|
55
65
|
|
56
|
-
# Split Multi
|
66
|
+
# Split Multi Fasta file
|
57
67
|
# RETURN : array of fasta files
|
58
68
|
def split_fasta outdir
|
59
69
|
@single_fasta = {}
|
60
70
|
Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
|
61
71
|
@fasta_flat.each_entry do |seq|
|
62
72
|
file_name = seq.definition.chomp.split(" ")[0]
|
63
|
-
@
|
64
|
-
@
|
73
|
+
@annotation_files[:contigs] << "#{file_name}"
|
74
|
+
@annotation_files[:contigs_length] << seq.seq.length
|
65
75
|
File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
66
76
|
fwrite.write(seq)
|
67
77
|
end
|
@@ -108,7 +118,6 @@ class FastaManip
|
|
108
118
|
outseq = "ORIGIN\n"
|
109
119
|
# puts "ORIGIN"
|
110
120
|
|
111
|
-
ntNum = 0
|
112
121
|
sequence = seq.seq.downcase
|
113
122
|
|
114
123
|
nt_left = true
|
@@ -144,7 +153,7 @@ class FastaManip
|
|
144
153
|
|
145
154
|
prot_ids = {}
|
146
155
|
prot_length = {}
|
147
|
-
flatfile = Bio::FlatFile.auto(@
|
156
|
+
flatfile = Bio::FlatFile.auto(@annotation_files[:proteins])
|
148
157
|
|
149
158
|
flatfile.each_entry do |entry|
|
150
159
|
prot_id = entry.definition.split(" ")[0]
|
@@ -163,8 +172,8 @@ class FastaManip
|
|
163
172
|
prot_array.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
|
164
173
|
end
|
165
174
|
|
166
|
-
@
|
167
|
-
@
|
175
|
+
@annotation_files[:prot_ids_by_contig] = prot_ids
|
176
|
+
@annotation_files[:prot_ids_length] = prot_length
|
168
177
|
|
169
178
|
end
|
170
179
|
|
@@ -7,20 +7,43 @@
|
|
7
7
|
# licence:
|
8
8
|
|
9
9
|
|
10
|
+
class SequenceSynteny
|
10
11
|
|
11
|
-
|
12
|
+
attr_reader :query_file, :subject_file, :aln_hits, :query_sequences, :subject_sequences
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
def initialize query_file, subject_file, name, pidentity, type
|
14
|
+
def initialize query_file, subject_file, name, pidentity, min_coverage, type
|
16
15
|
@query_file = query_file
|
17
16
|
@subject_file = subject_file
|
17
|
+
|
18
|
+
@query_sequences = get_sequences(query_file)
|
19
|
+
@subject_sequences = get_sequences(subject_file)
|
20
|
+
|
18
21
|
@name = name
|
19
22
|
@pidentity = pidentity
|
23
|
+
@min_coverage = min_coverage
|
20
24
|
@aln_file = nil
|
21
25
|
@type = type
|
26
|
+
|
22
27
|
end # end of initialize
|
23
28
|
|
29
|
+
|
30
|
+
# get sequences name with length in hash
|
31
|
+
def get_sequences seq_file
|
32
|
+
|
33
|
+
sequences = {}
|
34
|
+
flat = Bio::FlatFile.auto("#{seq_file}")
|
35
|
+
flat.each_entry do |s|
|
36
|
+
s_name = s.definition.chomp.split(" ")[0]
|
37
|
+
sequences[s_name] = {}
|
38
|
+
sequences[s_name][:length] = s.seq.length
|
39
|
+
sequences[s_name][:conserved] = false
|
40
|
+
sequences[s_name][:contig] = s_name.split("_")[0..-2].join("_") if s_name.include? "_"
|
41
|
+
end
|
42
|
+
|
43
|
+
sequences
|
44
|
+
|
45
|
+
end
|
46
|
+
|
24
47
|
# run blat on proteins
|
25
48
|
def run_blat root, outdir
|
26
49
|
base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity}"
|
@@ -32,9 +55,98 @@ class SyntenyManip
|
|
32
55
|
# extract_hits
|
33
56
|
end # end of method
|
34
57
|
|
58
|
+
|
59
|
+
# Extract Hit from blast8 file and save it in hash
|
60
|
+
# contig-0_1 ABJ71957.1 96.92 65 2 0 1 65 1 65 9.2e-31 131.0
|
61
|
+
def extract_hits mode
|
62
|
+
|
63
|
+
feature = ""
|
64
|
+
File.open(@aln_file,"r") do |fread|
|
65
|
+
while l = fread.gets
|
66
|
+
|
67
|
+
lA = l.chomp!.split("\t")
|
68
|
+
key = lA[0]
|
69
|
+
|
70
|
+
# extraction of hit id depends on mode ..
|
71
|
+
if mode == :refgenome
|
72
|
+
hit = lA[1]
|
73
|
+
feature = "cds"
|
74
|
+
elsif mode == :externaldb
|
75
|
+
# hit = lA[1].chomp.split("|")[3]
|
76
|
+
hit = lA[1]
|
77
|
+
feature = "cds"
|
78
|
+
end
|
79
|
+
|
80
|
+
# compute coverage based on sequences length
|
81
|
+
cov_query = (lA[3].to_f/@query_sequences[key][:length]).round(2)
|
82
|
+
cov_subject = (lA[3].to_f/@subject_sequences[hit][:length]).round(2)
|
83
|
+
|
84
|
+
# assert cutoff on identity and coverage
|
85
|
+
# 1 -> pass cutoff, 0 under cutoff
|
86
|
+
assert_cutoff = [1,1,1]
|
87
|
+
assert_cutoff[0] = 0 if lA[2].to_f < @pidentity
|
88
|
+
assert_cutoff[1] = 0 if cov_query < @min_coverage
|
89
|
+
assert_cutoff[2] = 0 if cov_subject < @min_coverage
|
90
|
+
|
91
|
+
# first hit for query
|
92
|
+
if ! @query_sequences[key].has_key? :homology
|
93
|
+
@query_sequences[key][:conserved] = true
|
94
|
+
@subject_sequences[key][:conserved] = true
|
95
|
+
@query_sequences[key][:homology] = {
|
96
|
+
pId: lA[2].to_f.round(2),
|
97
|
+
cov_query: cov_query,
|
98
|
+
cov_subject: cov_subject,
|
99
|
+
evalue: lA[10],
|
100
|
+
score: lA[11].to_f,
|
101
|
+
hits: [hit],
|
102
|
+
length: [lA[3].to_i],
|
103
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
104
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
105
|
+
feature: feature,
|
106
|
+
assert_cutoff: assert_cutoff
|
107
|
+
}
|
108
|
+
@subject_sequences[hit][:hits] = [key]
|
109
|
+
|
110
|
+
# query already got at least 1 hit and new_score > last_score
|
111
|
+
elsif lA[11].to_f > @query_sequences[key][:homology][:score]
|
112
|
+
@query_sequences[key][:conserved] = true
|
113
|
+
@subject_sequences[key][:conserved] = true
|
114
|
+
@query_sequences[key][:homology] = {
|
115
|
+
pId: lA[2].to_f.round(2),
|
116
|
+
cov_query: cov_query,
|
117
|
+
cov_subject: cov_subject,
|
118
|
+
evalue: lA[10],
|
119
|
+
score: lA[11].to_f,
|
120
|
+
hits: [hit],
|
121
|
+
length: [lA[3].to_i],
|
122
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
123
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
124
|
+
feature: feature,
|
125
|
+
assert_cutoff: assert_cutoff
|
126
|
+
}
|
127
|
+
@subject_sequences[hit][:hits] = [key]
|
128
|
+
|
129
|
+
# query already got at least 1 hit and score == last_score
|
130
|
+
elsif lA[11].to_f == @query_sequences[key][:homology][:score]
|
131
|
+
@query_sequences[key][:homology][:hits] << hit
|
132
|
+
@query_sequences[key][:homology][:length] << lA[3].to_i
|
133
|
+
@query_sequences[key][:homology][:query_location] << [lA[6].to_i,lA[7].to_i]
|
134
|
+
@query_sequences[key][:homology][:subject_location] << [lA[8].to_i,lA[9].to_i]
|
135
|
+
if @subject_sequences[hit].has_key? :hits
|
136
|
+
@subject_sequences[hit][:hits] << key
|
137
|
+
else
|
138
|
+
@subject_sequences[hit][:hits] = [key]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
end # end of method
|
145
|
+
|
146
|
+
|
35
147
|
# Extract Hit from blast8 file and save it in hash
|
36
148
|
# contig-0_1 ABJ71957.1 96.92 65 2 0 1 65 1 65 9.2e-31 131.0
|
37
|
-
def extract_hits_prodigal mode
|
149
|
+
def extract_hits_prodigal mode
|
38
150
|
|
39
151
|
@aln_hits = {}
|
40
152
|
feature = ""
|
@@ -49,8 +161,8 @@ class SyntenyManip
|
|
49
161
|
hit = lA[1].chomp.split("|")[3]
|
50
162
|
feature = "cds"
|
51
163
|
end
|
164
|
+
next if lA[2].to_f < @pidentity
|
52
165
|
if ! @aln_hits.has_key? key
|
53
|
-
next if lA[2].to_f < @pidentity
|
54
166
|
@aln_hits[key] = {
|
55
167
|
pId: lA[2].to_f.round(2),
|
56
168
|
evalue: lA[10],
|
@@ -99,10 +211,12 @@ class SyntenyManip
|
|
99
211
|
feature = hit_split[1]
|
100
212
|
product = hit_split[2]
|
101
213
|
end
|
214
|
+
next if lA[2].to_f < @pidentity
|
102
215
|
if ! @aln_hits.has_key? key
|
103
|
-
next if lA[2].to_f < @pidentity
|
104
216
|
@aln_hits[key] = {
|
105
217
|
pId: lA[2].to_f.round(2),
|
218
|
+
# cov_query: (@query_sequences[key][:length]/lA[3].to_f).round(2),
|
219
|
+
# cov_subject: (@subject_sequences[hit][:length]/lA[3].to_f).round(2),
|
106
220
|
evalue: lA[10],
|
107
221
|
score: lA[11].to_f,
|
108
222
|
hits: [hit],
|
@@ -115,6 +229,8 @@ class SyntenyManip
|
|
115
229
|
elsif lA[11].to_f > @aln_hits[key][:score]
|
116
230
|
@aln_hits[key] = {
|
117
231
|
pId: lA[2].to_f.round(2),
|
232
|
+
# cov_query: (@query_sequences[key][:length]/lA[3].to_f).round(2),
|
233
|
+
# cov_subject: (@subject_sequences[hit][:length]/lA[3].to_f).round(2),
|
118
234
|
evalue: lA[10],
|
119
235
|
score: lA[11].to_f,
|
120
236
|
hits: [hit],
|
@@ -135,7 +251,7 @@ class SyntenyManip
|
|
135
251
|
end
|
136
252
|
end
|
137
253
|
|
138
|
-
prune_aln_hits @aln_hits
|
254
|
+
# prune_aln_hits @aln_hits
|
139
255
|
|
140
256
|
end # end of method
|
141
257
|
|
@@ -178,6 +294,10 @@ class SyntenyManip
|
|
178
294
|
annotations[p][:length] = @aln_hits[p][:length][hit_index]
|
179
295
|
i+=1
|
180
296
|
|
297
|
+
File.open("debug-annotation-by-contig.txt","a") do |fout|
|
298
|
+
fout.write("#{p} #{@aln_hits[p][:pId]} #{@aln_hits[p][:cov_query]} #{@aln_hits[p][:cov_subject]} #{ref_cds[h][:product]}\n")
|
299
|
+
end
|
300
|
+
|
181
301
|
else
|
182
302
|
|
183
303
|
annotations[p] = nil
|