bacterial-annotator 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ba_prodigal +1 -1
- data/bin/bacterial-annotator +13 -14
- data/lib/bacterial-annotator/{genbank-manip.rb → sequence-annotation.rb} +128 -16
- data/lib/bacterial-annotator/{fasta-manip.rb → sequence-fasta.rb} +32 -23
- data/lib/bacterial-annotator/{synteny-manip.rb → sequence-synteny.rb} +128 -8
- data/lib/bacterial-annotator.rb +211 -140
- data/lib/bacterial-comparator.rb +1 -0
- metadata +5 -6
- data/lib/bacterial-annotator/remote-ncbi.rb +0 -201
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: befd57ce78f0c186da1553c7372c3aa6faeb9d90
|
4
|
+
data.tar.gz: 5e37d6a7e579a1e9e428deb9864e4a9d5ea9f057
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9a9766113cef56ae7ed35749cd5fbc10d746aa82e403596dccd0c5e7946786b136a69e19c74a4cece73549b0f1a8de077a8c106fc0e2310f7b000dc6cbad962
|
7
|
+
data.tar.gz: 00a0c5cf815252fa45ffae194318f730cf69e27dd53643659fb06d2dac131a3de881cbad2595b43df6ba5014be75cef8b337e6710afb46e42b420fdd1cf9b178
|
data/bin/ba_prodigal
CHANGED
data/bin/bacterial-annotator
CHANGED
@@ -46,22 +46,21 @@ annotate [OPTIONS]
|
|
46
46
|
--force/-f Force to overwrite the output directory
|
47
47
|
|
48
48
|
// Dataset
|
49
|
-
--refgenome/-g
|
50
|
-
--guessref
|
49
|
+
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
50
|
+
--guessref Will guess the best reference genome to use for the annotation.
|
51
51
|
|
52
|
-
--
|
53
|
-
|
54
|
-
Can be very slow, better to use an external database !
|
55
|
-
|
56
|
-
--externaldb <proteins fasta_file>
|
57
|
-
Complete or do the annotation of remaining CDS with this database (a protein fasta file).
|
52
|
+
--externaldb <proteins fasta_file>
|
53
|
+
Finish or do a complete annotation with this sequence database (a protein fasta file).
|
58
54
|
Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
|
59
55
|
>gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
60
56
|
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
61
57
|
|
62
58
|
// Other options
|
63
|
-
--pidentity Minimum percentage identity to incorporate a CDS annotation [default=0.7]
|
64
|
-
--
|
59
|
+
--pidentity <% identity> Minimum percentage identity to incorporate a CDS annotation [default=0.7]
|
60
|
+
--pcoverage <% identity> Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.7]
|
61
|
+
.. otherwise hint for a non-functional protein
|
62
|
+
|
63
|
+
--minlength <length> Minimum contig length for annotation [default=500]
|
65
64
|
|
66
65
|
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
67
66
|
|
@@ -77,6 +76,7 @@ def parseOptions_annotate
|
|
77
76
|
# default options
|
78
77
|
options[:outdir] = "BAnnotation"
|
79
78
|
options[:pidentity] = 70
|
79
|
+
options[:pcoverage] = 70
|
80
80
|
options[:minlength] = 500
|
81
81
|
options[:meta] = 0
|
82
82
|
|
@@ -95,10 +95,10 @@ def parseOptions_annotate
|
|
95
95
|
options[:minlength] = ARGV.shift
|
96
96
|
when "--pidentity"
|
97
97
|
options[:pidentity] = ARGV.shift
|
98
|
+
when "--pcoverage"
|
99
|
+
options[:pcoverage] = ARGV.shift
|
98
100
|
when "--meta"
|
99
101
|
options[:meta] = 1
|
100
|
-
when "--remotedb"
|
101
|
-
options[:remote_db] = ARGV.shift
|
102
102
|
when "--externaldb"
|
103
103
|
options[:external_db] = ARGV.shift
|
104
104
|
when "--help", "-h"
|
@@ -204,7 +204,7 @@ if ARGV.size > 1
|
|
204
204
|
system("ba_raxml")
|
205
205
|
|
206
206
|
options = {}
|
207
|
-
genomes_list = []
|
207
|
+
genomes_list = [] # TODO multiple input genomes
|
208
208
|
|
209
209
|
if ARGV[0] == "annotate"
|
210
210
|
|
@@ -217,7 +217,6 @@ if ARGV.size > 1
|
|
217
217
|
|
218
218
|
# Check Options
|
219
219
|
if ! options.has_key? :refgenome and
|
220
|
-
! options.has_key? :remote_db and
|
221
220
|
! options.has_key? :external_db
|
222
221
|
puts "You didn't provide a reference genome or a database for the annotation !"
|
223
222
|
elsif ! options.has_key? :input
|
@@ -1,14 +1,13 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
# author: maxime déraspe
|
3
3
|
# email: maximilien1er@gmail.com
|
4
|
-
# review:
|
5
4
|
# date: 15-02-24
|
6
5
|
# version: 0.0.1
|
7
6
|
# licence:
|
8
7
|
|
9
8
|
|
10
9
|
|
11
|
-
class
|
10
|
+
class SequenceAnnotation
|
12
11
|
|
13
12
|
attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
|
14
13
|
|
@@ -67,13 +66,16 @@ class GenbankManip
|
|
67
66
|
protId = locustag
|
68
67
|
end
|
69
68
|
|
70
|
-
@coding_seq[protId] = {
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
69
|
+
@coding_seq[protId] = {
|
70
|
+
protId: protId,
|
71
|
+
location: loc,
|
72
|
+
locustag: locustag,
|
73
|
+
gene: gene[0],
|
74
|
+
product: product[0],
|
75
|
+
bioseq: pepBioSeq,
|
76
|
+
bioseq_gene: dnaBioSeq,
|
77
|
+
bioseq_len: pepBioSeq.length
|
78
|
+
}
|
77
79
|
end
|
78
80
|
|
79
81
|
end
|
@@ -110,11 +112,13 @@ class GenbankManip
|
|
110
112
|
dna = get_DNA(ft,@bioseq)
|
111
113
|
dnaBioSeq = Bio::Sequence.auto(dna)
|
112
114
|
|
113
|
-
@rna_seq[locustag] = {
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
115
|
+
@rna_seq[locustag] = {
|
116
|
+
type: ft.feature.to_s,
|
117
|
+
location: loc,
|
118
|
+
locustag: locustag,
|
119
|
+
product: product,
|
120
|
+
bioseq_gene: dnaBioSeq
|
121
|
+
}
|
118
122
|
|
119
123
|
end
|
120
124
|
|
@@ -125,7 +129,6 @@ class GenbankManip
|
|
125
129
|
end
|
126
130
|
|
127
131
|
|
128
|
-
|
129
132
|
# Print CDS to files
|
130
133
|
# RETURN : cds_file path
|
131
134
|
def write_cds_to_file outdir
|
@@ -174,12 +177,108 @@ class GenbankManip
|
|
174
177
|
end
|
175
178
|
|
176
179
|
|
180
|
+
# add annotation from reference prot synteny
|
181
|
+
def add_annotation_ref_synteny_prot synteny_prot, annotations, ref_genome=nil
|
182
|
+
|
183
|
+
contig = @gbk.definition
|
184
|
+
|
185
|
+
prot_iterator = 0
|
186
|
+
@gbk.features.each_with_index do |cds, ft_index|
|
187
|
+
|
188
|
+
next if cds.feature != "CDS"
|
189
|
+
|
190
|
+
prot_iterator+=1
|
191
|
+
prot_id = contig+"_"+prot_iterator.to_s
|
192
|
+
|
193
|
+
ftArray = []
|
194
|
+
cds.qualifiers = []
|
195
|
+
|
196
|
+
hit = nil
|
197
|
+
|
198
|
+
next if ! synteny_prot.has_key? prot_id or
|
199
|
+
! synteny_prot[prot_id].has_key? :homology
|
200
|
+
|
201
|
+
# puts "#{annotations.keys}"
|
202
|
+
if annotations.has_key? synteny_prot[prot_id][:homology][:hits][0]
|
203
|
+
hit = annotations[synteny_prot[prot_id][:homology][:hits][0]]
|
204
|
+
# puts hit
|
205
|
+
else
|
206
|
+
puts "no hit for #{prot_id}"
|
207
|
+
next
|
208
|
+
end
|
209
|
+
|
210
|
+
# hit = annotations[synteny_prot[prot_id][:homology][:hits][0]]
|
211
|
+
|
212
|
+
if synteny_prot.has_key? prot_id
|
213
|
+
|
214
|
+
locus, gene, product, note, inference = nil
|
215
|
+
locus = hit[:locustag]
|
216
|
+
gene = hit[:gene]
|
217
|
+
product = hit[:product]
|
218
|
+
note = hit[:note]
|
219
|
+
inference = hit[:inference]
|
220
|
+
pId = synteny_prot[prot_id][:homology][:pId]
|
221
|
+
cov_query = (synteny_prot[prot_id][:homology][:cov_query]*100).round(2)
|
222
|
+
cov_subject = (synteny_prot[prot_id][:homology][:cov_subject]*100).round(2)
|
223
|
+
reference_prot_id = synteny_prot[prot_id][:homology][:hits][0]
|
224
|
+
|
225
|
+
qLocusTag = Bio::Feature::Qualifier.new('locus_tag', "#{prot_id}")
|
226
|
+
ftArray.push(qLocusTag)
|
227
|
+
|
228
|
+
if gene != nil
|
229
|
+
qGene = Bio::Feature::Qualifier.new('gene', gene)
|
230
|
+
ftArray.push(qGene)
|
231
|
+
end
|
232
|
+
|
233
|
+
if product != nil
|
234
|
+
qProd = Bio::Feature::Qualifier.new('product', product)
|
235
|
+
ftArray.push(qProd)
|
236
|
+
end
|
237
|
+
|
238
|
+
# check if there is a reference genome.. reference_locus shouldn't be nil in that case
|
239
|
+
if locus != nil
|
240
|
+
qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (AA identity: #{pId}%; coverage(q,s): #{cov_query}%,#{cov_subject}%) from #{ref_genome}")
|
241
|
+
ftArray.push(qNote)
|
242
|
+
|
243
|
+
db_source = "[DBSource]"
|
244
|
+
if reference_prot_id.include? "_"
|
245
|
+
db_source = "RefSeq"
|
246
|
+
else
|
247
|
+
db_source = "INSD"
|
248
|
+
end
|
249
|
+
qInference = Bio::Feature::Qualifier.new('inference', "similar to AA sequence:#{db_source}:#{reference_prot_id}")
|
250
|
+
ftArray.push(qInference)
|
251
|
+
|
252
|
+
end
|
253
|
+
|
254
|
+
if note != nil
|
255
|
+
qNote = Bio::Feature::Qualifier.new('note', note)
|
256
|
+
ftArray.push(qNote)
|
257
|
+
end
|
258
|
+
|
259
|
+
if inference != nil
|
260
|
+
qInference = Bio::Feature::Qualifier.new('inference', inference)
|
261
|
+
ftArray.push(qInference)
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
|
266
|
+
cds.qualifiers = ftArray
|
267
|
+
|
268
|
+
end
|
269
|
+
|
270
|
+
|
271
|
+
end
|
272
|
+
|
273
|
+
|
177
274
|
# add annotation to a genbank file produced by prodigal
|
178
275
|
def add_annotations annotations, mode, reference_locus=nil
|
179
276
|
|
180
277
|
# nb_of_added_ft = 0
|
181
278
|
i = 0
|
182
279
|
|
280
|
+
fdebug = File.open("debug-add-annotation.txt","w")
|
281
|
+
|
183
282
|
contig = @gbk.definition
|
184
283
|
|
185
284
|
if mode == "inplace"
|
@@ -195,9 +294,19 @@ class GenbankManip
|
|
195
294
|
i += 1
|
196
295
|
prot_id = contig+"_"+i.to_s
|
197
296
|
hit = nil
|
198
|
-
|
297
|
+
|
298
|
+
if annotations.has_key? prot_id
|
299
|
+
hit = annotations[prot_id]
|
300
|
+
else
|
301
|
+
puts "no hit for #{prot_id}"
|
302
|
+
next
|
303
|
+
end
|
199
304
|
|
200
305
|
if hit != nil
|
306
|
+
|
307
|
+
fdebug.write(hit)
|
308
|
+
fdebug.write("\n")
|
309
|
+
|
201
310
|
locus, gene, product, note = nil
|
202
311
|
locus = hit[:locustag]
|
203
312
|
gene = hit[:gene]
|
@@ -271,6 +380,8 @@ class GenbankManip
|
|
271
380
|
|
272
381
|
end
|
273
382
|
|
383
|
+
fdebug.close
|
384
|
+
|
274
385
|
end
|
275
386
|
|
276
387
|
|
@@ -315,3 +426,4 @@ class GenbankManip
|
|
315
426
|
|
316
427
|
|
317
428
|
end # end of Class
|
429
|
+
|
@@ -8,29 +8,35 @@
|
|
8
8
|
|
9
9
|
|
10
10
|
|
11
|
-
class
|
11
|
+
class SequenceFasta
|
12
12
|
|
13
|
-
attr_reader :fasta_flat, :fasta_file, :
|
13
|
+
attr_reader :fasta_flat, :fasta_file, :annotation_files
|
14
14
|
|
15
15
|
# Initialize fasta holder
|
16
16
|
def initialize fasta_file, meta
|
17
17
|
|
18
18
|
@fasta_file = fasta_file
|
19
19
|
@fasta_flat = Bio::FlatFile.auto(@fasta_file)
|
20
|
-
@meta = meta
|
21
|
-
@prodigal_files = nil
|
22
|
-
@single_fasta = nil
|
23
|
-
@seq_info = nil
|
24
20
|
|
25
21
|
if @fasta_flat.dbclass != Bio::FastaFormat
|
26
22
|
abort "Aborting : The input sequence is not a fasta file !"
|
27
23
|
end
|
28
24
|
|
25
|
+
# @contigs = extract_contigs(@fasta_flat)
|
26
|
+
|
27
|
+
@meta = meta
|
28
|
+
|
29
|
+
@annotation_files = nil
|
30
|
+
@single_fasta = nil
|
31
|
+
@seq_info = nil
|
32
|
+
|
29
33
|
end
|
30
34
|
|
35
|
+
|
31
36
|
# Run prodigal on the genome to annotate
|
32
37
|
def run_prodigal root, outdir
|
33
|
-
|
38
|
+
|
39
|
+
@annotation_files = {}
|
34
40
|
Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
|
35
41
|
if @meta
|
36
42
|
system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
@@ -38,30 +44,34 @@ class FastaManip
|
|
38
44
|
system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
39
45
|
end
|
40
46
|
|
41
|
-
@
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
47
|
+
@annotation_files = {
|
48
|
+
multiGBK: "#{outdir}/Genbanks.gbk",
|
49
|
+
contigs: [],
|
50
|
+
contigs_length: [],
|
51
|
+
genes: "#{outdir}/Genes.fa",
|
52
|
+
proteins: "#{outdir}/Proteins.fa",
|
53
|
+
prot_ids_by_contig: {},
|
54
|
+
fasta_path: "#{outdir}/single-fasta/",
|
55
|
+
gbk_path: "#{outdir}/single-genbank/"
|
56
|
+
}
|
57
|
+
|
49
58
|
split_fasta outdir
|
50
59
|
split_genbank outdir, "#{outdir}/Genbanks.gbk"
|
51
60
|
extract_cds_names
|
52
|
-
@
|
61
|
+
@annotation_files
|
62
|
+
|
53
63
|
end
|
54
64
|
|
55
65
|
|
56
|
-
# Split Multi
|
66
|
+
# Split Multi Fasta file
|
57
67
|
# RETURN : array of fasta files
|
58
68
|
def split_fasta outdir
|
59
69
|
@single_fasta = {}
|
60
70
|
Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
|
61
71
|
@fasta_flat.each_entry do |seq|
|
62
72
|
file_name = seq.definition.chomp.split(" ")[0]
|
63
|
-
@
|
64
|
-
@
|
73
|
+
@annotation_files[:contigs] << "#{file_name}"
|
74
|
+
@annotation_files[:contigs_length] << seq.seq.length
|
65
75
|
File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
66
76
|
fwrite.write(seq)
|
67
77
|
end
|
@@ -108,7 +118,6 @@ class FastaManip
|
|
108
118
|
outseq = "ORIGIN\n"
|
109
119
|
# puts "ORIGIN"
|
110
120
|
|
111
|
-
ntNum = 0
|
112
121
|
sequence = seq.seq.downcase
|
113
122
|
|
114
123
|
nt_left = true
|
@@ -144,7 +153,7 @@ class FastaManip
|
|
144
153
|
|
145
154
|
prot_ids = {}
|
146
155
|
prot_length = {}
|
147
|
-
flatfile = Bio::FlatFile.auto(@
|
156
|
+
flatfile = Bio::FlatFile.auto(@annotation_files[:proteins])
|
148
157
|
|
149
158
|
flatfile.each_entry do |entry|
|
150
159
|
prot_id = entry.definition.split(" ")[0]
|
@@ -163,8 +172,8 @@ class FastaManip
|
|
163
172
|
prot_array.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
|
164
173
|
end
|
165
174
|
|
166
|
-
@
|
167
|
-
@
|
175
|
+
@annotation_files[:prot_ids_by_contig] = prot_ids
|
176
|
+
@annotation_files[:prot_ids_length] = prot_length
|
168
177
|
|
169
178
|
end
|
170
179
|
|
@@ -7,20 +7,43 @@
|
|
7
7
|
# licence:
|
8
8
|
|
9
9
|
|
10
|
+
class SequenceSynteny
|
10
11
|
|
11
|
-
|
12
|
+
attr_reader :query_file, :subject_file, :aln_hits, :query_sequences, :subject_sequences
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
def initialize query_file, subject_file, name, pidentity, type
|
14
|
+
def initialize query_file, subject_file, name, pidentity, min_coverage, type
|
16
15
|
@query_file = query_file
|
17
16
|
@subject_file = subject_file
|
17
|
+
|
18
|
+
@query_sequences = get_sequences(query_file)
|
19
|
+
@subject_sequences = get_sequences(subject_file)
|
20
|
+
|
18
21
|
@name = name
|
19
22
|
@pidentity = pidentity
|
23
|
+
@min_coverage = min_coverage
|
20
24
|
@aln_file = nil
|
21
25
|
@type = type
|
26
|
+
|
22
27
|
end # end of initialize
|
23
28
|
|
29
|
+
|
30
|
+
# get sequences name with length in hash
|
31
|
+
def get_sequences seq_file
|
32
|
+
|
33
|
+
sequences = {}
|
34
|
+
flat = Bio::FlatFile.auto("#{seq_file}")
|
35
|
+
flat.each_entry do |s|
|
36
|
+
s_name = s.definition.chomp.split(" ")[0]
|
37
|
+
sequences[s_name] = {}
|
38
|
+
sequences[s_name][:length] = s.seq.length
|
39
|
+
sequences[s_name][:conserved] = false
|
40
|
+
sequences[s_name][:contig] = s_name.split("_")[0..-2].join("_") if s_name.include? "_"
|
41
|
+
end
|
42
|
+
|
43
|
+
sequences
|
44
|
+
|
45
|
+
end
|
46
|
+
|
24
47
|
# run blat on proteins
|
25
48
|
def run_blat root, outdir
|
26
49
|
base_cmd = "#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity}"
|
@@ -32,9 +55,98 @@ class SyntenyManip
|
|
32
55
|
# extract_hits
|
33
56
|
end # end of method
|
34
57
|
|
58
|
+
|
59
|
+
# Extract Hit from blast8 file and save it in hash
|
60
|
+
# contig-0_1 ABJ71957.1 96.92 65 2 0 1 65 1 65 9.2e-31 131.0
|
61
|
+
def extract_hits mode
|
62
|
+
|
63
|
+
feature = ""
|
64
|
+
File.open(@aln_file,"r") do |fread|
|
65
|
+
while l = fread.gets
|
66
|
+
|
67
|
+
lA = l.chomp!.split("\t")
|
68
|
+
key = lA[0]
|
69
|
+
|
70
|
+
# extraction of hit id depends on mode ..
|
71
|
+
if mode == :refgenome
|
72
|
+
hit = lA[1]
|
73
|
+
feature = "cds"
|
74
|
+
elsif mode == :externaldb
|
75
|
+
# hit = lA[1].chomp.split("|")[3]
|
76
|
+
hit = lA[1]
|
77
|
+
feature = "cds"
|
78
|
+
end
|
79
|
+
|
80
|
+
# compute coverage based on sequences length
|
81
|
+
cov_query = (lA[3].to_f/@query_sequences[key][:length]).round(2)
|
82
|
+
cov_subject = (lA[3].to_f/@subject_sequences[hit][:length]).round(2)
|
83
|
+
|
84
|
+
# assert cutoff on identity and coverage
|
85
|
+
# 1 -> pass cutoff, 0 under cutoff
|
86
|
+
assert_cutoff = [1,1,1]
|
87
|
+
assert_cutoff[0] = 0 if lA[2].to_f < @pidentity
|
88
|
+
assert_cutoff[1] = 0 if cov_query < @min_coverage
|
89
|
+
assert_cutoff[2] = 0 if cov_subject < @min_coverage
|
90
|
+
|
91
|
+
# first hit for query
|
92
|
+
if ! @query_sequences[key].has_key? :homology
|
93
|
+
@query_sequences[key][:conserved] = true
|
94
|
+
@subject_sequences[key][:conserved] = true
|
95
|
+
@query_sequences[key][:homology] = {
|
96
|
+
pId: lA[2].to_f.round(2),
|
97
|
+
cov_query: cov_query,
|
98
|
+
cov_subject: cov_subject,
|
99
|
+
evalue: lA[10],
|
100
|
+
score: lA[11].to_f,
|
101
|
+
hits: [hit],
|
102
|
+
length: [lA[3].to_i],
|
103
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
104
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
105
|
+
feature: feature,
|
106
|
+
assert_cutoff: assert_cutoff
|
107
|
+
}
|
108
|
+
@subject_sequences[hit][:hits] = [key]
|
109
|
+
|
110
|
+
# query already got at least 1 hit and new_score > last_score
|
111
|
+
elsif lA[11].to_f > @query_sequences[key][:homology][:score]
|
112
|
+
@query_sequences[key][:conserved] = true
|
113
|
+
@subject_sequences[key][:conserved] = true
|
114
|
+
@query_sequences[key][:homology] = {
|
115
|
+
pId: lA[2].to_f.round(2),
|
116
|
+
cov_query: cov_query,
|
117
|
+
cov_subject: cov_subject,
|
118
|
+
evalue: lA[10],
|
119
|
+
score: lA[11].to_f,
|
120
|
+
hits: [hit],
|
121
|
+
length: [lA[3].to_i],
|
122
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
123
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]],
|
124
|
+
feature: feature,
|
125
|
+
assert_cutoff: assert_cutoff
|
126
|
+
}
|
127
|
+
@subject_sequences[hit][:hits] = [key]
|
128
|
+
|
129
|
+
# query already got at least 1 hit and score == last_score
|
130
|
+
elsif lA[11].to_f == @query_sequences[key][:homology][:score]
|
131
|
+
@query_sequences[key][:homology][:hits] << hit
|
132
|
+
@query_sequences[key][:homology][:length] << lA[3].to_i
|
133
|
+
@query_sequences[key][:homology][:query_location] << [lA[6].to_i,lA[7].to_i]
|
134
|
+
@query_sequences[key][:homology][:subject_location] << [lA[8].to_i,lA[9].to_i]
|
135
|
+
if @subject_sequences[hit].has_key? :hits
|
136
|
+
@subject_sequences[hit][:hits] << key
|
137
|
+
else
|
138
|
+
@subject_sequences[hit][:hits] = [key]
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
end # end of method
|
145
|
+
|
146
|
+
|
35
147
|
# Extract Hit from blast8 file and save it in hash
|
36
148
|
# contig-0_1 ABJ71957.1 96.92 65 2 0 1 65 1 65 9.2e-31 131.0
|
37
|
-
def extract_hits_prodigal mode
|
149
|
+
def extract_hits_prodigal mode
|
38
150
|
|
39
151
|
@aln_hits = {}
|
40
152
|
feature = ""
|
@@ -49,8 +161,8 @@ class SyntenyManip
|
|
49
161
|
hit = lA[1].chomp.split("|")[3]
|
50
162
|
feature = "cds"
|
51
163
|
end
|
164
|
+
next if lA[2].to_f < @pidentity
|
52
165
|
if ! @aln_hits.has_key? key
|
53
|
-
next if lA[2].to_f < @pidentity
|
54
166
|
@aln_hits[key] = {
|
55
167
|
pId: lA[2].to_f.round(2),
|
56
168
|
evalue: lA[10],
|
@@ -99,10 +211,12 @@ class SyntenyManip
|
|
99
211
|
feature = hit_split[1]
|
100
212
|
product = hit_split[2]
|
101
213
|
end
|
214
|
+
next if lA[2].to_f < @pidentity
|
102
215
|
if ! @aln_hits.has_key? key
|
103
|
-
next if lA[2].to_f < @pidentity
|
104
216
|
@aln_hits[key] = {
|
105
217
|
pId: lA[2].to_f.round(2),
|
218
|
+
# cov_query: (@query_sequences[key][:length]/lA[3].to_f).round(2),
|
219
|
+
# cov_subject: (@subject_sequences[hit][:length]/lA[3].to_f).round(2),
|
106
220
|
evalue: lA[10],
|
107
221
|
score: lA[11].to_f,
|
108
222
|
hits: [hit],
|
@@ -115,6 +229,8 @@ class SyntenyManip
|
|
115
229
|
elsif lA[11].to_f > @aln_hits[key][:score]
|
116
230
|
@aln_hits[key] = {
|
117
231
|
pId: lA[2].to_f.round(2),
|
232
|
+
# cov_query: (@query_sequences[key][:length]/lA[3].to_f).round(2),
|
233
|
+
# cov_subject: (@subject_sequences[hit][:length]/lA[3].to_f).round(2),
|
118
234
|
evalue: lA[10],
|
119
235
|
score: lA[11].to_f,
|
120
236
|
hits: [hit],
|
@@ -135,7 +251,7 @@ class SyntenyManip
|
|
135
251
|
end
|
136
252
|
end
|
137
253
|
|
138
|
-
prune_aln_hits @aln_hits
|
254
|
+
# prune_aln_hits @aln_hits
|
139
255
|
|
140
256
|
end # end of method
|
141
257
|
|
@@ -178,6 +294,10 @@ class SyntenyManip
|
|
178
294
|
annotations[p][:length] = @aln_hits[p][:length][hit_index]
|
179
295
|
i+=1
|
180
296
|
|
297
|
+
File.open("debug-annotation-by-contig.txt","a") do |fout|
|
298
|
+
fout.write("#{p} #{@aln_hits[p][:pId]} #{@aln_hits[p][:cov_query]} #{@aln_hits[p][:cov_subject]} #{ref_cds[h][:product]}\n")
|
299
|
+
end
|
300
|
+
|
181
301
|
else
|
182
302
|
|
183
303
|
annotations[p] = nil
|