bacterial-annotator 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/bacterial-annotator +2 -2
- data/lib/bacterial-annotator.rb +42 -1
- data/lib/bacterial-annotator/fasta-manip.rb +11 -4
- data/lib/bacterial-annotator/genbank-manip.rb +8 -7
- data/lib/bacterial-annotator/remote-ncbi.rb +1 -1
- data/lib/bacterial-annotator/synteny-manip.rb +13 -5
- metadata +4 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4dbc3b5e016a3f24ba82ab525a0c8daa9617afec
|
4
|
+
data.tar.gz: 7c8ba281632d84d7131cf1a27e143f596e76f414
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73f72513dc9964c46c73ef39477a9b5148b010d68449ff270f5b15f556a5e56e027a9ddb80149b2ce44e12c09466ac11267c118c97c8ea8892eb582d220588cf
|
7
|
+
data.tar.gz: 0a92c0ceec70a12500a8dcf4433d76eff7edf2e4fe7ca180151efc016ee81a3e7a6279cc633a6adaaec5936425908c3a4299742ae1571bc1970017a058a8dedb
|
data/bin/bacterial-annotator
CHANGED
@@ -64,7 +64,7 @@ def parseOptions
|
|
64
64
|
options[:pidentity] = 70
|
65
65
|
options[:minlength] = 500
|
66
66
|
options[:meta] = 0
|
67
|
-
|
67
|
+
|
68
68
|
while x = ARGV.shift
|
69
69
|
|
70
70
|
case x.downcase
|
@@ -113,7 +113,7 @@ if ARGV.size > 1
|
|
113
113
|
abort "exiting blat is missing"
|
114
114
|
end
|
115
115
|
|
116
|
-
# Check Options
|
116
|
+
# Check Options
|
117
117
|
if ! options.has_key? :refgenome and ! options.has_key? :remote_db and ! options.has_key? :external_db
|
118
118
|
puts "You didn't provide a reference genome or a database for the annotation !"
|
119
119
|
elsif ! options.has_key? :input
|
data/lib/bacterial-annotator.rb
CHANGED
@@ -106,7 +106,46 @@ class BacterialAnnotator
|
|
106
106
|
# dump foreign proteins to file
|
107
107
|
foreign_cds_file = dump_cds
|
108
108
|
|
109
|
-
|
109
|
+
# Iterate over each Ref protein and print syntheny
|
110
|
+
synteny_file = File.open("#{@outdir}/Prot-Synteny.tsv","w")
|
111
|
+
synteny_file.write("RefLocusTag\tRefProtID\tRefLength\tRefCoverage\tIdentity\tQueryGene\tQueryLength\tQueryCoverage\n")
|
112
|
+
ref_annotated = {}
|
113
|
+
@contig_annotations.each do |contig,prot_annotations|
|
114
|
+
prot_annotations.each do |key,prot|
|
115
|
+
# p key
|
116
|
+
# p prot
|
117
|
+
ref_annotated[prot[:protId]] = {key: key, length: prot[:length], pId: prot[:pId]} if prot != nil
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
@refgenome.coding_seq.each do |ref_k, ref_v|
|
122
|
+
gene = ""
|
123
|
+
coverage_ref = ""
|
124
|
+
coverage_query = ""
|
125
|
+
query_length = ""
|
126
|
+
pId = ""
|
127
|
+
if ref_annotated[ref_v[:protId]] != nil
|
128
|
+
gene = ref_annotated[ref_v[:protId]][:key]
|
129
|
+
coverage_ref = (ref_annotated[ref_v[:protId]][:length].to_f/ref_v[:bioseq].seq.length.to_f).round(2)
|
130
|
+
query_length = @fasta.prodigal_files[:prot_ids_length][gene]
|
131
|
+
coverage_query = (ref_annotated[ref_v[:protId]][:length].to_f/query_length.to_f).round(2)
|
132
|
+
pId = ref_annotated[ref_v[:protId]][:pId]
|
133
|
+
end
|
134
|
+
|
135
|
+
synteny_file.write(ref_v[:protId])
|
136
|
+
synteny_file.write("\t"+ref_v[:locustag])
|
137
|
+
synteny_file.write("\t"+ref_v[:bioseq].seq.length.to_s)
|
138
|
+
synteny_file.write("\t"+coverage_ref.to_s)
|
139
|
+
synteny_file.write("\t"+pId.to_s)
|
140
|
+
synteny_file.write("\t"+gene)
|
141
|
+
synteny_file.write("\t"+query_length.to_s)
|
142
|
+
synteny_file.write("\t"+coverage_query.to_s)
|
143
|
+
synteny_file.write("\n")
|
144
|
+
|
145
|
+
end
|
146
|
+
synteny_file.close
|
147
|
+
|
148
|
+
else # no reference genome
|
110
149
|
|
111
150
|
# no reference genome .. will process all the CDS
|
112
151
|
foreign_cds_file = @fasta.prodigal_files[:proteins]
|
@@ -122,12 +161,14 @@ class BacterialAnnotator
|
|
122
161
|
puts "\nPrinting Statistics.."
|
123
162
|
print_stats "#{@outdir}/Annotation-Stats.txt"
|
124
163
|
|
164
|
+
|
125
165
|
end # end of method
|
126
166
|
|
127
167
|
|
128
168
|
# Finishing the annotation of the remaining CDS
|
129
169
|
def finish_annotation remaining_cds_file
|
130
170
|
|
171
|
+
# only finish the annotation with an external DB
|
131
172
|
if @options.has_key? :external_db # from an external DB
|
132
173
|
|
133
174
|
db_file = @options[:external_db]
|
@@ -20,6 +20,7 @@ class FastaManip
|
|
20
20
|
@meta = meta
|
21
21
|
@prodigal_files = nil
|
22
22
|
@single_fasta = nil
|
23
|
+
@seq_info = nil
|
23
24
|
|
24
25
|
if @fasta_flat.dbclass != Bio::FastaFormat
|
25
26
|
abort "Aborting : The input sequence is not a fasta file !"
|
@@ -61,7 +62,7 @@ class FastaManip
|
|
61
62
|
file_name = seq.definition.chomp.split(" ")[0]
|
62
63
|
@prodigal_files[:contigs] << "#{file_name}"
|
63
64
|
@prodigal_files[:contigs_length] << seq.seq.length
|
64
|
-
File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
65
|
+
File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
65
66
|
fwrite.write(seq)
|
66
67
|
end
|
67
68
|
@single_fasta[file_name] = seq
|
@@ -132,17 +133,19 @@ class FastaManip
|
|
132
133
|
end
|
133
134
|
|
134
135
|
end
|
135
|
-
|
136
|
+
|
136
137
|
return outseq, sequence.length
|
137
138
|
|
138
139
|
end
|
139
140
|
|
140
141
|
|
141
|
-
# extract protein and gene names
|
142
|
+
# extract protein and gene names from prodigal... with contig numbering
|
142
143
|
def extract_cds_names
|
143
144
|
|
144
145
|
prot_ids = {}
|
146
|
+
prot_length = {}
|
145
147
|
flatfile = Bio::FlatFile.auto(@prodigal_files[:proteins])
|
148
|
+
|
146
149
|
flatfile.each_entry do |entry|
|
147
150
|
prot_id = entry.definition.split(" ")[0]
|
148
151
|
contig = prot_id.split("_")[0..-2].join("_")
|
@@ -150,6 +153,10 @@ class FastaManip
|
|
150
153
|
prot_ids[contig] = []
|
151
154
|
end
|
152
155
|
prot_ids[contig] << prot_id
|
156
|
+
|
157
|
+
# puts "Prodigal length : " + entry.seq.length.to_s
|
158
|
+
prot_length[prot_id] = entry.seq.length-1 # minus the stop codon
|
159
|
+
|
153
160
|
end
|
154
161
|
|
155
162
|
prot_ids.each do |k,prot_array|
|
@@ -157,10 +164,10 @@ class FastaManip
|
|
157
164
|
end
|
158
165
|
|
159
166
|
@prodigal_files[:prot_ids_by_contig] = prot_ids
|
167
|
+
@prodigal_files[:prot_ids_length] = prot_length
|
160
168
|
|
161
169
|
end
|
162
170
|
|
163
|
-
|
164
171
|
private :extract_cds_names # :split_fasta, :split_genbank
|
165
172
|
|
166
173
|
end
|
@@ -25,7 +25,7 @@ class GenbankManip
|
|
25
25
|
flat_gbk = Bio::FlatFile.auto(@gbk_file)
|
26
26
|
|
27
27
|
# Check if gbk is valid
|
28
|
-
if flat_gbk.dbclass != Bio::GenBank
|
28
|
+
if flat_gbk.dbclass != Bio::GenBank
|
29
29
|
abort "Aborting : The input #{@gbk_file} is not a valid genbank file !"
|
30
30
|
else
|
31
31
|
@gbk = flat_gbk.next_entry
|
@@ -67,10 +67,11 @@ class GenbankManip
|
|
67
67
|
pepBioSeq = Bio::Sequence.auto(pep)
|
68
68
|
|
69
69
|
if protId.strip == ""
|
70
|
-
protId = locustag
|
70
|
+
protId = locustag
|
71
71
|
end
|
72
72
|
|
73
|
-
@coding_seq[protId] = {
|
73
|
+
@coding_seq[protId] = {protId: protId,
|
74
|
+
location: loc,
|
74
75
|
locustag: locustag,
|
75
76
|
gene: gene[0],
|
76
77
|
product: product[0],
|
@@ -113,7 +114,7 @@ class GenbankManip
|
|
113
114
|
|
114
115
|
contig = @gbk.definition
|
115
116
|
|
116
|
-
# iterate through
|
117
|
+
# iterate through
|
117
118
|
@gbk.features.each_with_index do |cds, ft_index|
|
118
119
|
|
119
120
|
next if cds.feature != "CDS"
|
@@ -150,7 +151,7 @@ class GenbankManip
|
|
150
151
|
|
151
152
|
# check if there is a reference genome.. reference_locus shouldn't be nil in that case
|
152
153
|
if locus != nil
|
153
|
-
qNote = Bio::Feature::Qualifier.new('note', "
|
154
|
+
qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (#{pId}% identity) from #{reference_locus.entry_id}")
|
154
155
|
ftArray.push(qNote)
|
155
156
|
end
|
156
157
|
|
@@ -180,7 +181,7 @@ class GenbankManip
|
|
180
181
|
###################
|
181
182
|
# Private Methods #
|
182
183
|
###################
|
183
|
-
|
184
|
+
|
184
185
|
# Fct: Get dna sequence
|
185
186
|
def get_DNA (cds, seq)
|
186
187
|
loc = cds.locations
|
@@ -208,5 +209,5 @@ class GenbankManip
|
|
208
209
|
|
209
210
|
private :fetch_ncbi_genome, :get_DNA
|
210
211
|
|
211
|
-
|
212
|
+
|
212
213
|
end # end of Class
|
@@ -24,7 +24,7 @@ class SyntenyManip
|
|
24
24
|
def run_blat root, outdir
|
25
25
|
system("#{root}/blat.linux -out=blast8 -minIdentity=#{@pidentity} -prot #{@subject_file} #{@query_file} #{outdir}/#{@name}.blat8.tsv")
|
26
26
|
@aln_file = "#{outdir}/#{@name}.blat8.tsv"
|
27
|
-
# extract_hits
|
27
|
+
# extract_hits
|
28
28
|
end # end of method
|
29
29
|
|
30
30
|
# Extract Hit from blast8 file and save it in hash
|
@@ -45,21 +45,28 @@ class SyntenyManip
|
|
45
45
|
next if lA[2].to_f < @pidentity
|
46
46
|
@aln_hits[key] = {
|
47
47
|
pId: lA[2].to_f.round(2),
|
48
|
-
length: lA[3].to_i,
|
49
48
|
evalue: lA[10],
|
50
49
|
score: lA[11].to_f,
|
51
|
-
hits: [hit]
|
50
|
+
hits: [hit],
|
51
|
+
length: [lA[3].to_i],
|
52
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
53
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]]
|
52
54
|
}
|
53
55
|
elsif lA[11].to_f > @aln_hits[key][:score]
|
54
56
|
@aln_hits[key] = {
|
55
57
|
pId: lA[2].to_f.round(2),
|
56
|
-
length: lA[3].to_i,
|
57
58
|
evalue: lA[10],
|
58
59
|
score: lA[11].to_f,
|
59
|
-
hits: [hit]
|
60
|
+
hits: [hit],
|
61
|
+
length: [lA[3].to_i],
|
62
|
+
query_location: [[lA[6].to_i,lA[7].to_i]],
|
63
|
+
subject_location: [[lA[8].to_i,lA[9].to_i]]
|
60
64
|
}
|
61
65
|
elsif lA[11].to_f == @aln_hits[key][:score]
|
62
66
|
@aln_hits[key][:hits] << hit
|
67
|
+
@aln_hits[key][:length] << lA[3].to_i
|
68
|
+
@aln_hits[key][:query_location] << [lA[6].to_i,lA[7].to_i]
|
69
|
+
@aln_hits[key][:subject_location] << [lA[8].to_i,lA[9].to_i]
|
63
70
|
end
|
64
71
|
end
|
65
72
|
end
|
@@ -102,6 +109,7 @@ class SyntenyManip
|
|
102
109
|
hit = ref_cds[h]
|
103
110
|
annotations[p] = hit
|
104
111
|
annotations[p][:pId] = @aln_hits[p][:pId]
|
112
|
+
annotations[p][:length] = @aln_hits[p][:length][hit_index]
|
105
113
|
i+=1
|
106
114
|
|
107
115
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -70,7 +70,7 @@ files:
|
|
70
70
|
- lib/bacterial-annotator/synteny-manip.rb
|
71
71
|
homepage: http://rubygems.org/gems/bacterial-annotator
|
72
72
|
licenses:
|
73
|
-
-
|
73
|
+
- GPL-3.0
|
74
74
|
metadata: {}
|
75
75
|
post_install_message:
|
76
76
|
rdoc_options: []
|
@@ -88,9 +88,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
88
|
version: '0'
|
89
89
|
requirements: []
|
90
90
|
rubyforge_project:
|
91
|
-
rubygems_version: 2.
|
91
|
+
rubygems_version: 2.5.1
|
92
92
|
signing_key:
|
93
93
|
specification_version: 4
|
94
94
|
summary: Bacterial Annotator
|
95
95
|
test_files: []
|
96
|
-
has_rdoc:
|