bacterial-annotator 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/bacterial-annotator +39 -29
- data/lib/bacterial-annotator/sequence-annotation.rb +209 -30
- data/lib/bacterial-annotator/sequence-fasta.rb +21 -18
- data/lib/bacterial-annotator/sequence-synteny.rb +77 -20
- data/lib/bacterial-annotator.rb +201 -64
- data/lib/bacterial-comparator.rb +42 -26
- data/lib/bacterial-identificator.rb +86 -13
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10f3d2469fb3aaf64e6b84076e05ab9e1ae41cd6
|
4
|
+
data.tar.gz: f08a5465ce584dd888074c7d0146c1450386598e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bd006cf021f0a74f1e98fa6367ca4aca0abb36004f375654ec552b68e1ac8ebc5c1f65e38a480473551848d25ac6be904c5d1841cc60657a47384169d368a18c
|
7
|
+
data.tar.gz: b5a8cb5c74c028e813bbc585e70b6dcb420b8c8f4ad659e8e4c985bce868009a7f5d6015c4e396768a6146c918943e4586a638c084d844cc91be6ac927c993b6
|
data/bin/bacterial-annotator
CHANGED
@@ -63,27 +63,28 @@ def usage_annotate
|
|
63
63
|
annotate [OPTIONS]
|
64
64
|
|
65
65
|
// IO
|
66
|
-
--input/-i
|
67
|
-
--outdir/-o
|
68
|
-
--
|
69
|
-
--
|
66
|
+
--input/-i <fasta_file> Provide the fasta file to annotate
|
67
|
+
--outdir/-o <outdir> Output directory [default=BAnnotation]
|
68
|
+
--name/-n <name> Sample name
|
69
|
+
--force/-f Force to overwrite the output directory
|
70
70
|
|
71
71
|
// MERGEM-based Annotation (Recommended)
|
72
|
-
--db/-d
|
72
|
+
--db/-d <species_dir> From MERGEM database (include CDS and RNAs fasta)
|
73
|
+
// see bacteriapps.genome.ulaval.ca/mergem
|
73
74
|
|
74
75
|
// Reference-Based Annotation
|
75
|
-
--refgenome/-g
|
76
|
-
--externaldb
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
--
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
--meta
|
76
|
+
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
77
|
+
--externaldb <fasta_file> Finish or do a complete annotation with this sequence database (protein fasta file).
|
78
|
+
Fasta headers need to look similar to NCBI or EBI fasta headers
|
79
|
+
EX: >gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
80
|
+
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
81
|
+
|
82
|
+
// Options
|
83
|
+
--pidentity <% identity> Minimum percentage identity to incorporate a CDS annotation [default=0.8]
|
84
|
+
--pcoverage <% identity> Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.8]
|
85
|
+
// otherwise hint for a non-functional protein
|
86
|
+
--minlength <length> Minimum contig length for annotation [default=500]
|
87
|
+
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
87
88
|
|
88
89
|
OEM
|
89
90
|
|
@@ -101,6 +102,11 @@ def parseOptions_annotate
|
|
101
102
|
options[:minlength] = 500
|
102
103
|
options[:meta] = 0
|
103
104
|
|
105
|
+
if ARGV.length == 0
|
106
|
+
usage_annotate
|
107
|
+
abort
|
108
|
+
end
|
109
|
+
|
104
110
|
while x = ARGV.shift
|
105
111
|
|
106
112
|
case x.downcase
|
@@ -224,12 +230,14 @@ def usage_identify
|
|
224
230
|
|
225
231
|
identify [OPTIONS] genome_1.fasta genome_2.fasta genome_x.fasta
|
226
232
|
|
227
|
-
//
|
228
|
-
--
|
233
|
+
//Mash Sketch
|
234
|
+
--mash/-m <mash sketch file>
|
229
235
|
|
230
236
|
//IO
|
231
237
|
--proc <nb of process> Number of process to run the comparison
|
232
238
|
|
239
|
+
--output [csv,tsv|json]
|
240
|
+
|
233
241
|
OEM
|
234
242
|
|
235
243
|
end
|
@@ -238,21 +246,24 @@ def parseOptions_identify
|
|
238
246
|
|
239
247
|
options = {}
|
240
248
|
options[:proc] = 2
|
241
|
-
options[:
|
249
|
+
options[:genome_list] = []
|
250
|
+
options[:output] = "tsv"
|
242
251
|
|
243
252
|
while x = ARGV.shift
|
244
253
|
|
245
254
|
case x.downcase
|
246
|
-
when "--
|
247
|
-
options[:
|
255
|
+
when "--mash", "-m"
|
256
|
+
options[:mash_file] = ARGV.shift
|
248
257
|
when "--proc", "-p"
|
249
258
|
options[:proc] = ARGV.shift
|
259
|
+
when "--output", "-o"
|
260
|
+
options[:output] = ARGV.shift
|
250
261
|
when "--help", "-h"
|
251
262
|
usage_identify
|
252
263
|
abort
|
253
264
|
else
|
254
265
|
if File.exists? "#{x}"
|
255
|
-
options[:
|
266
|
+
options[:genome_list] << x
|
256
267
|
else
|
257
268
|
puts "#{x} file doesn't exist"
|
258
269
|
usage_identify
|
@@ -302,14 +313,14 @@ if ARGV.size >= 1
|
|
302
313
|
|
303
314
|
# Check Options
|
304
315
|
if ! options.has_key? :refgenome and
|
305
|
-
! options.has_key? :external_db
|
316
|
+
! options.has_key? :external_db and
|
317
|
+
! options.has_key? :mergem
|
306
318
|
puts "You didn't provide a reference genome or a database for the annotation !"
|
307
319
|
elsif ! options.has_key? :input
|
308
320
|
puts "You didn't provide a fasta file to annotate !"
|
309
321
|
end
|
310
322
|
|
311
323
|
bannot = BacterialAnnotator.new(options, ROOT)
|
312
|
-
bannot.prepare_files_for_annotation
|
313
324
|
bannot.run_annotation
|
314
325
|
|
315
326
|
elsif ARGV[0] == "compare"
|
@@ -317,20 +328,19 @@ if ARGV.size >= 1
|
|
317
328
|
ARGV.shift
|
318
329
|
options = parseOptions_compare
|
319
330
|
bcomp = BacterialComparator.new(options, ROOT)
|
320
|
-
|
321
|
-
bcomp.mafft_aln aln_opt
|
322
|
-
bcomp.raxml_tree aln_opt, options[:bootstrap] if options[:phylogeny] == 1
|
331
|
+
bcomp.run_comparison
|
323
332
|
|
324
333
|
elsif ARGV[0] == "identify"
|
325
334
|
|
326
335
|
ARGV.shift
|
327
336
|
options = parseOptions_identify
|
328
|
-
if options[:
|
337
|
+
if options[:genome_list].empty?
|
329
338
|
puts "You need at least 1 genome fasta to identify !!"
|
330
339
|
usage_identify
|
331
340
|
abort
|
332
341
|
end
|
333
342
|
bident = BacterialIdentificator.new(options, ROOT)
|
343
|
+
bident.run_identification
|
334
344
|
|
335
345
|
elsif ARGV[0] == "--version" or ARGV[0] == "-v"
|
336
346
|
|
@@ -5,27 +5,208 @@
|
|
5
5
|
# version: 0.0.1
|
6
6
|
# licence:
|
7
7
|
|
8
|
+
require 'json'
|
9
|
+
require 'zlib'
|
8
10
|
|
9
11
|
|
10
12
|
class SequenceAnnotation
|
11
13
|
|
12
|
-
attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
|
14
|
+
attr_accessor :gbk, :coding_seq, :rna_seq, :cds_file, :rna_file
|
13
15
|
|
14
16
|
# Initialize then genbank file
|
15
|
-
def initialize
|
17
|
+
def initialize root, outdir, file_ref, type
|
18
|
+
|
19
|
+
@root = root
|
20
|
+
@outdir = outdir
|
21
|
+
@coding_seq = {}
|
22
|
+
@rna_seq = {}
|
23
|
+
|
24
|
+
case type
|
25
|
+
when "refGbk"
|
26
|
+
# reference genome use for annotation
|
27
|
+
reference_gbk file_ref
|
28
|
+
when "db"
|
29
|
+
# reference database use for annotation
|
30
|
+
reference_db file_ref
|
31
|
+
when "fasta"
|
32
|
+
# single fasta database for annotation (completion)
|
33
|
+
single_fasta file_ref
|
34
|
+
when "newGbk"
|
35
|
+
# new genbank holder to be annotated
|
36
|
+
new_gbk file_ref
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# Use a MERGEM database to get annotation from it
|
43
|
+
def reference_db dir
|
44
|
+
|
45
|
+
abort "Aborting: Can't find MERGEM db direcotry" if ! File.exists? dir
|
46
|
+
|
47
|
+
@cds_file = "#{dir}/cds.dmnd"
|
48
|
+
@rna_file = "#{dir}/rnas.fasta"
|
49
|
+
|
50
|
+
json_genes = {}
|
51
|
+
Zlib::GzipReader.open("#{dir}/cds.json.gz") {|gz|
|
52
|
+
json_genes = JSON.parse(gz.read)
|
53
|
+
}
|
54
|
+
|
55
|
+
json_genes.each do |gene|
|
16
56
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
57
|
+
prot_id = gene["cluster_id"]
|
58
|
+
@coding_seq[prot_id] = {
|
59
|
+
protId: prot_id,
|
60
|
+
location: nil,
|
61
|
+
product: gene["consensus_name"],
|
62
|
+
length: gene["consensus_length"]
|
63
|
+
}
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
# File.open("#{dir}/cds.txt") do |f|
|
68
|
+
# while l = f.gets
|
69
|
+
# lA = l.chomp.split(" ")
|
70
|
+
# @coding_seq[lA[0].gsub(">","")] = {
|
71
|
+
# protId: lA[0].gsub(">",""),
|
72
|
+
# location: nil,
|
73
|
+
# product: lA[1..-1].join(' '),
|
74
|
+
# }
|
75
|
+
# end
|
76
|
+
# end
|
77
|
+
|
78
|
+
File.open("#{dir}/rnas.txt") do |f|
|
79
|
+
while l = f.gets
|
80
|
+
lA = l.chomp.split(" ")
|
81
|
+
@rna_seq[lA[0].gsub(">","")] = {
|
82
|
+
protId: lA[0].gsub(">",""),
|
83
|
+
location: nil,
|
84
|
+
product: lA[1..-1].join(' '),
|
85
|
+
}
|
86
|
+
end
|
22
87
|
end
|
23
88
|
|
24
|
-
|
89
|
+
end
|
90
|
+
|
91
|
+
# Use a Genbank Reference and read annotation from it
|
92
|
+
def reference_gbk gbk_file
|
93
|
+
|
94
|
+
puts "# Preparing reference genome files.."
|
95
|
+
if ! File.exists? gbk_file
|
96
|
+
fetch_ncbi_genome(gbk_file)
|
97
|
+
gbk_file = "#{@outdir}/#{gbk_file}.gbk"
|
98
|
+
# gbk_file += ".gbk"
|
99
|
+
end
|
100
|
+
|
101
|
+
flat_gbk = Bio::FlatFile.auto(gbk_file)
|
25
102
|
|
26
103
|
# Check if gbk is valid
|
27
104
|
if flat_gbk.dbclass != Bio::GenBank
|
28
|
-
abort "Aborting : The input #{
|
105
|
+
abort "Aborting : The input #{gbk_file} is not a valid genbank file !"
|
106
|
+
else
|
107
|
+
@gbk = flat_gbk.next_entry
|
108
|
+
end
|
109
|
+
|
110
|
+
@bioseq = @gbk.to_biosequence
|
111
|
+
|
112
|
+
write_cds_to_file
|
113
|
+
write_rna_to_file
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
# Use a Genbank Reference and read annotation from it
|
118
|
+
def single_fasta fasta_file
|
119
|
+
|
120
|
+
return "" if ! File.exists? fasta_file
|
121
|
+
|
122
|
+
File.open(fasta_file, "r") do |dbfile|
|
123
|
+
|
124
|
+
while l=dbfile.gets
|
125
|
+
|
126
|
+
if l[0] == ">"
|
127
|
+
|
128
|
+
lA = l.chomp.split("|")
|
129
|
+
|
130
|
+
if lA.length > 1 # refseq, ncbi, trembl, swissprot
|
131
|
+
|
132
|
+
key_gi = l.split(" ")[0][1..-1]
|
133
|
+
product_long = lA[-1]
|
134
|
+
|
135
|
+
organism = ""
|
136
|
+
product = ""
|
137
|
+
db_source = "[DBSource]"
|
138
|
+
|
139
|
+
if product_long.scan(/|/).count >= 5 # FROM BIORUBY SCRIPTS
|
140
|
+
product = product_long
|
141
|
+
db_source = "RefSeq"
|
142
|
+
elsif product_long.include? " [" and product_long.include? "]" # NCBI
|
143
|
+
organism = product_long[/\[.*?\]/]
|
144
|
+
product = product_long.split(" [")[0].strip
|
145
|
+
elsif product_long.include? "OS=" # Swissprot / TrEMBL
|
146
|
+
product_tmp = product.split("OS=")
|
147
|
+
organism = product_tmp[1].split(/[A-Z][A-Z]=/)[0].strip
|
148
|
+
product = product_tmp[0].strip
|
149
|
+
elsif product_long.include? "[A-Z][A-Z]=" # NCBI
|
150
|
+
product = product_long.split(/[A-Z][A-Z]=/)[0].strip
|
151
|
+
else
|
152
|
+
product = product_long
|
153
|
+
end
|
154
|
+
|
155
|
+
org = organism.gsub("[","").gsub("]","")
|
156
|
+
|
157
|
+
product.lstrip!
|
158
|
+
prot_id = nil
|
159
|
+
|
160
|
+
if key_gi.count("|") == 4
|
161
|
+
if lA[2] == "ref"
|
162
|
+
db_source = "RefSeq"
|
163
|
+
end
|
164
|
+
prot_id = lA[3]
|
165
|
+
elsif key_gi.count("|") == 2
|
166
|
+
if lA[0].include? == "sp" or
|
167
|
+
lA[0].include? == "tr"
|
168
|
+
db_source = "UniProtKB"
|
169
|
+
end
|
170
|
+
prot_id = lA[1]
|
171
|
+
elsif key_gi.count("|") == 5
|
172
|
+
db_source = "RefSeq"
|
173
|
+
prot_id = lA[2]
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
else # mergem
|
178
|
+
|
179
|
+
|
180
|
+
end
|
181
|
+
|
182
|
+
@coding_seq[key_gi] = { product: product,
|
183
|
+
org: org,
|
184
|
+
prot_id: prot_id,
|
185
|
+
db_source: db_source }
|
186
|
+
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
|
195
|
+
|
196
|
+
# New Genbank Holder to add annotation to it
|
197
|
+
def new_gbk gbk_file
|
198
|
+
|
199
|
+
if ! File.exists? gbk_file
|
200
|
+
fetch_ncbi_genome(gbk_file)
|
201
|
+
gbk_file = "#{@outdir}/#{gbk_file}.gbk"
|
202
|
+
# gbk_file += ".gbk"
|
203
|
+
end
|
204
|
+
|
205
|
+
flat_gbk = Bio::FlatFile.auto(gbk_file)
|
206
|
+
|
207
|
+
# Check if gbk is valid
|
208
|
+
if flat_gbk.dbclass != Bio::GenBank
|
209
|
+
abort "Aborting : The input #{gbk_file} is not a valid genbank file !"
|
29
210
|
else
|
30
211
|
@gbk = flat_gbk.next_entry
|
31
212
|
end
|
@@ -38,9 +219,7 @@ class SequenceAnnotation
|
|
38
219
|
# Prepare CDS/proteins
|
39
220
|
def get_cds
|
40
221
|
|
41
|
-
if @coding_seq
|
42
|
-
|
43
|
-
@coding_seq = {}
|
222
|
+
if @coding_seq.empty?
|
44
223
|
|
45
224
|
# Iterate over each CDS
|
46
225
|
@gbk.each_cds do |ft|
|
@@ -74,7 +253,7 @@ class SequenceAnnotation
|
|
74
253
|
product: product[0],
|
75
254
|
bioseq: pepBioSeq,
|
76
255
|
bioseq_gene: dnaBioSeq,
|
77
|
-
|
256
|
+
length: pepBioSeq.length
|
78
257
|
}
|
79
258
|
|
80
259
|
end
|
@@ -88,12 +267,12 @@ class SequenceAnnotation
|
|
88
267
|
# Prepare rRNA tRNA
|
89
268
|
def get_rna
|
90
269
|
|
91
|
-
if @rna_seq
|
270
|
+
if @rna_seq.empty?
|
92
271
|
|
93
272
|
@rna_seq = {}
|
94
273
|
@gbk.features do |ft|
|
95
274
|
|
96
|
-
next if ! ft.feature.to_s.include? "
|
275
|
+
next if ! ft.feature.to_s.include? "rRNA"
|
97
276
|
|
98
277
|
ftH = ft.to_hash
|
99
278
|
loc = ft.locations
|
@@ -129,20 +308,19 @@ class SequenceAnnotation
|
|
129
308
|
|
130
309
|
end
|
131
310
|
|
132
|
-
|
133
311
|
# Print CDS to files
|
134
312
|
# RETURN : cds_file path
|
135
|
-
def write_cds_to_file
|
313
|
+
def write_cds_to_file
|
136
314
|
|
137
315
|
cds_file = "#{@gbk.accession}.pep"
|
138
316
|
dna_file = "#{@gbk.accession}.dna"
|
139
317
|
|
140
|
-
if @coding_seq
|
318
|
+
if @coding_seq.empty?
|
141
319
|
get_cds
|
142
320
|
end
|
143
321
|
|
144
|
-
dna_out = File.open("#{outdir}/#{dna_file}", "w")
|
145
|
-
File.open("#{outdir}/#{cds_file}", "w") do |fwrite|
|
322
|
+
dna_out = File.open("#{@outdir}/#{dna_file}", "w")
|
323
|
+
File.open("#{@outdir}/#{cds_file}", "w") do |fwrite|
|
146
324
|
@coding_seq.each_key do |k|
|
147
325
|
seqout = @coding_seq[k][:bioseq].output_fasta("#{k}",60)
|
148
326
|
seqout_dna = @coding_seq[k][:bioseq_gene].output_fasta("#{k}",60)
|
@@ -152,28 +330,28 @@ class SequenceAnnotation
|
|
152
330
|
end
|
153
331
|
dna_out.close
|
154
332
|
|
155
|
-
@cds_file = "#{outdir}/" + cds_file
|
333
|
+
@cds_file = "#{@outdir}/" + cds_file
|
156
334
|
|
157
335
|
end
|
158
336
|
|
159
337
|
# Print RNA to files
|
160
338
|
# RETURN : rna_file path
|
161
|
-
def write_rna_to_file
|
339
|
+
def write_rna_to_file
|
162
340
|
|
163
341
|
rna_file = "#{@gbk.accession}.rna"
|
164
342
|
|
165
|
-
if @rna_seq
|
343
|
+
if @rna_seq.empty?
|
166
344
|
get_rna
|
167
345
|
end
|
168
346
|
|
169
|
-
File.open("#{outdir}/#{rna_file}", "w") do |fwrite|
|
347
|
+
File.open("#{@outdir}/#{rna_file}", "w") do |fwrite|
|
170
348
|
@rna_seq.each_key do |k|
|
171
349
|
seqout_dna = @rna_seq[k][:bioseq_gene].output_fasta("#{k}|#{@rna_seq[k][:type]}|#{@rna_seq[k][:product]}",60)
|
172
350
|
fwrite.write(seqout_dna)
|
173
351
|
end
|
174
352
|
end
|
175
353
|
|
176
|
-
@rna_file = "#{outdir}/" + rna_file
|
354
|
+
@rna_file = "#{@outdir}/" + rna_file
|
177
355
|
|
178
356
|
end
|
179
357
|
|
@@ -247,6 +425,7 @@ class SequenceAnnotation
|
|
247
425
|
|
248
426
|
# check if there is a reference genome.. reference_locus shouldn't be nil in that case
|
249
427
|
if locus != nil
|
428
|
+
|
250
429
|
qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (AA identity: #{pId}%; coverage(q,s): #{cov_query}%,#{cov_subject}%) from #{ref_genome}")
|
251
430
|
ftArray.push(qNote)
|
252
431
|
|
@@ -390,9 +569,9 @@ class SequenceAnnotation
|
|
390
569
|
end
|
391
570
|
|
392
571
|
|
393
|
-
def save_genbank_to_file
|
572
|
+
def save_genbank_to_file
|
394
573
|
|
395
|
-
File.open("#{outdir}/#{@gbk.definition}.gbk", "w") do |f|
|
574
|
+
File.open("#{@outdir}/#{@gbk.definition}.gbk", "w") do |f|
|
396
575
|
f.write(@gbk.to_biosequence.output(:genbank))
|
397
576
|
end
|
398
577
|
|
@@ -403,7 +582,7 @@ class SequenceAnnotation
|
|
403
582
|
###################
|
404
583
|
|
405
584
|
# Fct: Get dna sequence
|
406
|
-
def get_DNA
|
585
|
+
def get_DNA cds, seq
|
407
586
|
loc = cds.locations
|
408
587
|
sbeg = loc[0].from.to_i
|
409
588
|
send = loc[0].to.to_i
|
@@ -418,11 +597,11 @@ class SequenceAnnotation
|
|
418
597
|
|
419
598
|
|
420
599
|
# Fetch genbank genome from NCBI
|
421
|
-
def fetch_ncbi_genome refgenome_id
|
600
|
+
def fetch_ncbi_genome refgenome_id
|
422
601
|
Bio::NCBI.default_email = 'default@default.com'
|
423
602
|
ncbi = Bio::NCBI::REST.new
|
424
603
|
genbankstring = ncbi.efetch(refgenome_id, {"db"=>'nucleotide', "rettype"=>'gb'})
|
425
|
-
File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
|
604
|
+
File.open("#{@outdir}/#{refgenome_id}.gbk", "w") do |f|
|
426
605
|
f.write(genbankstring)
|
427
606
|
end
|
428
607
|
end
|
@@ -13,8 +13,10 @@ class SequenceFasta
|
|
13
13
|
attr_reader :fasta_flat, :fasta_file, :annotation_files
|
14
14
|
|
15
15
|
# Initialize fasta holder
|
16
|
-
def initialize fasta_file, meta
|
16
|
+
def initialize root, outdir, fasta_file, meta
|
17
17
|
|
18
|
+
@root = root
|
19
|
+
@outdir = outdir
|
18
20
|
@fasta_file = fasta_file
|
19
21
|
@fasta_flat = Bio::FlatFile.auto(@fasta_file)
|
20
22
|
|
@@ -32,29 +34,29 @@ class SequenceFasta
|
|
32
34
|
|
33
35
|
|
34
36
|
# Run prodigal on the genome to annotate
|
35
|
-
def run_prodigal
|
37
|
+
def run_prodigal
|
36
38
|
|
37
39
|
@annotation_files = {}
|
38
|
-
Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
|
40
|
+
Dir.mkdir "#{@outdir}" if ! Dir.exists? "#{@outdir}"
|
39
41
|
if @meta==1
|
40
|
-
system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
42
|
+
system("#{@root}/prodigal.linux -p meta -i #{@fasta_file} -a #{@outdir}/Proteins.fa -d #{@outdir}/Genes.fa -o #{@outdir}/Genbanks.gbk -q")
|
41
43
|
else
|
42
|
-
system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
44
|
+
system("#{@root}/prodigal.linux -i #{@fasta_file} -a #{@outdir}/Proteins.fa -d #{@outdir}/Genes.fa -o #{@outdir}/Genbanks.gbk -q")
|
43
45
|
end
|
44
46
|
|
45
47
|
@annotation_files = {
|
46
|
-
multiGBK: "#{outdir}/Genbanks.gbk",
|
48
|
+
multiGBK: "#{@outdir}/Genbanks.gbk",
|
47
49
|
contigs: [],
|
48
50
|
contigs_length: [],
|
49
|
-
genes: "#{outdir}/Genes.fa",
|
50
|
-
proteins: "#{outdir}/Proteins.fa",
|
51
|
+
genes: "#{@outdir}/Genes.fa",
|
52
|
+
proteins: "#{@outdir}/Proteins.fa",
|
51
53
|
prot_ids_by_contig: {},
|
52
|
-
fasta_path: "#{outdir}/single-fasta/",
|
53
|
-
gbk_path: "#{outdir}/single-genbank/"
|
54
|
+
fasta_path: "#{@outdir}/single-fasta/",
|
55
|
+
gbk_path: "#{@outdir}/single-genbank/"
|
54
56
|
}
|
55
57
|
|
56
|
-
split_fasta
|
57
|
-
split_genbank
|
58
|
+
split_fasta
|
59
|
+
split_genbank
|
58
60
|
extract_cds_names
|
59
61
|
@annotation_files
|
60
62
|
|
@@ -63,14 +65,14 @@ class SequenceFasta
|
|
63
65
|
|
64
66
|
# Split Multi Fasta file
|
65
67
|
# RETURN : array of fasta files
|
66
|
-
def split_fasta
|
68
|
+
def split_fasta
|
67
69
|
@single_fasta = {}
|
68
|
-
Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
|
70
|
+
Dir.mkdir("#{@outdir}/single-fasta") if ! Dir.exists?("#{@outdir}/single-fasta")
|
69
71
|
@fasta_flat.each_entry do |seq|
|
70
72
|
file_name = seq.definition.chomp.split(" ")[0]
|
71
73
|
@annotation_files[:contigs] << "#{file_name}"
|
72
74
|
@annotation_files[:contigs_length] << seq.seq.length
|
73
|
-
File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
75
|
+
File.open("#{@outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
74
76
|
fwrite.write(seq)
|
75
77
|
end
|
76
78
|
@single_fasta[file_name] = seq
|
@@ -80,9 +82,10 @@ class SequenceFasta
|
|
80
82
|
|
81
83
|
# Split Multi Genbanks file
|
82
84
|
# RETURN : array of genbank files
|
83
|
-
def split_genbank
|
85
|
+
def split_genbank
|
84
86
|
|
85
|
-
|
87
|
+
multigbk = "#{@outdir}/Genbanks.gbk"
|
88
|
+
Dir.mkdir("#{@outdir}/single-genbank")if ! Dir.exists?("#{@outdir}/single-genbank")
|
86
89
|
File.open(multigbk,"r") do |f|
|
87
90
|
fopen = nil
|
88
91
|
while l = f.gets
|
@@ -96,7 +99,7 @@ class SequenceFasta
|
|
96
99
|
year = date.year
|
97
100
|
locus = "LOCUS #{file_name}#{spacer}#{seq_length.to_s} bp DNA linear BCT #{day}-#{month}-#{year}\n"
|
98
101
|
locus += "DEFINITION #{file_name}\n"
|
99
|
-
fopen = File.open("#{outdir}/single-genbank/#{file_name}.gbk", "w")
|
102
|
+
fopen = File.open("#{@outdir}/single-genbank/#{file_name}.gbk", "w")
|
100
103
|
fopen.write(locus)
|
101
104
|
elsif l[0..1] == "//"
|
102
105
|
fopen.write(outseq)
|