bacterial-annotator 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/bacterial-annotator +39 -29
- data/lib/bacterial-annotator/sequence-annotation.rb +209 -30
- data/lib/bacterial-annotator/sequence-fasta.rb +21 -18
- data/lib/bacterial-annotator/sequence-synteny.rb +77 -20
- data/lib/bacterial-annotator.rb +201 -64
- data/lib/bacterial-comparator.rb +42 -26
- data/lib/bacterial-identificator.rb +86 -13
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10f3d2469fb3aaf64e6b84076e05ab9e1ae41cd6
|
4
|
+
data.tar.gz: f08a5465ce584dd888074c7d0146c1450386598e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bd006cf021f0a74f1e98fa6367ca4aca0abb36004f375654ec552b68e1ac8ebc5c1f65e38a480473551848d25ac6be904c5d1841cc60657a47384169d368a18c
|
7
|
+
data.tar.gz: b5a8cb5c74c028e813bbc585e70b6dcb420b8c8f4ad659e8e4c985bce868009a7f5d6015c4e396768a6146c918943e4586a638c084d844cc91be6ac927c993b6
|
data/bin/bacterial-annotator
CHANGED
@@ -63,27 +63,28 @@ def usage_annotate
|
|
63
63
|
annotate [OPTIONS]
|
64
64
|
|
65
65
|
// IO
|
66
|
-
--input/-i
|
67
|
-
--outdir/-o
|
68
|
-
--
|
69
|
-
--
|
66
|
+
--input/-i <fasta_file> Provide the fasta file to annotate
|
67
|
+
--outdir/-o <outdir> Output directory [default=BAnnotation]
|
68
|
+
--name/-n <name> Sample name
|
69
|
+
--force/-f Force to overwrite the output directory
|
70
70
|
|
71
71
|
// MERGEM-based Annotation (Recommended)
|
72
|
-
--db/-d
|
72
|
+
--db/-d <species_dir> From MERGEM database (include CDS and RNAs fasta)
|
73
|
+
// see bacteriapps.genome.ulaval.ca/mergem
|
73
74
|
|
74
75
|
// Reference-Based Annotation
|
75
|
-
--refgenome/-g
|
76
|
-
--externaldb
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
--
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
--meta
|
76
|
+
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
77
|
+
--externaldb <fasta_file> Finish or do a complete annotation with this sequence database (protein fasta file).
|
78
|
+
Fasta headers need to look similar to NCBI or EBI fasta headers
|
79
|
+
EX: >gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
80
|
+
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
81
|
+
|
82
|
+
// Options
|
83
|
+
--pidentity <% identity> Minimum percentage identity to incorporate a CDS annotation [default=0.8]
|
84
|
+
--pcoverage <% identity> Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.8]
|
85
|
+
// otherwise hint for a non-functional protein
|
86
|
+
--minlength <length> Minimum contig length for annotation [default=500]
|
87
|
+
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
87
88
|
|
88
89
|
OEM
|
89
90
|
|
@@ -101,6 +102,11 @@ def parseOptions_annotate
|
|
101
102
|
options[:minlength] = 500
|
102
103
|
options[:meta] = 0
|
103
104
|
|
105
|
+
if ARGV.length == 0
|
106
|
+
usage_annotate
|
107
|
+
abort
|
108
|
+
end
|
109
|
+
|
104
110
|
while x = ARGV.shift
|
105
111
|
|
106
112
|
case x.downcase
|
@@ -224,12 +230,14 @@ def usage_identify
|
|
224
230
|
|
225
231
|
identify [OPTIONS] genome_1.fasta genome_2.fasta genome_x.fasta
|
226
232
|
|
227
|
-
//
|
228
|
-
--
|
233
|
+
//Mash Sketch
|
234
|
+
--mash/-m <mash sketch file>
|
229
235
|
|
230
236
|
//IO
|
231
237
|
--proc <nb of process> Number of process to run the comparison
|
232
238
|
|
239
|
+
--output [csv,tsv|json]
|
240
|
+
|
233
241
|
OEM
|
234
242
|
|
235
243
|
end
|
@@ -238,21 +246,24 @@ def parseOptions_identify
|
|
238
246
|
|
239
247
|
options = {}
|
240
248
|
options[:proc] = 2
|
241
|
-
options[:
|
249
|
+
options[:genome_list] = []
|
250
|
+
options[:output] = "tsv"
|
242
251
|
|
243
252
|
while x = ARGV.shift
|
244
253
|
|
245
254
|
case x.downcase
|
246
|
-
when "--
|
247
|
-
options[:
|
255
|
+
when "--mash", "-m"
|
256
|
+
options[:mash_file] = ARGV.shift
|
248
257
|
when "--proc", "-p"
|
249
258
|
options[:proc] = ARGV.shift
|
259
|
+
when "--output", "-o"
|
260
|
+
options[:output] = ARGV.shift
|
250
261
|
when "--help", "-h"
|
251
262
|
usage_identify
|
252
263
|
abort
|
253
264
|
else
|
254
265
|
if File.exists? "#{x}"
|
255
|
-
options[:
|
266
|
+
options[:genome_list] << x
|
256
267
|
else
|
257
268
|
puts "#{x} file doesn't exist"
|
258
269
|
usage_identify
|
@@ -302,14 +313,14 @@ if ARGV.size >= 1
|
|
302
313
|
|
303
314
|
# Check Options
|
304
315
|
if ! options.has_key? :refgenome and
|
305
|
-
! options.has_key? :external_db
|
316
|
+
! options.has_key? :external_db and
|
317
|
+
! options.has_key? :mergem
|
306
318
|
puts "You didn't provide a reference genome or a database for the annotation !"
|
307
319
|
elsif ! options.has_key? :input
|
308
320
|
puts "You didn't provide a fasta file to annotate !"
|
309
321
|
end
|
310
322
|
|
311
323
|
bannot = BacterialAnnotator.new(options, ROOT)
|
312
|
-
bannot.prepare_files_for_annotation
|
313
324
|
bannot.run_annotation
|
314
325
|
|
315
326
|
elsif ARGV[0] == "compare"
|
@@ -317,20 +328,19 @@ if ARGV.size >= 1
|
|
317
328
|
ARGV.shift
|
318
329
|
options = parseOptions_compare
|
319
330
|
bcomp = BacterialComparator.new(options, ROOT)
|
320
|
-
|
321
|
-
bcomp.mafft_aln aln_opt
|
322
|
-
bcomp.raxml_tree aln_opt, options[:bootstrap] if options[:phylogeny] == 1
|
331
|
+
bcomp.run_comparison
|
323
332
|
|
324
333
|
elsif ARGV[0] == "identify"
|
325
334
|
|
326
335
|
ARGV.shift
|
327
336
|
options = parseOptions_identify
|
328
|
-
if options[:
|
337
|
+
if options[:genome_list].empty?
|
329
338
|
puts "You need at least 1 genome fasta to identify !!"
|
330
339
|
usage_identify
|
331
340
|
abort
|
332
341
|
end
|
333
342
|
bident = BacterialIdentificator.new(options, ROOT)
|
343
|
+
bident.run_identification
|
334
344
|
|
335
345
|
elsif ARGV[0] == "--version" or ARGV[0] == "-v"
|
336
346
|
|
@@ -5,27 +5,208 @@
|
|
5
5
|
# version: 0.0.1
|
6
6
|
# licence:
|
7
7
|
|
8
|
+
require 'json'
|
9
|
+
require 'zlib'
|
8
10
|
|
9
11
|
|
10
12
|
class SequenceAnnotation
|
11
13
|
|
12
|
-
attr_accessor :gbk, :coding_seq, :cds_file, :rna_file
|
14
|
+
attr_accessor :gbk, :coding_seq, :rna_seq, :cds_file, :rna_file
|
13
15
|
|
14
16
|
# Initialize then genbank file
|
15
|
-
def initialize
|
17
|
+
def initialize root, outdir, file_ref, type
|
18
|
+
|
19
|
+
@root = root
|
20
|
+
@outdir = outdir
|
21
|
+
@coding_seq = {}
|
22
|
+
@rna_seq = {}
|
23
|
+
|
24
|
+
case type
|
25
|
+
when "refGbk"
|
26
|
+
# reference genome use for annotation
|
27
|
+
reference_gbk file_ref
|
28
|
+
when "db"
|
29
|
+
# reference database use for annotation
|
30
|
+
reference_db file_ref
|
31
|
+
when "fasta"
|
32
|
+
# single fasta database for annotation (completion)
|
33
|
+
single_fasta file_ref
|
34
|
+
when "newGbk"
|
35
|
+
# new genbank holder to be annotated
|
36
|
+
new_gbk file_ref
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# Use a MERGEM database to get annotation from it
|
43
|
+
def reference_db dir
|
44
|
+
|
45
|
+
abort "Aborting: Can't find MERGEM db direcotry" if ! File.exists? dir
|
46
|
+
|
47
|
+
@cds_file = "#{dir}/cds.dmnd"
|
48
|
+
@rna_file = "#{dir}/rnas.fasta"
|
49
|
+
|
50
|
+
json_genes = {}
|
51
|
+
Zlib::GzipReader.open("#{dir}/cds.json.gz") {|gz|
|
52
|
+
json_genes = JSON.parse(gz.read)
|
53
|
+
}
|
54
|
+
|
55
|
+
json_genes.each do |gene|
|
16
56
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
57
|
+
prot_id = gene["cluster_id"]
|
58
|
+
@coding_seq[prot_id] = {
|
59
|
+
protId: prot_id,
|
60
|
+
location: nil,
|
61
|
+
product: gene["consensus_name"],
|
62
|
+
length: gene["consensus_length"]
|
63
|
+
}
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
# File.open("#{dir}/cds.txt") do |f|
|
68
|
+
# while l = f.gets
|
69
|
+
# lA = l.chomp.split(" ")
|
70
|
+
# @coding_seq[lA[0].gsub(">","")] = {
|
71
|
+
# protId: lA[0].gsub(">",""),
|
72
|
+
# location: nil,
|
73
|
+
# product: lA[1..-1].join(' '),
|
74
|
+
# }
|
75
|
+
# end
|
76
|
+
# end
|
77
|
+
|
78
|
+
File.open("#{dir}/rnas.txt") do |f|
|
79
|
+
while l = f.gets
|
80
|
+
lA = l.chomp.split(" ")
|
81
|
+
@rna_seq[lA[0].gsub(">","")] = {
|
82
|
+
protId: lA[0].gsub(">",""),
|
83
|
+
location: nil,
|
84
|
+
product: lA[1..-1].join(' '),
|
85
|
+
}
|
86
|
+
end
|
22
87
|
end
|
23
88
|
|
24
|
-
|
89
|
+
end
|
90
|
+
|
91
|
+
# Use a Genbank Reference and read annotation from it
|
92
|
+
def reference_gbk gbk_file
|
93
|
+
|
94
|
+
puts "# Preparing reference genome files.."
|
95
|
+
if ! File.exists? gbk_file
|
96
|
+
fetch_ncbi_genome(gbk_file)
|
97
|
+
gbk_file = "#{@outdir}/#{gbk_file}.gbk"
|
98
|
+
# gbk_file += ".gbk"
|
99
|
+
end
|
100
|
+
|
101
|
+
flat_gbk = Bio::FlatFile.auto(gbk_file)
|
25
102
|
|
26
103
|
# Check if gbk is valid
|
27
104
|
if flat_gbk.dbclass != Bio::GenBank
|
28
|
-
abort "Aborting : The input #{
|
105
|
+
abort "Aborting : The input #{gbk_file} is not a valid genbank file !"
|
106
|
+
else
|
107
|
+
@gbk = flat_gbk.next_entry
|
108
|
+
end
|
109
|
+
|
110
|
+
@bioseq = @gbk.to_biosequence
|
111
|
+
|
112
|
+
write_cds_to_file
|
113
|
+
write_rna_to_file
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
# Use a Genbank Reference and read annotation from it
|
118
|
+
def single_fasta fasta_file
|
119
|
+
|
120
|
+
return "" if ! File.exists? fasta_file
|
121
|
+
|
122
|
+
File.open(fasta_file, "r") do |dbfile|
|
123
|
+
|
124
|
+
while l=dbfile.gets
|
125
|
+
|
126
|
+
if l[0] == ">"
|
127
|
+
|
128
|
+
lA = l.chomp.split("|")
|
129
|
+
|
130
|
+
if lA.length > 1 # refseq, ncbi, trembl, swissprot
|
131
|
+
|
132
|
+
key_gi = l.split(" ")[0][1..-1]
|
133
|
+
product_long = lA[-1]
|
134
|
+
|
135
|
+
organism = ""
|
136
|
+
product = ""
|
137
|
+
db_source = "[DBSource]"
|
138
|
+
|
139
|
+
if product_long.scan(/|/).count >= 5 # FROM BIORUBY SCRIPTS
|
140
|
+
product = product_long
|
141
|
+
db_source = "RefSeq"
|
142
|
+
elsif product_long.include? " [" and product_long.include? "]" # NCBI
|
143
|
+
organism = product_long[/\[.*?\]/]
|
144
|
+
product = product_long.split(" [")[0].strip
|
145
|
+
elsif product_long.include? "OS=" # Swissprot / TrEMBL
|
146
|
+
product_tmp = product.split("OS=")
|
147
|
+
organism = product_tmp[1].split(/[A-Z][A-Z]=/)[0].strip
|
148
|
+
product = product_tmp[0].strip
|
149
|
+
elsif product_long.include? "[A-Z][A-Z]=" # NCBI
|
150
|
+
product = product_long.split(/[A-Z][A-Z]=/)[0].strip
|
151
|
+
else
|
152
|
+
product = product_long
|
153
|
+
end
|
154
|
+
|
155
|
+
org = organism.gsub("[","").gsub("]","")
|
156
|
+
|
157
|
+
product.lstrip!
|
158
|
+
prot_id = nil
|
159
|
+
|
160
|
+
if key_gi.count("|") == 4
|
161
|
+
if lA[2] == "ref"
|
162
|
+
db_source = "RefSeq"
|
163
|
+
end
|
164
|
+
prot_id = lA[3]
|
165
|
+
elsif key_gi.count("|") == 2
|
166
|
+
if lA[0].include? == "sp" or
|
167
|
+
lA[0].include? == "tr"
|
168
|
+
db_source = "UniProtKB"
|
169
|
+
end
|
170
|
+
prot_id = lA[1]
|
171
|
+
elsif key_gi.count("|") == 5
|
172
|
+
db_source = "RefSeq"
|
173
|
+
prot_id = lA[2]
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
else # mergem
|
178
|
+
|
179
|
+
|
180
|
+
end
|
181
|
+
|
182
|
+
@coding_seq[key_gi] = { product: product,
|
183
|
+
org: org,
|
184
|
+
prot_id: prot_id,
|
185
|
+
db_source: db_source }
|
186
|
+
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
194
|
+
|
195
|
+
|
196
|
+
# New Genbank Holder to add annotation to it
|
197
|
+
def new_gbk gbk_file
|
198
|
+
|
199
|
+
if ! File.exists? gbk_file
|
200
|
+
fetch_ncbi_genome(gbk_file)
|
201
|
+
gbk_file = "#{@outdir}/#{gbk_file}.gbk"
|
202
|
+
# gbk_file += ".gbk"
|
203
|
+
end
|
204
|
+
|
205
|
+
flat_gbk = Bio::FlatFile.auto(gbk_file)
|
206
|
+
|
207
|
+
# Check if gbk is valid
|
208
|
+
if flat_gbk.dbclass != Bio::GenBank
|
209
|
+
abort "Aborting : The input #{gbk_file} is not a valid genbank file !"
|
29
210
|
else
|
30
211
|
@gbk = flat_gbk.next_entry
|
31
212
|
end
|
@@ -38,9 +219,7 @@ class SequenceAnnotation
|
|
38
219
|
# Prepare CDS/proteins
|
39
220
|
def get_cds
|
40
221
|
|
41
|
-
if @coding_seq
|
42
|
-
|
43
|
-
@coding_seq = {}
|
222
|
+
if @coding_seq.empty?
|
44
223
|
|
45
224
|
# Iterate over each CDS
|
46
225
|
@gbk.each_cds do |ft|
|
@@ -74,7 +253,7 @@ class SequenceAnnotation
|
|
74
253
|
product: product[0],
|
75
254
|
bioseq: pepBioSeq,
|
76
255
|
bioseq_gene: dnaBioSeq,
|
77
|
-
|
256
|
+
length: pepBioSeq.length
|
78
257
|
}
|
79
258
|
|
80
259
|
end
|
@@ -88,12 +267,12 @@ class SequenceAnnotation
|
|
88
267
|
# Prepare rRNA tRNA
|
89
268
|
def get_rna
|
90
269
|
|
91
|
-
if @rna_seq
|
270
|
+
if @rna_seq.empty?
|
92
271
|
|
93
272
|
@rna_seq = {}
|
94
273
|
@gbk.features do |ft|
|
95
274
|
|
96
|
-
next if ! ft.feature.to_s.include? "
|
275
|
+
next if ! ft.feature.to_s.include? "rRNA"
|
97
276
|
|
98
277
|
ftH = ft.to_hash
|
99
278
|
loc = ft.locations
|
@@ -129,20 +308,19 @@ class SequenceAnnotation
|
|
129
308
|
|
130
309
|
end
|
131
310
|
|
132
|
-
|
133
311
|
# Print CDS to files
|
134
312
|
# RETURN : cds_file path
|
135
|
-
def write_cds_to_file
|
313
|
+
def write_cds_to_file
|
136
314
|
|
137
315
|
cds_file = "#{@gbk.accession}.pep"
|
138
316
|
dna_file = "#{@gbk.accession}.dna"
|
139
317
|
|
140
|
-
if @coding_seq
|
318
|
+
if @coding_seq.empty?
|
141
319
|
get_cds
|
142
320
|
end
|
143
321
|
|
144
|
-
dna_out = File.open("#{outdir}/#{dna_file}", "w")
|
145
|
-
File.open("#{outdir}/#{cds_file}", "w") do |fwrite|
|
322
|
+
dna_out = File.open("#{@outdir}/#{dna_file}", "w")
|
323
|
+
File.open("#{@outdir}/#{cds_file}", "w") do |fwrite|
|
146
324
|
@coding_seq.each_key do |k|
|
147
325
|
seqout = @coding_seq[k][:bioseq].output_fasta("#{k}",60)
|
148
326
|
seqout_dna = @coding_seq[k][:bioseq_gene].output_fasta("#{k}",60)
|
@@ -152,28 +330,28 @@ class SequenceAnnotation
|
|
152
330
|
end
|
153
331
|
dna_out.close
|
154
332
|
|
155
|
-
@cds_file = "#{outdir}/" + cds_file
|
333
|
+
@cds_file = "#{@outdir}/" + cds_file
|
156
334
|
|
157
335
|
end
|
158
336
|
|
159
337
|
# Print RNA to files
|
160
338
|
# RETURN : rna_file path
|
161
|
-
def write_rna_to_file
|
339
|
+
def write_rna_to_file
|
162
340
|
|
163
341
|
rna_file = "#{@gbk.accession}.rna"
|
164
342
|
|
165
|
-
if @rna_seq
|
343
|
+
if @rna_seq.empty?
|
166
344
|
get_rna
|
167
345
|
end
|
168
346
|
|
169
|
-
File.open("#{outdir}/#{rna_file}", "w") do |fwrite|
|
347
|
+
File.open("#{@outdir}/#{rna_file}", "w") do |fwrite|
|
170
348
|
@rna_seq.each_key do |k|
|
171
349
|
seqout_dna = @rna_seq[k][:bioseq_gene].output_fasta("#{k}|#{@rna_seq[k][:type]}|#{@rna_seq[k][:product]}",60)
|
172
350
|
fwrite.write(seqout_dna)
|
173
351
|
end
|
174
352
|
end
|
175
353
|
|
176
|
-
@rna_file = "#{outdir}/" + rna_file
|
354
|
+
@rna_file = "#{@outdir}/" + rna_file
|
177
355
|
|
178
356
|
end
|
179
357
|
|
@@ -247,6 +425,7 @@ class SequenceAnnotation
|
|
247
425
|
|
248
426
|
# check if there is a reference genome.. reference_locus shouldn't be nil in that case
|
249
427
|
if locus != nil
|
428
|
+
|
250
429
|
qNote = Bio::Feature::Qualifier.new('note', "corresponds to #{locus} locus (AA identity: #{pId}%; coverage(q,s): #{cov_query}%,#{cov_subject}%) from #{ref_genome}")
|
251
430
|
ftArray.push(qNote)
|
252
431
|
|
@@ -390,9 +569,9 @@ class SequenceAnnotation
|
|
390
569
|
end
|
391
570
|
|
392
571
|
|
393
|
-
def save_genbank_to_file
|
572
|
+
def save_genbank_to_file
|
394
573
|
|
395
|
-
File.open("#{outdir}/#{@gbk.definition}.gbk", "w") do |f|
|
574
|
+
File.open("#{@outdir}/#{@gbk.definition}.gbk", "w") do |f|
|
396
575
|
f.write(@gbk.to_biosequence.output(:genbank))
|
397
576
|
end
|
398
577
|
|
@@ -403,7 +582,7 @@ class SequenceAnnotation
|
|
403
582
|
###################
|
404
583
|
|
405
584
|
# Fct: Get dna sequence
|
406
|
-
def get_DNA
|
585
|
+
def get_DNA cds, seq
|
407
586
|
loc = cds.locations
|
408
587
|
sbeg = loc[0].from.to_i
|
409
588
|
send = loc[0].to.to_i
|
@@ -418,11 +597,11 @@ class SequenceAnnotation
|
|
418
597
|
|
419
598
|
|
420
599
|
# Fetch genbank genome from NCBI
|
421
|
-
def fetch_ncbi_genome refgenome_id
|
600
|
+
def fetch_ncbi_genome refgenome_id
|
422
601
|
Bio::NCBI.default_email = 'default@default.com'
|
423
602
|
ncbi = Bio::NCBI::REST.new
|
424
603
|
genbankstring = ncbi.efetch(refgenome_id, {"db"=>'nucleotide', "rettype"=>'gb'})
|
425
|
-
File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
|
604
|
+
File.open("#{@outdir}/#{refgenome_id}.gbk", "w") do |f|
|
426
605
|
f.write(genbankstring)
|
427
606
|
end
|
428
607
|
end
|
@@ -13,8 +13,10 @@ class SequenceFasta
|
|
13
13
|
attr_reader :fasta_flat, :fasta_file, :annotation_files
|
14
14
|
|
15
15
|
# Initialize fasta holder
|
16
|
-
def initialize fasta_file, meta
|
16
|
+
def initialize root, outdir, fasta_file, meta
|
17
17
|
|
18
|
+
@root = root
|
19
|
+
@outdir = outdir
|
18
20
|
@fasta_file = fasta_file
|
19
21
|
@fasta_flat = Bio::FlatFile.auto(@fasta_file)
|
20
22
|
|
@@ -32,29 +34,29 @@ class SequenceFasta
|
|
32
34
|
|
33
35
|
|
34
36
|
# Run prodigal on the genome to annotate
|
35
|
-
def run_prodigal
|
37
|
+
def run_prodigal
|
36
38
|
|
37
39
|
@annotation_files = {}
|
38
|
-
Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
|
40
|
+
Dir.mkdir "#{@outdir}" if ! Dir.exists? "#{@outdir}"
|
39
41
|
if @meta==1
|
40
|
-
system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
42
|
+
system("#{@root}/prodigal.linux -p meta -i #{@fasta_file} -a #{@outdir}/Proteins.fa -d #{@outdir}/Genes.fa -o #{@outdir}/Genbanks.gbk -q")
|
41
43
|
else
|
42
|
-
system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
44
|
+
system("#{@root}/prodigal.linux -i #{@fasta_file} -a #{@outdir}/Proteins.fa -d #{@outdir}/Genes.fa -o #{@outdir}/Genbanks.gbk -q")
|
43
45
|
end
|
44
46
|
|
45
47
|
@annotation_files = {
|
46
|
-
multiGBK: "#{outdir}/Genbanks.gbk",
|
48
|
+
multiGBK: "#{@outdir}/Genbanks.gbk",
|
47
49
|
contigs: [],
|
48
50
|
contigs_length: [],
|
49
|
-
genes: "#{outdir}/Genes.fa",
|
50
|
-
proteins: "#{outdir}/Proteins.fa",
|
51
|
+
genes: "#{@outdir}/Genes.fa",
|
52
|
+
proteins: "#{@outdir}/Proteins.fa",
|
51
53
|
prot_ids_by_contig: {},
|
52
|
-
fasta_path: "#{outdir}/single-fasta/",
|
53
|
-
gbk_path: "#{outdir}/single-genbank/"
|
54
|
+
fasta_path: "#{@outdir}/single-fasta/",
|
55
|
+
gbk_path: "#{@outdir}/single-genbank/"
|
54
56
|
}
|
55
57
|
|
56
|
-
split_fasta
|
57
|
-
split_genbank
|
58
|
+
split_fasta
|
59
|
+
split_genbank
|
58
60
|
extract_cds_names
|
59
61
|
@annotation_files
|
60
62
|
|
@@ -63,14 +65,14 @@ class SequenceFasta
|
|
63
65
|
|
64
66
|
# Split Multi Fasta file
|
65
67
|
# RETURN : array of fasta files
|
66
|
-
def split_fasta
|
68
|
+
def split_fasta
|
67
69
|
@single_fasta = {}
|
68
|
-
Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
|
70
|
+
Dir.mkdir("#{@outdir}/single-fasta") if ! Dir.exists?("#{@outdir}/single-fasta")
|
69
71
|
@fasta_flat.each_entry do |seq|
|
70
72
|
file_name = seq.definition.chomp.split(" ")[0]
|
71
73
|
@annotation_files[:contigs] << "#{file_name}"
|
72
74
|
@annotation_files[:contigs_length] << seq.seq.length
|
73
|
-
File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
75
|
+
File.open("#{@outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
74
76
|
fwrite.write(seq)
|
75
77
|
end
|
76
78
|
@single_fasta[file_name] = seq
|
@@ -80,9 +82,10 @@ class SequenceFasta
|
|
80
82
|
|
81
83
|
# Split Multi Genbanks file
|
82
84
|
# RETURN : array of genbank files
|
83
|
-
def split_genbank
|
85
|
+
def split_genbank
|
84
86
|
|
85
|
-
|
87
|
+
multigbk = "#{@outdir}/Genbanks.gbk"
|
88
|
+
Dir.mkdir("#{@outdir}/single-genbank")if ! Dir.exists?("#{@outdir}/single-genbank")
|
86
89
|
File.open(multigbk,"r") do |f|
|
87
90
|
fopen = nil
|
88
91
|
while l = f.gets
|
@@ -96,7 +99,7 @@ class SequenceFasta
|
|
96
99
|
year = date.year
|
97
100
|
locus = "LOCUS #{file_name}#{spacer}#{seq_length.to_s} bp DNA linear BCT #{day}-#{month}-#{year}\n"
|
98
101
|
locus += "DEFINITION #{file_name}\n"
|
99
|
-
fopen = File.open("#{outdir}/single-genbank/#{file_name}.gbk", "w")
|
102
|
+
fopen = File.open("#{@outdir}/single-genbank/#{file_name}.gbk", "w")
|
100
103
|
fopen.write(locus)
|
101
104
|
elsif l[0..1] == "//"
|
102
105
|
fopen.write(outseq)
|