bacterial-annotator 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ba_mafft +48 -0
- data/bin/bacterial-annotator +154 -40
- data/lib/bacterial-annotator/genbank-manip.rb +19 -8
- data/lib/bacterial-comparator.rb +221 -0
- metadata +25 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 34d344cebc006441522c54fa91a021c43c180299
|
4
|
+
data.tar.gz: e3902d68c0e931a4054dddd4d36ac08b5c57ec98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e6104a764e106f3740410bd7e9a11f5d49e345b32797c854013dcc9a91fbd2540072f32fb85020896e8534eb2427fbdb6853ece60a123cd77a4f1c085fc34272
|
7
|
+
data.tar.gz: 1fc359b7523c87f66dcfd020092fc21b1691e8029bfa4684b35515f98b8f72c3160e2ef9d7aefc541f1eb91a803b1db839c02c24744d5e992fd6a2720b685b87
|
data/bin/ba_mafft
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# author: maxime déraspe
|
4
|
+
# email: maxime@deraspe.net
|
5
|
+
# review:
|
6
|
+
# date: 15-02-24
|
7
|
+
# version: 0.01
|
8
|
+
# licence:
|
9
|
+
|
10
|
+
require 'open-uri'
|
11
|
+
|
12
|
+
ROOT_path = File.dirname(__FILE__)
|
13
|
+
# mafft_url = http://mafft.cbrc.jp/alignment/software/mafft-7.222-without-extensions-src.tgz
|
14
|
+
# Install MAFFT on the user system
|
15
|
+
def installMafft
|
16
|
+
|
17
|
+
begin
|
18
|
+
resp = open("http://mafft.cbrc.jp/alignment/software/mafft-7.222-without-extensions-src.tgz")
|
19
|
+
open("#{ROOT_path}/mafft-7.222-without-extensions-src.tgz", "wb") do |file|
|
20
|
+
file.write(resp.read)
|
21
|
+
end
|
22
|
+
Dir.chdir("#{ROOT_path}/")
|
23
|
+
`tar xvf mafft-7.222-without-extensions-src.tgz`
|
24
|
+
Dir.chdir("#{ROOT_path}/mafft-7.222-without-extensions/core")
|
25
|
+
`make`
|
26
|
+
`make install PREFIX=$(pwd -P)/../`
|
27
|
+
`echo '#! /bin/sh' > #{ROOT_path}/mafft.linux`
|
28
|
+
`echo export MAFFT_BINARIES=$(pwd -P)/../binaries >> #{ROOT_path}/mafft.linux`
|
29
|
+
`tail -n +2 #{ROOT_path}/mafft-7.222-without-extensions/bin/mafft >> #{ROOT_path}/mafft.linux`
|
30
|
+
File.chmod(0755, "#{ROOT_path}/mafft.linux")
|
31
|
+
rescue
|
32
|
+
abort "Problem in stalling MAFFT, aborting"
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# Install prodigal if not already install
|
39
|
+
if ! File.exists? "#{ROOT_path}/mafft.linux"
|
40
|
+
|
41
|
+
puts "Installing MAFFT 7.222 the aligner.."
|
42
|
+
puts "See http://mafft.cbrc.jp/alignment/software/"
|
43
|
+
puts "License BSD : http://mafft.cbrc.jp/alignment/software/license.txt"
|
44
|
+
installMafft
|
45
|
+
puts "MAFFT successfully installed in #{ROOT_path}/mafft-7.222-without-extensions"
|
46
|
+
puts ""
|
47
|
+
|
48
|
+
end
|
data/bin/bacterial-annotator
CHANGED
@@ -9,6 +9,7 @@
|
|
9
9
|
|
10
10
|
|
11
11
|
require 'bacterial-annotator'
|
12
|
+
require 'bacterial-comparator'
|
12
13
|
|
13
14
|
|
14
15
|
# Usage message to print to CLI
|
@@ -16,46 +17,61 @@ def usage
|
|
16
17
|
|
17
18
|
print <<OEM
|
18
19
|
|
19
|
-
bacterial-annotator [OPTIONS]
|
20
|
+
bacterial-annotator [annotate | compare] [OPTIONS]
|
20
21
|
|
21
|
-
|
22
|
+
# Choose either to annotate a genome or compare several genome annotations
|
22
23
|
|
23
|
-
|
24
|
+
annotate [OPTIONS]
|
25
|
+
.. see annotate -h for OPTIONS
|
24
26
|
|
25
|
-
|
26
|
-
|
27
|
-
|
27
|
+
compare [OPTIONS] [all annotation directories]*
|
28
|
+
.. see compare -h for OPTIONS
|
29
|
+
|
30
|
+
--help/-h Print this !
|
31
|
+
|
32
|
+
OEM
|
33
|
+
|
34
|
+
end
|
28
35
|
|
29
|
-
// Dataset
|
30
36
|
|
31
|
-
|
32
|
-
--guessref Will guess the best reference genome to use for the annotation.
|
37
|
+
def usage_annotate
|
33
38
|
|
34
|
-
|
35
|
-
Complete the annotation of remaining CDS with a remote NCBI BLAST
|
36
|
-
Can be very slow, better to use an external database !
|
39
|
+
print <<OEM
|
37
40
|
|
38
|
-
|
39
|
-
Complete or do the annotation of remaining CDS with this database (a protein fasta file).
|
40
|
-
Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
|
41
|
-
>gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
42
|
-
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
41
|
+
annotate [OPTIONS]
|
43
42
|
|
44
|
-
//
|
43
|
+
// IO
|
44
|
+
--input/-i <fasta_file> Provide the fasta file to annotate
|
45
|
+
--outdir/-o <outdir> Output directory [default=BAnnotation]
|
46
|
+
--force/-f Force to overwrite the output directory
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
+
// Dataset
|
49
|
+
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
50
|
+
--guessref Will guess the best reference genome to use for the annotation.
|
48
51
|
|
49
|
-
|
50
|
-
|
52
|
+
--remotedb <remote_database> [nr|refseq|swissprot]
|
53
|
+
Complete the annotation of remaining CDS with a remote NCBI BLAST
|
54
|
+
Can be very slow, better to use an external database !
|
55
|
+
|
56
|
+
--externaldb <proteins fasta_file>
|
57
|
+
Complete or do the annotation of remaining CDS with this database (a protein fasta file).
|
58
|
+
Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
|
59
|
+
>gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
60
|
+
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
61
|
+
|
62
|
+
// Other options
|
63
|
+
--pidentity Minimum percentage identity to incorporate a CDS annotation [default=0.7]
|
64
|
+
--minlength Minimum contig length for annotation [default=500]
|
65
|
+
|
66
|
+
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
67
|
+
--gff Will also generate gff annotation files [off by default]
|
51
68
|
|
52
|
-
--help/-h Print this !
|
53
69
|
OEM
|
54
70
|
|
55
71
|
end
|
56
72
|
|
57
73
|
# Parse the Options given on the CLI
|
58
|
-
def
|
74
|
+
def parseOptions_annotate
|
59
75
|
|
60
76
|
options = {}
|
61
77
|
|
@@ -89,7 +105,73 @@ def parseOptions
|
|
89
105
|
when "--externaldb"
|
90
106
|
options[:external_db] = ARGV.shift
|
91
107
|
when "--help", "-h"
|
92
|
-
|
108
|
+
usage_annotate
|
109
|
+
abort
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
options
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
def usage_compare
|
120
|
+
|
121
|
+
print <<OEM
|
122
|
+
|
123
|
+
compare [OPTIONS]
|
124
|
+
|
125
|
+
//IO
|
126
|
+
--outdir/-o <output directory>
|
127
|
+
--proc <nb of process> Number of process to run the comparison
|
128
|
+
|
129
|
+
//Synteny
|
130
|
+
--pidentity <default 0.80> Minimal percentage identity to call a syntenic protein
|
131
|
+
--min_cov <default 0.80> Minimal coverage for the alignment of the protein / gene
|
132
|
+
|
133
|
+
//Alignment (MAFFT)
|
134
|
+
--align [dna|prot|both] by default align only proteins
|
135
|
+
--concat <nb of genes | all> by default all
|
136
|
+
|
137
|
+
//Phylo (RAXML)
|
138
|
+
|
139
|
+
OEM
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
# Parse the Options given on the CLI
|
145
|
+
def parseOptions_compare
|
146
|
+
|
147
|
+
options = {}
|
148
|
+
|
149
|
+
# default options
|
150
|
+
options[:outdir] = "phylogenomics"
|
151
|
+
options[:pidentity] = 0.8
|
152
|
+
options[:min_cov] = 0.8
|
153
|
+
options[:proc] = 2
|
154
|
+
options[:align] = "prot"
|
155
|
+
options[:genomes_list] = []
|
156
|
+
|
157
|
+
while x = ARGV.shift
|
158
|
+
|
159
|
+
case x.downcase
|
160
|
+
when "--outdir", "-o"
|
161
|
+
options[:outdir] = ARGV.shift
|
162
|
+
when "--pidentity"
|
163
|
+
options[:pidentity] = ARGV.shift
|
164
|
+
when "--min_cov"
|
165
|
+
options[:min_cov] = ARGV.shift
|
166
|
+
when "--proc", "-p"
|
167
|
+
options[:proc] = ARGV.shift
|
168
|
+
when "--align"
|
169
|
+
options[:align] = ARGV.shift
|
170
|
+
when "--help", "-h"
|
171
|
+
usage_compare
|
172
|
+
abort
|
173
|
+
else
|
174
|
+
options[:genomes_list] << x if File.exists? "#{x}"
|
93
175
|
end
|
94
176
|
|
95
177
|
end
|
@@ -103,28 +185,60 @@ end
|
|
103
185
|
if ARGV.size > 1
|
104
186
|
|
105
187
|
ROOT = File.dirname(__FILE__)
|
106
|
-
options = parseOptions
|
107
188
|
|
108
|
-
# Check for 3rd party dependencies : Prodigal
|
189
|
+
# Check for 3rd party dependencies : Prodigal, Blat, MAFFT
|
109
190
|
system("ba_prodigal")
|
110
191
|
system("ba_blat")
|
192
|
+
system("ba_mafft")
|
111
193
|
|
112
|
-
|
113
|
-
|
114
|
-
|
194
|
+
options = {}
|
195
|
+
genomes_list = []
|
196
|
+
|
197
|
+
if ARGV[0] == "annotate"
|
198
|
+
|
199
|
+
ARGV.shift
|
200
|
+
options = parseOptions_annotate
|
201
|
+
|
202
|
+
if ! File.exist? ("#{ROOT}/blat.linux")
|
203
|
+
abort "#exiting blat is missing"
|
204
|
+
end
|
205
|
+
|
206
|
+
# Check Options
|
207
|
+
if ! options.has_key? :refgenome and
|
208
|
+
! options.has_key? :remote_db and
|
209
|
+
! options.has_key? :external_db
|
210
|
+
puts "You didn't provide a reference genome or a database for the annotation !"
|
211
|
+
elsif ! options.has_key? :input
|
212
|
+
puts "You didn't provide a fasta file to annotate !"
|
213
|
+
elsif
|
214
|
+
puts ""
|
215
|
+
end
|
216
|
+
|
217
|
+
bannot = BacterialAnnotator.new(options, ROOT)
|
218
|
+
bannot.prepare_files_for_annotation
|
219
|
+
bannot.run_annotation
|
220
|
+
|
221
|
+
elsif ARGV[0] == "compare"
|
222
|
+
|
223
|
+
ARGV.shift
|
224
|
+
options = parseOptions_compare
|
225
|
+
bcomp = BacterialComparator.new(options, ROOT)
|
226
|
+
if options[:align].downcase == "both"
|
227
|
+
bcomp.mafft_align_all_pep
|
228
|
+
bcomp.mafft_align_all_dna
|
229
|
+
elsif options[:align].downcase == "prot"
|
230
|
+
bcomp.mafft_align_all_pep
|
231
|
+
elsif options[:align].downcase == "dna"
|
232
|
+
bcomp.mafft_align_all_dna
|
233
|
+
end
|
234
|
+
|
235
|
+
else
|
236
|
+
|
237
|
+
usage
|
238
|
+
abort
|
115
239
|
|
116
|
-
# Check Options
|
117
|
-
if ! options.has_key? :refgenome and ! options.has_key? :remote_db and ! options.has_key? :external_db
|
118
|
-
puts "You didn't provide a reference genome or a database for the annotation !"
|
119
|
-
elsif ! options.has_key? :input
|
120
|
-
puts "You didn't provide a fasta file to annotate !"
|
121
|
-
elsif
|
122
|
-
puts ""
|
123
240
|
end
|
124
241
|
|
125
|
-
bannot = BacterialAnnotator.new(options, ROOT)
|
126
|
-
bannot.prepare_files_for_annotation
|
127
|
-
bannot.run_annotation
|
128
242
|
|
129
243
|
else
|
130
244
|
usage
|
@@ -57,14 +57,18 @@ class GenbankManip
|
|
57
57
|
product = ftH["product"] if !ftH["product"].nil?
|
58
58
|
protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
|
59
59
|
locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
|
60
|
-
if ftH.has_key? "translation"
|
61
|
-
pep = ftH["translation"][0] if !ftH["translation"].nil?
|
62
|
-
else
|
63
|
-
dna = get_DNA(ft,@bioseq)
|
64
|
-
pep = dna.translate
|
65
|
-
end
|
66
60
|
|
61
|
+
# if ftH.has_key? "translation"
|
62
|
+
# pep = ftH["translation"][0] if !ftH["translation"].nil?
|
63
|
+
# else
|
64
|
+
# dna = get_DNA(ft,@bioseq)
|
65
|
+
# pep = dna.translate
|
66
|
+
# end
|
67
|
+
|
68
|
+
dna = get_DNA(ft,@bioseq)
|
69
|
+
pep = dna.translate
|
67
70
|
pepBioSeq = Bio::Sequence.auto(pep)
|
71
|
+
dnaBioSeq = Bio::Sequence.auto(dna)
|
68
72
|
|
69
73
|
if protId.strip == ""
|
70
74
|
protId = locustag
|
@@ -75,7 +79,8 @@ class GenbankManip
|
|
75
79
|
locustag: locustag,
|
76
80
|
gene: gene[0],
|
77
81
|
product: product[0],
|
78
|
-
bioseq: pepBioSeq
|
82
|
+
bioseq: pepBioSeq,
|
83
|
+
bioseq_gene: dnaBioSeq}
|
79
84
|
end
|
80
85
|
|
81
86
|
end
|
@@ -90,16 +95,22 @@ class GenbankManip
|
|
90
95
|
def write_cds_to_file outdir
|
91
96
|
|
92
97
|
cds_file = "#{@gbk.accession}.pep"
|
98
|
+
dna_file = "#{@gbk.accession}.dna"
|
99
|
+
|
93
100
|
if @coding_seq == nil
|
94
101
|
get_cds
|
95
102
|
end
|
96
103
|
|
104
|
+
dna_out = File.open("#{outdir}/#{dna_file}", "w")
|
97
105
|
File.open("#{outdir}/#{cds_file}", "w") do |fwrite|
|
98
106
|
@coding_seq.each_key do |k|
|
99
107
|
seqout = @coding_seq[k][:bioseq].output_fasta("#{k}",60)
|
108
|
+
seqout_dna = @coding_seq[k][:bioseq_gene].output_fasta("#{k}",60)
|
100
109
|
fwrite.write(seqout)
|
110
|
+
dna_out.write(seqout_dna)
|
101
111
|
end
|
102
112
|
end
|
113
|
+
dna_out.close
|
103
114
|
|
104
115
|
@cds_file = "#{outdir}/" + cds_file
|
105
116
|
|
@@ -202,7 +213,7 @@ class GenbankManip
|
|
202
213
|
Bio::NCBI.default_email = 'default@default.com'
|
203
214
|
ncbi = Bio::NCBI::REST.new
|
204
215
|
genbankstring = ncbi.efetch(refgenome_id, {"db"=>'nucleotide', "rettype"=>'gb'})
|
205
|
-
File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
|
216
|
+
File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
|
206
217
|
f.write(genbankstring)
|
207
218
|
end
|
208
219
|
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# author: maxime déraspe
|
3
|
+
# email: maxime@deraspe.net
|
4
|
+
# review:
|
5
|
+
# date: 15-02-24
|
6
|
+
# version: 0.0.1
|
7
|
+
# licence:
|
8
|
+
|
9
|
+
require 'bio'
|
10
|
+
require 'fileutils'
|
11
|
+
require 'parallel'
|
12
|
+
|
13
|
+
class BacterialComparator
|
14
|
+
|
15
|
+
attr_reader :genomes_list, :stats
|
16
|
+
|
17
|
+
# Initialize BacterialAnnotator
|
18
|
+
# options[:input], options[:refgenome], ROOT, options[:outdir], options)
|
19
|
+
def initialize options, root
|
20
|
+
|
21
|
+
@root = root
|
22
|
+
@outdir = options[:outdir]
|
23
|
+
Dir.mkdir(@outdir) if ! Dir.exists? @outdir
|
24
|
+
@genomes_list = options[:genomes_list]
|
25
|
+
@proc = options[:proc].to_i
|
26
|
+
|
27
|
+
min_cov = options[:min_cov].to_f
|
28
|
+
min_pid = options[:pidentity].to_f
|
29
|
+
if min_cov > 1
|
30
|
+
min_cov = min_cov/100
|
31
|
+
end
|
32
|
+
if min_pid > 1
|
33
|
+
min_pid = min_pid/100
|
34
|
+
end
|
35
|
+
|
36
|
+
@ref_prot = get_ref_prot
|
37
|
+
@synteny = read_prot_synteny
|
38
|
+
@stats = extract_syntenic_fasta min_cov, min_pid
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
def read_prot_synteny
|
43
|
+
synteny = {}
|
44
|
+
@genomes_list.each do |g|
|
45
|
+
puts "#{g}/Prot-Synteny.tsv"
|
46
|
+
file = File.open("#{g}/Prot-Synteny.tsv", "r")
|
47
|
+
l = file.gets # skip header
|
48
|
+
while l = file.gets
|
49
|
+
# AAK98805.1 spr0001 453 1.0 100.0 ABAC01000005_14 453 1.0
|
50
|
+
lA = l.chomp.split("\t")
|
51
|
+
synteny[lA[0]] = [] if ! synteny.has_key? lA[0]
|
52
|
+
synteny[lA[0]] << {ref_cov: lA[3].to_f, pId: lA[4].to_f, query_prot: lA[5], query_cov: lA[7].to_f}
|
53
|
+
end
|
54
|
+
file.close
|
55
|
+
end
|
56
|
+
synteny
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_ref_prot
|
60
|
+
ref_prot = []
|
61
|
+
pep_file = Dir["#{@genomes_list[0]}/*.pep"]
|
62
|
+
flatfile = Bio::FlatFile.auto("#{pep_file[0]}")
|
63
|
+
flatfile.each_entry do |entry|
|
64
|
+
ref_prot << entry.definition.split(" ")[0]
|
65
|
+
end
|
66
|
+
ref_prot
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
def get_sequence_from_flatfile flatfile, name
|
71
|
+
|
72
|
+
out = ""
|
73
|
+
flatfile.each_entry do |entry|
|
74
|
+
if entry.definition.split(" ")[0] == name
|
75
|
+
bioseq = Bio::Sequence.auto(entry.seq)
|
76
|
+
out = bioseq.output_fasta("#{name}",60)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
out
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
def build_multifasta ref_prot, synteny
|
85
|
+
|
86
|
+
pep_out_dir = "./#{@outdir}/genes-align-pep"
|
87
|
+
dna_out_dir = "./#{@outdir}/genes-align-dna"
|
88
|
+
|
89
|
+
# create multifasta by syntenic proteins (pep)
|
90
|
+
if ! File.exists? pep_out_dir+"/#{ref_prot}.pep"
|
91
|
+
pep_out = File.open(pep_out_dir+"/#{ref_prot}.pep", "w")
|
92
|
+
pep_file = Dir["#{@genomes_list[0]}/*.pep"]
|
93
|
+
flatfile = Bio::FlatFile.auto("#{pep_file[0]}")
|
94
|
+
pep_out.write(get_sequence_from_flatfile flatfile, ref_prot)
|
95
|
+
@genomes_list.each_with_index do |g,i|
|
96
|
+
flatfile = Bio::FlatFile.auto("#{g}/Proteins.fa")
|
97
|
+
pep_out.write(get_sequence_from_flatfile flatfile, synteny[i][:query_prot])
|
98
|
+
end
|
99
|
+
pep_out.close
|
100
|
+
end
|
101
|
+
|
102
|
+
# create multifasta by syntenic genes (dna)
|
103
|
+
if ! File.exists? dna_out_dir+"/#{ref_prot}.dna"
|
104
|
+
dna_out = File.open(dna_out_dir+"/#{ref_prot}.dna", "w")
|
105
|
+
# create multifasta by syntenic proteins
|
106
|
+
dna_file = Dir["#{@genomes_list[0]}/*.dna"]
|
107
|
+
flatfile = Bio::FlatFile.auto("#{dna_file[0]}")
|
108
|
+
dna_out.write(get_sequence_from_flatfile flatfile, ref_prot)
|
109
|
+
@genomes_list.each_with_index do |g,i|
|
110
|
+
flatfile = Bio::FlatFile.auto("#{g}/Genes.fa")
|
111
|
+
dna_out.write(get_sequence_from_flatfile flatfile, synteny[i][:query_prot])
|
112
|
+
end
|
113
|
+
dna_out.close
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
def extract_syntenic_fasta min_cov, min_pid
|
120
|
+
|
121
|
+
"# Extracting Proteins and Genes multifasta.."
|
122
|
+
nb_of_syntenic = 0
|
123
|
+
stats = {}
|
124
|
+
stats[:syntenic] = []
|
125
|
+
fout = File.open("#{@outdir}/cds-synteny.tsv", "w")
|
126
|
+
fout.write("Gene\t"+@genomes_list.join("\t")+"\n")
|
127
|
+
|
128
|
+
to_build_multifasta = []
|
129
|
+
|
130
|
+
@synteny.each do |k,v|
|
131
|
+
is_syntenic = 1
|
132
|
+
v.each do |v_|
|
133
|
+
if v_[:query_cov].nil?
|
134
|
+
is_syntenic = 0
|
135
|
+
break
|
136
|
+
elsif v_[:query_cov] > min_cov and
|
137
|
+
v_[:ref_cov] > min_cov and
|
138
|
+
v_[:pId] > min_pid
|
139
|
+
# synteny -> great !
|
140
|
+
else
|
141
|
+
is_syntenic = 0
|
142
|
+
break
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
if is_syntenic == 1
|
147
|
+
nb_of_syntenic += 1
|
148
|
+
# build_multifasta k, v
|
149
|
+
to_build_multifasta << [k,v]
|
150
|
+
fout.write("#{k}")
|
151
|
+
v.each do |x|
|
152
|
+
fout.write("\t#{x[:query_prot]}|#{x[:query_cov]}|#{x[:ref_cov]}")
|
153
|
+
stats[:syntenic] << k
|
154
|
+
end
|
155
|
+
fout.write("\n")
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
|
160
|
+
fout.close
|
161
|
+
|
162
|
+
pep_out_dir = "./#{@outdir}/genes-align-pep"
|
163
|
+
dna_out_dir = "./#{@outdir}/genes-align-dna"
|
164
|
+
Dir.mkdir(pep_out_dir) if ! Dir.exists? pep_out_dir
|
165
|
+
Dir.mkdir(dna_out_dir) if ! Dir.exists? dna_out_dir
|
166
|
+
|
167
|
+
Parallel.map(to_build_multifasta, in_processes: @proc) { |k,v|
|
168
|
+
build_multifasta k, v
|
169
|
+
}
|
170
|
+
|
171
|
+
stats[:nb_of_syntenic] = nb_of_syntenic
|
172
|
+
puts "Syntenic genes : " + nb_of_syntenic.to_s + " / " + @ref_prot.length.to_s
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
def mafft_align f
|
178
|
+
|
179
|
+
trying = 0
|
180
|
+
begin
|
181
|
+
cmd = system("#{@root}/mafft.linux --quiet #{f} > #{f}.aln")
|
182
|
+
if File.size("#{f}.aln") == 0
|
183
|
+
puts "File size of 0.. --#{f}--"
|
184
|
+
puts "Command used : #{@root}/mafft.linux --quiet #{f} > #{f}.aln"
|
185
|
+
fail
|
186
|
+
else
|
187
|
+
status = "OK"
|
188
|
+
status = "FAILED" if cmd != true
|
189
|
+
puts "Alignment #{f} : #{status}"
|
190
|
+
end
|
191
|
+
rescue
|
192
|
+
if trying < 3
|
193
|
+
trying += 1
|
194
|
+
retry
|
195
|
+
end
|
196
|
+
status = "FAILED"
|
197
|
+
puts "Alignment #{f} : #{status}"
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
def mafft_align_all_pep
|
203
|
+
puts "# MAFFT multialign all protein sequences.."
|
204
|
+
Dir.chdir("#{@outdir}/genes-align-pep/")
|
205
|
+
Parallel.map(Dir["*.pep"], in_processes: @proc) { |f|
|
206
|
+
mafft_align f
|
207
|
+
}
|
208
|
+
end
|
209
|
+
|
210
|
+
def mafft_align_all_dna
|
211
|
+
puts "# MAFFT multialign all gene sequences.."
|
212
|
+
puts "# MAFFT multialign all protein sequences.."
|
213
|
+
Dir.chdir("#{@outdir}/genes-align-dna/")
|
214
|
+
Parallel.map(Dir["*.dna"], in_processes: @proc) { |f|
|
215
|
+
mafft_align f
|
216
|
+
}
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
end # end of Class
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -50,6 +50,26 @@ dependencies:
|
|
50
50
|
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: 2.7.3
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: parallel
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '1.9'
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 1.9.0
|
63
|
+
type: :runtime
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '1.9'
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 1.9.0
|
53
73
|
description: GEM to annotate bacterial genome sequence based on a reference genome
|
54
74
|
and complete the annotation with an external database or a remote database.
|
55
75
|
email: maxime@deraspe.net
|
@@ -57,10 +77,12 @@ executables:
|
|
57
77
|
- bacterial-annotator
|
58
78
|
- ba_prodigal
|
59
79
|
- ba_blat
|
80
|
+
- ba_mafft
|
60
81
|
extensions: []
|
61
82
|
extra_rdoc_files: []
|
62
83
|
files:
|
63
84
|
- bin/ba_blat
|
85
|
+
- bin/ba_mafft
|
64
86
|
- bin/ba_prodigal
|
65
87
|
- bin/bacterial-annotator
|
66
88
|
- lib/bacterial-annotator.rb
|
@@ -68,6 +90,7 @@ files:
|
|
68
90
|
- lib/bacterial-annotator/genbank-manip.rb
|
69
91
|
- lib/bacterial-annotator/remote-ncbi.rb
|
70
92
|
- lib/bacterial-annotator/synteny-manip.rb
|
93
|
+
- lib/bacterial-comparator.rb
|
71
94
|
homepage: http://rubygems.org/gems/bacterial-annotator
|
72
95
|
licenses:
|
73
96
|
- GPL-3.0
|