bacterial-annotator 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/ba_mafft +48 -0
- data/bin/bacterial-annotator +154 -40
- data/lib/bacterial-annotator/genbank-manip.rb +19 -8
- data/lib/bacterial-comparator.rb +221 -0
- metadata +25 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 34d344cebc006441522c54fa91a021c43c180299
|
4
|
+
data.tar.gz: e3902d68c0e931a4054dddd4d36ac08b5c57ec98
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e6104a764e106f3740410bd7e9a11f5d49e345b32797c854013dcc9a91fbd2540072f32fb85020896e8534eb2427fbdb6853ece60a123cd77a4f1c085fc34272
|
7
|
+
data.tar.gz: 1fc359b7523c87f66dcfd020092fc21b1691e8029bfa4684b35515f98b8f72c3160e2ef9d7aefc541f1eb91a803b1db839c02c24744d5e992fd6a2720b685b87
|
data/bin/ba_mafft
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# author: maxime déraspe
|
4
|
+
# email: maxime@deraspe.net
|
5
|
+
# review:
|
6
|
+
# date: 15-02-24
|
7
|
+
# version: 0.01
|
8
|
+
# licence:
|
9
|
+
|
10
|
+
require 'open-uri'
|
11
|
+
|
12
|
+
ROOT_path = File.dirname(__FILE__)
|
13
|
+
# mafft_url = http://mafft.cbrc.jp/alignment/software/mafft-7.222-without-extensions-src.tgz
|
14
|
+
# Install MAFFT on the user system
|
15
|
+
def installMafft
|
16
|
+
|
17
|
+
begin
|
18
|
+
resp = open("http://mafft.cbrc.jp/alignment/software/mafft-7.222-without-extensions-src.tgz")
|
19
|
+
open("#{ROOT_path}/mafft-7.222-without-extensions-src.tgz", "wb") do |file|
|
20
|
+
file.write(resp.read)
|
21
|
+
end
|
22
|
+
Dir.chdir("#{ROOT_path}/")
|
23
|
+
`tar xvf mafft-7.222-without-extensions-src.tgz`
|
24
|
+
Dir.chdir("#{ROOT_path}/mafft-7.222-without-extensions/core")
|
25
|
+
`make`
|
26
|
+
`make install PREFIX=$(pwd -P)/../`
|
27
|
+
`echo '#! /bin/sh' > #{ROOT_path}/mafft.linux`
|
28
|
+
`echo export MAFFT_BINARIES=$(pwd -P)/../binaries >> #{ROOT_path}/mafft.linux`
|
29
|
+
`tail -n +2 #{ROOT_path}/mafft-7.222-without-extensions/bin/mafft >> #{ROOT_path}/mafft.linux`
|
30
|
+
File.chmod(0755, "#{ROOT_path}/mafft.linux")
|
31
|
+
rescue
|
32
|
+
abort "Problem in stalling MAFFT, aborting"
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
# Install prodigal if not already install
|
39
|
+
if ! File.exists? "#{ROOT_path}/mafft.linux"
|
40
|
+
|
41
|
+
puts "Installing MAFFT 7.222 the aligner.."
|
42
|
+
puts "See http://mafft.cbrc.jp/alignment/software/"
|
43
|
+
puts "License BSD : http://mafft.cbrc.jp/alignment/software/license.txt"
|
44
|
+
installMafft
|
45
|
+
puts "MAFFT successfully installed in #{ROOT_path}/mafft-7.222-without-extensions"
|
46
|
+
puts ""
|
47
|
+
|
48
|
+
end
|
data/bin/bacterial-annotator
CHANGED
@@ -9,6 +9,7 @@
|
|
9
9
|
|
10
10
|
|
11
11
|
require 'bacterial-annotator'
|
12
|
+
require 'bacterial-comparator'
|
12
13
|
|
13
14
|
|
14
15
|
# Usage message to print to CLI
|
@@ -16,46 +17,61 @@ def usage
|
|
16
17
|
|
17
18
|
print <<OEM
|
18
19
|
|
19
|
-
bacterial-annotator [OPTIONS]
|
20
|
+
bacterial-annotator [annotate | compare] [OPTIONS]
|
20
21
|
|
21
|
-
|
22
|
+
# Choose either to annotate a genome or compare several genome annotations
|
22
23
|
|
23
|
-
|
24
|
+
annotate [OPTIONS]
|
25
|
+
.. see annotate -h for OPTIONS
|
24
26
|
|
25
|
-
|
26
|
-
|
27
|
-
|
27
|
+
compare [OPTIONS] [all annotation directories]*
|
28
|
+
.. see compare -h for OPTIONS
|
29
|
+
|
30
|
+
--help/-h Print this !
|
31
|
+
|
32
|
+
OEM
|
33
|
+
|
34
|
+
end
|
28
35
|
|
29
|
-
// Dataset
|
30
36
|
|
31
|
-
|
32
|
-
--guessref Will guess the best reference genome to use for the annotation.
|
37
|
+
def usage_annotate
|
33
38
|
|
34
|
-
|
35
|
-
Complete the annotation of remaining CDS with a remote NCBI BLAST
|
36
|
-
Can be very slow, better to use an external database !
|
39
|
+
print <<OEM
|
37
40
|
|
38
|
-
|
39
|
-
Complete or do the annotation of remaining CDS with this database (a protein fasta file).
|
40
|
-
Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
|
41
|
-
>gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
42
|
-
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
41
|
+
annotate [OPTIONS]
|
43
42
|
|
44
|
-
//
|
43
|
+
// IO
|
44
|
+
--input/-i <fasta_file> Provide the fasta file to annotate
|
45
|
+
--outdir/-o <outdir> Output directory [default=BAnnotation]
|
46
|
+
--force/-f Force to overwrite the output directory
|
45
47
|
|
46
|
-
|
47
|
-
|
48
|
+
// Dataset
|
49
|
+
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
50
|
+
--guessref Will guess the best reference genome to use for the annotation.
|
48
51
|
|
49
|
-
|
50
|
-
|
52
|
+
--remotedb <remote_database> [nr|refseq|swissprot]
|
53
|
+
Complete the annotation of remaining CDS with a remote NCBI BLAST
|
54
|
+
Can be very slow, better to use an external database !
|
55
|
+
|
56
|
+
--externaldb <proteins fasta_file>
|
57
|
+
Complete or do the annotation of remaining CDS with this database (a protein fasta file).
|
58
|
+
Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
|
59
|
+
>gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
|
60
|
+
>sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
|
61
|
+
|
62
|
+
// Other options
|
63
|
+
--pidentity Minimum percentage identity to incorporate a CDS annotation [default=0.7]
|
64
|
+
--minlength Minimum contig length for annotation [default=500]
|
65
|
+
|
66
|
+
--meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
|
67
|
+
--gff Will also generate gff annotation files [off by default]
|
51
68
|
|
52
|
-
--help/-h Print this !
|
53
69
|
OEM
|
54
70
|
|
55
71
|
end
|
56
72
|
|
57
73
|
# Parse the Options given on the CLI
|
58
|
-
def
|
74
|
+
def parseOptions_annotate
|
59
75
|
|
60
76
|
options = {}
|
61
77
|
|
@@ -89,7 +105,73 @@ def parseOptions
|
|
89
105
|
when "--externaldb"
|
90
106
|
options[:external_db] = ARGV.shift
|
91
107
|
when "--help", "-h"
|
92
|
-
|
108
|
+
usage_annotate
|
109
|
+
abort
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
|
114
|
+
options
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
def usage_compare
|
120
|
+
|
121
|
+
print <<OEM
|
122
|
+
|
123
|
+
compare [OPTIONS]
|
124
|
+
|
125
|
+
//IO
|
126
|
+
--outdir/-o <output directory>
|
127
|
+
--proc <nb of process> Number of process to run the comparison
|
128
|
+
|
129
|
+
//Synteny
|
130
|
+
--pidentity <default 0.80> Minimal percentage identity to call a syntenic protein
|
131
|
+
--min_cov <default 0.80> Minimal coverage for the alignment of the protein / gene
|
132
|
+
|
133
|
+
//Alignment (MAFFT)
|
134
|
+
--align [dna|prot|both] by default align only proteins
|
135
|
+
--concat <nb of genes | all> by default all
|
136
|
+
|
137
|
+
//Phylo (RAXML)
|
138
|
+
|
139
|
+
OEM
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
# Parse the Options given on the CLI
|
145
|
+
def parseOptions_compare
|
146
|
+
|
147
|
+
options = {}
|
148
|
+
|
149
|
+
# default options
|
150
|
+
options[:outdir] = "phylogenomics"
|
151
|
+
options[:pidentity] = 0.8
|
152
|
+
options[:min_cov] = 0.8
|
153
|
+
options[:proc] = 2
|
154
|
+
options[:align] = "prot"
|
155
|
+
options[:genomes_list] = []
|
156
|
+
|
157
|
+
while x = ARGV.shift
|
158
|
+
|
159
|
+
case x.downcase
|
160
|
+
when "--outdir", "-o"
|
161
|
+
options[:outdir] = ARGV.shift
|
162
|
+
when "--pidentity"
|
163
|
+
options[:pidentity] = ARGV.shift
|
164
|
+
when "--min_cov"
|
165
|
+
options[:min_cov] = ARGV.shift
|
166
|
+
when "--proc", "-p"
|
167
|
+
options[:proc] = ARGV.shift
|
168
|
+
when "--align"
|
169
|
+
options[:align] = ARGV.shift
|
170
|
+
when "--help", "-h"
|
171
|
+
usage_compare
|
172
|
+
abort
|
173
|
+
else
|
174
|
+
options[:genomes_list] << x if File.exists? "#{x}"
|
93
175
|
end
|
94
176
|
|
95
177
|
end
|
@@ -103,28 +185,60 @@ end
|
|
103
185
|
if ARGV.size > 1
|
104
186
|
|
105
187
|
ROOT = File.dirname(__FILE__)
|
106
|
-
options = parseOptions
|
107
188
|
|
108
|
-
# Check for 3rd party dependencies : Prodigal
|
189
|
+
# Check for 3rd party dependencies : Prodigal, Blat, MAFFT
|
109
190
|
system("ba_prodigal")
|
110
191
|
system("ba_blat")
|
192
|
+
system("ba_mafft")
|
111
193
|
|
112
|
-
|
113
|
-
|
114
|
-
|
194
|
+
options = {}
|
195
|
+
genomes_list = []
|
196
|
+
|
197
|
+
if ARGV[0] == "annotate"
|
198
|
+
|
199
|
+
ARGV.shift
|
200
|
+
options = parseOptions_annotate
|
201
|
+
|
202
|
+
if ! File.exist? ("#{ROOT}/blat.linux")
|
203
|
+
abort "#exiting blat is missing"
|
204
|
+
end
|
205
|
+
|
206
|
+
# Check Options
|
207
|
+
if ! options.has_key? :refgenome and
|
208
|
+
! options.has_key? :remote_db and
|
209
|
+
! options.has_key? :external_db
|
210
|
+
puts "You didn't provide a reference genome or a database for the annotation !"
|
211
|
+
elsif ! options.has_key? :input
|
212
|
+
puts "You didn't provide a fasta file to annotate !"
|
213
|
+
elsif
|
214
|
+
puts ""
|
215
|
+
end
|
216
|
+
|
217
|
+
bannot = BacterialAnnotator.new(options, ROOT)
|
218
|
+
bannot.prepare_files_for_annotation
|
219
|
+
bannot.run_annotation
|
220
|
+
|
221
|
+
elsif ARGV[0] == "compare"
|
222
|
+
|
223
|
+
ARGV.shift
|
224
|
+
options = parseOptions_compare
|
225
|
+
bcomp = BacterialComparator.new(options, ROOT)
|
226
|
+
if options[:align].downcase == "both"
|
227
|
+
bcomp.mafft_align_all_pep
|
228
|
+
bcomp.mafft_align_all_dna
|
229
|
+
elsif options[:align].downcase == "prot"
|
230
|
+
bcomp.mafft_align_all_pep
|
231
|
+
elsif options[:align].downcase == "dna"
|
232
|
+
bcomp.mafft_align_all_dna
|
233
|
+
end
|
234
|
+
|
235
|
+
else
|
236
|
+
|
237
|
+
usage
|
238
|
+
abort
|
115
239
|
|
116
|
-
# Check Options
|
117
|
-
if ! options.has_key? :refgenome and ! options.has_key? :remote_db and ! options.has_key? :external_db
|
118
|
-
puts "You didn't provide a reference genome or a database for the annotation !"
|
119
|
-
elsif ! options.has_key? :input
|
120
|
-
puts "You didn't provide a fasta file to annotate !"
|
121
|
-
elsif
|
122
|
-
puts ""
|
123
240
|
end
|
124
241
|
|
125
|
-
bannot = BacterialAnnotator.new(options, ROOT)
|
126
|
-
bannot.prepare_files_for_annotation
|
127
|
-
bannot.run_annotation
|
128
242
|
|
129
243
|
else
|
130
244
|
usage
|
@@ -57,14 +57,18 @@ class GenbankManip
|
|
57
57
|
product = ftH["product"] if !ftH["product"].nil?
|
58
58
|
protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
|
59
59
|
locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
|
60
|
-
if ftH.has_key? "translation"
|
61
|
-
pep = ftH["translation"][0] if !ftH["translation"].nil?
|
62
|
-
else
|
63
|
-
dna = get_DNA(ft,@bioseq)
|
64
|
-
pep = dna.translate
|
65
|
-
end
|
66
60
|
|
61
|
+
# if ftH.has_key? "translation"
|
62
|
+
# pep = ftH["translation"][0] if !ftH["translation"].nil?
|
63
|
+
# else
|
64
|
+
# dna = get_DNA(ft,@bioseq)
|
65
|
+
# pep = dna.translate
|
66
|
+
# end
|
67
|
+
|
68
|
+
dna = get_DNA(ft,@bioseq)
|
69
|
+
pep = dna.translate
|
67
70
|
pepBioSeq = Bio::Sequence.auto(pep)
|
71
|
+
dnaBioSeq = Bio::Sequence.auto(dna)
|
68
72
|
|
69
73
|
if protId.strip == ""
|
70
74
|
protId = locustag
|
@@ -75,7 +79,8 @@ class GenbankManip
|
|
75
79
|
locustag: locustag,
|
76
80
|
gene: gene[0],
|
77
81
|
product: product[0],
|
78
|
-
bioseq: pepBioSeq
|
82
|
+
bioseq: pepBioSeq,
|
83
|
+
bioseq_gene: dnaBioSeq}
|
79
84
|
end
|
80
85
|
|
81
86
|
end
|
@@ -90,16 +95,22 @@ class GenbankManip
|
|
90
95
|
def write_cds_to_file outdir
|
91
96
|
|
92
97
|
cds_file = "#{@gbk.accession}.pep"
|
98
|
+
dna_file = "#{@gbk.accession}.dna"
|
99
|
+
|
93
100
|
if @coding_seq == nil
|
94
101
|
get_cds
|
95
102
|
end
|
96
103
|
|
104
|
+
dna_out = File.open("#{outdir}/#{dna_file}", "w")
|
97
105
|
File.open("#{outdir}/#{cds_file}", "w") do |fwrite|
|
98
106
|
@coding_seq.each_key do |k|
|
99
107
|
seqout = @coding_seq[k][:bioseq].output_fasta("#{k}",60)
|
108
|
+
seqout_dna = @coding_seq[k][:bioseq_gene].output_fasta("#{k}",60)
|
100
109
|
fwrite.write(seqout)
|
110
|
+
dna_out.write(seqout_dna)
|
101
111
|
end
|
102
112
|
end
|
113
|
+
dna_out.close
|
103
114
|
|
104
115
|
@cds_file = "#{outdir}/" + cds_file
|
105
116
|
|
@@ -202,7 +213,7 @@ class GenbankManip
|
|
202
213
|
Bio::NCBI.default_email = 'default@default.com'
|
203
214
|
ncbi = Bio::NCBI::REST.new
|
204
215
|
genbankstring = ncbi.efetch(refgenome_id, {"db"=>'nucleotide', "rettype"=>'gb'})
|
205
|
-
File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
|
216
|
+
File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
|
206
217
|
f.write(genbankstring)
|
207
218
|
end
|
208
219
|
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# author: maxime déraspe
|
3
|
+
# email: maxime@deraspe.net
|
4
|
+
# review:
|
5
|
+
# date: 15-02-24
|
6
|
+
# version: 0.0.1
|
7
|
+
# licence:
|
8
|
+
|
9
|
+
require 'bio'
|
10
|
+
require 'fileutils'
|
11
|
+
require 'parallel'
|
12
|
+
|
13
|
+
class BacterialComparator
|
14
|
+
|
15
|
+
attr_reader :genomes_list, :stats
|
16
|
+
|
17
|
+
# Initialize BacterialAnnotator
|
18
|
+
# options[:input], options[:refgenome], ROOT, options[:outdir], options)
|
19
|
+
def initialize options, root
|
20
|
+
|
21
|
+
@root = root
|
22
|
+
@outdir = options[:outdir]
|
23
|
+
Dir.mkdir(@outdir) if ! Dir.exists? @outdir
|
24
|
+
@genomes_list = options[:genomes_list]
|
25
|
+
@proc = options[:proc].to_i
|
26
|
+
|
27
|
+
min_cov = options[:min_cov].to_f
|
28
|
+
min_pid = options[:pidentity].to_f
|
29
|
+
if min_cov > 1
|
30
|
+
min_cov = min_cov/100
|
31
|
+
end
|
32
|
+
if min_pid > 1
|
33
|
+
min_pid = min_pid/100
|
34
|
+
end
|
35
|
+
|
36
|
+
@ref_prot = get_ref_prot
|
37
|
+
@synteny = read_prot_synteny
|
38
|
+
@stats = extract_syntenic_fasta min_cov, min_pid
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
def read_prot_synteny
|
43
|
+
synteny = {}
|
44
|
+
@genomes_list.each do |g|
|
45
|
+
puts "#{g}/Prot-Synteny.tsv"
|
46
|
+
file = File.open("#{g}/Prot-Synteny.tsv", "r")
|
47
|
+
l = file.gets # skip header
|
48
|
+
while l = file.gets
|
49
|
+
# AAK98805.1 spr0001 453 1.0 100.0 ABAC01000005_14 453 1.0
|
50
|
+
lA = l.chomp.split("\t")
|
51
|
+
synteny[lA[0]] = [] if ! synteny.has_key? lA[0]
|
52
|
+
synteny[lA[0]] << {ref_cov: lA[3].to_f, pId: lA[4].to_f, query_prot: lA[5], query_cov: lA[7].to_f}
|
53
|
+
end
|
54
|
+
file.close
|
55
|
+
end
|
56
|
+
synteny
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_ref_prot
|
60
|
+
ref_prot = []
|
61
|
+
pep_file = Dir["#{@genomes_list[0]}/*.pep"]
|
62
|
+
flatfile = Bio::FlatFile.auto("#{pep_file[0]}")
|
63
|
+
flatfile.each_entry do |entry|
|
64
|
+
ref_prot << entry.definition.split(" ")[0]
|
65
|
+
end
|
66
|
+
ref_prot
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
def get_sequence_from_flatfile flatfile, name
|
71
|
+
|
72
|
+
out = ""
|
73
|
+
flatfile.each_entry do |entry|
|
74
|
+
if entry.definition.split(" ")[0] == name
|
75
|
+
bioseq = Bio::Sequence.auto(entry.seq)
|
76
|
+
out = bioseq.output_fasta("#{name}",60)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
out
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
def build_multifasta ref_prot, synteny
|
85
|
+
|
86
|
+
pep_out_dir = "./#{@outdir}/genes-align-pep"
|
87
|
+
dna_out_dir = "./#{@outdir}/genes-align-dna"
|
88
|
+
|
89
|
+
# create multifasta by syntenic proteins (pep)
|
90
|
+
if ! File.exists? pep_out_dir+"/#{ref_prot}.pep"
|
91
|
+
pep_out = File.open(pep_out_dir+"/#{ref_prot}.pep", "w")
|
92
|
+
pep_file = Dir["#{@genomes_list[0]}/*.pep"]
|
93
|
+
flatfile = Bio::FlatFile.auto("#{pep_file[0]}")
|
94
|
+
pep_out.write(get_sequence_from_flatfile flatfile, ref_prot)
|
95
|
+
@genomes_list.each_with_index do |g,i|
|
96
|
+
flatfile = Bio::FlatFile.auto("#{g}/Proteins.fa")
|
97
|
+
pep_out.write(get_sequence_from_flatfile flatfile, synteny[i][:query_prot])
|
98
|
+
end
|
99
|
+
pep_out.close
|
100
|
+
end
|
101
|
+
|
102
|
+
# create multifasta by syntenic genes (dna)
|
103
|
+
if ! File.exists? dna_out_dir+"/#{ref_prot}.dna"
|
104
|
+
dna_out = File.open(dna_out_dir+"/#{ref_prot}.dna", "w")
|
105
|
+
# create multifasta by syntenic proteins
|
106
|
+
dna_file = Dir["#{@genomes_list[0]}/*.dna"]
|
107
|
+
flatfile = Bio::FlatFile.auto("#{dna_file[0]}")
|
108
|
+
dna_out.write(get_sequence_from_flatfile flatfile, ref_prot)
|
109
|
+
@genomes_list.each_with_index do |g,i|
|
110
|
+
flatfile = Bio::FlatFile.auto("#{g}/Genes.fa")
|
111
|
+
dna_out.write(get_sequence_from_flatfile flatfile, synteny[i][:query_prot])
|
112
|
+
end
|
113
|
+
dna_out.close
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
def extract_syntenic_fasta min_cov, min_pid
|
120
|
+
|
121
|
+
"# Extracting Proteins and Genes multifasta.."
|
122
|
+
nb_of_syntenic = 0
|
123
|
+
stats = {}
|
124
|
+
stats[:syntenic] = []
|
125
|
+
fout = File.open("#{@outdir}/cds-synteny.tsv", "w")
|
126
|
+
fout.write("Gene\t"+@genomes_list.join("\t")+"\n")
|
127
|
+
|
128
|
+
to_build_multifasta = []
|
129
|
+
|
130
|
+
@synteny.each do |k,v|
|
131
|
+
is_syntenic = 1
|
132
|
+
v.each do |v_|
|
133
|
+
if v_[:query_cov].nil?
|
134
|
+
is_syntenic = 0
|
135
|
+
break
|
136
|
+
elsif v_[:query_cov] > min_cov and
|
137
|
+
v_[:ref_cov] > min_cov and
|
138
|
+
v_[:pId] > min_pid
|
139
|
+
# synteny -> great !
|
140
|
+
else
|
141
|
+
is_syntenic = 0
|
142
|
+
break
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
if is_syntenic == 1
|
147
|
+
nb_of_syntenic += 1
|
148
|
+
# build_multifasta k, v
|
149
|
+
to_build_multifasta << [k,v]
|
150
|
+
fout.write("#{k}")
|
151
|
+
v.each do |x|
|
152
|
+
fout.write("\t#{x[:query_prot]}|#{x[:query_cov]}|#{x[:ref_cov]}")
|
153
|
+
stats[:syntenic] << k
|
154
|
+
end
|
155
|
+
fout.write("\n")
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
|
160
|
+
fout.close
|
161
|
+
|
162
|
+
pep_out_dir = "./#{@outdir}/genes-align-pep"
|
163
|
+
dna_out_dir = "./#{@outdir}/genes-align-dna"
|
164
|
+
Dir.mkdir(pep_out_dir) if ! Dir.exists? pep_out_dir
|
165
|
+
Dir.mkdir(dna_out_dir) if ! Dir.exists? dna_out_dir
|
166
|
+
|
167
|
+
Parallel.map(to_build_multifasta, in_processes: @proc) { |k,v|
|
168
|
+
build_multifasta k, v
|
169
|
+
}
|
170
|
+
|
171
|
+
stats[:nb_of_syntenic] = nb_of_syntenic
|
172
|
+
puts "Syntenic genes : " + nb_of_syntenic.to_s + " / " + @ref_prot.length.to_s
|
173
|
+
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
def mafft_align f
|
178
|
+
|
179
|
+
trying = 0
|
180
|
+
begin
|
181
|
+
cmd = system("#{@root}/mafft.linux --quiet #{f} > #{f}.aln")
|
182
|
+
if File.size("#{f}.aln") == 0
|
183
|
+
puts "File size of 0.. --#{f}--"
|
184
|
+
puts "Command used : #{@root}/mafft.linux --quiet #{f} > #{f}.aln"
|
185
|
+
fail
|
186
|
+
else
|
187
|
+
status = "OK"
|
188
|
+
status = "FAILED" if cmd != true
|
189
|
+
puts "Alignment #{f} : #{status}"
|
190
|
+
end
|
191
|
+
rescue
|
192
|
+
if trying < 3
|
193
|
+
trying += 1
|
194
|
+
retry
|
195
|
+
end
|
196
|
+
status = "FAILED"
|
197
|
+
puts "Alignment #{f} : #{status}"
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
def mafft_align_all_pep
|
203
|
+
puts "# MAFFT multialign all protein sequences.."
|
204
|
+
Dir.chdir("#{@outdir}/genes-align-pep/")
|
205
|
+
Parallel.map(Dir["*.pep"], in_processes: @proc) { |f|
|
206
|
+
mafft_align f
|
207
|
+
}
|
208
|
+
end
|
209
|
+
|
210
|
+
def mafft_align_all_dna
|
211
|
+
puts "# MAFFT multialign all gene sequences.."
|
212
|
+
puts "# MAFFT multialign all protein sequences.."
|
213
|
+
Dir.chdir("#{@outdir}/genes-align-dna/")
|
214
|
+
Parallel.map(Dir["*.dna"], in_processes: @proc) { |f|
|
215
|
+
mafft_align f
|
216
|
+
}
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
end # end of Class
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -50,6 +50,26 @@ dependencies:
|
|
50
50
|
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
52
|
version: 2.7.3
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: parallel
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '1.9'
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 1.9.0
|
63
|
+
type: :runtime
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '1.9'
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 1.9.0
|
53
73
|
description: GEM to annotate bacterial genome sequence based on a reference genome
|
54
74
|
and complete the annotation with an external database or a remote database.
|
55
75
|
email: maxime@deraspe.net
|
@@ -57,10 +77,12 @@ executables:
|
|
57
77
|
- bacterial-annotator
|
58
78
|
- ba_prodigal
|
59
79
|
- ba_blat
|
80
|
+
- ba_mafft
|
60
81
|
extensions: []
|
61
82
|
extra_rdoc_files: []
|
62
83
|
files:
|
63
84
|
- bin/ba_blat
|
85
|
+
- bin/ba_mafft
|
64
86
|
- bin/ba_prodigal
|
65
87
|
- bin/bacterial-annotator
|
66
88
|
- lib/bacterial-annotator.rb
|
@@ -68,6 +90,7 @@ files:
|
|
68
90
|
- lib/bacterial-annotator/genbank-manip.rb
|
69
91
|
- lib/bacterial-annotator/remote-ncbi.rb
|
70
92
|
- lib/bacterial-annotator/synteny-manip.rb
|
93
|
+
- lib/bacterial-comparator.rb
|
71
94
|
homepage: http://rubygems.org/gems/bacterial-annotator
|
72
95
|
licenses:
|
73
96
|
- GPL-3.0
|