bacterial-annotator 0.6.6 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/ba_mash +45 -0
- data/bin/bacterial-annotator +72 -5
- data/lib/bacterial-annotator/sequence-fasta.rb +1 -3
- data/lib/bacterial-annotator/sequence-synteny.rb +2 -2
- data/lib/bacterial-annotator.rb +25 -6
- data/lib/bacterial-comparator.rb +5 -3
- data/lib/bacterial-identificator.rb +64 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0228aafd97af13b8756df42db362e4a53a30f4f0
|
4
|
+
data.tar.gz: 858942624597354dd0f52ad98ea1e94373289174
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 82c36c4fba00b437e721991c739517b7cfeb5edaa7e1ac49849e59d3ffac2165f1ef39f9961aa756ff8ad691fec36a8b3424cf8ce4d0e1125d486fa2e2a38593
|
7
|
+
data.tar.gz: e8b569f61f2dcb7309c6587ce619f7432e2588a717f3017053adcb693327ac2f21850785883eae4477226d057dde18a308a88d85d2a0558561d567865d1348cc
|
data/bin/ba_mash
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# author: maxime déraspe
|
4
|
+
# email: maximilien1er@gmail.com
|
5
|
+
# review:
|
6
|
+
# date: 17-10-12
|
7
|
+
# version: 0.01
|
8
|
+
# licence:
|
9
|
+
|
10
|
+
require 'open-uri'
|
11
|
+
|
12
|
+
ROOT_path = File.dirname(__FILE__)
|
13
|
+
# url = "https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar"
|
14
|
+
|
15
|
+
# Install Mash on the user system
|
16
|
+
def installMash
|
17
|
+
|
18
|
+
begin
|
19
|
+
resp = open("https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar")
|
20
|
+
open("#{ROOT_path}/mash.tar", "wb") do |file|
|
21
|
+
file.write(resp.read)
|
22
|
+
end
|
23
|
+
Dir.chdir("#{ROOT_path}/")
|
24
|
+
`tar -xvf mash.tar > /dev/null 2>&1`
|
25
|
+
`cp mash-Linux64-v2.0/mash mash.linux`
|
26
|
+
File.chmod(0755, "#{ROOT_path}/mash.linux")
|
27
|
+
`rm -fr mash.tar mash-Linux64-v2.0/`
|
28
|
+
rescue
|
29
|
+
abort "Problem installing Mash, aborting"
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# Install Mash if not already install
|
36
|
+
if ! File.exists? "#{ROOT_path}/mash.linux"
|
37
|
+
|
38
|
+
puts "Installing Mash the MinHash sequence comparator.."
|
39
|
+
puts "See https://github.com/marbl/Mash/"
|
40
|
+
puts "The Licence is Custom"
|
41
|
+
installMash
|
42
|
+
puts "Mash successfully installed in #{ROOT_path}/mash.linux"
|
43
|
+
puts ""
|
44
|
+
|
45
|
+
end
|
data/bin/bacterial-annotator
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
|
11
11
|
require 'bacterial-annotator'
|
12
12
|
require 'bacterial-comparator'
|
13
|
+
require 'bacterial-identificator'
|
13
14
|
|
14
15
|
VERSION = "0.6.6"
|
15
16
|
|
@@ -22,6 +23,7 @@ def print_version
|
|
22
23
|
version += " -- fasta36 v36.3.8d\n"
|
23
24
|
version += " -- RAxML v8.2.11\n"
|
24
25
|
version += " -- FastTree v2.1.10\n"
|
26
|
+
version += " -- MASH v2.0\n"
|
25
27
|
puts version
|
26
28
|
end
|
27
29
|
|
@@ -36,6 +38,9 @@ bacterial-annotator [annotate | compare] [OPTIONS]
|
|
36
38
|
|
37
39
|
# Choose either to annotate a genome or compare several genome annotations
|
38
40
|
|
41
|
+
identify [OPTIONS]
|
42
|
+
.. see identify -h for OPTIONS
|
43
|
+
|
39
44
|
annotate [OPTIONS]
|
40
45
|
.. see annotate -h for OPTIONS
|
41
46
|
|
@@ -64,7 +69,7 @@ annotate [OPTIONS]
|
|
64
69
|
--name/-n <name> Sample name
|
65
70
|
|
66
71
|
// MERGEM-based Annotation (Recommended)
|
67
|
-
--db/-d
|
72
|
+
--db/-d <directory> MERGEM database directory
|
68
73
|
|
69
74
|
// Reference-Based Annotation
|
70
75
|
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
@@ -122,7 +127,6 @@ def parseOptions_annotate
|
|
122
127
|
when "--externaldb"
|
123
128
|
options[:external_db] = ARGV.shift
|
124
129
|
when "--help", "-h"
|
125
|
-
|
126
130
|
usage_annotate
|
127
131
|
abort
|
128
132
|
when "--version", "-v"
|
@@ -213,6 +217,58 @@ def parseOptions_compare
|
|
213
217
|
|
214
218
|
end
|
215
219
|
|
220
|
+
|
221
|
+
def usage_identify
|
222
|
+
|
223
|
+
print <<OEM
|
224
|
+
|
225
|
+
identify [OPTIONS] genome_1.fasta genome_2.fasta genome_x.fasta
|
226
|
+
|
227
|
+
//MERGEM Database
|
228
|
+
--db/-d <database directory>
|
229
|
+
|
230
|
+
//IO
|
231
|
+
--proc <nb of process> Number of process to run the comparison
|
232
|
+
|
233
|
+
OEM
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
def parseOptions_identify
|
238
|
+
|
239
|
+
options = {}
|
240
|
+
options[:proc] = 2
|
241
|
+
options[:genomes_list] = []
|
242
|
+
|
243
|
+
while x = ARGV.shift
|
244
|
+
|
245
|
+
case x.downcase
|
246
|
+
when "--db", "-d"
|
247
|
+
options[:database] = ARGV.shift
|
248
|
+
when "--proc", "-p"
|
249
|
+
options[:proc] = ARGV.shift
|
250
|
+
when "--help", "-h"
|
251
|
+
usage_identify
|
252
|
+
abort
|
253
|
+
else
|
254
|
+
if File.exists? "#{x}"
|
255
|
+
options[:genomes_list] << x
|
256
|
+
else
|
257
|
+
puts "#{x} file doesn't exist"
|
258
|
+
usage_identify
|
259
|
+
abort
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
end
|
264
|
+
|
265
|
+
options
|
266
|
+
|
267
|
+
end
|
268
|
+
|
269
|
+
|
270
|
+
|
271
|
+
|
216
272
|
########
|
217
273
|
# MAIN #
|
218
274
|
########
|
@@ -230,9 +286,9 @@ if ARGV.size >= 1
|
|
230
286
|
system("ba_fasta36")
|
231
287
|
system("ba_cdhit")
|
232
288
|
system("ba_fasttree")
|
289
|
+
system("ba_mash")
|
233
290
|
|
234
291
|
options = {}
|
235
|
-
genomes_list = [] # TODO multiple input genomes
|
236
292
|
|
237
293
|
if ARGV[0] == "annotate"
|
238
294
|
|
@@ -240,7 +296,8 @@ if ARGV.size >= 1
|
|
240
296
|
options = parseOptions_annotate
|
241
297
|
|
242
298
|
if ! File.exist? ("#{ROOT}/blat.linux")
|
243
|
-
|
299
|
+
puts "Exiting program because blat is missing"
|
300
|
+
abort
|
244
301
|
end
|
245
302
|
|
246
303
|
# Check Options
|
@@ -264,6 +321,17 @@ if ARGV.size >= 1
|
|
264
321
|
bcomp.mafft_aln aln_opt
|
265
322
|
bcomp.raxml_tree aln_opt, options[:bootstrap] if options[:phylogeny] == 1
|
266
323
|
|
324
|
+
elsif ARGV[0] == "identify"
|
325
|
+
|
326
|
+
ARGV.shift
|
327
|
+
options = parseOptions_identify
|
328
|
+
if options[:genomes_list].empty?
|
329
|
+
puts "You need at least 1 genome fasta to identify !!"
|
330
|
+
usage_identify
|
331
|
+
abort
|
332
|
+
end
|
333
|
+
bident = BacterialIdentificator.new(options, ROOT)
|
334
|
+
|
267
335
|
elsif ARGV[0] == "--version" or ARGV[0] == "-v"
|
268
336
|
|
269
337
|
print_version
|
@@ -273,7 +341,6 @@ if ARGV.size >= 1
|
|
273
341
|
usage
|
274
342
|
end
|
275
343
|
|
276
|
-
|
277
344
|
else
|
278
345
|
usage
|
279
346
|
end
|
@@ -22,8 +22,6 @@ class SequenceFasta
|
|
22
22
|
abort "Aborting : The input sequence is not a fasta file !"
|
23
23
|
end
|
24
24
|
|
25
|
-
# @contigs = extract_contigs(@fasta_flat)
|
26
|
-
|
27
25
|
@meta = meta
|
28
26
|
|
29
27
|
@annotation_files = nil
|
@@ -38,7 +36,7 @@ class SequenceFasta
|
|
38
36
|
|
39
37
|
@annotation_files = {}
|
40
38
|
Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
|
41
|
-
if @meta
|
39
|
+
if @meta==1
|
42
40
|
system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
43
41
|
else
|
44
42
|
system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
@@ -100,7 +100,7 @@ class SequenceSynteny
|
|
100
100
|
# first hit for query
|
101
101
|
if ! @query_sequences[key].has_key? :homology
|
102
102
|
@query_sequences[key][:conserved] = true
|
103
|
-
@subject_sequences[key][:conserved] = true
|
103
|
+
# @subject_sequences[key][:conserved] = true
|
104
104
|
@query_sequences[key][:homology] = {
|
105
105
|
pId: lA[2].to_f.round(2),
|
106
106
|
cov_query: cov_query,
|
@@ -119,7 +119,7 @@ class SequenceSynteny
|
|
119
119
|
# query already got at least 1 hit and new_score > last_score
|
120
120
|
elsif lA[11].to_f > @query_sequences[key][:homology][:score]
|
121
121
|
@query_sequences[key][:conserved] = true
|
122
|
-
@subject_sequences[key][:conserved] = true
|
122
|
+
# @subject_sequences[key][:conserved] = true
|
123
123
|
@query_sequences[key][:homology] = {
|
124
124
|
pId: lA[2].to_f.round(2),
|
125
125
|
cov_query: cov_query,
|
data/lib/bacterial-annotator.rb
CHANGED
@@ -26,7 +26,7 @@ class BacterialAnnotator
|
|
26
26
|
@minlength = @options[:minlength].to_i
|
27
27
|
@options[:minlength] = @options[:minlength].to_i
|
28
28
|
@options[:pidentity] = @options[:pidentity].to_f
|
29
|
-
@options[:
|
29
|
+
@options[:pidentity] = @options[:pidentity] * 100 if @options[:pidentity] <= 1.00
|
30
30
|
@options[:pcoverage] = @options[:pcoverage].to_f
|
31
31
|
@options[:pcoverage] = @options[:pcoverage] / 100 if @options[:pcoverage] > 1.00
|
32
32
|
|
@@ -200,8 +200,12 @@ class BacterialAnnotator
|
|
200
200
|
"Prot-ExternalDB", @options[:pidentity],
|
201
201
|
@options[:pcoverage], "prot")
|
202
202
|
|
203
|
-
|
203
|
+
print "# Running BLAT alignment with External Database.."
|
204
|
+
start_time = Time.now
|
204
205
|
@externaldb_synteny.run_blat @root, @options[:outdir]
|
206
|
+
end_time = Time.now
|
207
|
+
c_time = Helper.sec2str(end_time-start_time)
|
208
|
+
print "done (#{c_time})\n"
|
205
209
|
@externaldb_synteny.extract_hits :externaldb
|
206
210
|
|
207
211
|
@externaldb_synteny.query_sequences.each do |k, v|
|
@@ -214,6 +218,9 @@ class BacterialAnnotator
|
|
214
218
|
|
215
219
|
next if ! v.has_key? :homology
|
216
220
|
|
221
|
+
if ! @contig_annotations_cds.has_key? contig_of_protein
|
222
|
+
@contig_annotations_cds[contig_of_protein] = []
|
223
|
+
end
|
217
224
|
@contig_annotations_cds[contig_of_protein] << k
|
218
225
|
|
219
226
|
hit_gi = v[:homology][:hits][0]
|
@@ -238,7 +245,6 @@ class BacterialAnnotator
|
|
238
245
|
inference: inference
|
239
246
|
}
|
240
247
|
|
241
|
-
|
242
248
|
end
|
243
249
|
|
244
250
|
end
|
@@ -256,12 +262,17 @@ class BacterialAnnotator
|
|
256
262
|
gbk_path = @query_fasta.annotation_files[:gbk_path]
|
257
263
|
gbk_to_annotate = SequenceAnnotation.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
|
258
264
|
|
259
|
-
if @with_external_db
|
265
|
+
if @with_external_db and @with_refence_genome
|
260
266
|
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
261
267
|
(@prot_synteny_refgenome.query_sequences.merge(@externaldb_synteny.query_sequences)),
|
262
268
|
@contig_annotations_externaldb[contig].merge(@ref_genome.coding_seq),
|
263
269
|
(File.basename @options[:refgenome]).gsub(/.gb.*/,"")
|
264
270
|
)
|
271
|
+
elsif @with_external_db
|
272
|
+
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
273
|
+
@externaldb_synteny.query_sequences,
|
274
|
+
@contig_annotations_externaldb[contig]
|
275
|
+
)
|
265
276
|
else
|
266
277
|
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
267
278
|
@prot_synteny_refgenome.query_sequences,
|
@@ -270,7 +281,7 @@ class BacterialAnnotator
|
|
270
281
|
)
|
271
282
|
end
|
272
283
|
|
273
|
-
if @contig_annotations_rna.has_key? contig
|
284
|
+
if @contig_annotations_rna and @contig_annotations_rna.has_key? contig
|
274
285
|
# puts "RNA annotation"
|
275
286
|
gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
|
276
287
|
end
|
@@ -419,6 +430,8 @@ class BacterialAnnotator
|
|
419
430
|
# >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae GN=blaNDM-1 PE=1 SV=1
|
420
431
|
# TrEMBL
|
421
432
|
# >tr|E5KIY2|E5KIY2_ECOLX Beta-lactamase NDM-1 OS=Escherichia coli GN=blaNDM-1 PE=1 SV=1
|
433
|
+
# MERGEM
|
434
|
+
# >Genome_ID|location|Protein_ID|LocusTag|Gene|Protein_Product
|
422
435
|
|
423
436
|
ref_cds = {}
|
424
437
|
|
@@ -436,7 +449,10 @@ class BacterialAnnotator
|
|
436
449
|
product = ""
|
437
450
|
db_source = "[DBSource]"
|
438
451
|
|
439
|
-
if product_long.
|
452
|
+
if product_long.scan(/|/).count >= 5 # MERGEM
|
453
|
+
product = product_long
|
454
|
+
db_source = "RefSeq"
|
455
|
+
elsif product_long.include? " [" and product_long.include? "]" # NCBI
|
440
456
|
organism = product_long[/\[.*?\]/]
|
441
457
|
product = product_long.split(" [")[0].strip
|
442
458
|
elsif product_long.include? "OS=" # Swissprot / TrEMBL
|
@@ -465,6 +481,9 @@ class BacterialAnnotator
|
|
465
481
|
db_source = "UniProtKB"
|
466
482
|
end
|
467
483
|
prot_id = lA[1]
|
484
|
+
elsif key_gi.count("|") == 5
|
485
|
+
db_source = "RefSeq"
|
486
|
+
prot_id = lA[2]
|
468
487
|
end
|
469
488
|
|
470
489
|
ref_cds[key_gi] = {product: product, org: org, prot_id: prot_id, db_source: db_source}
|
data/lib/bacterial-comparator.rb
CHANGED
@@ -153,7 +153,11 @@ class BacterialComparator
|
|
153
153
|
stats = {}
|
154
154
|
stats[:syntenic] = []
|
155
155
|
fout = File.open("#{@outdir}/cds-synteny.tsv", "w")
|
156
|
-
|
156
|
+
genomes_name = []
|
157
|
+
@genomes_list.each do |g|
|
158
|
+
genomes_name.push(File.basename(g))
|
159
|
+
end
|
160
|
+
fout.write("Gene\t"+genomes_name.join("\t")+"\n")
|
157
161
|
|
158
162
|
to_build_multifasta = []
|
159
163
|
|
@@ -386,7 +390,6 @@ class BacterialComparator
|
|
386
390
|
|
387
391
|
end
|
388
392
|
|
389
|
-
|
390
393
|
def raxml_tree_dna bt
|
391
394
|
print "# Genes DNA tree creation (RAXML).."
|
392
395
|
start_time = Time.now
|
@@ -423,7 +426,6 @@ class BacterialComparator
|
|
423
426
|
print "done (#{c_time})\n"
|
424
427
|
end
|
425
428
|
|
426
|
-
|
427
429
|
def raxml_tree aln_opt, bt
|
428
430
|
|
429
431
|
if aln_opt == "both"
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# author: maxime déraspe
|
3
|
+
# email: maximilien1er@gmail.com
|
4
|
+
# review:
|
5
|
+
# date: 17-10-12
|
6
|
+
# version: 0.0.1
|
7
|
+
# licence:
|
8
|
+
|
9
|
+
require 'bio'
|
10
|
+
require 'fileutils'
|
11
|
+
require 'parallel'
|
12
|
+
require 'helper'
|
13
|
+
|
14
|
+
class BacterialIdentificator
|
15
|
+
|
16
|
+
attr_reader :genomes_list, :stats
|
17
|
+
|
18
|
+
# Initialize BacterialIdentificator
|
19
|
+
# options[:input], options[:refgenome], ROOT, options[:outdir], options)
|
20
|
+
def initialize options, root
|
21
|
+
|
22
|
+
@root = root
|
23
|
+
@db_path = options[:database]
|
24
|
+
@genomes_list = options[:genomes_list]
|
25
|
+
@proc = options[:proc].to_i
|
26
|
+
p @genomes_list
|
27
|
+
|
28
|
+
@genomes_list.each do |g|
|
29
|
+
mash_genome g
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
def mash_genome genome
|
36
|
+
|
37
|
+
# Reference-ID, Query-ID, Mash-distance, P-value, and Matching-hashes
|
38
|
+
# fields = ["hit","query","distance","pvalue","match"]
|
39
|
+
|
40
|
+
results_raw = `#{@root}/mash.linux dist #{@db_path}/species-sequences.msh #{genome}`
|
41
|
+
results = []
|
42
|
+
|
43
|
+
results_raw.split("\n").each do |l|
|
44
|
+
lA = l.chomp.split("\t")
|
45
|
+
next if lA[-1].split("/")[0] == '0' # no match
|
46
|
+
results << lA
|
47
|
+
end
|
48
|
+
|
49
|
+
results_sorted = results.sort {|a,b| a[2] <=> b[2]}
|
50
|
+
|
51
|
+
File.open("#{genome}.msh_dist", "w") do |fout|
|
52
|
+
results_sorted.each do |f|
|
53
|
+
fout.write(f.join("\t"))
|
54
|
+
fout.write("\n")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -83,6 +83,7 @@ executables:
|
|
83
83
|
- ba_fasta36
|
84
84
|
- ba_cdhit
|
85
85
|
- ba_fasttree
|
86
|
+
- ba_mash
|
86
87
|
extensions: []
|
87
88
|
extra_rdoc_files: []
|
88
89
|
files:
|
@@ -92,6 +93,7 @@ files:
|
|
92
93
|
- bin/ba_fasta36
|
93
94
|
- bin/ba_fasttree
|
94
95
|
- bin/ba_mafft
|
96
|
+
- bin/ba_mash
|
95
97
|
- bin/ba_prodigal
|
96
98
|
- bin/ba_raxml
|
97
99
|
- bin/bacterial-annotator
|
@@ -100,6 +102,7 @@ files:
|
|
100
102
|
- lib/bacterial-annotator/sequence-fasta.rb
|
101
103
|
- lib/bacterial-annotator/sequence-synteny.rb
|
102
104
|
- lib/bacterial-comparator.rb
|
105
|
+
- lib/bacterial-identificator.rb
|
103
106
|
- lib/helper.rb
|
104
107
|
homepage: http://rubygems.org/gems/bacterial-annotator
|
105
108
|
licenses:
|