bacterial-annotator 0.6.6 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ba_mash +45 -0
- data/bin/bacterial-annotator +72 -5
- data/lib/bacterial-annotator/sequence-fasta.rb +1 -3
- data/lib/bacterial-annotator/sequence-synteny.rb +2 -2
- data/lib/bacterial-annotator.rb +25 -6
- data/lib/bacterial-comparator.rb +5 -3
- data/lib/bacterial-identificator.rb +64 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0228aafd97af13b8756df42db362e4a53a30f4f0
|
4
|
+
data.tar.gz: 858942624597354dd0f52ad98ea1e94373289174
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 82c36c4fba00b437e721991c739517b7cfeb5edaa7e1ac49849e59d3ffac2165f1ef39f9961aa756ff8ad691fec36a8b3424cf8ce4d0e1125d486fa2e2a38593
|
7
|
+
data.tar.gz: e8b569f61f2dcb7309c6587ce619f7432e2588a717f3017053adcb693327ac2f21850785883eae4477226d057dde18a308a88d85d2a0558561d567865d1348cc
|
data/bin/ba_mash
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
# author: maxime déraspe
|
4
|
+
# email: maximilien1er@gmail.com
|
5
|
+
# review:
|
6
|
+
# date: 17-10-12
|
7
|
+
# version: 0.01
|
8
|
+
# licence:
|
9
|
+
|
10
|
+
require 'open-uri'
|
11
|
+
|
12
|
+
ROOT_path = File.dirname(__FILE__)
|
13
|
+
# url = "https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar"
|
14
|
+
|
15
|
+
# Install Mash on the user system
|
16
|
+
def installMash
|
17
|
+
|
18
|
+
begin
|
19
|
+
resp = open("https://github.com/marbl/Mash/releases/download/v2.0/mash-Linux64-v2.0.tar")
|
20
|
+
open("#{ROOT_path}/mash.tar", "wb") do |file|
|
21
|
+
file.write(resp.read)
|
22
|
+
end
|
23
|
+
Dir.chdir("#{ROOT_path}/")
|
24
|
+
`tar -xvf mash.tar > /dev/null 2>&1`
|
25
|
+
`cp mash-Linux64-v2.0/mash mash.linux`
|
26
|
+
File.chmod(0755, "#{ROOT_path}/mash.linux")
|
27
|
+
`rm -fr mash.tar mash-Linux64-v2.0/`
|
28
|
+
rescue
|
29
|
+
abort "Problem installing Mash, aborting"
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# Install Mash if not already install
|
36
|
+
if ! File.exists? "#{ROOT_path}/mash.linux"
|
37
|
+
|
38
|
+
puts "Installing Mash the MinHash sequence comparator.."
|
39
|
+
puts "See https://github.com/marbl/Mash/"
|
40
|
+
puts "The Licence is Custom"
|
41
|
+
installMash
|
42
|
+
puts "Mash successfully installed in #{ROOT_path}/mash.linux"
|
43
|
+
puts ""
|
44
|
+
|
45
|
+
end
|
data/bin/bacterial-annotator
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
|
11
11
|
require 'bacterial-annotator'
|
12
12
|
require 'bacterial-comparator'
|
13
|
+
require 'bacterial-identificator'
|
13
14
|
|
14
15
|
VERSION = "0.6.6"
|
15
16
|
|
@@ -22,6 +23,7 @@ def print_version
|
|
22
23
|
version += " -- fasta36 v36.3.8d\n"
|
23
24
|
version += " -- RAxML v8.2.11\n"
|
24
25
|
version += " -- FastTree v2.1.10\n"
|
26
|
+
version += " -- MASH v2.0\n"
|
25
27
|
puts version
|
26
28
|
end
|
27
29
|
|
@@ -36,6 +38,9 @@ bacterial-annotator [annotate | compare] [OPTIONS]
|
|
36
38
|
|
37
39
|
# Choose either to annotate a genome or compare several genome annotations
|
38
40
|
|
41
|
+
identify [OPTIONS]
|
42
|
+
.. see identify -h for OPTIONS
|
43
|
+
|
39
44
|
annotate [OPTIONS]
|
40
45
|
.. see annotate -h for OPTIONS
|
41
46
|
|
@@ -64,7 +69,7 @@ annotate [OPTIONS]
|
|
64
69
|
--name/-n <name> Sample name
|
65
70
|
|
66
71
|
// MERGEM-based Annotation (Recommended)
|
67
|
-
--db/-d
|
72
|
+
--db/-d <directory> MERGEM database directory
|
68
73
|
|
69
74
|
// Reference-Based Annotation
|
70
75
|
--refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
|
@@ -122,7 +127,6 @@ def parseOptions_annotate
|
|
122
127
|
when "--externaldb"
|
123
128
|
options[:external_db] = ARGV.shift
|
124
129
|
when "--help", "-h"
|
125
|
-
|
126
130
|
usage_annotate
|
127
131
|
abort
|
128
132
|
when "--version", "-v"
|
@@ -213,6 +217,58 @@ def parseOptions_compare
|
|
213
217
|
|
214
218
|
end
|
215
219
|
|
220
|
+
|
221
|
+
def usage_identify
|
222
|
+
|
223
|
+
print <<OEM
|
224
|
+
|
225
|
+
identify [OPTIONS] genome_1.fasta genome_2.fasta genome_x.fasta
|
226
|
+
|
227
|
+
//MERGEM Database
|
228
|
+
--db/-d <database directory>
|
229
|
+
|
230
|
+
//IO
|
231
|
+
--proc <nb of process> Number of process to run the comparison
|
232
|
+
|
233
|
+
OEM
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
def parseOptions_identify
|
238
|
+
|
239
|
+
options = {}
|
240
|
+
options[:proc] = 2
|
241
|
+
options[:genomes_list] = []
|
242
|
+
|
243
|
+
while x = ARGV.shift
|
244
|
+
|
245
|
+
case x.downcase
|
246
|
+
when "--db", "-d"
|
247
|
+
options[:database] = ARGV.shift
|
248
|
+
when "--proc", "-p"
|
249
|
+
options[:proc] = ARGV.shift
|
250
|
+
when "--help", "-h"
|
251
|
+
usage_identify
|
252
|
+
abort
|
253
|
+
else
|
254
|
+
if File.exists? "#{x}"
|
255
|
+
options[:genomes_list] << x
|
256
|
+
else
|
257
|
+
puts "#{x} file doesn't exist"
|
258
|
+
usage_identify
|
259
|
+
abort
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
end
|
264
|
+
|
265
|
+
options
|
266
|
+
|
267
|
+
end
|
268
|
+
|
269
|
+
|
270
|
+
|
271
|
+
|
216
272
|
########
|
217
273
|
# MAIN #
|
218
274
|
########
|
@@ -230,9 +286,9 @@ if ARGV.size >= 1
|
|
230
286
|
system("ba_fasta36")
|
231
287
|
system("ba_cdhit")
|
232
288
|
system("ba_fasttree")
|
289
|
+
system("ba_mash")
|
233
290
|
|
234
291
|
options = {}
|
235
|
-
genomes_list = [] # TODO multiple input genomes
|
236
292
|
|
237
293
|
if ARGV[0] == "annotate"
|
238
294
|
|
@@ -240,7 +296,8 @@ if ARGV.size >= 1
|
|
240
296
|
options = parseOptions_annotate
|
241
297
|
|
242
298
|
if ! File.exist? ("#{ROOT}/blat.linux")
|
243
|
-
|
299
|
+
puts "Exiting program because blat is missing"
|
300
|
+
abort
|
244
301
|
end
|
245
302
|
|
246
303
|
# Check Options
|
@@ -264,6 +321,17 @@ if ARGV.size >= 1
|
|
264
321
|
bcomp.mafft_aln aln_opt
|
265
322
|
bcomp.raxml_tree aln_opt, options[:bootstrap] if options[:phylogeny] == 1
|
266
323
|
|
324
|
+
elsif ARGV[0] == "identify"
|
325
|
+
|
326
|
+
ARGV.shift
|
327
|
+
options = parseOptions_identify
|
328
|
+
if options[:genomes_list].empty?
|
329
|
+
puts "You need at least 1 genome fasta to identify !!"
|
330
|
+
usage_identify
|
331
|
+
abort
|
332
|
+
end
|
333
|
+
bident = BacterialIdentificator.new(options, ROOT)
|
334
|
+
|
267
335
|
elsif ARGV[0] == "--version" or ARGV[0] == "-v"
|
268
336
|
|
269
337
|
print_version
|
@@ -273,7 +341,6 @@ if ARGV.size >= 1
|
|
273
341
|
usage
|
274
342
|
end
|
275
343
|
|
276
|
-
|
277
344
|
else
|
278
345
|
usage
|
279
346
|
end
|
@@ -22,8 +22,6 @@ class SequenceFasta
|
|
22
22
|
abort "Aborting : The input sequence is not a fasta file !"
|
23
23
|
end
|
24
24
|
|
25
|
-
# @contigs = extract_contigs(@fasta_flat)
|
26
|
-
|
27
25
|
@meta = meta
|
28
26
|
|
29
27
|
@annotation_files = nil
|
@@ -38,7 +36,7 @@ class SequenceFasta
|
|
38
36
|
|
39
37
|
@annotation_files = {}
|
40
38
|
Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
|
41
|
-
if @meta
|
39
|
+
if @meta==1
|
42
40
|
system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
43
41
|
else
|
44
42
|
system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
@@ -100,7 +100,7 @@ class SequenceSynteny
|
|
100
100
|
# first hit for query
|
101
101
|
if ! @query_sequences[key].has_key? :homology
|
102
102
|
@query_sequences[key][:conserved] = true
|
103
|
-
@subject_sequences[key][:conserved] = true
|
103
|
+
# @subject_sequences[key][:conserved] = true
|
104
104
|
@query_sequences[key][:homology] = {
|
105
105
|
pId: lA[2].to_f.round(2),
|
106
106
|
cov_query: cov_query,
|
@@ -119,7 +119,7 @@ class SequenceSynteny
|
|
119
119
|
# query already got at least 1 hit and new_score > last_score
|
120
120
|
elsif lA[11].to_f > @query_sequences[key][:homology][:score]
|
121
121
|
@query_sequences[key][:conserved] = true
|
122
|
-
@subject_sequences[key][:conserved] = true
|
122
|
+
# @subject_sequences[key][:conserved] = true
|
123
123
|
@query_sequences[key][:homology] = {
|
124
124
|
pId: lA[2].to_f.round(2),
|
125
125
|
cov_query: cov_query,
|
data/lib/bacterial-annotator.rb
CHANGED
@@ -26,7 +26,7 @@ class BacterialAnnotator
|
|
26
26
|
@minlength = @options[:minlength].to_i
|
27
27
|
@options[:minlength] = @options[:minlength].to_i
|
28
28
|
@options[:pidentity] = @options[:pidentity].to_f
|
29
|
-
@options[:
|
29
|
+
@options[:pidentity] = @options[:pidentity] * 100 if @options[:pidentity] <= 1.00
|
30
30
|
@options[:pcoverage] = @options[:pcoverage].to_f
|
31
31
|
@options[:pcoverage] = @options[:pcoverage] / 100 if @options[:pcoverage] > 1.00
|
32
32
|
|
@@ -200,8 +200,12 @@ class BacterialAnnotator
|
|
200
200
|
"Prot-ExternalDB", @options[:pidentity],
|
201
201
|
@options[:pcoverage], "prot")
|
202
202
|
|
203
|
-
|
203
|
+
print "# Running BLAT alignment with External Database.."
|
204
|
+
start_time = Time.now
|
204
205
|
@externaldb_synteny.run_blat @root, @options[:outdir]
|
206
|
+
end_time = Time.now
|
207
|
+
c_time = Helper.sec2str(end_time-start_time)
|
208
|
+
print "done (#{c_time})\n"
|
205
209
|
@externaldb_synteny.extract_hits :externaldb
|
206
210
|
|
207
211
|
@externaldb_synteny.query_sequences.each do |k, v|
|
@@ -214,6 +218,9 @@ class BacterialAnnotator
|
|
214
218
|
|
215
219
|
next if ! v.has_key? :homology
|
216
220
|
|
221
|
+
if ! @contig_annotations_cds.has_key? contig_of_protein
|
222
|
+
@contig_annotations_cds[contig_of_protein] = []
|
223
|
+
end
|
217
224
|
@contig_annotations_cds[contig_of_protein] << k
|
218
225
|
|
219
226
|
hit_gi = v[:homology][:hits][0]
|
@@ -238,7 +245,6 @@ class BacterialAnnotator
|
|
238
245
|
inference: inference
|
239
246
|
}
|
240
247
|
|
241
|
-
|
242
248
|
end
|
243
249
|
|
244
250
|
end
|
@@ -256,12 +262,17 @@ class BacterialAnnotator
|
|
256
262
|
gbk_path = @query_fasta.annotation_files[:gbk_path]
|
257
263
|
gbk_to_annotate = SequenceAnnotation.new("#{gbk_path}/#{contig}.gbk", "#{gbk_path}")
|
258
264
|
|
259
|
-
if @with_external_db
|
265
|
+
if @with_external_db and @with_refence_genome
|
260
266
|
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
261
267
|
(@prot_synteny_refgenome.query_sequences.merge(@externaldb_synteny.query_sequences)),
|
262
268
|
@contig_annotations_externaldb[contig].merge(@ref_genome.coding_seq),
|
263
269
|
(File.basename @options[:refgenome]).gsub(/.gb.*/,"")
|
264
270
|
)
|
271
|
+
elsif @with_external_db
|
272
|
+
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
273
|
+
@externaldb_synteny.query_sequences,
|
274
|
+
@contig_annotations_externaldb[contig]
|
275
|
+
)
|
265
276
|
else
|
266
277
|
gbk_to_annotate.add_annotation_ref_synteny_prot(
|
267
278
|
@prot_synteny_refgenome.query_sequences,
|
@@ -270,7 +281,7 @@ class BacterialAnnotator
|
|
270
281
|
)
|
271
282
|
end
|
272
283
|
|
273
|
-
if @contig_annotations_rna.has_key? contig
|
284
|
+
if @contig_annotations_rna and @contig_annotations_rna.has_key? contig
|
274
285
|
# puts "RNA annotation"
|
275
286
|
gbk_to_annotate.add_annotations @contig_annotations_rna[contig], "new"
|
276
287
|
end
|
@@ -419,6 +430,8 @@ class BacterialAnnotator
|
|
419
430
|
# >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae GN=blaNDM-1 PE=1 SV=1
|
420
431
|
# TrEMBL
|
421
432
|
# >tr|E5KIY2|E5KIY2_ECOLX Beta-lactamase NDM-1 OS=Escherichia coli GN=blaNDM-1 PE=1 SV=1
|
433
|
+
# MERGEM
|
434
|
+
# >Genome_ID|location|Protein_ID|LocusTag|Gene|Protein_Product
|
422
435
|
|
423
436
|
ref_cds = {}
|
424
437
|
|
@@ -436,7 +449,10 @@ class BacterialAnnotator
|
|
436
449
|
product = ""
|
437
450
|
db_source = "[DBSource]"
|
438
451
|
|
439
|
-
if product_long.
|
452
|
+
if product_long.scan(/|/).count >= 5 # MERGEM
|
453
|
+
product = product_long
|
454
|
+
db_source = "RefSeq"
|
455
|
+
elsif product_long.include? " [" and product_long.include? "]" # NCBI
|
440
456
|
organism = product_long[/\[.*?\]/]
|
441
457
|
product = product_long.split(" [")[0].strip
|
442
458
|
elsif product_long.include? "OS=" # Swissprot / TrEMBL
|
@@ -465,6 +481,9 @@ class BacterialAnnotator
|
|
465
481
|
db_source = "UniProtKB"
|
466
482
|
end
|
467
483
|
prot_id = lA[1]
|
484
|
+
elsif key_gi.count("|") == 5
|
485
|
+
db_source = "RefSeq"
|
486
|
+
prot_id = lA[2]
|
468
487
|
end
|
469
488
|
|
470
489
|
ref_cds[key_gi] = {product: product, org: org, prot_id: prot_id, db_source: db_source}
|
data/lib/bacterial-comparator.rb
CHANGED
@@ -153,7 +153,11 @@ class BacterialComparator
|
|
153
153
|
stats = {}
|
154
154
|
stats[:syntenic] = []
|
155
155
|
fout = File.open("#{@outdir}/cds-synteny.tsv", "w")
|
156
|
-
|
156
|
+
genomes_name = []
|
157
|
+
@genomes_list.each do |g|
|
158
|
+
genomes_name.push(File.basename(g))
|
159
|
+
end
|
160
|
+
fout.write("Gene\t"+genomes_name.join("\t")+"\n")
|
157
161
|
|
158
162
|
to_build_multifasta = []
|
159
163
|
|
@@ -386,7 +390,6 @@ class BacterialComparator
|
|
386
390
|
|
387
391
|
end
|
388
392
|
|
389
|
-
|
390
393
|
def raxml_tree_dna bt
|
391
394
|
print "# Genes DNA tree creation (RAXML).."
|
392
395
|
start_time = Time.now
|
@@ -423,7 +426,6 @@ class BacterialComparator
|
|
423
426
|
print "done (#{c_time})\n"
|
424
427
|
end
|
425
428
|
|
426
|
-
|
427
429
|
def raxml_tree aln_opt, bt
|
428
430
|
|
429
431
|
if aln_opt == "both"
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# author: maxime déraspe
|
3
|
+
# email: maximilien1er@gmail.com
|
4
|
+
# review:
|
5
|
+
# date: 17-10-12
|
6
|
+
# version: 0.0.1
|
7
|
+
# licence:
|
8
|
+
|
9
|
+
require 'bio'
|
10
|
+
require 'fileutils'
|
11
|
+
require 'parallel'
|
12
|
+
require 'helper'
|
13
|
+
|
14
|
+
class BacterialIdentificator
|
15
|
+
|
16
|
+
attr_reader :genomes_list, :stats
|
17
|
+
|
18
|
+
# Initialize BacterialIdentificator
|
19
|
+
# options[:input], options[:refgenome], ROOT, options[:outdir], options)
|
20
|
+
def initialize options, root
|
21
|
+
|
22
|
+
@root = root
|
23
|
+
@db_path = options[:database]
|
24
|
+
@genomes_list = options[:genomes_list]
|
25
|
+
@proc = options[:proc].to_i
|
26
|
+
p @genomes_list
|
27
|
+
|
28
|
+
@genomes_list.each do |g|
|
29
|
+
mash_genome g
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
def mash_genome genome
|
36
|
+
|
37
|
+
# Reference-ID, Query-ID, Mash-distance, P-value, and Matching-hashes
|
38
|
+
# fields = ["hit","query","distance","pvalue","match"]
|
39
|
+
|
40
|
+
results_raw = `#{@root}/mash.linux dist #{@db_path}/species-sequences.msh #{genome}`
|
41
|
+
results = []
|
42
|
+
|
43
|
+
results_raw.split("\n").each do |l|
|
44
|
+
lA = l.chomp.split("\t")
|
45
|
+
next if lA[-1].split("/")[0] == '0' # no match
|
46
|
+
results << lA
|
47
|
+
end
|
48
|
+
|
49
|
+
results_sorted = results.sort {|a,b| a[2] <=> b[2]}
|
50
|
+
|
51
|
+
File.open("#{genome}.msh_dist", "w") do |fout|
|
52
|
+
results_sorted.each do |f|
|
53
|
+
fout.write(f.join("\t"))
|
54
|
+
fout.write("\n")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bacterial-annotator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maxime Deraspe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -83,6 +83,7 @@ executables:
|
|
83
83
|
- ba_fasta36
|
84
84
|
- ba_cdhit
|
85
85
|
- ba_fasttree
|
86
|
+
- ba_mash
|
86
87
|
extensions: []
|
87
88
|
extra_rdoc_files: []
|
88
89
|
files:
|
@@ -92,6 +93,7 @@ files:
|
|
92
93
|
- bin/ba_fasta36
|
93
94
|
- bin/ba_fasttree
|
94
95
|
- bin/ba_mafft
|
96
|
+
- bin/ba_mash
|
95
97
|
- bin/ba_prodigal
|
96
98
|
- bin/ba_raxml
|
97
99
|
- bin/bacterial-annotator
|
@@ -100,6 +102,7 @@ files:
|
|
100
102
|
- lib/bacterial-annotator/sequence-fasta.rb
|
101
103
|
- lib/bacterial-annotator/sequence-synteny.rb
|
102
104
|
- lib/bacterial-comparator.rb
|
105
|
+
- lib/bacterial-identificator.rb
|
103
106
|
- lib/helper.rb
|
104
107
|
homepage: http://rubygems.org/gems/bacterial-annotator
|
105
108
|
licenses:
|