bacterial-annotator 0.5.6 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 88abe4fa7d11fe6afe5719a8fd91b512dc0f3b2c
4
- data.tar.gz: 4cc2535a8aa6cc65cee81500cd7683cb3d06fb9c
3
+ metadata.gz: 1ef156f21da883b7175715d94f734fe2cadb3d02
4
+ data.tar.gz: 2620a56039162760e875242c1225bac333d78be1
5
5
  SHA512:
6
- metadata.gz: f263ec699bf5c7996c1dab3703eaa76abab0637b0edd6a3653a9382b93200d002095f10c7ef17c16ec727910616f20afeb20da5075341469b37862aaa38477a2
7
- data.tar.gz: 5b5e56947ec630d095a89cf9628aef046d4d0443673bf6d627d1616fe9c647557797ff9cee31516b348fd029b6075b1a0fd2ec831d445e76c2d12b966a25654c
6
+ metadata.gz: 44d3900f5b482a0bf66b87662384553603b7862d8c85b10c6eb71c498df18b9aba98fc792f590601faf6b50f03d638786b8e6515b05dc9444ef15c44412f1824
7
+ data.tar.gz: dc65c44722a63fc21b3a87a3bad9770942788eb16da89d3c79d4a1ec154a095fa0304465fe1c5e5f076b8c57d04d8cddb765492678e48e212d4f32a805ddb52f
data/bin/ba_cdhit ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 17-08-24
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # cdhit URL = "https://github.com/weizhongli/cdhit/releases/download/V4.6.8/cd-hit-v4.6.8-2017-0621-source.tar.gz"
14
+
15
+ # Install cdhit on the user system
16
+ def installcdhit
17
+
18
+ begin
19
+ resp = open("https://github.com/weizhongli/cdhit/releases/download/V4.6.8/cd-hit-v4.6.8-2017-0621-source.tar.gz")
20
+ open("#{ROOT_path}/cdhit-4.6.8.tar.gz", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ `tar xvf cdhit-4.6.8.tar.gz; rm cdhit-4.6.8.tar.gz`
25
+ Dir.chdir("#{ROOT_path}/cd-hit-v4.6.8-2017-0621")
26
+ `make`
27
+ Dir.chdir("../")
28
+ `mv ./cd-hit-v4.6.8-2017-0621/cd-hit ./cdhit.linux`
29
+ `mv ./cd-hit-v4.6.8-2017-0621/cd-hit-2d ./cdhit2d.linux`
30
+ File.chmod(0755, "#{ROOT_path}/cdhit.linux")
31
+ File.chmod(0755, "#{ROOT_path}/cdhit2d.linux")
32
+ rescue
33
+ abort "Problem installing cdhit, aborting"
34
+ end
35
+
36
+ end
37
+
38
+
39
+ # Install cdhit if not already install
40
+ if ! File.exists? "#{ROOT_path}/cdhit.linux"
41
+
42
+ puts "Installing cd-hit tools.."
43
+ puts "Requirements: SSE+ instructions, OpenMP"
44
+ puts "See https://github.com/weizhongli/cdhit/"
45
+ puts "The Licence is GPLv2"
46
+ installcdhit
47
+ puts "cd-hit successfully installed in #{ROOT_path}/cdhit.linux and cdhit2d.linux"
48
+ puts ""
49
+
50
+ end
51
+
data/bin/ba_diamond ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 17-08-24
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # diamond URL = "https://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz"
14
+
15
+ # Install diamond on the user system
16
+ def installDiamond
17
+
18
+ begin
19
+ resp = open("https://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz")
20
+ open("#{ROOT_path}/diamond-linux64.tar.gz", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ `tar xvf diamond-linux64.tar.gz; rm diamond-linux64.tar.gz`
25
+ `mv diamond ./diamond.linux`
26
+ `rm diamond_manual.pdf`
27
+ File.chmod(0755, "#{ROOT_path}/diamond.linux")
28
+ rescue
29
+ abort "Problem installing Diamond, aborting"
30
+ end
31
+
32
+ end
33
+
34
+
35
+ # Install diamond if not already install
36
+ if ! File.exists? "#{ROOT_path}/diamond.linux"
37
+
38
+ puts "Installing Diamond aligner.."
39
+ puts "See https://github.com/bbuchfink/diamond/"
40
+ puts "The Licence is GPLv3"
41
+ installDiamond
42
+ puts "Diamond successfully installed in #{ROOT_path}/diamond.linux"
43
+ puts ""
44
+
45
+ end
46
+
data/bin/ba_fasta36 ADDED
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 17-08-24
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # fasta36 URL = "https://github.com/wrpearson/fasta36/releases/download/v36.3.8d_13Apr16/fasta-36.3.8d-linux64.tar.gz"
14
+
15
+ # Install fasta36 on the user system
16
+ def installFasta36
17
+
18
+ begin
19
+ resp = open("https://github.com/wrpearson/fasta36/releases/download/v36.3.8d_13Apr16/fasta-36.3.8d-linux64.tar.gz")
20
+ open("#{ROOT_path}/fasta-36.3.8d-linux64.tar.gz", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ `tar xvf fasta-36.3.8d-linux64.tar.gz; rm fasta-36.3.8d-linux64.tar.gz`
25
+ `cp fasta-36.3.8d/bin/fasta36 ./fasta36.linux`
26
+ `cp fasta-36.3.8d/bin/glsearch36 ./glsearch36.linux`
27
+ `cp fasta-36.3.8d/bin/tfastx36 ./tfastx36.linux`
28
+ File.chmod(0755, "#{ROOT_path}/fasta36.linux")
29
+ rescue
30
+ abort "Problem installing Fasta36, aborting"
31
+ end
32
+
33
+ end
34
+
35
+
36
+ # Install fasta36 if not already install
37
+ if ! File.exists? "#{ROOT_path}/fasta36.linux"
38
+
39
+ puts "Installing Fasta36 aligner.."
40
+ puts "See https://github.com/wrpearson/fasta36/"
41
+ puts "The Licence is Apache 2"
42
+ installFasta36
43
+ puts "Fasta36 successfully installed in #{ROOT_path}/fasta36.linux"
44
+ puts ""
45
+
46
+ end
47
+
data/bin/ba_fasttree ADDED
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 15-02-24
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # fasttree_url = http://www.microbesonline.org/fasttree/FastTreeMP
14
+ # Install FASTTREE on the user system
15
+
16
+ def installFastTree
17
+
18
+ begin
19
+ resp = open("http://www.microbesonline.org/fasttree/FastTreeMP")
20
+ open("#{ROOT_path}/fasttree.linux", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ File.chmod(0755, "#{ROOT_path}/fasttree.linux")
25
+ rescue
26
+ abort "Problem in stalling FastTree, aborting"
27
+ end
28
+
29
+ end
30
+
31
+
32
+ # Install prodigal if not already install
33
+ if ! File.exists? "#{ROOT_path}/fasttree.linux"
34
+
35
+ puts "Installing FastTree .."
36
+ puts "See http://www.microbesonline.org/fasttree/#Install"
37
+ puts "License - OpenSource (unknown)"
38
+ installFastTree
39
+ puts "FastTree successfully installed in #{ROOT_path}/fasttree.linux"
40
+ puts ""
41
+
42
+ end
data/bin/ba_prodigal CHANGED
@@ -24,7 +24,8 @@ def installProdigal
24
24
  `tar xvf v2.6.2.tar.gz; rm v2.6.2.tar.gz`
25
25
  Dir.chdir("#{ROOT_path}/Prodigal-2.6.2")
26
26
  `make`
27
- `cp #{ROOT_path}/Prodigal-2.6.2/prodigal #{ROOT_path}/prodigal.linux`
27
+ Dir.chdir("../")
28
+ `cp Prodigal-2.6.2/prodigal prodigal.linux`
28
29
  File.chmod(0755, "#{ROOT_path}/prodigal.linux")
29
30
  # Dir.chir("#{ROOT_path}")
30
31
  # File.chmod(0755, "#{ROOT_path}/prodigal.linux")
@@ -44,22 +44,21 @@ annotate [OPTIONS]
44
44
  --input/-i <fasta_file> Provide the fasta file to annotate
45
45
  --outdir/-o <outdir> Output directory [default=BAnnotation]
46
46
  --force/-f Force to overwrite the output directory
47
+ --name/-n <name> Sample name
47
48
 
48
- // Dataset
49
- --refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
50
- --guessref Will guess the best reference genome to use for the annotation.
49
+ // MERGEM-based Annotation (Recommended)
50
+ --db MERGEM database directory
51
51
 
52
+ // Reference-Based Annotation
53
+ --refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
52
54
  --externaldb <proteins fasta_file>
53
55
  Finish or do a complete annotation with this sequence database (a protein fasta file).
54
56
  Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
55
57
  >gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
56
58
  >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
57
-
58
- // Other options
59
59
  --pidentity <% identity> Minimum percentage identity to incorporate a CDS annotation [default=0.7]
60
60
  --pcoverage <% identity> Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.7]
61
61
  .. otherwise hint for a non-functional protein
62
-
63
62
  --minlength <length> Minimum contig length for annotation [default=500]
64
63
 
65
64
  --meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
@@ -131,9 +130,10 @@ compare [OPTIONS]
131
130
  --align [dna|prot|both] by default align only proteins
132
131
  --concat <nb of genes> by default 0=all
133
132
 
134
- //Phylogeny (RAXML)
133
+ //Phylogeny (Maximum Likelihood)
135
134
  --phylogeny will build phylogenetic tree from the alignments files (pep or dna)
136
- --bootstrap <nb of bootstrap> by default 100
135
+ --software [raxml|fasttree] (default fasttree)
136
+ --bootstrap <nb of bootstrap> (default 100)
137
137
 
138
138
  OEM
139
139
 
@@ -197,11 +197,15 @@ if ARGV.size > 1
197
197
 
198
198
  ROOT = File.dirname(__FILE__)
199
199
 
200
- # Check for 3rd party dependencies : Prodigal, Blat, MAFFT
200
+ # Check for 3rd party dependencies : Prodigal, Blat, MAFFT, ...
201
201
  system("ba_prodigal")
202
202
  system("ba_blat")
203
203
  system("ba_mafft")
204
204
  system("ba_raxml")
205
+ system("ba_diamond")
206
+ system("ba_fasta36")
207
+ system("ba_cdhit")
208
+ system("ba_fasttree")
205
209
 
206
210
  options = {}
207
211
  genomes_list = [] # TODO multiple input genomes
@@ -35,6 +35,13 @@ class SequenceSynteny
35
35
  flat.each_entry do |s|
36
36
  s_name = s.definition.chomp.split(" ")[0]
37
37
  sequences[s_name] = {}
38
+ properties = s.definition.chomp.split(";")
39
+ partial = false
40
+ if properties.length >= 2 and properties[1].include? "partial"
41
+ partial = (properties[1].gsub("partial=","")=='01')
42
+ puts "partial:" + partial.to_s
43
+ end
44
+ sequences[s_name][:partial] = partial
38
45
  sequences[s_name][:length] = s.seq.length
39
46
  sequences[s_name][:conserved] = false
40
47
  sequences[s_name][:contig] = s_name.split("_")[0..-2].join("_") if s_name.include? "_"
@@ -88,7 +95,7 @@ class SequenceSynteny
88
95
  assert_cutoff = [1,1,1]
89
96
  assert_cutoff[0] = 0 if lA[2].to_f < @pidentity
90
97
  assert_cutoff[1] = 0 if cov_query < @min_coverage
91
- assert_cutoff[2] = 0 if cov_subject < @min_coverage
98
+ assert_cutoff[2] = 0 if cov_subject < @min_coverage and @query_sequences[key][:partial] == false
92
99
 
93
100
  # first hit for query
94
101
  if ! @query_sequences[key].has_key? :homology
@@ -24,7 +24,6 @@ class BacterialComparator
24
24
  @genomes_list = options[:genomes_list]
25
25
  @proc = options[:proc].to_i
26
26
  @phylo_nb_genes = options[:phylo_nb_genes]
27
-
28
27
  min_cov = options[:min_cov].to_f
29
28
  min_pid = options[:pidentity].to_f
30
29
  if min_cov > 1
@@ -209,7 +208,7 @@ class BacterialComparator
209
208
  else
210
209
  status = "OK"
211
210
  status = "FAILED" if cmd != true
212
- puts "Alignment #{f} : #{status}"
211
+ # puts "Alignment #{f} : #{status}"
213
212
  end
214
213
  rescue
215
214
  if trying < 3
@@ -248,7 +247,11 @@ class BacterialComparator
248
247
  puts "..Prot alignment files already exists, skipping."
249
248
  end
250
249
 
251
- concat_alignments "align-genes-pep.all.fasta"
250
+ # ugly hack to find out the reference genome
251
+ ref_id = Dir["#{ori_dir}/#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
252
+
253
+ concat_alignments "align-genes-pep.all.fasta", ref_id
254
+
252
255
  Dir.chdir(ori_dir)
253
256
 
254
257
  end
@@ -277,13 +280,17 @@ class BacterialComparator
277
280
  puts "..Gene alignment files already exists, skipping."
278
281
  end
279
282
 
280
- concat_alignments "align-genes-dna.all.fasta"
283
+ # ugly hack to find out the reference genome
284
+ ref_id = Dir["#{ori_dir}/#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
285
+
286
+ concat_alignments "align-genes-dna.all.fasta", ref_id
287
+
281
288
  Dir.chdir(ori_dir)
282
289
 
283
290
  end
284
291
 
285
292
 
286
- def concat_alignments outfile
293
+ def concat_alignments outfile, ref_id
287
294
 
288
295
  if File.exists?("../#{outfile}") and File.size("../#{outfile}") > 0
289
296
  puts "..Alignment concatenated file already exists, skipping."
@@ -292,8 +299,6 @@ class BacterialComparator
292
299
 
293
300
  fout = File.open("../#{outfile}", "w")
294
301
 
295
- ref_id = Dir["../../#{@genomes_list[0]}/*.pep"][0].gsub(/.*\//,"").gsub(".pep","")
296
-
297
302
  seq = ""
298
303
  Dir["*.aln"].each do |f|
299
304
  flat = Bio::FlatFile.auto(f)
@@ -303,7 +308,7 @@ class BacterialComparator
303
308
  end
304
309
 
305
310
  bioseq = Bio::Sequence.auto(seq)
306
- out = bioseq.output_fasta("#{ref_id}",60)
311
+ out = bioseq.output_fasta("#{ref_id}", 60)
307
312
  fout.write(out)
308
313
 
309
314
  for i in 1..@genomes_list.length
@@ -325,7 +330,9 @@ class BacterialComparator
325
330
  flat.close
326
331
  end
327
332
  bioseq = Bio::Sequence.auto(seq)
328
- out = bioseq.output_fasta("#{@genomes_list[i-1]}",60)
333
+ # get the file name without path prefix and extension
334
+ genome_name = genomes_list[i-1].split("/")[-1].split(".")[0]
335
+ out = bioseq.output_fasta(genome_name,60)
329
336
  fout.write(out)
330
337
  end
331
338
 
@@ -358,7 +365,7 @@ class BacterialComparator
358
365
  tree_dir = "#{current_dir}/tree-genes-dna"
359
366
  cmd = system("#{@root}/raxml.linux -T #{@proc} -f d -N #{bt} -s align-genes-dna.all.fasta -m GTRGAMMA -p 123454321 -n DnaTree -w #{tree_dir}")
360
367
  cmd = system("cat #{tree_dir}/RAxML_result.DnaTree.RUN.* >> #{tree_dir}/RAxML_result.BS")
361
- cmd = system("#{@root}/raxml.linux -T 3 -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.DnaTree -m GTRGAMMA -n DNA_BS_TREE -w #{tree_dir}")
368
+ cmd = system("#{@root}/raxml.linux -T #{@proc} -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.DnaTree -m GTRGAMMA -n DNA_BS_TREE -w #{tree_dir}")
362
369
  cmd = system("ln -s #{tree_dir}/RAxML_bipartitionsBranchLabels.DNA_BS_TREE #{tree_dir}/../")
363
370
  Dir.chdir(ori_dir)
364
371
  end
@@ -374,7 +381,7 @@ class BacterialComparator
374
381
  tree_dir = "#{current_dir}/tree-genes-pep"
375
382
  cmd = system("#{@root}/raxml.linux -T #{@proc} -f d -N #{bt} -s align-genes-pep.all.fasta -m PROTGAMMAAUTO -p 123454321 -n PepTree -w #{tree_dir}")
376
383
  cmd = system("cat #{tree_dir}/RAxML_result.PepTree.RUN.* >> #{tree_dir}/RAxML_result.BS")
377
- cmd = system("#{@root}/raxml.linux -T 3 -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.PepTree -m PROTGAMMAAUTO -n PEP_BS_TREE -w #{tree_dir}")
384
+ cmd = system("#{@root}/raxml.linux -T #{@proc} -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.PepTree -m PROTGAMMAAUTO -n PEP_BS_TREE -w #{tree_dir}")
378
385
  cmd = system("ln -s #{tree_dir}/RAxML_bipartitionsBranchLabels.PEP_BS_TREE #{tree_dir}/../")
379
386
  Dir.chdir(ori_dir)
380
387
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.6
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-12 00:00:00.000000000 Z
11
+ date: 2017-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -79,10 +79,18 @@ executables:
79
79
  - ba_blat
80
80
  - ba_mafft
81
81
  - ba_raxml
82
+ - ba_diamond
83
+ - ba_fasta36
84
+ - ba_cdhit
85
+ - ba_fasttree
82
86
  extensions: []
83
87
  extra_rdoc_files: []
84
88
  files:
85
89
  - bin/ba_blat
90
+ - bin/ba_cdhit
91
+ - bin/ba_diamond
92
+ - bin/ba_fasta36
93
+ - bin/ba_fasttree
86
94
  - bin/ba_mafft
87
95
  - bin/ba_prodigal
88
96
  - bin/ba_raxml