bacterial-annotator 0.5.6 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 88abe4fa7d11fe6afe5719a8fd91b512dc0f3b2c
4
- data.tar.gz: 4cc2535a8aa6cc65cee81500cd7683cb3d06fb9c
3
+ metadata.gz: 1ef156f21da883b7175715d94f734fe2cadb3d02
4
+ data.tar.gz: 2620a56039162760e875242c1225bac333d78be1
5
5
  SHA512:
6
- metadata.gz: f263ec699bf5c7996c1dab3703eaa76abab0637b0edd6a3653a9382b93200d002095f10c7ef17c16ec727910616f20afeb20da5075341469b37862aaa38477a2
7
- data.tar.gz: 5b5e56947ec630d095a89cf9628aef046d4d0443673bf6d627d1616fe9c647557797ff9cee31516b348fd029b6075b1a0fd2ec831d445e76c2d12b966a25654c
6
+ metadata.gz: 44d3900f5b482a0bf66b87662384553603b7862d8c85b10c6eb71c498df18b9aba98fc792f590601faf6b50f03d638786b8e6515b05dc9444ef15c44412f1824
7
+ data.tar.gz: dc65c44722a63fc21b3a87a3bad9770942788eb16da89d3c79d4a1ec154a095fa0304465fe1c5e5f076b8c57d04d8cddb765492678e48e212d4f32a805ddb52f
data/bin/ba_cdhit ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 17-08-24
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # cdhit URL = "https://github.com/weizhongli/cdhit/releases/download/V4.6.8/cd-hit-v4.6.8-2017-0621-source.tar.gz"
14
+
15
+ # Install cdhit on the user system
16
+ def installcdhit
17
+
18
+ begin
19
+ resp = open("https://github.com/weizhongli/cdhit/releases/download/V4.6.8/cd-hit-v4.6.8-2017-0621-source.tar.gz")
20
+ open("#{ROOT_path}/cdhit-4.6.8.tar.gz", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ `tar xvf cdhit-4.6.8.tar.gz; rm cdhit-4.6.8.tar.gz`
25
+ Dir.chdir("#{ROOT_path}/cd-hit-v4.6.8-2017-0621")
26
+ `make`
27
+ Dir.chdir("../")
28
+ `mv ./cd-hit-v4.6.8-2017-0621/cd-hit ./cdhit.linux`
29
+ `mv ./cd-hit-v4.6.8-2017-0621/cd-hit-2d ./cdhit2d.linux`
30
+ File.chmod(0755, "#{ROOT_path}/cdhit.linux")
31
+ File.chmod(0755, "#{ROOT_path}/cdhit2d.linux")
32
+ rescue
33
+ abort "Problem installing cdhit, aborting"
34
+ end
35
+
36
+ end
37
+
38
+
39
+ # Install cdhit if not already install
40
+ if ! File.exists? "#{ROOT_path}/cdhit.linux"
41
+
42
+ puts "Installing cd-hit tools.."
43
+ puts "Requirements: SSE+ instructions, OpenMP"
44
+ puts "See https://github.com/weizhongli/cdhit/"
45
+ puts "The Licence is GPLv2"
46
+ installcdhit
47
+ puts "cd-hit successfully installed in #{ROOT_path}/cdhit.linux and cdhit2d.linux"
48
+ puts ""
49
+
50
+ end
51
+
data/bin/ba_diamond ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 17-08-24
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # diamond URL = "https://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz"
14
+
15
+ # Install diamond on the user system
16
+ def installDiamond
17
+
18
+ begin
19
+ resp = open("https://github.com/bbuchfink/diamond/releases/download/v0.9.10/diamond-linux64.tar.gz")
20
+ open("#{ROOT_path}/diamond-linux64.tar.gz", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ `tar xvf diamond-linux64.tar.gz; rm diamond-linux64.tar.gz`
25
+ `mv diamond ./diamond.linux`
26
+ `rm diamond_manual.pdf`
27
+ File.chmod(0755, "#{ROOT_path}/diamond.linux")
28
+ rescue
29
+ abort "Problem installing Diamond, aborting"
30
+ end
31
+
32
+ end
33
+
34
+
35
+ # Install diamond if not already install
36
+ if ! File.exists? "#{ROOT_path}/diamond.linux"
37
+
38
+ puts "Installing Diamond aligner.."
39
+ puts "See https://github.com/bbuchfink/diamond/"
40
+ puts "The Licence is GPLv3"
41
+ installDiamond
42
+ puts "Diamond successfully installed in #{ROOT_path}/diamond.linux"
43
+ puts ""
44
+
45
+ end
46
+
data/bin/ba_fasta36 ADDED
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 17-08-24
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # fasta36 URL = "https://github.com/wrpearson/fasta36/releases/download/v36.3.8d_13Apr16/fasta-36.3.8d-linux64.tar.gz"
14
+
15
+ # Install fasta36 on the user system
16
+ def installFasta36
17
+
18
+ begin
19
+ resp = open("https://github.com/wrpearson/fasta36/releases/download/v36.3.8d_13Apr16/fasta-36.3.8d-linux64.tar.gz")
20
+ open("#{ROOT_path}/fasta-36.3.8d-linux64.tar.gz", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ `tar xvf fasta-36.3.8d-linux64.tar.gz; rm fasta-36.3.8d-linux64.tar.gz`
25
+ `cp fasta-36.3.8d/bin/fasta36 ./fasta36.linux`
26
+ `cp fasta-36.3.8d/bin/glsearch36 ./glsearch36.linux`
27
+ `cp fasta-36.3.8d/bin/tfastx36 ./tfastx36.linux`
28
+ File.chmod(0755, "#{ROOT_path}/fasta36.linux")
29
+ rescue
30
+ abort "Problem installing Fasta36, aborting"
31
+ end
32
+
33
+ end
34
+
35
+
36
+ # Install fasta36 if not already install
37
+ if ! File.exists? "#{ROOT_path}/fasta36.linux"
38
+
39
+ puts "Installing Fasta36 aligner.."
40
+ puts "See https://github.com/wrpearson/fasta36/"
41
+ puts "The Licence is Apache 2"
42
+ installFasta36
43
+ puts "Fasta36 successfully installed in #{ROOT_path}/fasta36.linux"
44
+ puts ""
45
+
46
+ end
47
+
data/bin/ba_fasttree ADDED
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+ # author: maxime déraspe
4
+ # email: maximilien1er@gmail.com
5
+ # review:
6
+ # date: 15-02-24
7
+ # version: 0.01
8
+ # licence:
9
+
10
+ require 'open-uri'
11
+
12
+ ROOT_path = File.dirname(__FILE__)
13
+ # fasttree_url = http://www.microbesonline.org/fasttree/FastTreeMP
14
+ # Install FASTTREE on the user system
15
+
16
+ def installFastTree
17
+
18
+ begin
19
+ resp = open("http://www.microbesonline.org/fasttree/FastTreeMP")
20
+ open("#{ROOT_path}/fasttree.linux", "wb") do |file|
21
+ file.write(resp.read)
22
+ end
23
+ Dir.chdir("#{ROOT_path}/")
24
+ File.chmod(0755, "#{ROOT_path}/fasttree.linux")
25
+ rescue
26
+ abort "Problem in stalling FastTree, aborting"
27
+ end
28
+
29
+ end
30
+
31
+
32
+ # Install prodigal if not already install
33
+ if ! File.exists? "#{ROOT_path}/fasttree.linux"
34
+
35
+ puts "Installing FastTree .."
36
+ puts "See http://www.microbesonline.org/fasttree/#Install"
37
+ puts "License - OpenSource (unknown)"
38
+ installFastTree
39
+ puts "FastTree successfully installed in #{ROOT_path}/fasttree.linux"
40
+ puts ""
41
+
42
+ end
data/bin/ba_prodigal CHANGED
@@ -24,7 +24,8 @@ def installProdigal
24
24
  `tar xvf v2.6.2.tar.gz; rm v2.6.2.tar.gz`
25
25
  Dir.chdir("#{ROOT_path}/Prodigal-2.6.2")
26
26
  `make`
27
- `cp #{ROOT_path}/Prodigal-2.6.2/prodigal #{ROOT_path}/prodigal.linux`
27
+ Dir.chdir("../")
28
+ `cp Prodigal-2.6.2/prodigal prodigal.linux`
28
29
  File.chmod(0755, "#{ROOT_path}/prodigal.linux")
29
30
  # Dir.chir("#{ROOT_path}")
30
31
  # File.chmod(0755, "#{ROOT_path}/prodigal.linux")
@@ -44,22 +44,21 @@ annotate [OPTIONS]
44
44
  --input/-i <fasta_file> Provide the fasta file to annotate
45
45
  --outdir/-o <outdir> Output directory [default=BAnnotation]
46
46
  --force/-f Force to overwrite the output directory
47
+ --name/-n <name> Sample name
47
48
 
48
- // Dataset
49
- --refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
50
- --guessref Will guess the best reference genome to use for the annotation.
49
+ // MERGEM-based Annotation (Recommended)
50
+ --db MERGEM database directory
51
51
 
52
+ // Reference-Based Annotation
53
+ --refgenome/-g <GBK_ID> Provide a Genbank file or a Gbk Accession ID.
52
54
  --externaldb <proteins fasta_file>
53
55
  Finish or do a complete annotation with this sequence database (a protein fasta file).
54
56
  Fasta headers need to look similar to NCBI or EBI fasta headers, ex.:
55
57
  >gi|385721352|gb|AFI72857.1| NDM-1 [Escherichia coli]
56
58
  >sp|C7C422|BLAN1_KLEPN Beta-lactamase NDM-1 OS=Klebsiella pneumoniae..
57
-
58
- // Other options
59
59
  --pidentity <% identity> Minimum percentage identity to incorporate a CDS annotation [default=0.7]
60
60
  --pcoverage <% identity> Minimum percentage of coverage over protein alignment to incorporate a CDS annotation [default=0.7]
61
61
  .. otherwise hint for a non-functional protein
62
-
63
62
  --minlength <length> Minimum contig length for annotation [default=500]
64
63
 
65
64
  --meta Better for metagenome and plasmid annotations because of disparate codon usage [default=off]
@@ -131,9 +130,10 @@ compare [OPTIONS]
131
130
  --align [dna|prot|both] by default align only proteins
132
131
  --concat <nb of genes> by default 0=all
133
132
 
134
- //Phylogeny (RAXML)
133
+ //Phylogeny (Maximum Likelihood)
135
134
  --phylogeny will build phylogenetic tree from the alignments files (pep or dna)
136
- --bootstrap <nb of bootstrap> by default 100
135
+ --software [raxml|fasttree] (default fasttree)
136
+ --bootstrap <nb of bootstrap> (default 100)
137
137
 
138
138
  OEM
139
139
 
@@ -197,11 +197,15 @@ if ARGV.size > 1
197
197
 
198
198
  ROOT = File.dirname(__FILE__)
199
199
 
200
- # Check for 3rd party dependencies : Prodigal, Blat, MAFFT
200
+ # Check for 3rd party dependencies : Prodigal, Blat, MAFFT, ...
201
201
  system("ba_prodigal")
202
202
  system("ba_blat")
203
203
  system("ba_mafft")
204
204
  system("ba_raxml")
205
+ system("ba_diamond")
206
+ system("ba_fasta36")
207
+ system("ba_cdhit")
208
+ system("ba_fasttree")
205
209
 
206
210
  options = {}
207
211
  genomes_list = [] # TODO multiple input genomes
@@ -35,6 +35,13 @@ class SequenceSynteny
35
35
  flat.each_entry do |s|
36
36
  s_name = s.definition.chomp.split(" ")[0]
37
37
  sequences[s_name] = {}
38
+ properties = s.definition.chomp.split(";")
39
+ partial = false
40
+ if properties.length >= 2 and properties[1].include? "partial"
41
+ partial = (properties[1].gsub("partial=","")=='01')
42
+ puts "partial:" + partial.to_s
43
+ end
44
+ sequences[s_name][:partial] = partial
38
45
  sequences[s_name][:length] = s.seq.length
39
46
  sequences[s_name][:conserved] = false
40
47
  sequences[s_name][:contig] = s_name.split("_")[0..-2].join("_") if s_name.include? "_"
@@ -88,7 +95,7 @@ class SequenceSynteny
88
95
  assert_cutoff = [1,1,1]
89
96
  assert_cutoff[0] = 0 if lA[2].to_f < @pidentity
90
97
  assert_cutoff[1] = 0 if cov_query < @min_coverage
91
- assert_cutoff[2] = 0 if cov_subject < @min_coverage
98
+ assert_cutoff[2] = 0 if cov_subject < @min_coverage and @query_sequences[key][:partial] == false
92
99
 
93
100
  # first hit for query
94
101
  if ! @query_sequences[key].has_key? :homology
@@ -24,7 +24,6 @@ class BacterialComparator
24
24
  @genomes_list = options[:genomes_list]
25
25
  @proc = options[:proc].to_i
26
26
  @phylo_nb_genes = options[:phylo_nb_genes]
27
-
28
27
  min_cov = options[:min_cov].to_f
29
28
  min_pid = options[:pidentity].to_f
30
29
  if min_cov > 1
@@ -209,7 +208,7 @@ class BacterialComparator
209
208
  else
210
209
  status = "OK"
211
210
  status = "FAILED" if cmd != true
212
- puts "Alignment #{f} : #{status}"
211
+ # puts "Alignment #{f} : #{status}"
213
212
  end
214
213
  rescue
215
214
  if trying < 3
@@ -248,7 +247,11 @@ class BacterialComparator
248
247
  puts "..Prot alignment files already exists, skipping."
249
248
  end
250
249
 
251
- concat_alignments "align-genes-pep.all.fasta"
250
+ # ugly hack to find out the reference genome
251
+ ref_id = Dir["#{ori_dir}/#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
252
+
253
+ concat_alignments "align-genes-pep.all.fasta", ref_id
254
+
252
255
  Dir.chdir(ori_dir)
253
256
 
254
257
  end
@@ -277,13 +280,17 @@ class BacterialComparator
277
280
  puts "..Gene alignment files already exists, skipping."
278
281
  end
279
282
 
280
- concat_alignments "align-genes-dna.all.fasta"
283
+ # ugly hack to find out the reference genome
284
+ ref_id = Dir["#{ori_dir}/#{@genomes_list[0]}/*.pep"][0].split('/')[-1].gsub(".pep","")
285
+
286
+ concat_alignments "align-genes-dna.all.fasta", ref_id
287
+
281
288
  Dir.chdir(ori_dir)
282
289
 
283
290
  end
284
291
 
285
292
 
286
- def concat_alignments outfile
293
+ def concat_alignments outfile, ref_id
287
294
 
288
295
  if File.exists?("../#{outfile}") and File.size("../#{outfile}") > 0
289
296
  puts "..Alignment concatenated file already exists, skipping."
@@ -292,8 +299,6 @@ class BacterialComparator
292
299
 
293
300
  fout = File.open("../#{outfile}", "w")
294
301
 
295
- ref_id = Dir["../../#{@genomes_list[0]}/*.pep"][0].gsub(/.*\//,"").gsub(".pep","")
296
-
297
302
  seq = ""
298
303
  Dir["*.aln"].each do |f|
299
304
  flat = Bio::FlatFile.auto(f)
@@ -303,7 +308,7 @@ class BacterialComparator
303
308
  end
304
309
 
305
310
  bioseq = Bio::Sequence.auto(seq)
306
- out = bioseq.output_fasta("#{ref_id}",60)
311
+ out = bioseq.output_fasta("#{ref_id}", 60)
307
312
  fout.write(out)
308
313
 
309
314
  for i in 1..@genomes_list.length
@@ -325,7 +330,9 @@ class BacterialComparator
325
330
  flat.close
326
331
  end
327
332
  bioseq = Bio::Sequence.auto(seq)
328
- out = bioseq.output_fasta("#{@genomes_list[i-1]}",60)
333
+ # get the file name without path prefix and extension
334
+ genome_name = genomes_list[i-1].split("/")[-1].split(".")[0]
335
+ out = bioseq.output_fasta(genome_name,60)
329
336
  fout.write(out)
330
337
  end
331
338
 
@@ -358,7 +365,7 @@ class BacterialComparator
358
365
  tree_dir = "#{current_dir}/tree-genes-dna"
359
366
  cmd = system("#{@root}/raxml.linux -T #{@proc} -f d -N #{bt} -s align-genes-dna.all.fasta -m GTRGAMMA -p 123454321 -n DnaTree -w #{tree_dir}")
360
367
  cmd = system("cat #{tree_dir}/RAxML_result.DnaTree.RUN.* >> #{tree_dir}/RAxML_result.BS")
361
- cmd = system("#{@root}/raxml.linux -T 3 -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.DnaTree -m GTRGAMMA -n DNA_BS_TREE -w #{tree_dir}")
368
+ cmd = system("#{@root}/raxml.linux -T #{@proc} -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.DnaTree -m GTRGAMMA -n DNA_BS_TREE -w #{tree_dir}")
362
369
  cmd = system("ln -s #{tree_dir}/RAxML_bipartitionsBranchLabels.DNA_BS_TREE #{tree_dir}/../")
363
370
  Dir.chdir(ori_dir)
364
371
  end
@@ -374,7 +381,7 @@ class BacterialComparator
374
381
  tree_dir = "#{current_dir}/tree-genes-pep"
375
382
  cmd = system("#{@root}/raxml.linux -T #{@proc} -f d -N #{bt} -s align-genes-pep.all.fasta -m PROTGAMMAAUTO -p 123454321 -n PepTree -w #{tree_dir}")
376
383
  cmd = system("cat #{tree_dir}/RAxML_result.PepTree.RUN.* >> #{tree_dir}/RAxML_result.BS")
377
- cmd = system("#{@root}/raxml.linux -T 3 -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.PepTree -m PROTGAMMAAUTO -n PEP_BS_TREE -w #{tree_dir}")
384
+ cmd = system("#{@root}/raxml.linux -T #{@proc} -f b -z #{tree_dir}/RAxML_result.BS -t #{tree_dir}/RAxML_bestTree.PepTree -m PROTGAMMAAUTO -n PEP_BS_TREE -w #{tree_dir}")
378
385
  cmd = system("ln -s #{tree_dir}/RAxML_bipartitionsBranchLabels.PEP_BS_TREE #{tree_dir}/../")
379
386
  Dir.chdir(ori_dir)
380
387
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bacterial-annotator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.6
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maxime Deraspe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-12 00:00:00.000000000 Z
11
+ date: 2017-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -79,10 +79,18 @@ executables:
79
79
  - ba_blat
80
80
  - ba_mafft
81
81
  - ba_raxml
82
+ - ba_diamond
83
+ - ba_fasta36
84
+ - ba_cdhit
85
+ - ba_fasttree
82
86
  extensions: []
83
87
  extra_rdoc_files: []
84
88
  files:
85
89
  - bin/ba_blat
90
+ - bin/ba_cdhit
91
+ - bin/ba_diamond
92
+ - bin/ba_fasta36
93
+ - bin/ba_fasttree
86
94
  - bin/ba_mafft
87
95
  - bin/ba_prodigal
88
96
  - bin/ba_raxml