aai 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 27ae62f0f9faca6746e563c76f8ca78376cd7e06
4
- data.tar.gz: 0766cf35cf6654147d7a07261f2e4c9cef610062
3
+ metadata.gz: b2cbffaec2af6b021831515c48b227eeb9115b95
4
+ data.tar.gz: 537374f723948c2b42d118e3bd2cfc702b0f39ad
5
5
  SHA512:
6
- metadata.gz: 4ab936b2f2a91c5b1415e2adf552a809b65487e0d6b3b611cc6551b28686109e6d402e9452cb1e6229b3c78fa16356837e1013bde573954db0f77ecfd53027e3
7
- data.tar.gz: ed52d87efbcdb2aed8bd9ecb7f8da62cceb029e44a4e4f1cc61d55b777dbeb6f711583f51e065a527267edbcba381cb1f511c982785bd05624532124e6ed8a8a
6
+ metadata.gz: 9bc4dcd6f1ad1401d96734d7e97f6bf8a614dd474e9d61eb6973f8f148c948cbbfe47fd4671a30d80eb26345b56315912a7b94f26e1b4a2dad4f2afbbead313e
7
+ data.tar.gz: 6ab74991df2c29db77f8b0ddfd3c72b69428c01cfd3c2f9952f4d6687503d33caa8dcc472d1349938f41cd849639afbd4d5f4d855799d2759605529a45edef14
@@ -0,0 +1,6 @@
1
+ # Change log
2
+
3
+ Only key changes are listed here.
4
+
5
+ - 0.4.0: Switch to using DIAMOND instead of BLAST
6
+ - 0.3.0: Last version that uses blast.
data/README.md CHANGED
@@ -1,7 +1,5 @@
1
1
  # AAI
2
2
 
3
- [![Build Status](https://travis-ci.org/mooreryan/aai.svg?branch=master)](https://travis-ci.org/mooreryan/aai)
4
- [![Coverage Status](https://coveralls.io/repos/github/mooreryan/aai/badge.svg?branch=master)](https://coveralls.io/github/mooreryan/aai?branch=master)
5
3
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
4
 
7
5
  Calculate Seanie's multi-genome (or genome bin, or metagenome sample) amino acid similarity.
@@ -10,8 +8,13 @@ Calculate Seanie's multi-genome (or genome bin, or metagenome sample) amino acid
10
8
 
11
9
  The following programs must be installed and on your `PATH` for `aai` to work.
12
10
 
13
- - GNU Parallel
14
- - NCBI Blast suite
11
+ For versions `>= 0.4`
12
+
13
+ - [DIAMOND](https://github.com/bbuchfink/diamond/)
14
+
15
+ For versions `< 0.4`
16
+
17
+ - [NCBI Blast suite](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)
15
18
 
16
19
  ## Installation
17
20
 
@@ -38,13 +41,14 @@ And then execute:
38
41
  ### Example
39
42
 
40
43
  ```
41
- $ ruby exe/aai.rb --infiles *.fa
44
+ $ ruby exe/aai.rb --infiles *.fa --outdir aai_output
42
45
  ```
43
46
 
44
47
  ### Options
45
48
 
46
49
  ```
47
50
  Options:
51
+ -c, --cpus=<i> Number of CPUs to use (default: 1)
48
52
  -i, --infiles=<s+> Input files
49
53
  -o, --outdir=<s> Output directory (default: .)
50
54
  -b, --basename=<s> Base name for output file (default: aai_scores)
@@ -28,7 +28,7 @@ Gem::Specification.new do |spec|
28
28
  spec.add_development_dependency "yard", "~> 0.9.9"
29
29
 
30
30
  spec.add_runtime_dependency "abort_if", "~> 0.2.0"
31
- spec.add_runtime_dependency "parallel", "~> 1.6", ">= 1.6.1"
31
+ # spec.add_runtime_dependency "parallel", "~> 1.6", ">= 1.6.1"
32
32
  spec.add_runtime_dependency "parse_fasta", "~> 2.2"
33
33
  spec.add_runtime_dependency "systemu", "~> 2.6", ">= 2.6.5"
34
34
  spec.add_runtime_dependency "trollop", "~> 2.1", ">= 2.1.2"
data/lib/aai.rb CHANGED
@@ -1,6 +1,5 @@
1
1
  require "abort_if"
2
2
  require "systemu"
3
- require "parallel"
4
3
  require "parse_fasta"
5
4
 
6
5
  require "aai/core_extensions"
@@ -21,12 +20,17 @@ module Aai
21
20
  EVALUE_CUTOFF = 1e-3
22
21
  LENGTH_CUTOFF = 70 # actually is 70 percent
23
22
 
23
+ # If a blast job fails, it will retry once. If it fails again, it
24
+ # will be ignored by the rest of the pipeline.
24
25
  def blast_permutations! fastas, blast_dbs, cpus=4
25
26
  file_permutations = one_way_combinations fastas, blast_dbs, true
26
27
  file_permutations = file_permutations.select do |f1, f2|
27
28
  genome_from_fname(f1) != genome_from_fname(f2)
28
29
  end
29
30
 
31
+ completed_outf_names = []
32
+ failed_jobs = []
33
+
30
34
  first_files = file_permutations.map(&:first)
31
35
  second_files = file_permutations.map(&:last)
32
36
 
@@ -49,22 +53,61 @@ module Aai
49
53
  "#{f1}____#{f2}.aai_blastp"
50
54
  end
51
55
 
52
- parallel_args = first_files.length.times.map do |idx|
56
+ args = first_files.length.times.map do |idx|
53
57
  [first_files[idx], second_files[idx], outf_names[idx]]
54
58
  end
55
59
 
56
60
  Time.time_it "Running blast jobs" do
57
- Parallel.each(parallel_args, in_processes: cpus) do |infiles|
61
+ args.each_with_index do |infiles, idx|
58
62
  query = infiles[0]
59
63
  db = infiles[1]
60
64
  out = infiles[2]
61
65
 
62
- cmd = "blastp -outfmt 6 -query #{query} -db #{db} -out #{out} -evalue #{EVALUE_CUTOFF}"
63
- Process.run_it! cmd
66
+ cmd = "diamond blastp --threads #{cpus} --outfmt 6 " +
67
+ "--query #{query} --db #{db} --out #{out} " +
68
+ "--evalue #{EVALUE_CUTOFF}"
69
+
70
+ exit_status = Process.run_it cmd
71
+
72
+ if exit_status.zero?
73
+ completed_outf_names << out
74
+ else
75
+ failed_jobs << idx
76
+ AbortIf.logger.warn { "Blast job failed. Non-zero exit status " +
77
+ "(#{exit_status}) " +
78
+ "when running '#{cmd}'. " +
79
+ "Will retry at end." }
80
+ end
81
+ end
82
+ end
83
+
84
+ if failed_jobs.count > 0
85
+ Time.time_it "Retrying failed blast jobs" do
86
+ # retry failed jobs once
87
+ failed_jobs.each do |idx|
88
+ query = args[idx][0]
89
+ db = args[idx][1]
90
+ out = args[idx][2]
91
+
92
+ cmd = "diamond blastp --threads #{cpus} --outfmt 6 " +
93
+ "--query #{query} --db #{db} --out #{out} " +
94
+ "--evalue #{EVALUE_CUTOFF}"
95
+
96
+ exit_status = Process.run_it cmd
97
+
98
+ if exit_status.zero?
99
+ completed_outf_names << out
100
+ else
101
+ AbortIf.logger.error { "Retrying blast job failed. " +
102
+ "Non-zero exit status " +
103
+ "(#{exit_status}) " +
104
+ "when running '#{cmd}'." }
105
+ end
106
+ end
64
107
  end
65
108
  end
66
109
 
67
- outf_names
110
+ completed_outf_names
68
111
  end
69
112
 
70
113
  # Make blast dbs given an array of filenames.
@@ -78,8 +121,9 @@ module Aai
78
121
  outfiles = fnames.map { |fname| fname + suffix }
79
122
 
80
123
  Time.time_it "Making blast databases" do
81
- Parallel.each(fnames, in_processes: cpus) do |fname|
82
- cmd = "makeblastdb -in #{fname} -out #{fname}#{BLAST_DB_SUFFIX} -dbtype prot"
124
+ fnames.each do |fname|
125
+ cmd = "diamond makedb --threads #{cpus} --in #{fname} " +
126
+ "--db #{fname}#{BLAST_DB_SUFFIX}"
83
127
 
84
128
  Process.run_it! cmd
85
129
  end
@@ -103,13 +147,15 @@ module Aai
103
147
  clean_fnames << clean_fname
104
148
  File.open(clean_fname, "w") do |f|
105
149
  Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
106
- header =
107
- annotate_header clean_header(rec.header),
108
- File.basename(fname)
150
+ unless bad_seq? rec.seq
151
+ header =
152
+ annotate_header clean_header(rec.header),
153
+ File.basename(fname)
109
154
 
110
- seq_lengths[header] = rec.seq.length
155
+ seq_lengths[header] = rec.seq.length
111
156
 
112
- f.puts ">#{header}\n#{rec.seq}"
157
+ f.puts ">#{header}\n#{rec.seq}"
158
+ end
113
159
  end
114
160
  end
115
161
  end
@@ -269,6 +315,13 @@ module Aai
269
315
 
270
316
  private
271
317
 
318
+ # this is to account for the weird IMG error. Some seqs will
319
+ # not have an actual protein, rather it will be "No sequence
320
+ # found"
321
+ def bad_seq? seq
322
+ seq.downcase.include? "nosequencefound"
323
+ end
324
+
272
325
  def two_way_hit? hit1, hit2
273
326
  hit1[:query_name] == hit2[:target_name] &&
274
327
  hit1[:query_genome] == hit2[:target_genome]
@@ -1,5 +1,9 @@
1
1
  module Aai
2
2
  module Utils
3
+ def clean_str str
4
+ str.strip.gsub(/[^\p{Alnum}_]+/, "_").gsub(/_+/, "_")
5
+ end
6
+
3
7
  # Raises SystemExit if one of the fnames does not exist.
4
8
  def check_files fnames
5
9
  fnames.each do |fname|
@@ -1,5 +1,5 @@
1
1
  module Aai
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  COPYRIGHT = "2017 Ryan Moore"
4
4
  CONTACT = "moorer@udel.edu"
5
5
  WEBSITE = "https://github.com/mooreryan/aai"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aai
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-06-23 00:00:00.000000000 Z
11
+ date: 2017-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -94,26 +94,6 @@ dependencies:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: 0.2.0
97
- - !ruby/object:Gem::Dependency
98
- name: parallel
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - "~>"
102
- - !ruby/object:Gem::Version
103
- version: '1.6'
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- version: 1.6.1
107
- type: :runtime
108
- prerelease: false
109
- version_requirements: !ruby/object:Gem::Requirement
110
- requirements:
111
- - - "~>"
112
- - !ruby/object:Gem::Version
113
- version: '1.6'
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- version: 1.6.1
117
97
  - !ruby/object:Gem::Dependency
118
98
  name: parse_fasta
119
99
  requirement: !ruby/object:Gem::Requirement
@@ -180,6 +160,7 @@ files:
180
160
  - ".gitignore"
181
161
  - ".rspec"
182
162
  - ".travis.yml"
163
+ - CHANGELOG.md
183
164
  - CODE_OF_CONDUCT.md
184
165
  - Gemfile
185
166
  - LICENSE