aai 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 27ae62f0f9faca6746e563c76f8ca78376cd7e06
4
- data.tar.gz: 0766cf35cf6654147d7a07261f2e4c9cef610062
3
+ metadata.gz: b2cbffaec2af6b021831515c48b227eeb9115b95
4
+ data.tar.gz: 537374f723948c2b42d118e3bd2cfc702b0f39ad
5
5
  SHA512:
6
- metadata.gz: 4ab936b2f2a91c5b1415e2adf552a809b65487e0d6b3b611cc6551b28686109e6d402e9452cb1e6229b3c78fa16356837e1013bde573954db0f77ecfd53027e3
7
- data.tar.gz: ed52d87efbcdb2aed8bd9ecb7f8da62cceb029e44a4e4f1cc61d55b777dbeb6f711583f51e065a527267edbcba381cb1f511c982785bd05624532124e6ed8a8a
6
+ metadata.gz: 9bc4dcd6f1ad1401d96734d7e97f6bf8a614dd474e9d61eb6973f8f148c948cbbfe47fd4671a30d80eb26345b56315912a7b94f26e1b4a2dad4f2afbbead313e
7
+ data.tar.gz: 6ab74991df2c29db77f8b0ddfd3c72b69428c01cfd3c2f9952f4d6687503d33caa8dcc472d1349938f41cd849639afbd4d5f4d855799d2759605529a45edef14
@@ -0,0 +1,6 @@
1
+ # Change log
2
+
3
+ Only key changes are listed here.
4
+
5
+ - 0.4.0: Switch to using DIAMOND instead of BLAST
6
+ - 0.3.0: Last version that uses blast.
data/README.md CHANGED
@@ -1,7 +1,5 @@
1
1
  # AAI
2
2
 
3
- [![Build Status](https://travis-ci.org/mooreryan/aai.svg?branch=master)](https://travis-ci.org/mooreryan/aai)
4
- [![Coverage Status](https://coveralls.io/repos/github/mooreryan/aai/badge.svg?branch=master)](https://coveralls.io/github/mooreryan/aai?branch=master)
5
3
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
4
 
7
5
  Calculate Seanie's multi-genome (or genome bin, or metagenome sample) amino acid similarity.
@@ -10,8 +8,13 @@ Calculate Seanie's multi-genome (or genome bin, or metagenome sample) amino acid
10
8
 
11
9
  The following programs must be installed and on your `PATH` for `aai` to work.
12
10
 
13
- - GNU Parallel
14
- - NCBI Blast suite
11
+ For versions `>= 0.4`
12
+
13
+ - [DIAMOND](https://github.com/bbuchfink/diamond/)
14
+
15
+ For versions `< 0.4`
16
+
17
+ - [NCBI Blast suite](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)
15
18
 
16
19
  ## Installation
17
20
 
@@ -38,13 +41,14 @@ And then execute:
38
41
  ### Example
39
42
 
40
43
  ```
41
- $ ruby exe/aai.rb --infiles *.fa
44
+ $ ruby exe/aai.rb --infiles *.fa --outdir aai_output
42
45
  ```
43
46
 
44
47
  ### Options
45
48
 
46
49
  ```
47
50
  Options:
51
+ -c, --cpus=<i> Number of CPUs to use (default: 1)
48
52
  -i, --infiles=<s+> Input files
49
53
  -o, --outdir=<s> Output directory (default: .)
50
54
  -b, --basename=<s> Base name for output file (default: aai_scores)
@@ -28,7 +28,7 @@ Gem::Specification.new do |spec|
28
28
  spec.add_development_dependency "yard", "~> 0.9.9"
29
29
 
30
30
  spec.add_runtime_dependency "abort_if", "~> 0.2.0"
31
- spec.add_runtime_dependency "parallel", "~> 1.6", ">= 1.6.1"
31
+ # spec.add_runtime_dependency "parallel", "~> 1.6", ">= 1.6.1"
32
32
  spec.add_runtime_dependency "parse_fasta", "~> 2.2"
33
33
  spec.add_runtime_dependency "systemu", "~> 2.6", ">= 2.6.5"
34
34
  spec.add_runtime_dependency "trollop", "~> 2.1", ">= 2.1.2"
data/lib/aai.rb CHANGED
@@ -1,6 +1,5 @@
1
1
  require "abort_if"
2
2
  require "systemu"
3
- require "parallel"
4
3
  require "parse_fasta"
5
4
 
6
5
  require "aai/core_extensions"
@@ -21,12 +20,17 @@ module Aai
21
20
  EVALUE_CUTOFF = 1e-3
22
21
  LENGTH_CUTOFF = 70 # actually is 70 percent
23
22
 
23
+ # If a blast job fails, it will retry once. If it fails again, it
24
+ # will be ignored by the rest of the pipeline.
24
25
  def blast_permutations! fastas, blast_dbs, cpus=4
25
26
  file_permutations = one_way_combinations fastas, blast_dbs, true
26
27
  file_permutations = file_permutations.select do |f1, f2|
27
28
  genome_from_fname(f1) != genome_from_fname(f2)
28
29
  end
29
30
 
31
+ completed_outf_names = []
32
+ failed_jobs = []
33
+
30
34
  first_files = file_permutations.map(&:first)
31
35
  second_files = file_permutations.map(&:last)
32
36
 
@@ -49,22 +53,61 @@ module Aai
49
53
  "#{f1}____#{f2}.aai_blastp"
50
54
  end
51
55
 
52
- parallel_args = first_files.length.times.map do |idx|
56
+ args = first_files.length.times.map do |idx|
53
57
  [first_files[idx], second_files[idx], outf_names[idx]]
54
58
  end
55
59
 
56
60
  Time.time_it "Running blast jobs" do
57
- Parallel.each(parallel_args, in_processes: cpus) do |infiles|
61
+ args.each_with_index do |infiles, idx|
58
62
  query = infiles[0]
59
63
  db = infiles[1]
60
64
  out = infiles[2]
61
65
 
62
- cmd = "blastp -outfmt 6 -query #{query} -db #{db} -out #{out} -evalue #{EVALUE_CUTOFF}"
63
- Process.run_it! cmd
66
+ cmd = "diamond blastp --threads #{cpus} --outfmt 6 " +
67
+ "--query #{query} --db #{db} --out #{out} " +
68
+ "--evalue #{EVALUE_CUTOFF}"
69
+
70
+ exit_status = Process.run_it cmd
71
+
72
+ if exit_status.zero?
73
+ completed_outf_names << out
74
+ else
75
+ failed_jobs << idx
76
+ AbortIf.logger.warn { "Blast job failed. Non-zero exit status " +
77
+ "(#{exit_status}) " +
78
+ "when running '#{cmd}'. " +
79
+ "Will retry at end." }
80
+ end
81
+ end
82
+ end
83
+
84
+ if failed_jobs.count > 0
85
+ Time.time_it "Retrying failed blast jobs" do
86
+ # retry failed jobs once
87
+ failed_jobs.each do |idx|
88
+ query = args[idx][0]
89
+ db = args[idx][1]
90
+ out = args[idx][2]
91
+
92
+ cmd = "diamond blastp --threads #{cpus} --outfmt 6 " +
93
+ "--query #{query} --db #{db} --out #{out} " +
94
+ "--evalue #{EVALUE_CUTOFF}"
95
+
96
+ exit_status = Process.run_it cmd
97
+
98
+ if exit_status.zero?
99
+ completed_outf_names << out
100
+ else
101
+ AbortIf.logger.error { "Retrying blast job failed. " +
102
+ "Non-zero exit status " +
103
+ "(#{exit_status}) " +
104
+ "when running '#{cmd}'." }
105
+ end
106
+ end
64
107
  end
65
108
  end
66
109
 
67
- outf_names
110
+ completed_outf_names
68
111
  end
69
112
 
70
113
  # Make blast dbs given an array of filenames.
@@ -78,8 +121,9 @@ module Aai
78
121
  outfiles = fnames.map { |fname| fname + suffix }
79
122
 
80
123
  Time.time_it "Making blast databases" do
81
- Parallel.each(fnames, in_processes: cpus) do |fname|
82
- cmd = "makeblastdb -in #{fname} -out #{fname}#{BLAST_DB_SUFFIX} -dbtype prot"
124
+ fnames.each do |fname|
125
+ cmd = "diamond makedb --threads #{cpus} --in #{fname} " +
126
+ "--db #{fname}#{BLAST_DB_SUFFIX}"
83
127
 
84
128
  Process.run_it! cmd
85
129
  end
@@ -103,13 +147,15 @@ module Aai
103
147
  clean_fnames << clean_fname
104
148
  File.open(clean_fname, "w") do |f|
105
149
  Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
106
- header =
107
- annotate_header clean_header(rec.header),
108
- File.basename(fname)
150
+ unless bad_seq? rec.seq
151
+ header =
152
+ annotate_header clean_header(rec.header),
153
+ File.basename(fname)
109
154
 
110
- seq_lengths[header] = rec.seq.length
155
+ seq_lengths[header] = rec.seq.length
111
156
 
112
- f.puts ">#{header}\n#{rec.seq}"
157
+ f.puts ">#{header}\n#{rec.seq}"
158
+ end
113
159
  end
114
160
  end
115
161
  end
@@ -269,6 +315,13 @@ module Aai
269
315
 
270
316
  private
271
317
 
318
+ # this is to account for the weird IMG error. Some seqs will
319
+ # not have an actual protein, rather it will be "No sequence
320
+ # found"
321
+ def bad_seq? seq
322
+ seq.downcase.include? "nosequencefound"
323
+ end
324
+
272
325
  def two_way_hit? hit1, hit2
273
326
  hit1[:query_name] == hit2[:target_name] &&
274
327
  hit1[:query_genome] == hit2[:target_genome]
@@ -1,5 +1,9 @@
1
1
  module Aai
2
2
  module Utils
3
+ def clean_str str
4
+ str.strip.gsub(/[^\p{Alnum}_]+/, "_").gsub(/_+/, "_")
5
+ end
6
+
3
7
  # Raises SystemExit if one of the fnames does not exist.
4
8
  def check_files fnames
5
9
  fnames.each do |fname|
@@ -1,5 +1,5 @@
1
1
  module Aai
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  COPYRIGHT = "2017 Ryan Moore"
4
4
  CONTACT = "moorer@udel.edu"
5
5
  WEBSITE = "https://github.com/mooreryan/aai"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: aai
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-06-23 00:00:00.000000000 Z
11
+ date: 2017-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -94,26 +94,6 @@ dependencies:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: 0.2.0
97
- - !ruby/object:Gem::Dependency
98
- name: parallel
99
- requirement: !ruby/object:Gem::Requirement
100
- requirements:
101
- - - "~>"
102
- - !ruby/object:Gem::Version
103
- version: '1.6'
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- version: 1.6.1
107
- type: :runtime
108
- prerelease: false
109
- version_requirements: !ruby/object:Gem::Requirement
110
- requirements:
111
- - - "~>"
112
- - !ruby/object:Gem::Version
113
- version: '1.6'
114
- - - ">="
115
- - !ruby/object:Gem::Version
116
- version: 1.6.1
117
97
  - !ruby/object:Gem::Dependency
118
98
  name: parse_fasta
119
99
  requirement: !ruby/object:Gem::Requirement
@@ -180,6 +160,7 @@ files:
180
160
  - ".gitignore"
181
161
  - ".rspec"
182
162
  - ".travis.yml"
163
+ - CHANGELOG.md
183
164
  - CODE_OF_CONDUCT.md
184
165
  - Gemfile
185
166
  - LICENSE