lederhosen 2.0.8 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ rvm:
2
+ - '1.9.7'
3
+ script: bundle exec rspec
data/lederhosen.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "2.0.8"
8
+ s.version = "3.1.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
12
- s.date = "2013-03-14"
12
+ s.date = "2013-07-03"
13
13
  s.description = "Various tools for OTU clustering"
14
14
  s.email = "harekrishna@gmail.com"
15
15
  s.executables = ["lederhosen"]
@@ -18,6 +18,7 @@ Gem::Specification.new do |s|
18
18
  ]
19
19
  s.files = [
20
20
  ".rspec",
21
+ ".travis.yml",
21
22
  "Gemfile",
22
23
  "LICENSE.txt",
23
24
  "Rakefile",
@@ -40,6 +41,7 @@ Gem::Specification.new do |s|
40
41
  "lib/lederhosen/version.rb",
41
42
  "logo.png",
42
43
  "readme.md",
44
+ "scripts/count_taxonomies.go",
43
45
  "scripts/illumina_pipeline/.gitignore",
44
46
  "scripts/illumina_pipeline/Makefile",
45
47
  "scripts/illumina_pipeline/pipeline.sh",
@@ -56,7 +58,7 @@ Gem::Specification.new do |s|
56
58
  s.homepage = "http://audy.github.com/lederhosen"
57
59
  s.licenses = ["MIT"]
58
60
  s.require_paths = ["lib"]
59
- s.rubygems_version = "1.8.24"
61
+ s.rubygems_version = "1.8.25"
60
62
  s.summary = "OTU Clustering"
61
63
 
62
64
  if s.respond_to? :specification_version then
@@ -33,4 +33,4 @@ module Lederhosen
33
33
 
34
34
  end # module
35
35
 
36
- Dir.glob(File.join(File.dirname(__FILE__), 'tasks', '*.rb')).each { |f| require f }
36
+ Dir.glob(File.join(File.dirname(__FILE__), 'tasks', '*.rb')).each { |f| require f }
@@ -36,6 +36,8 @@ module Lederhosen
36
36
  # taxcollector taxonomy starts with a open square bracked
37
37
  if taxonomy =~ /^\[/
38
38
  :taxcollector
39
+ elsif taxonomy =~ /s__/
40
+ :greengenes_135
39
41
  elsif taxonomy =~ /^\d/
40
42
  :greengenes
41
43
  elsif taxonomy.nil?
@@ -51,6 +53,8 @@ module Lederhosen
51
53
  case @taxonomy_format
52
54
  when :greengenes
53
55
  parse_taxonomy_greengenes(taxonomy)
56
+ when :greengenes_135
57
+ parse_taxonomy_greengenes_135(taxonomy)
54
58
  when :taxcollector
55
59
  parse_taxonomy_taxcollector(taxonomy)
56
60
  when :qiime
@@ -62,6 +66,7 @@ module Lederhosen
62
66
 
63
67
  RE_TAXCOLLECTOR = /^\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/
64
68
  RE_GREENGENES = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?(.*);/
69
+ RE_GREENGENES_135 = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?s__(.*)/
65
70
  RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
66
71
 
67
72
  def parse_taxonomy_qiime(taxonomy)
@@ -90,6 +95,19 @@ module Lederhosen
90
95
  names
91
96
  end
92
97
 
98
+ def parse_taxonomy_greengenes_135(taxonomy)
99
+ levels = %w{domain phylum class order family genus species}
100
+ match_data = taxonomy.match(RE_GREENGENES_135)
101
+ match_data = match_data[1..-1]
102
+
103
+ names = Hash.new
104
+ # for some reason Hash[*levels.zip(match_data)] ain't working
105
+ levels.zip(match_data).each { |l, n| names[l] = n }
106
+
107
+ names['original'] = taxonomy
108
+ names
109
+ end
110
+
93
111
  # parse a taxonomic description using the
94
112
  # taxcollector format returning name at each level (genus, etc...)
95
113
  #
@@ -4,22 +4,24 @@ module Lederhosen
4
4
 
5
5
  desc 'cluster', 'reference-based clustering using usearch'
6
6
 
7
- method_option :input, :type => :string, :required => true
8
- method_option :database, :type => :string, :required => true
9
- method_option :threads, :type => :numeric, :default => false
10
- method_option :identity, :type => :numeric, :required => true
11
- method_option :output, :type => :string, :required => true
12
- method_option :strand, :type => :string, :default => 'plus'
13
- method_option :dry_run, :type => :boolean, :default => false
7
+ method_option :input, :type => :string, :required => true
8
+ method_option :database, :type => :string, :required => true
9
+ method_option :threads, :type => :numeric, :default => false
10
+ method_option :identity, :type => :numeric, :required => true
11
+ method_option :output, :type => :string, :required => true
12
+ method_option :strand, :type => :string, :default => 'plus'
13
+ method_option :dry_run, :type => :boolean, :default => false
14
+ method_option :query_cov, :type => :numeric, :required => false, :default => 0.95
14
15
 
15
16
  def cluster
16
- input = File.expand_path(options[:input])
17
- database = File.expand_path(options[:database])
18
- threads = options[:threads]
19
- identity = options[:identity]
20
- output = File.expand_path(options[:output])
21
- strand = options[:strand]
22
- dry_run = options[:dry_run]
17
+ input = File.expand_path(options[:input])
18
+ database = File.expand_path(options[:database])
19
+ threads = options[:threads]
20
+ identity = options[:identity]
21
+ output = File.expand_path(options[:output])
22
+ strand = options[:strand]
23
+ dry_run = options[:dry_run]
24
+ query_cov = options[:query_cov]
23
25
 
24
26
  ohai "#{'(dry run)' if dry_run} clustering #{input} to #{database} and saving to #{output}"
25
27
 
@@ -32,7 +34,8 @@ module Lederhosen
32
34
  "--id #{identity}",
33
35
  "--uc #{output}",
34
36
  "--db #{database}",
35
- "--strand #{strand}"
37
+ "--strand #{strand}",
38
+ "--query_cov #{query_cov}"
36
39
  ]
37
40
 
38
41
  # threads = False : use all threads (default)
@@ -5,27 +5,16 @@ module Lederhosen
5
5
 
6
6
  method_option :input, :type => :string, :required => true
7
7
  method_option :output, :type => :string, :required => true
8
- method_option :strict, :type => :string, :default => false,
9
- :banner => '<level> only count reads where both taxonomies are in agreement at <level>'
10
8
 
11
9
  def count_taxonomies
12
10
  input = options[:input]
13
11
  output = options[:output]
14
- strict = options[:strict]
15
12
 
16
13
  ohai "generating #{output} from #{input}"
17
14
 
18
15
  handle = File.open(input)
19
16
  uc = UCParser.new(handle)
20
-
21
- taxonomy_count =
22
- if not strict
23
- get_taxonomy_count(uc)
24
-
25
- elsif strict
26
- get_strict_taxonomy_count(uc, strict)
27
- end
28
-
17
+ taxonomy_count = get_taxonomy_count(uc)
29
18
  handle.close
30
19
 
31
20
  out = File.open(output, 'w')
@@ -51,34 +40,6 @@ module Lederhosen
51
40
  taxonomy_count
52
41
  end
53
42
 
54
- # returns Hash of taxonomy => number_of_reads
55
- # if a pair of reads do not agree at a taxonomic level,
56
- # or if at least one is unclassified, bot reads are counted
57
- # as unclassified_reads
58
- def get_strict_taxonomy_count(uc, level)
59
- taxonomy_count = Hash.new { |h, k| h[k] = 0 }
60
- # TODO: I'm making a block for results because I don't know how to
61
- # make results return an Enumerator when not given a block
62
- uc.each_slice(2) do |left, right|
63
- if left.miss? or right.miss? # at least one is a miss
64
- taxonomy_count['unclassified_reads'] += 2
65
- # both are hits, check taxonomies
66
- else
67
- ta = parse_taxonomy(left.target)
68
- tb = parse_taxonomy(right.target)
69
- # they match up, count both separately
70
- if ta[level] == tb[level]
71
- taxonomy_count[left.target] += 1
72
- taxonomy_count[right.target] += 1
73
- # they don't match up, count as unclassified
74
- else
75
- taxonomy_count['unclassified_reads'] += 2
76
- end
77
- end
78
- end # results.each_slice
79
- taxonomy_count
80
- end
81
-
82
43
  end
83
44
  end
84
45
  end
@@ -10,12 +10,14 @@ module Lederhosen
10
10
  input = options[:input]
11
11
  output = options[:output]
12
12
  word_length = options[:word_length]
13
+ db_step = options[:db_step]
13
14
 
14
15
  ohai "making udb w/ #{input}, saving as #{output}."
15
16
 
16
17
  cmd = ['usearch',
17
18
  "-makeudb_usearch #{input}",
18
- "-output #{output}"]
19
+ "-output #{output}",
20
+ ]
19
21
 
20
22
  cmd = cmd.join(' ')
21
23
 
@@ -70,7 +70,7 @@ module Lederhosen
70
70
 
71
71
  kept_counts = counts.zip(mask).map { |c, m| c if m }.compact
72
72
  noise = counts.zip(mask).map { |c, m| c unless m }.compact.inject(:+)
73
- filtered_reads += noise
73
+ filtered_reads += noise || 0
74
74
 
75
75
  output.puts "#{sample_name},#{kept_counts.join(',')},#{noise}"
76
76
 
@@ -1,10 +1,12 @@
1
1
  module Lederhosen
2
2
  module Version
3
- MAJOR = 2
4
- MINOR = 0
5
- CODENAME = 'Schnittlauchbrot' # changes for minor versions
6
- PATCH = 8
3
+ MAJOR = 3
4
+ MINOR = 1
5
+ CODENAME = 'Hauptbahnhof' # changes for minor versions
6
+ PATCH = 0
7
7
 
8
- STRING = [MAJOR, MINOR, PATCH].join('.')
8
+ string = [MAJOR, MINOR, PATCH].join('.')
9
+
10
+ STRING = string
9
11
  end
10
12
  end
data/readme.md CHANGED
@@ -1,52 +1,38 @@
1
1
  <img src="https://raw.github.com/audy/lederhosen/master/logo.png" align="right">
2
2
 
3
- # Lederhosen
4
-
5
- Lederhosen is a set of tools for OTU clustering rRNA amplicons using Robert Edgar's USEARCH.
6
-
7
- It's used to run USEARCH and create and filter tables. Unlike most of the software in Bioinformatics,
8
- It is meant to be UNIX-y: do one thing and do it well.
9
-
10
- Do you want to run Lederhosen on a cluster? Use `--dry-run` and feed it to your cluster's queue management system.
3
+ [![Build
4
+ Status](https://travis-ci.org/audy/lederhosen.png)](https://travis-ci.org/audy/lederhosen)
11
5
 
12
- Lederhosen is not a pipeline but rather a set of tools broken up into tasks. Tasks are invoked by running `lederhosen TASK ...`.
13
-
14
- Lederhosen is designed with the following "pipeline" in mind:
15
-
16
- 1. Clustering sequences to reference sequences (read: database) and/or _de novo_ OTU clustering.
17
- - `lederhosen cluster ...`
18
- 2. Generating tables from USEARCH output.
19
- - `lederhosen count_taxonomies ...`
20
- - `lederhosen otu_table ...`
21
- 3. Filtering tables to remove small or insignificant OTUs.
22
- - `lederhosen otu_filter ...`
6
+ # Lederhosen
23
7
 
8
+ Lederhosen is a set of tools for OTU clustering rRNA amplicons using
9
+ Robert Edgar's USEARCH and is simple, robust, and fast.
10
+ Lederhosen was designed from the beginning to handle lots of data from
11
+ lots of samples, specifically from data generated by multiplexed
12
+ Illumina Hi/Mi-Seq sequencing.
24
13
 
25
- ### About
14
+ No assumptions are made about the design of your experiment.
15
+ Therefore, there are no tools for read pre-processing and data analysis
16
+ or statistics. Insert reads, receive data.
26
17
 
27
- - Lederhosen is a project born out of the Triplett Lab at the University of Florida.
28
- - Lederhosen is designed to be a fast and **simple** (~700 SLOC) tool to aid in clustering 16S rRNA amplicons sequenced
29
- using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq), Ion Torrent, or Roche-454.
30
- - Lederhosen uses [Semantic Versioning](http://semver.org/), is free and open source under the
31
- [MIT open source license](http://opensource.org/licenses/mit-license.php/).
32
- - Except for USEARCH which requires a license, Lederhosen is available for commercial use.
18
+ Lederhosen is free and open source under the MIT license. Except for
19
+ the USEARCH license, Lederhosen is free for commercial use.
33
20
 
34
21
  ### Features
35
22
 
36
- - Closed/Open/Mixed OTU clustering to TaxCollector or GreenGenes via USEARCH.
37
- - Parallel support (pipe commands into [parallel](http://savannah.gnu.org/projects/parallel/), or use your cluster's queue).
38
- - Support for RDP, TaxCollector or GreenGenes 16S rRNA databases.
23
+ - Referenced-based OTU clustering to via USEARCH.
24
+ - Multiple Database Support (RDP, GreenGenes, TaxCollector, Silva).
25
+ - Parallel support (USEARCH, MapReduce or Compute Cluster).
39
26
  - Generation and filtering of OTU abundancy matrices.
40
- -. Support for paired end reads (considers taxonomic assignment for both reads in a pair).
41
27
 
42
28
  ### Installation
43
29
 
44
- 0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine for non-commercial use)
45
- 2. Get a database:
30
+ 0. Obtain & Install [USEARCH](http://www.drive5.com/).
31
+ 1. Get a database:
46
32
  - [TaxCollector](http://github.com/audy/taxcollector)
47
33
  - [GreenGenes](http://greengenes.lbl.gov) 16S database
48
34
  - File an [issue report](https://github.com/audy/lederhosen/issues) or pull request ;) to request support for a different database.
49
- 3. Install Lederhosen by typing:
35
+ 2. Install Lederhosen by typing:
50
36
 
51
37
  `sudo gem install lederhosen`
52
38
  4. Check installation by typing `lederhosen`. You should see some help text.
@@ -61,11 +47,17 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
61
47
 
62
48
  ### Trim Reads
63
49
 
64
- Trimming removed. I think you should use [Sickle](https://github.com/najoshi/sickle).
50
+ Trimming removed. I think you should use
51
+ [Sickle](https://github.com/najoshi/sickle), or
52
+ [Trimmomatic](http://www.usadellab.org/cms/index.php?page=trimmomatic).
53
+ You can use
54
+ [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to inspect read quality.
65
55
 
66
56
  ### Create Database
67
57
 
68
- Create UDB database required by usearch from TaxCollector
58
+ The 16S database can optionally be in USEARCH database format (udb).
59
+ This speeds things up if you are clustering sequences in multiple FASTA
60
+ files.
69
61
 
70
62
  ```bash
71
63
  lederhosen make_udb \
@@ -73,22 +65,21 @@ lederhosen make_udb \
73
65
  --output=taxcollector.udb
74
66
  ```
75
67
 
76
- (not actually required but will make batch searching a lot faster)
77
-
78
68
  ### Cluster Reads using USEARCH
79
69
 
80
70
  Cluster reads using USEARCH. Output is a uc file.
81
71
 
82
72
  ```bash
83
73
  lederhosen cluster \
84
- --input=trimmed/*.fasta \
74
+ --input=trimmed/sequences.fasta \
85
75
  --identity=0.95 \
86
76
  --output=clusters_95.uc \
87
77
  --database=taxcollector.udb
88
78
  ```
89
79
 
90
- The optional `--dry-run` parameter outputs the usearch command to standard out.
91
- This is useful if you want to run usearch on a cluster.
80
+ The optional `--dry-run` parameter prints the USEARCH command to
81
+ standard out. Instead of actually running the command. This is useful if
82
+ you want to run jobs in parallel and/or on a cluster.
92
83
 
93
84
  ```bash
94
85
  for reads_file in reads/*.fasta;
@@ -108,7 +99,7 @@ cat jobs.sh | parallel -j 24 # run 24 parallel jobs
108
99
 
109
100
  ### Generate taxonomy counts tables
110
101
 
111
- Before generating OTU tables, you must generate taxonomy counts tables.
102
+ Before generating OTU tables, you must generate taxonomy counts (`.tax`) tables.
112
103
 
113
104
  A taxonomy count table looks something like this
114
105
 
@@ -125,19 +116,6 @@ lederhosen count_taxonomies \
125
116
  --output=clusters_taxonomies.txt
126
117
  ```
127
118
 
128
- If you did paired-end sequencing, you can generate strict taxonomy tables that only count reads when *both pairs* have the *same*
129
- taxonomic description at a certain taxonomic level. This is useful for leveraging the increased length of having pairs and also
130
- acts as a sort of chimera filter. You will, however, end up using less of your reads as the level goes from domain to species.
131
-
132
- ```bash
133
- lederhosen count_taxonomies \
134
- --input=clusters.uc \
135
- --strict=genus \
136
- --output=clusters_taxonomies.strict.genus.txt
137
- ```
138
-
139
- Reads that do not have the same phylogeny at `level` will become `unclassified_reads`
140
-
141
119
  ### Generate OTU tables
142
120
 
143
121
  Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
@@ -152,8 +130,8 @@ lederhosen otu_table \
152
130
  This will create the file `my_poop_samples_genus_strict.95.txt` containing the clusters
153
131
  as columns and the samples as rows.
154
132
 
155
- You now will apply advanced data mining and statistical techniques to this table to make
156
- interesting biological inferences and cure diseases.
133
+ If your database doesn't have taxonomic descriptions, use
134
+ `--level=original`.
157
135
 
158
136
  ### Filter OTU tables
159
137
 
@@ -175,7 +153,6 @@ lederhosen otu_filter \
175
153
  This will remove any clusters that do not appear in at least 10 samples with at least 50 reads. The read counts
176
154
  for filtered clusters will be moved to the `noise` psuedocluster.
177
155
 
178
-
179
156
  ### Get representative sequences
180
157
 
181
158
  You can get the representative sequences for each cluster using the `get_reps` tasks.
@@ -219,9 +196,9 @@ lederhosen separate_unclassified \
219
196
 
220
197
  ## Acknowledgements
221
198
 
222
- - Lexi, Vinnie and Kevin for beta-testing and putting up with bugs
223
- - The QIIME project for inspiration
224
- - Sinbad Richardson for the Lederhosen Guy artwork
199
+ - [Sinbad Richardson](http://viennapitts.com/) for the Lederhosen Guy artwork
200
+ - Lexi, and Kevin for beta-testing and putting up with bugs.
201
+ - The QIIME project for inspiration.
225
202
 
226
203
  ## Please Cite
227
204
 
@@ -0,0 +1,68 @@
1
+ package main
2
+
3
+ //
4
+ // count_taxonomies.go
5
+ // a faster alternative to lederhosen count_taxonomies
6
+ // (c2013) Austin G. Davis-Richardson
7
+ // MIT v3 LICENSE
8
+ //
9
+ // COMPILATION:
10
+ //
11
+ // 1.) Install Go (http://golang.org)
12
+ // 2.) go build count_taxonomies.go
13
+ // 3.) At this point you're ready to go
14
+ //
15
+ // USAGE:
16
+ // count_taxonomies input.uc > output.tax
17
+ //
18
+
19
+ import (
20
+ "encoding/csv"
21
+ "fmt"
22
+ "io"
23
+ "os"
24
+ )
25
+
26
+ func main() {
27
+
28
+ table := map[string]int64{}
29
+
30
+ infile := os.Args[1]
31
+
32
+ file, err := os.Open(infile)
33
+
34
+ if err != nil {
35
+ panic(err)
36
+ }
37
+
38
+ defer file.Close()
39
+
40
+ reader := csv.NewReader(file)
41
+ reader.Comma = '\t'
42
+
43
+ // count items
44
+ for {
45
+ record, err := reader.Read()
46
+ if err == io.EOF {
47
+ break
48
+ } else if err != nil {
49
+ panic(err)
50
+ }
51
+
52
+ // key is the name of the target sequence.
53
+ // column 8 in the uc file (9 if you start
54
+ // counting at 0)
55
+ key := record[9]
56
+
57
+ if _, present := table[key]; present {
58
+ table[key] = table[key] + 1
59
+ } else {
60
+ table[key] = 1
61
+ }
62
+
63
+ }
64
+
65
+ for k, _ := range table {
66
+ fmt.Printf("%v,%v\n", k, table[k])
67
+ }
68
+ }
data/spec/cli_spec.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe Lederhosen::CLI do
3
+ describe Lederhosen::CLI, :requires_usearch => true do
4
4
 
5
5
  it 'should have an executable' do
6
6
  `./bin/lederhosen`
@@ -39,10 +39,10 @@ describe Lederhosen::CLI do
39
39
  unclassified_reads = File.readlines("#{$test_dir}/unclassified.fasta")\
40
40
  .select { |x| x =~ /^>/ }\
41
41
  .size
42
-
42
+
43
43
  unclassified_results.should == unclassified_reads
44
44
  end
45
-
45
+
46
46
  it 'can separate unclassified reads from usearch output using strict pairing' do
47
47
  `./bin/lederhosen separate_unclassified --strict=genus --uc-file=spec/data/test.uc --reads=spec/data/trimmed/ILT_L_9_B_001.fasta --output=#{$test_dir}/unclassified.strict_genus.fasta`
48
48
  $?.success?.should be_true
@@ -52,42 +52,22 @@ describe Lederhosen::CLI do
52
52
  end
53
53
 
54
54
  it 'can create taxonomy count tables' do
55
- `./bin/lederhosen count_taxonomies --input=spec/data/test.uc --output=#{$test_dir}/taxonomy_count.txt`
55
+ `./bin/lederhosen count_taxonomies --input=spec/data/test.uc --output=#{$test_dir}/taxonomy_count.tax`
56
56
  $?.success?.should be_true
57
- File.exists?(File.join($test_dir, 'taxonomy_count.txt')).should be_true
57
+ File.exists?(File.join($test_dir, 'taxonomy_count.tax')).should be_true
58
58
  end
59
59
 
60
60
  it 'generates taxonomy tables w/ comma-free taxonomic descriptions' do
61
- File.readlines(File.join($test_dir, 'taxonomy_count.txt'))\
61
+ File.readlines(File.join($test_dir, 'taxonomy_count.tax'))\
62
62
  .map(&:strip)\
63
63
  .map { |x| x.count(',') }\
64
64
  .uniq\
65
65
  .should == [1]
66
66
  end
67
67
 
68
- %w{domain phylum class order family genus species}.each do |level|
69
- it "generates taxonomy tables only counting pairs that agree at level: #{level}" do
70
- `./bin/lederhosen count_taxonomies --input=spec/data/test.uc --output=#{$test_dir}/taxonomy_count.strict.#{level}.txt --strict=#{level}`
71
- $?.success?.should be_true
72
-
73
- lines = File.readlines(File.join($test_dir, "taxonomy_count.strict.#{level}.txt"))
74
-
75
- # make sure total number of reads is even
76
- # requires that there should be an odd number if classification is not strict
77
- lines.select { |x| !(x =~ /^#/) }\
78
- .map(&:strip)\
79
- .map { |x| x.split(',') }\
80
- .map(&:last)\
81
- .map(&:to_i)\
82
- .inject(:+).should be_even
83
- end
84
- end
85
-
86
- %w{domain phylum class order family genus species}.each do |level|
87
- it "should create OTU abundance matrices from taxonomy count tables at level: #{level}" do
88
- `./bin/lederhosen otu_table --files=#{$test_dir}/taxonomy_count.strict.*.txt --level=#{level} --output=#{$test_dir}/otus_genus.strict.csv`
89
- $?.success?.should be_true
90
- end
68
+ it 'can create OTU abundance matrices' do
69
+ `./bin/lederhosen otu_table --files=#{$test_dir}/taxonomy_count.tax --output=#{$test_dir}/otus.genus.csv --level=genus`
70
+ $?.success?.should be_true
91
71
  end
92
72
 
93
73
  it 'should filter OTU abundance matrices' do
@@ -95,7 +75,7 @@ describe Lederhosen::CLI do
95
75
  # filtering should move filtered reads to 'unclassified_reads' so that we maintain
96
76
  # our knowledge of depth of coverage throughout
97
77
  # this makes normalization better later.
98
- `./bin/lederhosen otu_filter --input=#{$test_dir}/otus_genus.strict.csv --output=#{$test_dir}/otu_table.filtered.csv --reads 1 --samples 1`
78
+ `./bin/lederhosen otu_filter --input=#{$test_dir}/otus.genus.csv --output=#{$test_dir}/otus_genus.filtered.csv --reads 1 --samples 1`
99
79
  $?.success?.should be_true
100
80
  end
101
81
 
@@ -3,6 +3,7 @@ require 'spec_helper'
3
3
  describe 'no_tasks' do
4
4
 
5
5
  let(:greengenes_taxonomies) { ['124 U55236.1 Methanobrevibacter thaueri str. CW k__domain; p__phylum; c__class; o__order; f__family; g__genus; species; otu_127']}
6
+ let(:greengenes135_taxonomies) { ['k__domain; p__phylum; c__class; o__order; f__family; g__genus; s__species']}
6
7
  let(:qiime_taxonomies) { [ 'k__domain;p__phylum;c__class;o__order;f__family;g__genus;s__species' ]}
7
8
  let(:taxcollector_taxonomies) { ['[0]domain;[1]phylum;[2]class;[3]order;[4]family;[5]genus;[6]species;[7]strain;[8]Genus_species_strain_id'] }
8
9
  let(:lederhosen) { Lederhosen::CLI.new }
@@ -15,6 +16,12 @@ describe 'no_tasks' do
15
16
  end
16
17
  end
17
18
 
19
+ it '#detect_taxonomy_format should recognize GreenGenes v13.5' do
20
+ greengenes135_taxonomies.each do |greengenes_taxonomy|
21
+ lederhosen.detect_taxonomy_format(greengenes_taxonomy).should == :greengenes_135
22
+ end
23
+ end
24
+
18
25
  it '#detect_taxonomy_format should recognize TaxCollector' do
19
26
  taxcollector_taxonomies.each do |taxcollector_taxonomy|
20
27
  lederhosen.detect_taxonomy_format(taxcollector_taxonomy).should == :taxcollector
@@ -33,28 +40,34 @@ describe 'no_tasks' do
33
40
  taxonomy[level].should == level
34
41
  end
35
42
  end
36
-
43
+
37
44
  it "#parse_taxonomy_greengenes should parse greengenes taxonomy (#{level})" do
38
45
  greengenes_taxonomies.each do |greengenes_taxonomy|
39
46
  taxonomy = lederhosen.parse_taxonomy_greengenes(greengenes_taxonomy)
40
47
  taxonomy[level].should == level
41
48
  end
42
49
  end
43
-
50
+
51
+ it "#parse_taxonomy_greengenes_135 should parse greengenes v13.5 taxonomy (#{level})" do
52
+ greengenes135_taxonomies.each do |greengenes_taxonomy|
53
+ taxonomy = lederhosen.parse_taxonomy_greengenes_135(greengenes_taxonomy)
54
+ taxonomy[level].should == level
55
+ end
56
+ end
57
+
44
58
  it "#parse_taxonomy_greengenes should parse qiime taxonomy (#{level})" do
45
59
  qiime_taxonomies.each do |qiime_taxonomy|
46
60
  taxonomy = lederhosen.parse_taxonomy_qiime(qiime_taxonomy)
47
61
  taxonomy[level].should == level
48
62
  end
49
63
  end
50
-
64
+
51
65
  end
52
-
66
+
53
67
  it '#parse_taxonomy_taxcollector should return original taxonomy' do
54
68
  lederhosen.parse_taxonomy_taxcollector(taxcollector_taxonomies[0])['original'].should == taxcollector_taxonomies[0]
55
69
  end
56
70
 
57
-
58
71
  it '#parse_taxonomy should automatically detect and parse greengenes taxonomy' do
59
72
  greengenes_taxonomies.each do |greengenes_taxonomy|
60
73
  lederhosen.parse_taxonomy(greengenes_taxonomy).should_not be_nil
@@ -67,6 +80,12 @@ describe 'no_tasks' do
67
80
  end
68
81
  end
69
82
 
83
+ it '#parse_taxonomy should automatically detect and parse greengenes 13.5 taxonomy' do
84
+ greengenes135_taxonomies.each do |greengenes_taxonomy|
85
+ lederhosen.parse_taxonomy(greengenes_taxonomy).should_not be_nil
86
+ end
87
+ end
88
+
70
89
  it '#parse_taxonomy_taxcollector should replace unclassified species names with strain name' do
71
90
  t = '[0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Actinomycetales;[4]test;[5]null;[6]Propionibacterineae_bacterium;[7]Propionibacterineae_bacterium_870BRRJ;[8]Propionibacterineae_bacterium_870BRRJ|genus'
72
91
  tax = lederhosen.parse_taxonomy(t)
data/spec/spec_helper.rb CHANGED
@@ -7,3 +7,12 @@ Bundler.require :test, :development
7
7
  $test_dir = ENV['TEST_DIR'] || "/tmp/lederhosen_test_#{(0...8).map{65.+(rand(25)).chr}.join}/"
8
8
  `mkdir -p #{$test_dir}`
9
9
  $stderr.puts "test dir: #{$test_dir}"
10
+
11
+ RSpec.configure do |c|
12
+ # check if usearch is in $PATH
13
+ # if not, skip usearch tests.
14
+ usearch = `which usearch`
15
+ if usearch == ''
16
+ c.filter_run_excluding :requires_usearch => true
17
+ end
18
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.8
4
+ version: 3.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-14 00:00:00.000000000 Z
12
+ date: 2013-07-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: dna
@@ -100,6 +100,7 @@ extra_rdoc_files:
100
100
  - LICENSE.txt
101
101
  files:
102
102
  - .rspec
103
+ - .travis.yml
103
104
  - Gemfile
104
105
  - LICENSE.txt
105
106
  - Rakefile
@@ -122,6 +123,7 @@ files:
122
123
  - lib/lederhosen/version.rb
123
124
  - logo.png
124
125
  - readme.md
126
+ - scripts/count_taxonomies.go
125
127
  - scripts/illumina_pipeline/.gitignore
126
128
  - scripts/illumina_pipeline/Makefile
127
129
  - scripts/illumina_pipeline/pipeline.sh
@@ -149,7 +151,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
149
151
  version: '0'
150
152
  segments:
151
153
  - 0
152
- hash: -1050380685311720987
154
+ hash: -391146498945924903
153
155
  required_rubygems_version: !ruby/object:Gem::Requirement
154
156
  none: false
155
157
  requirements:
@@ -158,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
158
160
  version: '0'
159
161
  requirements: []
160
162
  rubyforge_project:
161
- rubygems_version: 1.8.24
163
+ rubygems_version: 1.8.25
162
164
  signing_key:
163
165
  specification_version: 3
164
166
  summary: OTU Clustering