lederhosen 2.0.8 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +3 -0
- data/lederhosen.gemspec +5 -3
- data/lib/lederhosen/cli.rb +1 -1
- data/lib/lederhosen/no_tasks.rb +18 -0
- data/lib/lederhosen/tasks/cluster.rb +18 -15
- data/lib/lederhosen/tasks/count_taxonomies.rb +1 -40
- data/lib/lederhosen/tasks/make_udb.rb +3 -1
- data/lib/lederhosen/tasks/otu_filter.rb +1 -1
- data/lib/lederhosen/version.rb +7 -5
- data/readme.md +37 -60
- data/scripts/count_taxonomies.go +68 -0
- data/spec/cli_spec.rb +10 -30
- data/spec/no_tasks_spec.rb +24 -5
- data/spec/spec_helper.rb +9 -0
- metadata +6 -4
data/.travis.yml
ADDED
data/lederhosen.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "
|
8
|
+
s.version = "3.1.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
12
|
-
s.date = "2013-03
|
12
|
+
s.date = "2013-07-03"
|
13
13
|
s.description = "Various tools for OTU clustering"
|
14
14
|
s.email = "harekrishna@gmail.com"
|
15
15
|
s.executables = ["lederhosen"]
|
@@ -18,6 +18,7 @@ Gem::Specification.new do |s|
|
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".rspec",
|
21
|
+
".travis.yml",
|
21
22
|
"Gemfile",
|
22
23
|
"LICENSE.txt",
|
23
24
|
"Rakefile",
|
@@ -40,6 +41,7 @@ Gem::Specification.new do |s|
|
|
40
41
|
"lib/lederhosen/version.rb",
|
41
42
|
"logo.png",
|
42
43
|
"readme.md",
|
44
|
+
"scripts/count_taxonomies.go",
|
43
45
|
"scripts/illumina_pipeline/.gitignore",
|
44
46
|
"scripts/illumina_pipeline/Makefile",
|
45
47
|
"scripts/illumina_pipeline/pipeline.sh",
|
@@ -56,7 +58,7 @@ Gem::Specification.new do |s|
|
|
56
58
|
s.homepage = "http://audy.github.com/lederhosen"
|
57
59
|
s.licenses = ["MIT"]
|
58
60
|
s.require_paths = ["lib"]
|
59
|
-
s.rubygems_version = "1.8.
|
61
|
+
s.rubygems_version = "1.8.25"
|
60
62
|
s.summary = "OTU Clustering"
|
61
63
|
|
62
64
|
if s.respond_to? :specification_version then
|
data/lib/lederhosen/cli.rb
CHANGED
data/lib/lederhosen/no_tasks.rb
CHANGED
@@ -36,6 +36,8 @@ module Lederhosen
|
|
36
36
|
# taxcollector taxonomy starts with a open square bracked
|
37
37
|
if taxonomy =~ /^\[/
|
38
38
|
:taxcollector
|
39
|
+
elsif taxonomy =~ /s__/
|
40
|
+
:greengenes_135
|
39
41
|
elsif taxonomy =~ /^\d/
|
40
42
|
:greengenes
|
41
43
|
elsif taxonomy.nil?
|
@@ -51,6 +53,8 @@ module Lederhosen
|
|
51
53
|
case @taxonomy_format
|
52
54
|
when :greengenes
|
53
55
|
parse_taxonomy_greengenes(taxonomy)
|
56
|
+
when :greengenes_135
|
57
|
+
parse_taxonomy_greengenes_135(taxonomy)
|
54
58
|
when :taxcollector
|
55
59
|
parse_taxonomy_taxcollector(taxonomy)
|
56
60
|
when :qiime
|
@@ -62,6 +66,7 @@ module Lederhosen
|
|
62
66
|
|
63
67
|
RE_TAXCOLLECTOR = /^\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/
|
64
68
|
RE_GREENGENES = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?(.*);/
|
69
|
+
RE_GREENGENES_135 = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?s__(.*)/
|
65
70
|
RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
|
66
71
|
|
67
72
|
def parse_taxonomy_qiime(taxonomy)
|
@@ -90,6 +95,19 @@ module Lederhosen
|
|
90
95
|
names
|
91
96
|
end
|
92
97
|
|
98
|
+
def parse_taxonomy_greengenes_135(taxonomy)
|
99
|
+
levels = %w{domain phylum class order family genus species}
|
100
|
+
match_data = taxonomy.match(RE_GREENGENES_135)
|
101
|
+
match_data = match_data[1..-1]
|
102
|
+
|
103
|
+
names = Hash.new
|
104
|
+
# for some reason Hash[*levels.zip(match_data)] ain't working
|
105
|
+
levels.zip(match_data).each { |l, n| names[l] = n }
|
106
|
+
|
107
|
+
names['original'] = taxonomy
|
108
|
+
names
|
109
|
+
end
|
110
|
+
|
93
111
|
# parse a taxonomic description using the
|
94
112
|
# taxcollector format returning name at each level (genus, etc...)
|
95
113
|
#
|
@@ -4,22 +4,24 @@ module Lederhosen
|
|
4
4
|
|
5
5
|
desc 'cluster', 'reference-based clustering using usearch'
|
6
6
|
|
7
|
-
method_option :input, :type
|
8
|
-
method_option :database, :type
|
9
|
-
method_option :threads, :type
|
10
|
-
method_option :identity, :type
|
11
|
-
method_option :output, :type
|
12
|
-
method_option :strand, :type
|
13
|
-
method_option :dry_run, :type
|
7
|
+
method_option :input, :type => :string, :required => true
|
8
|
+
method_option :database, :type => :string, :required => true
|
9
|
+
method_option :threads, :type => :numeric, :default => false
|
10
|
+
method_option :identity, :type => :numeric, :required => true
|
11
|
+
method_option :output, :type => :string, :required => true
|
12
|
+
method_option :strand, :type => :string, :default => 'plus'
|
13
|
+
method_option :dry_run, :type => :boolean, :default => false
|
14
|
+
method_option :query_cov, :type => :numeric, :required => false, :default => 0.95
|
14
15
|
|
15
16
|
def cluster
|
16
|
-
input
|
17
|
-
database
|
18
|
-
threads
|
19
|
-
identity
|
20
|
-
output
|
21
|
-
strand
|
22
|
-
dry_run
|
17
|
+
input = File.expand_path(options[:input])
|
18
|
+
database = File.expand_path(options[:database])
|
19
|
+
threads = options[:threads]
|
20
|
+
identity = options[:identity]
|
21
|
+
output = File.expand_path(options[:output])
|
22
|
+
strand = options[:strand]
|
23
|
+
dry_run = options[:dry_run]
|
24
|
+
query_cov = options[:query_cov]
|
23
25
|
|
24
26
|
ohai "#{'(dry run)' if dry_run} clustering #{input} to #{database} and saving to #{output}"
|
25
27
|
|
@@ -32,7 +34,8 @@ module Lederhosen
|
|
32
34
|
"--id #{identity}",
|
33
35
|
"--uc #{output}",
|
34
36
|
"--db #{database}",
|
35
|
-
"--strand #{strand}"
|
37
|
+
"--strand #{strand}",
|
38
|
+
"--query_cov #{query_cov}"
|
36
39
|
]
|
37
40
|
|
38
41
|
# threads = False : use all threads (default)
|
@@ -5,27 +5,16 @@ module Lederhosen
|
|
5
5
|
|
6
6
|
method_option :input, :type => :string, :required => true
|
7
7
|
method_option :output, :type => :string, :required => true
|
8
|
-
method_option :strict, :type => :string, :default => false,
|
9
|
-
:banner => '<level> only count reads where both taxonomies are in agreement at <level>'
|
10
8
|
|
11
9
|
def count_taxonomies
|
12
10
|
input = options[:input]
|
13
11
|
output = options[:output]
|
14
|
-
strict = options[:strict]
|
15
12
|
|
16
13
|
ohai "generating #{output} from #{input}"
|
17
14
|
|
18
15
|
handle = File.open(input)
|
19
16
|
uc = UCParser.new(handle)
|
20
|
-
|
21
|
-
taxonomy_count =
|
22
|
-
if not strict
|
23
|
-
get_taxonomy_count(uc)
|
24
|
-
|
25
|
-
elsif strict
|
26
|
-
get_strict_taxonomy_count(uc, strict)
|
27
|
-
end
|
28
|
-
|
17
|
+
taxonomy_count = get_taxonomy_count(uc)
|
29
18
|
handle.close
|
30
19
|
|
31
20
|
out = File.open(output, 'w')
|
@@ -51,34 +40,6 @@ module Lederhosen
|
|
51
40
|
taxonomy_count
|
52
41
|
end
|
53
42
|
|
54
|
-
# returns Hash of taxonomy => number_of_reads
|
55
|
-
# if a pair of reads do not agree at a taxonomic level,
|
56
|
-
# or if at least one is unclassified, bot reads are counted
|
57
|
-
# as unclassified_reads
|
58
|
-
def get_strict_taxonomy_count(uc, level)
|
59
|
-
taxonomy_count = Hash.new { |h, k| h[k] = 0 }
|
60
|
-
# TODO: I'm making a block for results because I don't know how to
|
61
|
-
# make results return an Enumerator when not given a block
|
62
|
-
uc.each_slice(2) do |left, right|
|
63
|
-
if left.miss? or right.miss? # at least one is a miss
|
64
|
-
taxonomy_count['unclassified_reads'] += 2
|
65
|
-
# both are hits, check taxonomies
|
66
|
-
else
|
67
|
-
ta = parse_taxonomy(left.target)
|
68
|
-
tb = parse_taxonomy(right.target)
|
69
|
-
# they match up, count both separately
|
70
|
-
if ta[level] == tb[level]
|
71
|
-
taxonomy_count[left.target] += 1
|
72
|
-
taxonomy_count[right.target] += 1
|
73
|
-
# they don't match up, count as unclassified
|
74
|
-
else
|
75
|
-
taxonomy_count['unclassified_reads'] += 2
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end # results.each_slice
|
79
|
-
taxonomy_count
|
80
|
-
end
|
81
|
-
|
82
43
|
end
|
83
44
|
end
|
84
45
|
end
|
@@ -10,12 +10,14 @@ module Lederhosen
|
|
10
10
|
input = options[:input]
|
11
11
|
output = options[:output]
|
12
12
|
word_length = options[:word_length]
|
13
|
+
db_step = options[:db_step]
|
13
14
|
|
14
15
|
ohai "making udb w/ #{input}, saving as #{output}."
|
15
16
|
|
16
17
|
cmd = ['usearch',
|
17
18
|
"-makeudb_usearch #{input}",
|
18
|
-
"-output #{output}"
|
19
|
+
"-output #{output}",
|
20
|
+
]
|
19
21
|
|
20
22
|
cmd = cmd.join(' ')
|
21
23
|
|
@@ -70,7 +70,7 @@ module Lederhosen
|
|
70
70
|
|
71
71
|
kept_counts = counts.zip(mask).map { |c, m| c if m }.compact
|
72
72
|
noise = counts.zip(mask).map { |c, m| c unless m }.compact.inject(:+)
|
73
|
-
filtered_reads += noise
|
73
|
+
filtered_reads += noise || 0
|
74
74
|
|
75
75
|
output.puts "#{sample_name},#{kept_counts.join(',')},#{noise}"
|
76
76
|
|
data/lib/lederhosen/version.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
module Lederhosen
|
2
2
|
module Version
|
3
|
-
MAJOR =
|
4
|
-
MINOR =
|
5
|
-
CODENAME = '
|
6
|
-
PATCH =
|
3
|
+
MAJOR = 3
|
4
|
+
MINOR = 1
|
5
|
+
CODENAME = 'Hauptbahnhof' # changes for minor versions
|
6
|
+
PATCH = 0
|
7
7
|
|
8
|
-
|
8
|
+
string = [MAJOR, MINOR, PATCH].join('.')
|
9
|
+
|
10
|
+
STRING = string
|
9
11
|
end
|
10
12
|
end
|
data/readme.md
CHANGED
@@ -1,52 +1,38 @@
|
|
1
1
|
<img src="https://raw.github.com/audy/lederhosen/master/logo.png" align="right">
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
Lederhosen is a set of tools for OTU clustering rRNA amplicons using Robert Edgar's USEARCH.
|
6
|
-
|
7
|
-
It's used to run USEARCH and create and filter tables. Unlike most of the software in Bioinformatics,
|
8
|
-
It is meant to be UNIX-y: do one thing and do it well.
|
9
|
-
|
10
|
-
Do you want to run Lederhosen on a cluster? Use `--dry-run` and feed it to your cluster's queue management system.
|
3
|
+
[![Build
|
4
|
+
Status](https://travis-ci.org/audy/lederhosen.png)](https://travis-ci.org/audy/lederhosen)
|
11
5
|
|
12
|
-
Lederhosen
|
13
|
-
|
14
|
-
Lederhosen is designed with the following "pipeline" in mind:
|
15
|
-
|
16
|
-
1. Clustering sequences to reference sequences (read: database) and/or _de novo_ OTU clustering.
|
17
|
-
- `lederhosen cluster ...`
|
18
|
-
2. Generating tables from USEARCH output.
|
19
|
-
- `lederhosen count_taxonomies ...`
|
20
|
-
- `lederhosen otu_table ...`
|
21
|
-
3. Filtering tables to remove small or insignificant OTUs.
|
22
|
-
- `lederhosen otu_filter ...`
|
6
|
+
# Lederhosen
|
23
7
|
|
8
|
+
Lederhosen is a set of tools for OTU clustering rRNA amplicons using
|
9
|
+
Robert Edgar's USEARCH and is simple, robust, and fast.
|
10
|
+
Lederhosen was designed from the beginning to handle lots of data from
|
11
|
+
lots of samples, specifically from data generated by multiplexed
|
12
|
+
Illumina Hi/Mi-Seq sequencing.
|
24
13
|
|
25
|
-
|
14
|
+
No assumptions are made about the design of your experiment.
|
15
|
+
Therefore, there are no tools for read pre-processing and data analysis
|
16
|
+
or statistics. Insert reads, receive data.
|
26
17
|
|
27
|
-
|
28
|
-
|
29
|
-
using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq), Ion Torrent, or Roche-454.
|
30
|
-
- Lederhosen uses [Semantic Versioning](http://semver.org/), is free and open source under the
|
31
|
-
[MIT open source license](http://opensource.org/licenses/mit-license.php/).
|
32
|
-
- Except for USEARCH which requires a license, Lederhosen is available for commercial use.
|
18
|
+
Lederhosen is free and open source under the MIT license. Except for
|
19
|
+
the USEARCH license, Lederhosen is free for commercial use.
|
33
20
|
|
34
21
|
### Features
|
35
22
|
|
36
|
-
-
|
37
|
-
-
|
38
|
-
-
|
23
|
+
- Referenced-based OTU clustering to via USEARCH.
|
24
|
+
- Multiple Database Support (RDP, GreenGenes, TaxCollector, Silva).
|
25
|
+
- Parallel support (USEARCH, MapReduce or Compute Cluster).
|
39
26
|
- Generation and filtering of OTU abundancy matrices.
|
40
|
-
-. Support for paired end reads (considers taxonomic assignment for both reads in a pair).
|
41
27
|
|
42
28
|
### Installation
|
43
29
|
|
44
|
-
0. Obtain & Install [USEARCH](http://www.drive5.com/)
|
45
|
-
|
30
|
+
0. Obtain & Install [USEARCH](http://www.drive5.com/).
|
31
|
+
1. Get a database:
|
46
32
|
- [TaxCollector](http://github.com/audy/taxcollector)
|
47
33
|
- [GreenGenes](http://greengenes.lbl.gov) 16S database
|
48
34
|
- File an [issue report](https://github.com/audy/lederhosen/issues) or pull request ;) to request support for a different database.
|
49
|
-
|
35
|
+
2. Install Lederhosen by typing:
|
50
36
|
|
51
37
|
`sudo gem install lederhosen`
|
52
38
|
4. Check installation by typing `lederhosen`. You should see some help text.
|
@@ -61,11 +47,17 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
|
|
61
47
|
|
62
48
|
### Trim Reads
|
63
49
|
|
64
|
-
Trimming removed. I think you should use
|
50
|
+
Trimming removed. I think you should use
|
51
|
+
[Sickle](https://github.com/najoshi/sickle), or
|
52
|
+
[Trimmomatic](http://www.usadellab.org/cms/index.php?page=trimmomatic).
|
53
|
+
You can use
|
54
|
+
[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to inspect read quality.
|
65
55
|
|
66
56
|
### Create Database
|
67
57
|
|
68
|
-
|
58
|
+
The 16S database can optionally be in USEARCH database format (udb).
|
59
|
+
This speeds things up if you are clustering sequences in multiple FASTA
|
60
|
+
files.
|
69
61
|
|
70
62
|
```bash
|
71
63
|
lederhosen make_udb \
|
@@ -73,22 +65,21 @@ lederhosen make_udb \
|
|
73
65
|
--output=taxcollector.udb
|
74
66
|
```
|
75
67
|
|
76
|
-
(not actually required but will make batch searching a lot faster)
|
77
|
-
|
78
68
|
### Cluster Reads using USEARCH
|
79
69
|
|
80
70
|
Cluster reads using USEARCH. Output is a uc file.
|
81
71
|
|
82
72
|
```bash
|
83
73
|
lederhosen cluster \
|
84
|
-
--input=trimmed
|
74
|
+
--input=trimmed/sequences.fasta \
|
85
75
|
--identity=0.95 \
|
86
76
|
--output=clusters_95.uc \
|
87
77
|
--database=taxcollector.udb
|
88
78
|
```
|
89
79
|
|
90
|
-
The optional `--dry-run` parameter
|
91
|
-
|
80
|
+
The optional `--dry-run` parameter prints the USEARCH command to
|
81
|
+
standard out. Instead of actually running the command. This is useful if
|
82
|
+
you want to run jobs in parallel and/or on a cluster.
|
92
83
|
|
93
84
|
```bash
|
94
85
|
for reads_file in reads/*.fasta;
|
@@ -108,7 +99,7 @@ cat jobs.sh | parallel -j 24 # run 24 parallel jobs
|
|
108
99
|
|
109
100
|
### Generate taxonomy counts tables
|
110
101
|
|
111
|
-
Before generating OTU tables, you must generate taxonomy counts tables.
|
102
|
+
Before generating OTU tables, you must generate taxonomy counts (`.tax`) tables.
|
112
103
|
|
113
104
|
A taxonomy count table looks something like this
|
114
105
|
|
@@ -125,19 +116,6 @@ lederhosen count_taxonomies \
|
|
125
116
|
--output=clusters_taxonomies.txt
|
126
117
|
```
|
127
118
|
|
128
|
-
If you did paired-end sequencing, you can generate strict taxonomy tables that only count reads when *both pairs* have the *same*
|
129
|
-
taxonomic description at a certain taxonomic level. This is useful for leveraging the increased length of having pairs and also
|
130
|
-
acts as a sort of chimera filter. You will, however, end up using less of your reads as the level goes from domain to species.
|
131
|
-
|
132
|
-
```bash
|
133
|
-
lederhosen count_taxonomies \
|
134
|
-
--input=clusters.uc \
|
135
|
-
--strict=genus \
|
136
|
-
--output=clusters_taxonomies.strict.genus.txt
|
137
|
-
```
|
138
|
-
|
139
|
-
Reads that do not have the same phylogeny at `level` will become `unclassified_reads`
|
140
|
-
|
141
119
|
### Generate OTU tables
|
142
120
|
|
143
121
|
Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
|
@@ -152,8 +130,8 @@ lederhosen otu_table \
|
|
152
130
|
This will create the file `my_poop_samples_genus_strict.95.txt` containing the clusters
|
153
131
|
as columns and the samples as rows.
|
154
132
|
|
155
|
-
|
156
|
-
|
133
|
+
If your database doesn't have taxonomic descriptions, use
|
134
|
+
`--level=original`.
|
157
135
|
|
158
136
|
### Filter OTU tables
|
159
137
|
|
@@ -175,7 +153,6 @@ lederhosen otu_filter \
|
|
175
153
|
This will remove any clusters that do not appear in at least 10 samples with at least 50 reads. The read counts
|
176
154
|
for filtered clusters will be moved to the `noise` psuedocluster.
|
177
155
|
|
178
|
-
|
179
156
|
### Get representative sequences
|
180
157
|
|
181
158
|
You can get the representative sequences for each cluster using the `get_reps` tasks.
|
@@ -219,9 +196,9 @@ lederhosen separate_unclassified \
|
|
219
196
|
|
220
197
|
## Acknowledgements
|
221
198
|
|
222
|
-
-
|
223
|
-
-
|
224
|
-
-
|
199
|
+
- [Sinbad Richardson](http://viennapitts.com/) for the Lederhosen Guy artwork
|
200
|
+
- Lexi, and Kevin for beta-testing and putting up with bugs.
|
201
|
+
- The QIIME project for inspiration.
|
225
202
|
|
226
203
|
## Please Cite
|
227
204
|
|
@@ -0,0 +1,68 @@
|
|
1
|
+
package main
|
2
|
+
|
3
|
+
//
|
4
|
+
// count_taxonomies.go
|
5
|
+
// a faster alternative to lederhosen count_taxonomies
|
6
|
+
// (c2013) Austin G. Davis-Richardson
|
7
|
+
// MIT v3 LICENSE
|
8
|
+
//
|
9
|
+
// COMPILATION:
|
10
|
+
//
|
11
|
+
// 1.) Install Go (http://golang.org)
|
12
|
+
// 2.) go build count_taxonomies.go
|
13
|
+
// 3.) At this point you're ready to go
|
14
|
+
//
|
15
|
+
// USAGE:
|
16
|
+
// count_taxonomies input.uc > output.tax
|
17
|
+
//
|
18
|
+
|
19
|
+
import (
|
20
|
+
"encoding/csv"
|
21
|
+
"fmt"
|
22
|
+
"io"
|
23
|
+
"os"
|
24
|
+
)
|
25
|
+
|
26
|
+
func main() {
|
27
|
+
|
28
|
+
table := map[string]int64{}
|
29
|
+
|
30
|
+
infile := os.Args[1]
|
31
|
+
|
32
|
+
file, err := os.Open(infile)
|
33
|
+
|
34
|
+
if err != nil {
|
35
|
+
panic(err)
|
36
|
+
}
|
37
|
+
|
38
|
+
defer file.Close()
|
39
|
+
|
40
|
+
reader := csv.NewReader(file)
|
41
|
+
reader.Comma = '\t'
|
42
|
+
|
43
|
+
// count items
|
44
|
+
for {
|
45
|
+
record, err := reader.Read()
|
46
|
+
if err == io.EOF {
|
47
|
+
break
|
48
|
+
} else if err != nil {
|
49
|
+
panic(err)
|
50
|
+
}
|
51
|
+
|
52
|
+
// key is the name of the target sequence.
|
53
|
+
// column 8 in the uc file (9 if you start
|
54
|
+
// counting at 0)
|
55
|
+
key := record[9]
|
56
|
+
|
57
|
+
if _, present := table[key]; present {
|
58
|
+
table[key] = table[key] + 1
|
59
|
+
} else {
|
60
|
+
table[key] = 1
|
61
|
+
}
|
62
|
+
|
63
|
+
}
|
64
|
+
|
65
|
+
for k, _ := range table {
|
66
|
+
fmt.Printf("%v,%v\n", k, table[k])
|
67
|
+
}
|
68
|
+
}
|
data/spec/cli_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe Lederhosen::CLI do
|
3
|
+
describe Lederhosen::CLI, :requires_usearch => true do
|
4
4
|
|
5
5
|
it 'should have an executable' do
|
6
6
|
`./bin/lederhosen`
|
@@ -39,10 +39,10 @@ describe Lederhosen::CLI do
|
|
39
39
|
unclassified_reads = File.readlines("#{$test_dir}/unclassified.fasta")\
|
40
40
|
.select { |x| x =~ /^>/ }\
|
41
41
|
.size
|
42
|
-
|
42
|
+
|
43
43
|
unclassified_results.should == unclassified_reads
|
44
44
|
end
|
45
|
-
|
45
|
+
|
46
46
|
it 'can separate unclassified reads from usearch output using strict pairing' do
|
47
47
|
`./bin/lederhosen separate_unclassified --strict=genus --uc-file=spec/data/test.uc --reads=spec/data/trimmed/ILT_L_9_B_001.fasta --output=#{$test_dir}/unclassified.strict_genus.fasta`
|
48
48
|
$?.success?.should be_true
|
@@ -52,42 +52,22 @@ describe Lederhosen::CLI do
|
|
52
52
|
end
|
53
53
|
|
54
54
|
it 'can create taxonomy count tables' do
|
55
|
-
`./bin/lederhosen count_taxonomies --input=spec/data/test.uc --output=#{$test_dir}/taxonomy_count.
|
55
|
+
`./bin/lederhosen count_taxonomies --input=spec/data/test.uc --output=#{$test_dir}/taxonomy_count.tax`
|
56
56
|
$?.success?.should be_true
|
57
|
-
File.exists?(File.join($test_dir, 'taxonomy_count.
|
57
|
+
File.exists?(File.join($test_dir, 'taxonomy_count.tax')).should be_true
|
58
58
|
end
|
59
59
|
|
60
60
|
it 'generates taxonomy tables w/ comma-free taxonomic descriptions' do
|
61
|
-
File.readlines(File.join($test_dir, 'taxonomy_count.
|
61
|
+
File.readlines(File.join($test_dir, 'taxonomy_count.tax'))\
|
62
62
|
.map(&:strip)\
|
63
63
|
.map { |x| x.count(',') }\
|
64
64
|
.uniq\
|
65
65
|
.should == [1]
|
66
66
|
end
|
67
67
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
$?.success?.should be_true
|
72
|
-
|
73
|
-
lines = File.readlines(File.join($test_dir, "taxonomy_count.strict.#{level}.txt"))
|
74
|
-
|
75
|
-
# make sure total number of reads is even
|
76
|
-
# requires that there should be an odd number if classification is not strict
|
77
|
-
lines.select { |x| !(x =~ /^#/) }\
|
78
|
-
.map(&:strip)\
|
79
|
-
.map { |x| x.split(',') }\
|
80
|
-
.map(&:last)\
|
81
|
-
.map(&:to_i)\
|
82
|
-
.inject(:+).should be_even
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
%w{domain phylum class order family genus species}.each do |level|
|
87
|
-
it "should create OTU abundance matrices from taxonomy count tables at level: #{level}" do
|
88
|
-
`./bin/lederhosen otu_table --files=#{$test_dir}/taxonomy_count.strict.*.txt --level=#{level} --output=#{$test_dir}/otus_genus.strict.csv`
|
89
|
-
$?.success?.should be_true
|
90
|
-
end
|
68
|
+
it 'can create OTU abundance matrices' do
|
69
|
+
`./bin/lederhosen otu_table --files=#{$test_dir}/taxonomy_count.tax --output=#{$test_dir}/otus.genus.csv --level=genus`
|
70
|
+
$?.success?.should be_true
|
91
71
|
end
|
92
72
|
|
93
73
|
it 'should filter OTU abundance matrices' do
|
@@ -95,7 +75,7 @@ describe Lederhosen::CLI do
|
|
95
75
|
# filtering should move filtered reads to 'unclassified_reads' so that we maintain
|
96
76
|
# our knowledge of depth of coverage throughout
|
97
77
|
# this makes normalization better later.
|
98
|
-
`./bin/lederhosen otu_filter --input=#{$test_dir}/
|
78
|
+
`./bin/lederhosen otu_filter --input=#{$test_dir}/otus.genus.csv --output=#{$test_dir}/otus_genus.filtered.csv --reads 1 --samples 1`
|
99
79
|
$?.success?.should be_true
|
100
80
|
end
|
101
81
|
|
data/spec/no_tasks_spec.rb
CHANGED
@@ -3,6 +3,7 @@ require 'spec_helper'
|
|
3
3
|
describe 'no_tasks' do
|
4
4
|
|
5
5
|
let(:greengenes_taxonomies) { ['124 U55236.1 Methanobrevibacter thaueri str. CW k__domain; p__phylum; c__class; o__order; f__family; g__genus; species; otu_127']}
|
6
|
+
let(:greengenes135_taxonomies) { ['k__domain; p__phylum; c__class; o__order; f__family; g__genus; s__species']}
|
6
7
|
let(:qiime_taxonomies) { [ 'k__domain;p__phylum;c__class;o__order;f__family;g__genus;s__species' ]}
|
7
8
|
let(:taxcollector_taxonomies) { ['[0]domain;[1]phylum;[2]class;[3]order;[4]family;[5]genus;[6]species;[7]strain;[8]Genus_species_strain_id'] }
|
8
9
|
let(:lederhosen) { Lederhosen::CLI.new }
|
@@ -15,6 +16,12 @@ describe 'no_tasks' do
|
|
15
16
|
end
|
16
17
|
end
|
17
18
|
|
19
|
+
it '#detect_taxonomy_format should recognize GreenGenes v13.5' do
|
20
|
+
greengenes135_taxonomies.each do |greengenes_taxonomy|
|
21
|
+
lederhosen.detect_taxonomy_format(greengenes_taxonomy).should == :greengenes_135
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
18
25
|
it '#detect_taxonomy_format should recognize TaxCollector' do
|
19
26
|
taxcollector_taxonomies.each do |taxcollector_taxonomy|
|
20
27
|
lederhosen.detect_taxonomy_format(taxcollector_taxonomy).should == :taxcollector
|
@@ -33,28 +40,34 @@ describe 'no_tasks' do
|
|
33
40
|
taxonomy[level].should == level
|
34
41
|
end
|
35
42
|
end
|
36
|
-
|
43
|
+
|
37
44
|
it "#parse_taxonomy_greengenes should parse greengenes taxonomy (#{level})" do
|
38
45
|
greengenes_taxonomies.each do |greengenes_taxonomy|
|
39
46
|
taxonomy = lederhosen.parse_taxonomy_greengenes(greengenes_taxonomy)
|
40
47
|
taxonomy[level].should == level
|
41
48
|
end
|
42
49
|
end
|
43
|
-
|
50
|
+
|
51
|
+
it "#parse_taxonomy_greengenes_135 should parse greengenes v13.5 taxonomy (#{level})" do
|
52
|
+
greengenes135_taxonomies.each do |greengenes_taxonomy|
|
53
|
+
taxonomy = lederhosen.parse_taxonomy_greengenes_135(greengenes_taxonomy)
|
54
|
+
taxonomy[level].should == level
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
44
58
|
it "#parse_taxonomy_greengenes should parse qiime taxonomy (#{level})" do
|
45
59
|
qiime_taxonomies.each do |qiime_taxonomy|
|
46
60
|
taxonomy = lederhosen.parse_taxonomy_qiime(qiime_taxonomy)
|
47
61
|
taxonomy[level].should == level
|
48
62
|
end
|
49
63
|
end
|
50
|
-
|
64
|
+
|
51
65
|
end
|
52
|
-
|
66
|
+
|
53
67
|
it '#parse_taxonomy_taxcollector should return original taxonomy' do
|
54
68
|
lederhosen.parse_taxonomy_taxcollector(taxcollector_taxonomies[0])['original'].should == taxcollector_taxonomies[0]
|
55
69
|
end
|
56
70
|
|
57
|
-
|
58
71
|
it '#parse_taxonomy should automatically detect and parse greengenes taxonomy' do
|
59
72
|
greengenes_taxonomies.each do |greengenes_taxonomy|
|
60
73
|
lederhosen.parse_taxonomy(greengenes_taxonomy).should_not be_nil
|
@@ -67,6 +80,12 @@ describe 'no_tasks' do
|
|
67
80
|
end
|
68
81
|
end
|
69
82
|
|
83
|
+
it '#parse_taxonomy should automatically detect and parse greengenes 13.5 taxonomy' do
|
84
|
+
greengenes135_taxonomies.each do |greengenes_taxonomy|
|
85
|
+
lederhosen.parse_taxonomy(greengenes_taxonomy).should_not be_nil
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
70
89
|
it '#parse_taxonomy_taxcollector should replace unclassified species names with strain name' do
|
71
90
|
t = '[0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Actinomycetales;[4]test;[5]null;[6]Propionibacterineae_bacterium;[7]Propionibacterineae_bacterium_870BRRJ;[8]Propionibacterineae_bacterium_870BRRJ|genus'
|
72
91
|
tax = lederhosen.parse_taxonomy(t)
|
data/spec/spec_helper.rb
CHANGED
@@ -7,3 +7,12 @@ Bundler.require :test, :development
|
|
7
7
|
$test_dir = ENV['TEST_DIR'] || "/tmp/lederhosen_test_#{(0...8).map{65.+(rand(25)).chr}.join}/"
|
8
8
|
`mkdir -p #{$test_dir}`
|
9
9
|
$stderr.puts "test dir: #{$test_dir}"
|
10
|
+
|
11
|
+
RSpec.configure do |c|
|
12
|
+
# check if usearch is in $PATH
|
13
|
+
# if not, skip usearch tests.
|
14
|
+
usearch = `which usearch`
|
15
|
+
if usearch == ''
|
16
|
+
c.filter_run_excluding :requires_usearch => true
|
17
|
+
end
|
18
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03
|
12
|
+
date: 2013-07-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: dna
|
@@ -100,6 +100,7 @@ extra_rdoc_files:
|
|
100
100
|
- LICENSE.txt
|
101
101
|
files:
|
102
102
|
- .rspec
|
103
|
+
- .travis.yml
|
103
104
|
- Gemfile
|
104
105
|
- LICENSE.txt
|
105
106
|
- Rakefile
|
@@ -122,6 +123,7 @@ files:
|
|
122
123
|
- lib/lederhosen/version.rb
|
123
124
|
- logo.png
|
124
125
|
- readme.md
|
126
|
+
- scripts/count_taxonomies.go
|
125
127
|
- scripts/illumina_pipeline/.gitignore
|
126
128
|
- scripts/illumina_pipeline/Makefile
|
127
129
|
- scripts/illumina_pipeline/pipeline.sh
|
@@ -149,7 +151,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
149
151
|
version: '0'
|
150
152
|
segments:
|
151
153
|
- 0
|
152
|
-
hash: -
|
154
|
+
hash: -391146498945924903
|
153
155
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
154
156
|
none: false
|
155
157
|
requirements:
|
@@ -158,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
158
160
|
version: '0'
|
159
161
|
requirements: []
|
160
162
|
rubyforge_project:
|
161
|
-
rubygems_version: 1.8.
|
163
|
+
rubygems_version: 1.8.25
|
162
164
|
signing_key:
|
163
165
|
specification_version: 3
|
164
166
|
summary: OTU Clustering
|