lederhosen 2.0.8 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +3 -0
- data/lederhosen.gemspec +5 -3
- data/lib/lederhosen/cli.rb +1 -1
- data/lib/lederhosen/no_tasks.rb +18 -0
- data/lib/lederhosen/tasks/cluster.rb +18 -15
- data/lib/lederhosen/tasks/count_taxonomies.rb +1 -40
- data/lib/lederhosen/tasks/make_udb.rb +3 -1
- data/lib/lederhosen/tasks/otu_filter.rb +1 -1
- data/lib/lederhosen/version.rb +7 -5
- data/readme.md +37 -60
- data/scripts/count_taxonomies.go +68 -0
- data/spec/cli_spec.rb +10 -30
- data/spec/no_tasks_spec.rb +24 -5
- data/spec/spec_helper.rb +9 -0
- metadata +6 -4
data/.travis.yml
ADDED
data/lederhosen.gemspec
CHANGED
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |s|
|
|
7
7
|
s.name = "lederhosen"
|
|
8
|
-
s.version = "
|
|
8
|
+
s.version = "3.1.0"
|
|
9
9
|
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
|
12
|
-
s.date = "2013-03
|
|
12
|
+
s.date = "2013-07-03"
|
|
13
13
|
s.description = "Various tools for OTU clustering"
|
|
14
14
|
s.email = "harekrishna@gmail.com"
|
|
15
15
|
s.executables = ["lederhosen"]
|
|
@@ -18,6 +18,7 @@ Gem::Specification.new do |s|
|
|
|
18
18
|
]
|
|
19
19
|
s.files = [
|
|
20
20
|
".rspec",
|
|
21
|
+
".travis.yml",
|
|
21
22
|
"Gemfile",
|
|
22
23
|
"LICENSE.txt",
|
|
23
24
|
"Rakefile",
|
|
@@ -40,6 +41,7 @@ Gem::Specification.new do |s|
|
|
|
40
41
|
"lib/lederhosen/version.rb",
|
|
41
42
|
"logo.png",
|
|
42
43
|
"readme.md",
|
|
44
|
+
"scripts/count_taxonomies.go",
|
|
43
45
|
"scripts/illumina_pipeline/.gitignore",
|
|
44
46
|
"scripts/illumina_pipeline/Makefile",
|
|
45
47
|
"scripts/illumina_pipeline/pipeline.sh",
|
|
@@ -56,7 +58,7 @@ Gem::Specification.new do |s|
|
|
|
56
58
|
s.homepage = "http://audy.github.com/lederhosen"
|
|
57
59
|
s.licenses = ["MIT"]
|
|
58
60
|
s.require_paths = ["lib"]
|
|
59
|
-
s.rubygems_version = "1.8.
|
|
61
|
+
s.rubygems_version = "1.8.25"
|
|
60
62
|
s.summary = "OTU Clustering"
|
|
61
63
|
|
|
62
64
|
if s.respond_to? :specification_version then
|
data/lib/lederhosen/cli.rb
CHANGED
data/lib/lederhosen/no_tasks.rb
CHANGED
|
@@ -36,6 +36,8 @@ module Lederhosen
|
|
|
36
36
|
# taxcollector taxonomy starts with a open square bracked
|
|
37
37
|
if taxonomy =~ /^\[/
|
|
38
38
|
:taxcollector
|
|
39
|
+
elsif taxonomy =~ /s__/
|
|
40
|
+
:greengenes_135
|
|
39
41
|
elsif taxonomy =~ /^\d/
|
|
40
42
|
:greengenes
|
|
41
43
|
elsif taxonomy.nil?
|
|
@@ -51,6 +53,8 @@ module Lederhosen
|
|
|
51
53
|
case @taxonomy_format
|
|
52
54
|
when :greengenes
|
|
53
55
|
parse_taxonomy_greengenes(taxonomy)
|
|
56
|
+
when :greengenes_135
|
|
57
|
+
parse_taxonomy_greengenes_135(taxonomy)
|
|
54
58
|
when :taxcollector
|
|
55
59
|
parse_taxonomy_taxcollector(taxonomy)
|
|
56
60
|
when :qiime
|
|
@@ -62,6 +66,7 @@ module Lederhosen
|
|
|
62
66
|
|
|
63
67
|
RE_TAXCOLLECTOR = /^\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/
|
|
64
68
|
RE_GREENGENES = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?(.*);/
|
|
69
|
+
RE_GREENGENES_135 = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?s__(.*)/
|
|
65
70
|
RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
|
|
66
71
|
|
|
67
72
|
def parse_taxonomy_qiime(taxonomy)
|
|
@@ -90,6 +95,19 @@ module Lederhosen
|
|
|
90
95
|
names
|
|
91
96
|
end
|
|
92
97
|
|
|
98
|
+
def parse_taxonomy_greengenes_135(taxonomy)
|
|
99
|
+
levels = %w{domain phylum class order family genus species}
|
|
100
|
+
match_data = taxonomy.match(RE_GREENGENES_135)
|
|
101
|
+
match_data = match_data[1..-1]
|
|
102
|
+
|
|
103
|
+
names = Hash.new
|
|
104
|
+
# for some reason Hash[*levels.zip(match_data)] ain't working
|
|
105
|
+
levels.zip(match_data).each { |l, n| names[l] = n }
|
|
106
|
+
|
|
107
|
+
names['original'] = taxonomy
|
|
108
|
+
names
|
|
109
|
+
end
|
|
110
|
+
|
|
93
111
|
# parse a taxonomic description using the
|
|
94
112
|
# taxcollector format returning name at each level (genus, etc...)
|
|
95
113
|
#
|
|
@@ -4,22 +4,24 @@ module Lederhosen
|
|
|
4
4
|
|
|
5
5
|
desc 'cluster', 'reference-based clustering using usearch'
|
|
6
6
|
|
|
7
|
-
method_option :input, :type
|
|
8
|
-
method_option :database, :type
|
|
9
|
-
method_option :threads, :type
|
|
10
|
-
method_option :identity, :type
|
|
11
|
-
method_option :output, :type
|
|
12
|
-
method_option :strand, :type
|
|
13
|
-
method_option :dry_run, :type
|
|
7
|
+
method_option :input, :type => :string, :required => true
|
|
8
|
+
method_option :database, :type => :string, :required => true
|
|
9
|
+
method_option :threads, :type => :numeric, :default => false
|
|
10
|
+
method_option :identity, :type => :numeric, :required => true
|
|
11
|
+
method_option :output, :type => :string, :required => true
|
|
12
|
+
method_option :strand, :type => :string, :default => 'plus'
|
|
13
|
+
method_option :dry_run, :type => :boolean, :default => false
|
|
14
|
+
method_option :query_cov, :type => :numeric, :required => false, :default => 0.95
|
|
14
15
|
|
|
15
16
|
def cluster
|
|
16
|
-
input
|
|
17
|
-
database
|
|
18
|
-
threads
|
|
19
|
-
identity
|
|
20
|
-
output
|
|
21
|
-
strand
|
|
22
|
-
dry_run
|
|
17
|
+
input = File.expand_path(options[:input])
|
|
18
|
+
database = File.expand_path(options[:database])
|
|
19
|
+
threads = options[:threads]
|
|
20
|
+
identity = options[:identity]
|
|
21
|
+
output = File.expand_path(options[:output])
|
|
22
|
+
strand = options[:strand]
|
|
23
|
+
dry_run = options[:dry_run]
|
|
24
|
+
query_cov = options[:query_cov]
|
|
23
25
|
|
|
24
26
|
ohai "#{'(dry run)' if dry_run} clustering #{input} to #{database} and saving to #{output}"
|
|
25
27
|
|
|
@@ -32,7 +34,8 @@ module Lederhosen
|
|
|
32
34
|
"--id #{identity}",
|
|
33
35
|
"--uc #{output}",
|
|
34
36
|
"--db #{database}",
|
|
35
|
-
"--strand #{strand}"
|
|
37
|
+
"--strand #{strand}",
|
|
38
|
+
"--query_cov #{query_cov}"
|
|
36
39
|
]
|
|
37
40
|
|
|
38
41
|
# threads = False : use all threads (default)
|
|
@@ -5,27 +5,16 @@ module Lederhosen
|
|
|
5
5
|
|
|
6
6
|
method_option :input, :type => :string, :required => true
|
|
7
7
|
method_option :output, :type => :string, :required => true
|
|
8
|
-
method_option :strict, :type => :string, :default => false,
|
|
9
|
-
:banner => '<level> only count reads where both taxonomies are in agreement at <level>'
|
|
10
8
|
|
|
11
9
|
def count_taxonomies
|
|
12
10
|
input = options[:input]
|
|
13
11
|
output = options[:output]
|
|
14
|
-
strict = options[:strict]
|
|
15
12
|
|
|
16
13
|
ohai "generating #{output} from #{input}"
|
|
17
14
|
|
|
18
15
|
handle = File.open(input)
|
|
19
16
|
uc = UCParser.new(handle)
|
|
20
|
-
|
|
21
|
-
taxonomy_count =
|
|
22
|
-
if not strict
|
|
23
|
-
get_taxonomy_count(uc)
|
|
24
|
-
|
|
25
|
-
elsif strict
|
|
26
|
-
get_strict_taxonomy_count(uc, strict)
|
|
27
|
-
end
|
|
28
|
-
|
|
17
|
+
taxonomy_count = get_taxonomy_count(uc)
|
|
29
18
|
handle.close
|
|
30
19
|
|
|
31
20
|
out = File.open(output, 'w')
|
|
@@ -51,34 +40,6 @@ module Lederhosen
|
|
|
51
40
|
taxonomy_count
|
|
52
41
|
end
|
|
53
42
|
|
|
54
|
-
# returns Hash of taxonomy => number_of_reads
|
|
55
|
-
# if a pair of reads do not agree at a taxonomic level,
|
|
56
|
-
# or if at least one is unclassified, bot reads are counted
|
|
57
|
-
# as unclassified_reads
|
|
58
|
-
def get_strict_taxonomy_count(uc, level)
|
|
59
|
-
taxonomy_count = Hash.new { |h, k| h[k] = 0 }
|
|
60
|
-
# TODO: I'm making a block for results because I don't know how to
|
|
61
|
-
# make results return an Enumerator when not given a block
|
|
62
|
-
uc.each_slice(2) do |left, right|
|
|
63
|
-
if left.miss? or right.miss? # at least one is a miss
|
|
64
|
-
taxonomy_count['unclassified_reads'] += 2
|
|
65
|
-
# both are hits, check taxonomies
|
|
66
|
-
else
|
|
67
|
-
ta = parse_taxonomy(left.target)
|
|
68
|
-
tb = parse_taxonomy(right.target)
|
|
69
|
-
# they match up, count both separately
|
|
70
|
-
if ta[level] == tb[level]
|
|
71
|
-
taxonomy_count[left.target] += 1
|
|
72
|
-
taxonomy_count[right.target] += 1
|
|
73
|
-
# they don't match up, count as unclassified
|
|
74
|
-
else
|
|
75
|
-
taxonomy_count['unclassified_reads'] += 2
|
|
76
|
-
end
|
|
77
|
-
end
|
|
78
|
-
end # results.each_slice
|
|
79
|
-
taxonomy_count
|
|
80
|
-
end
|
|
81
|
-
|
|
82
43
|
end
|
|
83
44
|
end
|
|
84
45
|
end
|
|
@@ -10,12 +10,14 @@ module Lederhosen
|
|
|
10
10
|
input = options[:input]
|
|
11
11
|
output = options[:output]
|
|
12
12
|
word_length = options[:word_length]
|
|
13
|
+
db_step = options[:db_step]
|
|
13
14
|
|
|
14
15
|
ohai "making udb w/ #{input}, saving as #{output}."
|
|
15
16
|
|
|
16
17
|
cmd = ['usearch',
|
|
17
18
|
"-makeudb_usearch #{input}",
|
|
18
|
-
"-output #{output}"
|
|
19
|
+
"-output #{output}",
|
|
20
|
+
]
|
|
19
21
|
|
|
20
22
|
cmd = cmd.join(' ')
|
|
21
23
|
|
|
@@ -70,7 +70,7 @@ module Lederhosen
|
|
|
70
70
|
|
|
71
71
|
kept_counts = counts.zip(mask).map { |c, m| c if m }.compact
|
|
72
72
|
noise = counts.zip(mask).map { |c, m| c unless m }.compact.inject(:+)
|
|
73
|
-
filtered_reads += noise
|
|
73
|
+
filtered_reads += noise || 0
|
|
74
74
|
|
|
75
75
|
output.puts "#{sample_name},#{kept_counts.join(',')},#{noise}"
|
|
76
76
|
|
data/lib/lederhosen/version.rb
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
module Lederhosen
|
|
2
2
|
module Version
|
|
3
|
-
MAJOR =
|
|
4
|
-
MINOR =
|
|
5
|
-
CODENAME = '
|
|
6
|
-
PATCH =
|
|
3
|
+
MAJOR = 3
|
|
4
|
+
MINOR = 1
|
|
5
|
+
CODENAME = 'Hauptbahnhof' # changes for minor versions
|
|
6
|
+
PATCH = 0
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
string = [MAJOR, MINOR, PATCH].join('.')
|
|
9
|
+
|
|
10
|
+
STRING = string
|
|
9
11
|
end
|
|
10
12
|
end
|
data/readme.md
CHANGED
|
@@ -1,52 +1,38 @@
|
|
|
1
1
|
<img src="https://raw.github.com/audy/lederhosen/master/logo.png" align="right">
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
Lederhosen is a set of tools for OTU clustering rRNA amplicons using Robert Edgar's USEARCH.
|
|
6
|
-
|
|
7
|
-
It's used to run USEARCH and create and filter tables. Unlike most of the software in Bioinformatics,
|
|
8
|
-
It is meant to be UNIX-y: do one thing and do it well.
|
|
9
|
-
|
|
10
|
-
Do you want to run Lederhosen on a cluster? Use `--dry-run` and feed it to your cluster's queue management system.
|
|
3
|
+
[](https://travis-ci.org/audy/lederhosen)
|
|
11
5
|
|
|
12
|
-
Lederhosen
|
|
13
|
-
|
|
14
|
-
Lederhosen is designed with the following "pipeline" in mind:
|
|
15
|
-
|
|
16
|
-
1. Clustering sequences to reference sequences (read: database) and/or _de novo_ OTU clustering.
|
|
17
|
-
- `lederhosen cluster ...`
|
|
18
|
-
2. Generating tables from USEARCH output.
|
|
19
|
-
- `lederhosen count_taxonomies ...`
|
|
20
|
-
- `lederhosen otu_table ...`
|
|
21
|
-
3. Filtering tables to remove small or insignificant OTUs.
|
|
22
|
-
- `lederhosen otu_filter ...`
|
|
6
|
+
# Lederhosen
|
|
23
7
|
|
|
8
|
+
Lederhosen is a set of tools for OTU clustering rRNA amplicons using
|
|
9
|
+
Robert Edgar's USEARCH and is simple, robust, and fast.
|
|
10
|
+
Lederhosen was designed from the beginning to handle lots of data from
|
|
11
|
+
lots of samples, specifically from data generated by multiplexed
|
|
12
|
+
Illumina Hi/Mi-Seq sequencing.
|
|
24
13
|
|
|
25
|
-
|
|
14
|
+
No assumptions are made about the design of your experiment.
|
|
15
|
+
Therefore, there are no tools for read pre-processing and data analysis
|
|
16
|
+
or statistics. Insert reads, receive data.
|
|
26
17
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq), Ion Torrent, or Roche-454.
|
|
30
|
-
- Lederhosen uses [Semantic Versioning](http://semver.org/), is free and open source under the
|
|
31
|
-
[MIT open source license](http://opensource.org/licenses/mit-license.php/).
|
|
32
|
-
- Except for USEARCH which requires a license, Lederhosen is available for commercial use.
|
|
18
|
+
Lederhosen is free and open source under the MIT license. Except for
|
|
19
|
+
the USEARCH license, Lederhosen is free for commercial use.
|
|
33
20
|
|
|
34
21
|
### Features
|
|
35
22
|
|
|
36
|
-
-
|
|
37
|
-
-
|
|
38
|
-
-
|
|
23
|
+
- Referenced-based OTU clustering to via USEARCH.
|
|
24
|
+
- Multiple Database Support (RDP, GreenGenes, TaxCollector, Silva).
|
|
25
|
+
- Parallel support (USEARCH, MapReduce or Compute Cluster).
|
|
39
26
|
- Generation and filtering of OTU abundancy matrices.
|
|
40
|
-
-. Support for paired end reads (considers taxonomic assignment for both reads in a pair).
|
|
41
27
|
|
|
42
28
|
### Installation
|
|
43
29
|
|
|
44
|
-
0. Obtain & Install [USEARCH](http://www.drive5.com/)
|
|
45
|
-
|
|
30
|
+
0. Obtain & Install [USEARCH](http://www.drive5.com/).
|
|
31
|
+
1. Get a database:
|
|
46
32
|
- [TaxCollector](http://github.com/audy/taxcollector)
|
|
47
33
|
- [GreenGenes](http://greengenes.lbl.gov) 16S database
|
|
48
34
|
- File an [issue report](https://github.com/audy/lederhosen/issues) or pull request ;) to request support for a different database.
|
|
49
|
-
|
|
35
|
+
2. Install Lederhosen by typing:
|
|
50
36
|
|
|
51
37
|
`sudo gem install lederhosen`
|
|
52
38
|
4. Check installation by typing `lederhosen`. You should see some help text.
|
|
@@ -61,11 +47,17 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
|
|
|
61
47
|
|
|
62
48
|
### Trim Reads
|
|
63
49
|
|
|
64
|
-
Trimming removed. I think you should use
|
|
50
|
+
Trimming removed. I think you should use
|
|
51
|
+
[Sickle](https://github.com/najoshi/sickle), or
|
|
52
|
+
[Trimmomatic](http://www.usadellab.org/cms/index.php?page=trimmomatic).
|
|
53
|
+
You can use
|
|
54
|
+
[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) to inspect read quality.
|
|
65
55
|
|
|
66
56
|
### Create Database
|
|
67
57
|
|
|
68
|
-
|
|
58
|
+
The 16S database can optionally be in USEARCH database format (udb).
|
|
59
|
+
This speeds things up if you are clustering sequences in multiple FASTA
|
|
60
|
+
files.
|
|
69
61
|
|
|
70
62
|
```bash
|
|
71
63
|
lederhosen make_udb \
|
|
@@ -73,22 +65,21 @@ lederhosen make_udb \
|
|
|
73
65
|
--output=taxcollector.udb
|
|
74
66
|
```
|
|
75
67
|
|
|
76
|
-
(not actually required but will make batch searching a lot faster)
|
|
77
|
-
|
|
78
68
|
### Cluster Reads using USEARCH
|
|
79
69
|
|
|
80
70
|
Cluster reads using USEARCH. Output is a uc file.
|
|
81
71
|
|
|
82
72
|
```bash
|
|
83
73
|
lederhosen cluster \
|
|
84
|
-
--input=trimmed
|
|
74
|
+
--input=trimmed/sequences.fasta \
|
|
85
75
|
--identity=0.95 \
|
|
86
76
|
--output=clusters_95.uc \
|
|
87
77
|
--database=taxcollector.udb
|
|
88
78
|
```
|
|
89
79
|
|
|
90
|
-
The optional `--dry-run` parameter
|
|
91
|
-
|
|
80
|
+
The optional `--dry-run` parameter prints the USEARCH command to
|
|
81
|
+
standard out. Instead of actually running the command. This is useful if
|
|
82
|
+
you want to run jobs in parallel and/or on a cluster.
|
|
92
83
|
|
|
93
84
|
```bash
|
|
94
85
|
for reads_file in reads/*.fasta;
|
|
@@ -108,7 +99,7 @@ cat jobs.sh | parallel -j 24 # run 24 parallel jobs
|
|
|
108
99
|
|
|
109
100
|
### Generate taxonomy counts tables
|
|
110
101
|
|
|
111
|
-
Before generating OTU tables, you must generate taxonomy counts tables.
|
|
102
|
+
Before generating OTU tables, you must generate taxonomy counts (`.tax`) tables.
|
|
112
103
|
|
|
113
104
|
A taxonomy count table looks something like this
|
|
114
105
|
|
|
@@ -125,19 +116,6 @@ lederhosen count_taxonomies \
|
|
|
125
116
|
--output=clusters_taxonomies.txt
|
|
126
117
|
```
|
|
127
118
|
|
|
128
|
-
If you did paired-end sequencing, you can generate strict taxonomy tables that only count reads when *both pairs* have the *same*
|
|
129
|
-
taxonomic description at a certain taxonomic level. This is useful for leveraging the increased length of having pairs and also
|
|
130
|
-
acts as a sort of chimera filter. You will, however, end up using less of your reads as the level goes from domain to species.
|
|
131
|
-
|
|
132
|
-
```bash
|
|
133
|
-
lederhosen count_taxonomies \
|
|
134
|
-
--input=clusters.uc \
|
|
135
|
-
--strict=genus \
|
|
136
|
-
--output=clusters_taxonomies.strict.genus.txt
|
|
137
|
-
```
|
|
138
|
-
|
|
139
|
-
Reads that do not have the same phylogeny at `level` will become `unclassified_reads`
|
|
140
|
-
|
|
141
119
|
### Generate OTU tables
|
|
142
120
|
|
|
143
121
|
Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
|
|
@@ -152,8 +130,8 @@ lederhosen otu_table \
|
|
|
152
130
|
This will create the file `my_poop_samples_genus_strict.95.txt` containing the clusters
|
|
153
131
|
as columns and the samples as rows.
|
|
154
132
|
|
|
155
|
-
|
|
156
|
-
|
|
133
|
+
If your database doesn't have taxonomic descriptions, use
|
|
134
|
+
`--level=original`.
|
|
157
135
|
|
|
158
136
|
### Filter OTU tables
|
|
159
137
|
|
|
@@ -175,7 +153,6 @@ lederhosen otu_filter \
|
|
|
175
153
|
This will remove any clusters that do not appear in at least 10 samples with at least 50 reads. The read counts
|
|
176
154
|
for filtered clusters will be moved to the `noise` psuedocluster.
|
|
177
155
|
|
|
178
|
-
|
|
179
156
|
### Get representative sequences
|
|
180
157
|
|
|
181
158
|
You can get the representative sequences for each cluster using the `get_reps` tasks.
|
|
@@ -219,9 +196,9 @@ lederhosen separate_unclassified \
|
|
|
219
196
|
|
|
220
197
|
## Acknowledgements
|
|
221
198
|
|
|
222
|
-
-
|
|
223
|
-
-
|
|
224
|
-
-
|
|
199
|
+
- [Sinbad Richardson](http://viennapitts.com/) for the Lederhosen Guy artwork
|
|
200
|
+
- Lexi, and Kevin for beta-testing and putting up with bugs.
|
|
201
|
+
- The QIIME project for inspiration.
|
|
225
202
|
|
|
226
203
|
## Please Cite
|
|
227
204
|
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
//
|
|
4
|
+
// count_taxonomies.go
|
|
5
|
+
// a faster alternative to lederhosen count_taxonomies
|
|
6
|
+
// (c2013) Austin G. Davis-Richardson
|
|
7
|
+
// MIT v3 LICENSE
|
|
8
|
+
//
|
|
9
|
+
// COMPILATION:
|
|
10
|
+
//
|
|
11
|
+
// 1.) Install Go (http://golang.org)
|
|
12
|
+
// 2.) go build count_taxonomies.go
|
|
13
|
+
// 3.) At this point you're ready to go
|
|
14
|
+
//
|
|
15
|
+
// USAGE:
|
|
16
|
+
// count_taxonomies input.uc > output.tax
|
|
17
|
+
//
|
|
18
|
+
|
|
19
|
+
import (
|
|
20
|
+
"encoding/csv"
|
|
21
|
+
"fmt"
|
|
22
|
+
"io"
|
|
23
|
+
"os"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
func main() {
|
|
27
|
+
|
|
28
|
+
table := map[string]int64{}
|
|
29
|
+
|
|
30
|
+
infile := os.Args[1]
|
|
31
|
+
|
|
32
|
+
file, err := os.Open(infile)
|
|
33
|
+
|
|
34
|
+
if err != nil {
|
|
35
|
+
panic(err)
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
defer file.Close()
|
|
39
|
+
|
|
40
|
+
reader := csv.NewReader(file)
|
|
41
|
+
reader.Comma = '\t'
|
|
42
|
+
|
|
43
|
+
// count items
|
|
44
|
+
for {
|
|
45
|
+
record, err := reader.Read()
|
|
46
|
+
if err == io.EOF {
|
|
47
|
+
break
|
|
48
|
+
} else if err != nil {
|
|
49
|
+
panic(err)
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// key is the name of the target sequence.
|
|
53
|
+
// column 8 in the uc file (9 if you start
|
|
54
|
+
// counting at 0)
|
|
55
|
+
key := record[9]
|
|
56
|
+
|
|
57
|
+
if _, present := table[key]; present {
|
|
58
|
+
table[key] = table[key] + 1
|
|
59
|
+
} else {
|
|
60
|
+
table[key] = 1
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
for k, _ := range table {
|
|
66
|
+
fmt.Printf("%v,%v\n", k, table[k])
|
|
67
|
+
}
|
|
68
|
+
}
|
data/spec/cli_spec.rb
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
require 'spec_helper'
|
|
2
2
|
|
|
3
|
-
describe Lederhosen::CLI do
|
|
3
|
+
describe Lederhosen::CLI, :requires_usearch => true do
|
|
4
4
|
|
|
5
5
|
it 'should have an executable' do
|
|
6
6
|
`./bin/lederhosen`
|
|
@@ -39,10 +39,10 @@ describe Lederhosen::CLI do
|
|
|
39
39
|
unclassified_reads = File.readlines("#{$test_dir}/unclassified.fasta")\
|
|
40
40
|
.select { |x| x =~ /^>/ }\
|
|
41
41
|
.size
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
unclassified_results.should == unclassified_reads
|
|
44
44
|
end
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
it 'can separate unclassified reads from usearch output using strict pairing' do
|
|
47
47
|
`./bin/lederhosen separate_unclassified --strict=genus --uc-file=spec/data/test.uc --reads=spec/data/trimmed/ILT_L_9_B_001.fasta --output=#{$test_dir}/unclassified.strict_genus.fasta`
|
|
48
48
|
$?.success?.should be_true
|
|
@@ -52,42 +52,22 @@ describe Lederhosen::CLI do
|
|
|
52
52
|
end
|
|
53
53
|
|
|
54
54
|
it 'can create taxonomy count tables' do
|
|
55
|
-
`./bin/lederhosen count_taxonomies --input=spec/data/test.uc --output=#{$test_dir}/taxonomy_count.
|
|
55
|
+
`./bin/lederhosen count_taxonomies --input=spec/data/test.uc --output=#{$test_dir}/taxonomy_count.tax`
|
|
56
56
|
$?.success?.should be_true
|
|
57
|
-
File.exists?(File.join($test_dir, 'taxonomy_count.
|
|
57
|
+
File.exists?(File.join($test_dir, 'taxonomy_count.tax')).should be_true
|
|
58
58
|
end
|
|
59
59
|
|
|
60
60
|
it 'generates taxonomy tables w/ comma-free taxonomic descriptions' do
|
|
61
|
-
File.readlines(File.join($test_dir, 'taxonomy_count.
|
|
61
|
+
File.readlines(File.join($test_dir, 'taxonomy_count.tax'))\
|
|
62
62
|
.map(&:strip)\
|
|
63
63
|
.map { |x| x.count(',') }\
|
|
64
64
|
.uniq\
|
|
65
65
|
.should == [1]
|
|
66
66
|
end
|
|
67
67
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
$?.success?.should be_true
|
|
72
|
-
|
|
73
|
-
lines = File.readlines(File.join($test_dir, "taxonomy_count.strict.#{level}.txt"))
|
|
74
|
-
|
|
75
|
-
# make sure total number of reads is even
|
|
76
|
-
# requires that there should be an odd number if classification is not strict
|
|
77
|
-
lines.select { |x| !(x =~ /^#/) }\
|
|
78
|
-
.map(&:strip)\
|
|
79
|
-
.map { |x| x.split(',') }\
|
|
80
|
-
.map(&:last)\
|
|
81
|
-
.map(&:to_i)\
|
|
82
|
-
.inject(:+).should be_even
|
|
83
|
-
end
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
%w{domain phylum class order family genus species}.each do |level|
|
|
87
|
-
it "should create OTU abundance matrices from taxonomy count tables at level: #{level}" do
|
|
88
|
-
`./bin/lederhosen otu_table --files=#{$test_dir}/taxonomy_count.strict.*.txt --level=#{level} --output=#{$test_dir}/otus_genus.strict.csv`
|
|
89
|
-
$?.success?.should be_true
|
|
90
|
-
end
|
|
68
|
+
it 'can create OTU abundance matrices' do
|
|
69
|
+
`./bin/lederhosen otu_table --files=#{$test_dir}/taxonomy_count.tax --output=#{$test_dir}/otus.genus.csv --level=genus`
|
|
70
|
+
$?.success?.should be_true
|
|
91
71
|
end
|
|
92
72
|
|
|
93
73
|
it 'should filter OTU abundance matrices' do
|
|
@@ -95,7 +75,7 @@ describe Lederhosen::CLI do
|
|
|
95
75
|
# filtering should move filtered reads to 'unclassified_reads' so that we maintain
|
|
96
76
|
# our knowledge of depth of coverage throughout
|
|
97
77
|
# this makes normalization better later.
|
|
98
|
-
`./bin/lederhosen otu_filter --input=#{$test_dir}/
|
|
78
|
+
`./bin/lederhosen otu_filter --input=#{$test_dir}/otus.genus.csv --output=#{$test_dir}/otus_genus.filtered.csv --reads 1 --samples 1`
|
|
99
79
|
$?.success?.should be_true
|
|
100
80
|
end
|
|
101
81
|
|
data/spec/no_tasks_spec.rb
CHANGED
|
@@ -3,6 +3,7 @@ require 'spec_helper'
|
|
|
3
3
|
describe 'no_tasks' do
|
|
4
4
|
|
|
5
5
|
let(:greengenes_taxonomies) { ['124 U55236.1 Methanobrevibacter thaueri str. CW k__domain; p__phylum; c__class; o__order; f__family; g__genus; species; otu_127']}
|
|
6
|
+
let(:greengenes135_taxonomies) { ['k__domain; p__phylum; c__class; o__order; f__family; g__genus; s__species']}
|
|
6
7
|
let(:qiime_taxonomies) { [ 'k__domain;p__phylum;c__class;o__order;f__family;g__genus;s__species' ]}
|
|
7
8
|
let(:taxcollector_taxonomies) { ['[0]domain;[1]phylum;[2]class;[3]order;[4]family;[5]genus;[6]species;[7]strain;[8]Genus_species_strain_id'] }
|
|
8
9
|
let(:lederhosen) { Lederhosen::CLI.new }
|
|
@@ -15,6 +16,12 @@ describe 'no_tasks' do
|
|
|
15
16
|
end
|
|
16
17
|
end
|
|
17
18
|
|
|
19
|
+
it '#detect_taxonomy_format should recognize GreenGenes v13.5' do
|
|
20
|
+
greengenes135_taxonomies.each do |greengenes_taxonomy|
|
|
21
|
+
lederhosen.detect_taxonomy_format(greengenes_taxonomy).should == :greengenes_135
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
18
25
|
it '#detect_taxonomy_format should recognize TaxCollector' do
|
|
19
26
|
taxcollector_taxonomies.each do |taxcollector_taxonomy|
|
|
20
27
|
lederhosen.detect_taxonomy_format(taxcollector_taxonomy).should == :taxcollector
|
|
@@ -33,28 +40,34 @@ describe 'no_tasks' do
|
|
|
33
40
|
taxonomy[level].should == level
|
|
34
41
|
end
|
|
35
42
|
end
|
|
36
|
-
|
|
43
|
+
|
|
37
44
|
it "#parse_taxonomy_greengenes should parse greengenes taxonomy (#{level})" do
|
|
38
45
|
greengenes_taxonomies.each do |greengenes_taxonomy|
|
|
39
46
|
taxonomy = lederhosen.parse_taxonomy_greengenes(greengenes_taxonomy)
|
|
40
47
|
taxonomy[level].should == level
|
|
41
48
|
end
|
|
42
49
|
end
|
|
43
|
-
|
|
50
|
+
|
|
51
|
+
it "#parse_taxonomy_greengenes_135 should parse greengenes v13.5 taxonomy (#{level})" do
|
|
52
|
+
greengenes135_taxonomies.each do |greengenes_taxonomy|
|
|
53
|
+
taxonomy = lederhosen.parse_taxonomy_greengenes_135(greengenes_taxonomy)
|
|
54
|
+
taxonomy[level].should == level
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
44
58
|
it "#parse_taxonomy_greengenes should parse qiime taxonomy (#{level})" do
|
|
45
59
|
qiime_taxonomies.each do |qiime_taxonomy|
|
|
46
60
|
taxonomy = lederhosen.parse_taxonomy_qiime(qiime_taxonomy)
|
|
47
61
|
taxonomy[level].should == level
|
|
48
62
|
end
|
|
49
63
|
end
|
|
50
|
-
|
|
64
|
+
|
|
51
65
|
end
|
|
52
|
-
|
|
66
|
+
|
|
53
67
|
it '#parse_taxonomy_taxcollector should return original taxonomy' do
|
|
54
68
|
lederhosen.parse_taxonomy_taxcollector(taxcollector_taxonomies[0])['original'].should == taxcollector_taxonomies[0]
|
|
55
69
|
end
|
|
56
70
|
|
|
57
|
-
|
|
58
71
|
it '#parse_taxonomy should automatically detect and parse greengenes taxonomy' do
|
|
59
72
|
greengenes_taxonomies.each do |greengenes_taxonomy|
|
|
60
73
|
lederhosen.parse_taxonomy(greengenes_taxonomy).should_not be_nil
|
|
@@ -67,6 +80,12 @@ describe 'no_tasks' do
|
|
|
67
80
|
end
|
|
68
81
|
end
|
|
69
82
|
|
|
83
|
+
it '#parse_taxonomy should automatically detect and parse greengenes 13.5 taxonomy' do
|
|
84
|
+
greengenes135_taxonomies.each do |greengenes_taxonomy|
|
|
85
|
+
lederhosen.parse_taxonomy(greengenes_taxonomy).should_not be_nil
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
70
89
|
it '#parse_taxonomy_taxcollector should replace unclassified species names with strain name' do
|
|
71
90
|
t = '[0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Actinomycetales;[4]test;[5]null;[6]Propionibacterineae_bacterium;[7]Propionibacterineae_bacterium_870BRRJ;[8]Propionibacterineae_bacterium_870BRRJ|genus'
|
|
72
91
|
tax = lederhosen.parse_taxonomy(t)
|
data/spec/spec_helper.rb
CHANGED
|
@@ -7,3 +7,12 @@ Bundler.require :test, :development
|
|
|
7
7
|
$test_dir = ENV['TEST_DIR'] || "/tmp/lederhosen_test_#{(0...8).map{65.+(rand(25)).chr}.join}/"
|
|
8
8
|
`mkdir -p #{$test_dir}`
|
|
9
9
|
$stderr.puts "test dir: #{$test_dir}"
|
|
10
|
+
|
|
11
|
+
RSpec.configure do |c|
|
|
12
|
+
# check if usearch is in $PATH
|
|
13
|
+
# if not, skip usearch tests.
|
|
14
|
+
usearch = `which usearch`
|
|
15
|
+
if usearch == ''
|
|
16
|
+
c.filter_run_excluding :requires_usearch => true
|
|
17
|
+
end
|
|
18
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lederhosen
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 3.1.0
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2013-03
|
|
12
|
+
date: 2013-07-03 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: dna
|
|
@@ -100,6 +100,7 @@ extra_rdoc_files:
|
|
|
100
100
|
- LICENSE.txt
|
|
101
101
|
files:
|
|
102
102
|
- .rspec
|
|
103
|
+
- .travis.yml
|
|
103
104
|
- Gemfile
|
|
104
105
|
- LICENSE.txt
|
|
105
106
|
- Rakefile
|
|
@@ -122,6 +123,7 @@ files:
|
|
|
122
123
|
- lib/lederhosen/version.rb
|
|
123
124
|
- logo.png
|
|
124
125
|
- readme.md
|
|
126
|
+
- scripts/count_taxonomies.go
|
|
125
127
|
- scripts/illumina_pipeline/.gitignore
|
|
126
128
|
- scripts/illumina_pipeline/Makefile
|
|
127
129
|
- scripts/illumina_pipeline/pipeline.sh
|
|
@@ -149,7 +151,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
149
151
|
version: '0'
|
|
150
152
|
segments:
|
|
151
153
|
- 0
|
|
152
|
-
hash: -
|
|
154
|
+
hash: -391146498945924903
|
|
153
155
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
156
|
none: false
|
|
155
157
|
requirements:
|
|
@@ -158,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
158
160
|
version: '0'
|
|
159
161
|
requirements: []
|
|
160
162
|
rubyforge_project:
|
|
161
|
-
rubygems_version: 1.8.
|
|
163
|
+
rubygems_version: 1.8.25
|
|
162
164
|
signing_key:
|
|
163
165
|
specification_version: 3
|
|
164
166
|
summary: OTU Clustering
|