lederhosen 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/{pipeline.sh → examples/pipeline.sh} +10 -10
- data/lib/lederhosen/tasks/add_names.rb +86 -0
- data/lib/lederhosen/tasks/cluster.rb +1 -1
- data/lib/lederhosen/tasks/hierarchical.rb +40 -0
- data/lib/lederhosen/tasks/join.rb +1 -1
- data/lib/lederhosen/tasks/k_filter.rb +1 -1
- data/lib/lederhosen/tasks/name.rb +2 -2
- data/lib/lederhosen/tasks/otu_table.rb +1 -1
- data/lib/lederhosen/tasks/rep_reads.rb +1 -1
- data/lib/lederhosen/tasks/sort.rb +1 -1
- data/lib/lederhosen/tasks/split.rb +3 -3
- data/lib/lederhosen/tasks/squish.rb +48 -0
- data/lib/lederhosen/tasks/trim.rb +1 -1
- data/lib/lederhosen/tasks/uc_filter.rb +1 -1
- data/lib/version.rb +1 -1
- data/readme.md +83 -7
- metadata +8 -5
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/
|
1
|
+
#!/bash
|
2
2
|
|
3
3
|
# An example OTU clustering pipeline
|
4
4
|
# Austin G. Davis-Richardson
|
@@ -14,55 +14,55 @@ min_reads=50
|
|
14
14
|
min_samples=10
|
15
15
|
|
16
16
|
# trim reads
|
17
|
-
|
17
|
+
lederhosen trim \
|
18
18
|
--reads-dir=$raw_reads \
|
19
19
|
--out-dir=$out_dir/trimmed
|
20
20
|
|
21
21
|
# join reads
|
22
|
-
|
22
|
+
lederhosen join \
|
23
23
|
--trimmed=$out_dir/trimmed/*.fasta \
|
24
24
|
--output=$out_dir/joined.fasta
|
25
25
|
|
26
26
|
# filter reads
|
27
|
-
|
27
|
+
lederhosen k_filter \
|
28
28
|
--input=$out_dir/joined.fasta \
|
29
29
|
--output=$out_dir/filtered.fasta \
|
30
30
|
-k=10 \
|
31
31
|
--cutoff=50
|
32
32
|
|
33
33
|
# sort
|
34
|
-
|
34
|
+
lederhosen sort \
|
35
35
|
--input=$out_dir/filtered.fasta \
|
36
36
|
--output=$out_dir/sorted.fasta
|
37
37
|
|
38
38
|
for i in 0.80 0.90 0.95
|
39
39
|
do
|
40
40
|
# cluster
|
41
|
-
|
41
|
+
lederhosen cluster \
|
42
42
|
--input=$out_dir/sorted.fasta \
|
43
43
|
--output=$out_dir/clusters_"$i".uc \
|
44
44
|
--identity=$i
|
45
45
|
|
46
46
|
# filter uc file
|
47
|
-
|
47
|
+
lederhosen uc_filter \
|
48
48
|
--input=$out_dir/clusters_"$i".uc \
|
49
49
|
--output=$out_dir/clusters_"$i".uc.filtered \
|
50
50
|
--reads=$min_reads \
|
51
51
|
--samples=$min_samples \
|
52
52
|
|
53
53
|
# generate otu table
|
54
|
-
|
54
|
+
lederhosen otu_table \
|
55
55
|
--clusters=$out_dir/clusters_"$i".uc.filtered \
|
56
56
|
--output=$out_dir/otus_"$i"
|
57
57
|
|
58
58
|
# get representative reads
|
59
|
-
|
59
|
+
lederhosen rep_reads \
|
60
60
|
--clusters=$out_dir/clusters_"$i".uc.filtered \
|
61
61
|
--joined=$out_dir/sorted.fasta \
|
62
62
|
--output=$out_dir/representatives_"$i".fasta
|
63
63
|
|
64
64
|
# blast representative reads
|
65
|
-
|
65
|
+
lederhosen name \
|
66
66
|
--reps=$out_dir/representatives_"$i".fasta \
|
67
67
|
--output=$out_dir/taxonomies_"$i".txt \
|
68
68
|
--database=$taxcollector
|
@@ -0,0 +1,86 @@
|
|
1
|
+
##
|
2
|
+
# ADD TAXONOMIC DESCRIPTIONS TO OTU TABLE
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "add_names",
|
9
|
+
"--blat=blat_output.txt --table=cluster_table.csv --level=taxonomic level (i.e 6 genus)"
|
10
|
+
|
11
|
+
method_option :blat, :type => :string, :required => true
|
12
|
+
method_option :table, :type => :string, :required => true
|
13
|
+
method_option :level, :type => :string, :required => true
|
14
|
+
method_option :output, :type => :string, :required => false
|
15
|
+
|
16
|
+
def add_names
|
17
|
+
blat = options[:blat]
|
18
|
+
table = options[:table]
|
19
|
+
level = options[:level]
|
20
|
+
output = options[:output] || $stdout
|
21
|
+
|
22
|
+
levels = { 'kingdom' => 0,
|
23
|
+
'domain' => 0,
|
24
|
+
'phylum' => 1,
|
25
|
+
'class' => 2,
|
26
|
+
'order' => 3,
|
27
|
+
'family' => 4,
|
28
|
+
'genus' => 5,
|
29
|
+
'species' => 6 }
|
30
|
+
|
31
|
+
fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
|
32
|
+
|
33
|
+
# Corresponds with the numbers used in the TaxCollector database
|
34
|
+
# taxonomic descriptions
|
35
|
+
level_no = levels[level]
|
36
|
+
|
37
|
+
# map cluster_id to taxonomic description
|
38
|
+
# default is the cluster_id itself in case
|
39
|
+
# the cluster was not classified.
|
40
|
+
clusterid_to_name = Hash.new { |h, k| h[k] = k }
|
41
|
+
|
42
|
+
# map clusterid to name using blat output
|
43
|
+
ohai "loading BLAT output from #{blat}"
|
44
|
+
File.open(blat) do |handle|
|
45
|
+
handle.each do |line|
|
46
|
+
line = line.strip.split
|
47
|
+
|
48
|
+
# Only get first match
|
49
|
+
# TODO something smarter here
|
50
|
+
cluster_id = line[0].split(':')[3]
|
51
|
+
next if clusterid_to_name.include? cluster_id
|
52
|
+
|
53
|
+
taxonomic_description = line[1]
|
54
|
+
|
55
|
+
# match by level_no
|
56
|
+
# Example:
|
57
|
+
# [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
|
58
|
+
# I want to match Actinobacteria given level_no = 2
|
59
|
+
level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
|
60
|
+
|
61
|
+
clusterid_to_name[cluster_id] = level_name
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# load table, replace cluster names with taxonomic descriptions
|
66
|
+
output = File.open(output, 'w') unless output == $stdout
|
67
|
+
ohai "replacing names in #{table}"
|
68
|
+
File.open(table) do |handle|
|
69
|
+
|
70
|
+
# read in header, replace clusterids to names
|
71
|
+
header = handle.gets.strip.split(',')
|
72
|
+
header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
|
73
|
+
|
74
|
+
# print new header
|
75
|
+
output.puts header.join(',')
|
76
|
+
|
77
|
+
# print rest of table
|
78
|
+
handle.each { |l| output.print l }
|
79
|
+
end
|
80
|
+
|
81
|
+
# print status message
|
82
|
+
ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
##
|
2
|
+
# HIERARCHICAL CLUSTERING FTW
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "h_cluster",
|
9
|
+
"--input=sorted.fasta --identity=0.80 --output=clusters.uc --identities=0.80 0.90 0.95"
|
10
|
+
|
11
|
+
method_option :input, :type => :string, :required => true
|
12
|
+
method_option :out_dir, :type => :string, :required => true
|
13
|
+
method_option :identities, :type => :array, :required => true
|
14
|
+
|
15
|
+
def h_cluster
|
16
|
+
out_dir = options[:out_dir]
|
17
|
+
input = options[:input]
|
18
|
+
identities = options[:identities].map(&:to_f).sort
|
19
|
+
|
20
|
+
`mkdir -p #{out_dir}`
|
21
|
+
|
22
|
+
# initial clustering
|
23
|
+
i = identities.shift
|
24
|
+
clusters = File.join(out_dir, "clusters_#{i}.uc")
|
25
|
+
clusters_filtered = File.join(out_dir, "clusters_#{i}.uc.filtered")
|
26
|
+
|
27
|
+
# cluster
|
28
|
+
invoke :cluster, [], { :input => input, :output => clusters, :identity => i }
|
29
|
+
|
30
|
+
# filter
|
31
|
+
invoke :uc_filter, [], { :input => clusters, :output => clusters_filtered }
|
32
|
+
|
33
|
+
# get reads for each cluster
|
34
|
+
invoke :split, [], { :clusters => clusters_filtered, :reads => input }
|
35
|
+
|
36
|
+
[t1, t2, t3].map(&:call)
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -5,8 +5,8 @@
|
|
5
5
|
module Lederhosen
|
6
6
|
class CLI
|
7
7
|
|
8
|
-
desc "name
|
9
|
-
"--reps representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
|
8
|
+
desc "name",
|
9
|
+
"--reps --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
|
10
10
|
|
11
11
|
method_option :reps, :type => :string, :required => true
|
12
12
|
method_option :database, :type => :string, :required => true
|
@@ -5,7 +5,7 @@
|
|
5
5
|
module Lederhosen
|
6
6
|
class CLI
|
7
7
|
|
8
|
-
desc "rep_reads
|
8
|
+
desc "rep_reads",
|
9
9
|
"--clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta"
|
10
10
|
|
11
11
|
method_option :clusters, :type => :string, :required => true
|
@@ -5,14 +5,14 @@
|
|
5
5
|
module Lederhosen
|
6
6
|
class CLI
|
7
7
|
|
8
|
-
desc "
|
9
|
-
"--clusters=clusters.uc --reads=joined.fasta --min-clst-size=
|
8
|
+
desc "split",
|
9
|
+
"--clusters=clusters.uc --reads=joined.fasta --min-clst-size=1 --out-dir=output_directory"
|
10
10
|
|
11
11
|
method_option :clusters, :type => :string, :required => true
|
12
12
|
method_option :reads, :type => :string, :required => true
|
13
13
|
method_option :out_dir, :type => :string, :required => true
|
14
14
|
method_option :buffer_size, :type => :numeric, :default => 1000
|
15
|
-
method_option :min_clst_size, :type => :numeric, :default =>
|
15
|
+
method_option :min_clst_size, :type => :numeric, :default => 1
|
16
16
|
|
17
17
|
def split
|
18
18
|
clusters = options[:clusters]
|
@@ -0,0 +1,48 @@
|
|
1
|
+
##
|
2
|
+
# SQUISH A CSV FILE BY COLUMN NAME
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
|
9
|
+
|
10
|
+
method_option :csv_file, :type => :string, :required => true
|
11
|
+
method_option :output, :type => :string, :required => false
|
12
|
+
|
13
|
+
def squish
|
14
|
+
csv_file = options[:csv_file]
|
15
|
+
output = options[:output] || $stdout
|
16
|
+
|
17
|
+
# sample_name -> column name -> total number of reads
|
18
|
+
total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
19
|
+
column_names = '' # scope
|
20
|
+
# Load CSV file, merge counts in columns with the same name
|
21
|
+
File.open(csv_file) do |handle|
|
22
|
+
column_names = handle.gets.strip.split(',')[1..-1]
|
23
|
+
handle.each do |line|
|
24
|
+
line = line.strip.split(',')
|
25
|
+
sample = line[0]
|
26
|
+
line[1..-1].zip(column_names) do |reads, column_name|
|
27
|
+
total_by_sample_by_column[sample][column_name] += reads.to_i
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
output = File.open(output) rescue $stdout
|
33
|
+
|
34
|
+
# print the new, squished csv file
|
35
|
+
column_names.uniq!.sort!
|
36
|
+
puts "-,#{column_names.join(',')}"
|
37
|
+
total_by_sample_by_column.each_pair do |sample_id, row|
|
38
|
+
print "#{sample_id}"
|
39
|
+
column_names.each do |column_name|
|
40
|
+
print ",#{row[column_name]}"
|
41
|
+
end
|
42
|
+
print "\n"
|
43
|
+
end
|
44
|
+
|
45
|
+
output.close
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/version.rb
CHANGED
data/readme.md
CHANGED
@@ -12,12 +12,6 @@ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own ri
|
|
12
12
|
`sudo gem install lederhosen`
|
13
13
|
4. Check installation by typing `lederhosen`. You should see some help text.
|
14
14
|
|
15
|
-
## How do I use Lederhosen?
|
16
|
-
|
17
|
-
Type `lederhosen help` for complete instructions
|
18
|
-
|
19
|
-
See pipeline.sh for example usage.
|
20
|
-
|
21
15
|
## Features
|
22
16
|
|
23
17
|
- Sequence trimming (paired-end Illumina).
|
@@ -27,4 +21,86 @@ See pipeline.sh for example usage.
|
|
27
21
|
- Separation of representative reads.
|
28
22
|
- Separation of all reads belonging to each cluster.
|
29
23
|
- Identification of clusters using TaxCollector.
|
30
|
-
- Generation of OTU abundancy matrices.
|
24
|
+
- Generation of OTU abundancy matrices.
|
25
|
+
|
26
|
+
## How do I use Lederhosen?
|
27
|
+
|
28
|
+
Lederhosen is just a convenient wrapper for UCLUST and BLAT with some scripts for quality filtering, de-noising of data as well as creation of nice tables. It is similar to QIIME but meant for paired-end Illumina data rather than single-end 454. The basic lederhosen pipeline consists of: trimming, joining, sorting, filtering, clustering, more filtering, and output generation (OTU tables, representative reads, reads by cluster, and taxonomic descriptions for clusters). See the example pipeline in `pipeline.sh`.
|
29
|
+
|
30
|
+
## Tasks
|
31
|
+
|
32
|
+
Lederhosen is invoked by typing `lederhosen [TASK]`
|
33
|
+
|
34
|
+
### trim
|
35
|
+
|
36
|
+
Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
|
37
|
+
|
38
|
+
lederhosen trim --reads_dir=reads/* --out_dir=trimmed/
|
39
|
+
|
40
|
+
### join
|
41
|
+
|
42
|
+
Join paired reads from all samples end-to-end. This method enables the use of uclust with paired-end data. Output will be a single fasta file.
|
43
|
+
|
44
|
+
lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
|
45
|
+
|
46
|
+
### sort
|
47
|
+
|
48
|
+
Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
|
49
|
+
|
50
|
+
lederhosen sort --input=joined.fasta --output=sorted.fasta
|
51
|
+
|
52
|
+
### k_filter
|
53
|
+
|
54
|
+
K-mer abundance noise filtering. This step is experimental and optional. It may reduce the time it takes to perform the clustering.
|
55
|
+
|
56
|
+
lederhosen k_filter --input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50
|
57
|
+
|
58
|
+
### cluster
|
59
|
+
|
60
|
+
Cluster reads using UCLUST. Output is a uc file.
|
61
|
+
|
62
|
+
lederhosen cluster --input=sorted.fasta --identity=0.80 --output=clusters.uc
|
63
|
+
|
64
|
+
### uc_filter
|
65
|
+
|
66
|
+
Filter UC file removing singleton clusters or clusters that are only present in a few samples. This greatly reduces the noise of the data without removing many of the reads.
|
67
|
+
|
68
|
+
lederhosen uc_filter --input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10
|
69
|
+
|
70
|
+
### otu_table
|
71
|
+
|
72
|
+
Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
|
73
|
+
|
74
|
+
lederhosen otu_table --clusters=clusters.uc --output=otu_prefix.csv
|
75
|
+
|
76
|
+
### rep_reads
|
77
|
+
|
78
|
+
Get representative reads for each cluster. Output is a single fasta file.
|
79
|
+
|
80
|
+
lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta
|
81
|
+
|
82
|
+
### split
|
83
|
+
|
84
|
+
Get all reads belonging to each cluster. Output is a directory containing a fasta file for each cluster. The fasta file contains the joined reads.
|
85
|
+
|
86
|
+
lederhosen split --clusters=clusters.uc --reads=joined.fasta --min-clst-size=100
|
87
|
+
|
88
|
+
### name
|
89
|
+
|
90
|
+
Identify clusters in a database using the representative reads. This is a simple wrapper for BLAT. The output is a tab-delimited file similar to a BLAST output file. For this step you need to have BLAT installed and also a [TaxCollector](http://github.com/audy/taxcollector) database.
|
91
|
+
|
92
|
+
lederhosen name --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt
|
93
|
+
|
94
|
+
### add_names
|
95
|
+
|
96
|
+
Add phylogenetic classification of clusters to OTU abundance file.
|
97
|
+
|
98
|
+
lederhosen add_names --blat=blat_output.txt --level=taxonomic_level --table=otu_file.csv --output=named_out_file.csv
|
99
|
+
|
100
|
+
Where `taxonomic_level` can be: kingdom, domain, phylum, class, order, family, genus or species. This method only works with a TaxCollector database.
|
101
|
+
|
102
|
+
### squish
|
103
|
+
|
104
|
+
Squish an OTU abundance file by column name (phylogenetic description)
|
105
|
+
|
106
|
+
lederhosen squish --csv-file=named_out_file.csv --output=squished_named_out_file.csv
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 29
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 3
|
10
|
+
version: 0.1.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Austin G. Davis-Richardson
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-07-13 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: dna
|
@@ -129,12 +129,15 @@ files:
|
|
129
129
|
- .rvmrc
|
130
130
|
- Gemfile
|
131
131
|
- bin/lederhosen
|
132
|
+
- examples/pipeline.sh
|
132
133
|
- lederhosen.gemspec
|
133
134
|
- lib/lederhosen.rb
|
134
135
|
- lib/lederhosen/buffer.rb
|
135
136
|
- lib/lederhosen/cli.rb
|
136
137
|
- lib/lederhosen/helpers.rb
|
138
|
+
- lib/lederhosen/tasks/add_names.rb
|
137
139
|
- lib/lederhosen/tasks/cluster.rb
|
140
|
+
- lib/lederhosen/tasks/hierarchical.rb
|
138
141
|
- lib/lederhosen/tasks/join.rb
|
139
142
|
- lib/lederhosen/tasks/k_filter.rb
|
140
143
|
- lib/lederhosen/tasks/name.rb
|
@@ -142,10 +145,10 @@ files:
|
|
142
145
|
- lib/lederhosen/tasks/rep_reads.rb
|
143
146
|
- lib/lederhosen/tasks/sort.rb
|
144
147
|
- lib/lederhosen/tasks/split.rb
|
148
|
+
- lib/lederhosen/tasks/squish.rb
|
145
149
|
- lib/lederhosen/tasks/trim.rb
|
146
150
|
- lib/lederhosen/tasks/uc_filter.rb
|
147
151
|
- lib/version.rb
|
148
|
-
- pipeline.sh
|
149
152
|
- readme.md
|
150
153
|
- spec/data/ILT_L_9_B_001_1.txt
|
151
154
|
- spec/data/ILT_L_9_B_001_3.txt
|