lederhosen 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/{pipeline.sh → examples/pipeline.sh} +10 -10
- data/lib/lederhosen/tasks/add_names.rb +86 -0
- data/lib/lederhosen/tasks/cluster.rb +1 -1
- data/lib/lederhosen/tasks/hierarchical.rb +40 -0
- data/lib/lederhosen/tasks/join.rb +1 -1
- data/lib/lederhosen/tasks/k_filter.rb +1 -1
- data/lib/lederhosen/tasks/name.rb +2 -2
- data/lib/lederhosen/tasks/otu_table.rb +1 -1
- data/lib/lederhosen/tasks/rep_reads.rb +1 -1
- data/lib/lederhosen/tasks/sort.rb +1 -1
- data/lib/lederhosen/tasks/split.rb +3 -3
- data/lib/lederhosen/tasks/squish.rb +48 -0
- data/lib/lederhosen/tasks/trim.rb +1 -1
- data/lib/lederhosen/tasks/uc_filter.rb +1 -1
- data/lib/version.rb +1 -1
- data/readme.md +83 -7
- metadata +8 -5
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/
|
1
|
+
#!/bash
|
2
2
|
|
3
3
|
# An example OTU clustering pipeline
|
4
4
|
# Austin G. Davis-Richardson
|
@@ -14,55 +14,55 @@ min_reads=50
|
|
14
14
|
min_samples=10
|
15
15
|
|
16
16
|
# trim reads
|
17
|
-
|
17
|
+
lederhosen trim \
|
18
18
|
--reads-dir=$raw_reads \
|
19
19
|
--out-dir=$out_dir/trimmed
|
20
20
|
|
21
21
|
# join reads
|
22
|
-
|
22
|
+
lederhosen join \
|
23
23
|
--trimmed=$out_dir/trimmed/*.fasta \
|
24
24
|
--output=$out_dir/joined.fasta
|
25
25
|
|
26
26
|
# filter reads
|
27
|
-
|
27
|
+
lederhosen k_filter \
|
28
28
|
--input=$out_dir/joined.fasta \
|
29
29
|
--output=$out_dir/filtered.fasta \
|
30
30
|
-k=10 \
|
31
31
|
--cutoff=50
|
32
32
|
|
33
33
|
# sort
|
34
|
-
|
34
|
+
lederhosen sort \
|
35
35
|
--input=$out_dir/filtered.fasta \
|
36
36
|
--output=$out_dir/sorted.fasta
|
37
37
|
|
38
38
|
for i in 0.80 0.90 0.95
|
39
39
|
do
|
40
40
|
# cluster
|
41
|
-
|
41
|
+
lederhosen cluster \
|
42
42
|
--input=$out_dir/sorted.fasta \
|
43
43
|
--output=$out_dir/clusters_"$i".uc \
|
44
44
|
--identity=$i
|
45
45
|
|
46
46
|
# filter uc file
|
47
|
-
|
47
|
+
lederhosen uc_filter \
|
48
48
|
--input=$out_dir/clusters_"$i".uc \
|
49
49
|
--output=$out_dir/clusters_"$i".uc.filtered \
|
50
50
|
--reads=$min_reads \
|
51
51
|
--samples=$min_samples \
|
52
52
|
|
53
53
|
# generate otu table
|
54
|
-
|
54
|
+
lederhosen otu_table \
|
55
55
|
--clusters=$out_dir/clusters_"$i".uc.filtered \
|
56
56
|
--output=$out_dir/otus_"$i"
|
57
57
|
|
58
58
|
# get representative reads
|
59
|
-
|
59
|
+
lederhosen rep_reads \
|
60
60
|
--clusters=$out_dir/clusters_"$i".uc.filtered \
|
61
61
|
--joined=$out_dir/sorted.fasta \
|
62
62
|
--output=$out_dir/representatives_"$i".fasta
|
63
63
|
|
64
64
|
# blast representative reads
|
65
|
-
|
65
|
+
lederhosen name \
|
66
66
|
--reps=$out_dir/representatives_"$i".fasta \
|
67
67
|
--output=$out_dir/taxonomies_"$i".txt \
|
68
68
|
--database=$taxcollector
|
@@ -0,0 +1,86 @@
|
|
1
|
+
##
|
2
|
+
# ADD TAXONOMIC DESCRIPTIONS TO OTU TABLE
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "add_names",
|
9
|
+
"--blat=blat_output.txt --table=cluster_table.csv --level=taxonomic level (i.e 6 genus)"
|
10
|
+
|
11
|
+
method_option :blat, :type => :string, :required => true
|
12
|
+
method_option :table, :type => :string, :required => true
|
13
|
+
method_option :level, :type => :string, :required => true
|
14
|
+
method_option :output, :type => :string, :required => false
|
15
|
+
|
16
|
+
def add_names
|
17
|
+
blat = options[:blat]
|
18
|
+
table = options[:table]
|
19
|
+
level = options[:level]
|
20
|
+
output = options[:output] || $stdout
|
21
|
+
|
22
|
+
levels = { 'kingdom' => 0,
|
23
|
+
'domain' => 0,
|
24
|
+
'phylum' => 1,
|
25
|
+
'class' => 2,
|
26
|
+
'order' => 3,
|
27
|
+
'family' => 4,
|
28
|
+
'genus' => 5,
|
29
|
+
'species' => 6 }
|
30
|
+
|
31
|
+
fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
|
32
|
+
|
33
|
+
# Corresponds with the numbers used in the TaxCollector database
|
34
|
+
# taxonomic descriptions
|
35
|
+
level_no = levels[level]
|
36
|
+
|
37
|
+
# map cluster_id to taxonomic description
|
38
|
+
# default is the cluster_id itself in case
|
39
|
+
# the cluster was not classified.
|
40
|
+
clusterid_to_name = Hash.new { |h, k| h[k] = k }
|
41
|
+
|
42
|
+
# map clusterid to name using blat output
|
43
|
+
ohai "loading BLAT output from #{blat}"
|
44
|
+
File.open(blat) do |handle|
|
45
|
+
handle.each do |line|
|
46
|
+
line = line.strip.split
|
47
|
+
|
48
|
+
# Only get first match
|
49
|
+
# TODO something smarter here
|
50
|
+
cluster_id = line[0].split(':')[3]
|
51
|
+
next if clusterid_to_name.include? cluster_id
|
52
|
+
|
53
|
+
taxonomic_description = line[1]
|
54
|
+
|
55
|
+
# match by level_no
|
56
|
+
# Example:
|
57
|
+
# [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
|
58
|
+
# I want to match Actinobacteria given level_no = 2
|
59
|
+
level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
|
60
|
+
|
61
|
+
clusterid_to_name[cluster_id] = level_name
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# load table, replace cluster names with taxonomic descriptions
|
66
|
+
output = File.open(output, 'w') unless output == $stdout
|
67
|
+
ohai "replacing names in #{table}"
|
68
|
+
File.open(table) do |handle|
|
69
|
+
|
70
|
+
# read in header, replace clusterids to names
|
71
|
+
header = handle.gets.strip.split(',')
|
72
|
+
header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
|
73
|
+
|
74
|
+
# print new header
|
75
|
+
output.puts header.join(',')
|
76
|
+
|
77
|
+
# print rest of table
|
78
|
+
handle.each { |l| output.print l }
|
79
|
+
end
|
80
|
+
|
81
|
+
# print status message
|
82
|
+
ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
##
|
2
|
+
# HIERARCHICAL CLUSTERING FTW
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "h_cluster",
|
9
|
+
"--input=sorted.fasta --identity=0.80 --output=clusters.uc --identities=0.80 0.90 0.95"
|
10
|
+
|
11
|
+
method_option :input, :type => :string, :required => true
|
12
|
+
method_option :out_dir, :type => :string, :required => true
|
13
|
+
method_option :identities, :type => :array, :required => true
|
14
|
+
|
15
|
+
def h_cluster
|
16
|
+
out_dir = options[:out_dir]
|
17
|
+
input = options[:input]
|
18
|
+
identities = options[:identities].map(&:to_f).sort
|
19
|
+
|
20
|
+
`mkdir -p #{out_dir}`
|
21
|
+
|
22
|
+
# initial clustering
|
23
|
+
i = identities.shift
|
24
|
+
clusters = File.join(out_dir, "clusters_#{i}.uc")
|
25
|
+
clusters_filtered = File.join(out_dir, "clusters_#{i}.uc.filtered")
|
26
|
+
|
27
|
+
# cluster
|
28
|
+
invoke :cluster, [], { :input => input, :output => clusters, :identity => i }
|
29
|
+
|
30
|
+
# filter
|
31
|
+
invoke :uc_filter, [], { :input => clusters, :output => clusters_filtered }
|
32
|
+
|
33
|
+
# get reads for each cluster
|
34
|
+
invoke :split, [], { :clusters => clusters_filtered, :reads => input }
|
35
|
+
|
36
|
+
[t1, t2, t3].map(&:call)
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -5,8 +5,8 @@
|
|
5
5
|
module Lederhosen
|
6
6
|
class CLI
|
7
7
|
|
8
|
-
desc "name
|
9
|
-
"--reps representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
|
8
|
+
desc "name",
|
9
|
+
"--reps --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
|
10
10
|
|
11
11
|
method_option :reps, :type => :string, :required => true
|
12
12
|
method_option :database, :type => :string, :required => true
|
@@ -5,7 +5,7 @@
|
|
5
5
|
module Lederhosen
|
6
6
|
class CLI
|
7
7
|
|
8
|
-
desc "rep_reads
|
8
|
+
desc "rep_reads",
|
9
9
|
"--clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta"
|
10
10
|
|
11
11
|
method_option :clusters, :type => :string, :required => true
|
@@ -5,14 +5,14 @@
|
|
5
5
|
module Lederhosen
|
6
6
|
class CLI
|
7
7
|
|
8
|
-
desc "
|
9
|
-
"--clusters=clusters.uc --reads=joined.fasta --min-clst-size=
|
8
|
+
desc "split",
|
9
|
+
"--clusters=clusters.uc --reads=joined.fasta --min-clst-size=1 --out-dir=output_directory"
|
10
10
|
|
11
11
|
method_option :clusters, :type => :string, :required => true
|
12
12
|
method_option :reads, :type => :string, :required => true
|
13
13
|
method_option :out_dir, :type => :string, :required => true
|
14
14
|
method_option :buffer_size, :type => :numeric, :default => 1000
|
15
|
-
method_option :min_clst_size, :type => :numeric, :default =>
|
15
|
+
method_option :min_clst_size, :type => :numeric, :default => 1
|
16
16
|
|
17
17
|
def split
|
18
18
|
clusters = options[:clusters]
|
@@ -0,0 +1,48 @@
|
|
1
|
+
##
|
2
|
+
# SQUISH A CSV FILE BY COLUMN NAME
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
|
9
|
+
|
10
|
+
method_option :csv_file, :type => :string, :required => true
|
11
|
+
method_option :output, :type => :string, :required => false
|
12
|
+
|
13
|
+
def squish
|
14
|
+
csv_file = options[:csv_file]
|
15
|
+
output = options[:output] || $stdout
|
16
|
+
|
17
|
+
# sample_name -> column name -> total number of reads
|
18
|
+
total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
19
|
+
column_names = '' # scope
|
20
|
+
# Load CSV file, merge counts in columns with the same name
|
21
|
+
File.open(csv_file) do |handle|
|
22
|
+
column_names = handle.gets.strip.split(',')[1..-1]
|
23
|
+
handle.each do |line|
|
24
|
+
line = line.strip.split(',')
|
25
|
+
sample = line[0]
|
26
|
+
line[1..-1].zip(column_names) do |reads, column_name|
|
27
|
+
total_by_sample_by_column[sample][column_name] += reads.to_i
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
output = File.open(output) rescue $stdout
|
33
|
+
|
34
|
+
# print the new, squished csv file
|
35
|
+
column_names.uniq!.sort!
|
36
|
+
puts "-,#{column_names.join(',')}"
|
37
|
+
total_by_sample_by_column.each_pair do |sample_id, row|
|
38
|
+
print "#{sample_id}"
|
39
|
+
column_names.each do |column_name|
|
40
|
+
print ",#{row[column_name]}"
|
41
|
+
end
|
42
|
+
print "\n"
|
43
|
+
end
|
44
|
+
|
45
|
+
output.close
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/version.rb
CHANGED
data/readme.md
CHANGED
@@ -12,12 +12,6 @@ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own ri
|
|
12
12
|
`sudo gem install lederhosen`
|
13
13
|
4. Check installation by typing `lederhosen`. You should see some help text.
|
14
14
|
|
15
|
-
## How do I use Lederhosen?
|
16
|
-
|
17
|
-
Type `lederhosen help` for complete instructions
|
18
|
-
|
19
|
-
See pipeline.sh for example usage.
|
20
|
-
|
21
15
|
## Features
|
22
16
|
|
23
17
|
- Sequence trimming (paired-end Illumina).
|
@@ -27,4 +21,86 @@ See pipeline.sh for example usage.
|
|
27
21
|
- Separation of representative reads.
|
28
22
|
- Separation of all reads belonging to each cluster.
|
29
23
|
- Identification of clusters using TaxCollector.
|
30
|
-
- Generation of OTU abundancy matrices.
|
24
|
+
- Generation of OTU abundancy matrices.
|
25
|
+
|
26
|
+
## How do I use Lederhosen?
|
27
|
+
|
28
|
+
Lederhosen is just a convenient wrapper for UCLUST and BLAT with some scripts for quality filtering, de-noising of data as well as creation of nice tables. It is similar to QIIME but meant for paired-end Illumina data rather than single-end 454. The basic lederhosen pipeline consists of: trimming, joining, sorting, filtering, clustering, more filtering, and output generation (OTU tables, representative reads, reads by cluster, and taxonomic descriptions for clusters). See the example pipeline in `pipeline.sh`.
|
29
|
+
|
30
|
+
## Tasks
|
31
|
+
|
32
|
+
Lederhosen is invoked by typing `lederhosen [TASK]`
|
33
|
+
|
34
|
+
### trim
|
35
|
+
|
36
|
+
Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
|
37
|
+
|
38
|
+
lederhosen trim --reads_dir=reads/* --out_dir=trimmed/
|
39
|
+
|
40
|
+
### join
|
41
|
+
|
42
|
+
Join paired reads from all samples end-to-end. This method enables the use of uclust with paired-end data. Output will be a single fasta file.
|
43
|
+
|
44
|
+
lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
|
45
|
+
|
46
|
+
### sort
|
47
|
+
|
48
|
+
Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
|
49
|
+
|
50
|
+
lederhosen sort --input=joined.fasta --output=sorted.fasta
|
51
|
+
|
52
|
+
### k_filter
|
53
|
+
|
54
|
+
K-mer abundance noise filtering. This step is experimental and optional. It may reduce the time it takes to perform the clustering.
|
55
|
+
|
56
|
+
lederhosen k_filter --input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50
|
57
|
+
|
58
|
+
### cluster
|
59
|
+
|
60
|
+
Cluster reads using UCLUST. Output is a uc file.
|
61
|
+
|
62
|
+
lederhosen cluster --input=sorted.fasta --identity=0.80 --output=clusters.uc
|
63
|
+
|
64
|
+
### uc_filter
|
65
|
+
|
66
|
+
Filter UC file removing singleton clusters or clusters that are only present in a few samples. This greatly reduces the noise of the data without removing many of the reads.
|
67
|
+
|
68
|
+
lederhosen uc_filter --input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10
|
69
|
+
|
70
|
+
### otu_table
|
71
|
+
|
72
|
+
Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
|
73
|
+
|
74
|
+
lederhosen otu_table --clusters=clusters.uc --output=otu_prefix.csv
|
75
|
+
|
76
|
+
### rep_reads
|
77
|
+
|
78
|
+
Get representative reads for each cluster. Output is a single fasta file.
|
79
|
+
|
80
|
+
lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta
|
81
|
+
|
82
|
+
### split
|
83
|
+
|
84
|
+
Get all reads belonging to each cluster. Output is a directory containing a fasta file for each cluster. The fasta file contains the joined reads.
|
85
|
+
|
86
|
+
lederhosen split --clusters=clusters.uc --reads=joined.fasta --min-clst-size=100
|
87
|
+
|
88
|
+
### name
|
89
|
+
|
90
|
+
Identify clusters in a database using the representative reads. This is a simple wrapper for BLAT. The output is a tab-delimited file similar to a BLAST output file. For this step you need to have BLAT installed and also a [TaxCollector](http://github.com/audy/taxcollector) database.
|
91
|
+
|
92
|
+
lederhosen name --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt
|
93
|
+
|
94
|
+
### add_names
|
95
|
+
|
96
|
+
Add phylogenetic classification of clusters to OTU abundance file.
|
97
|
+
|
98
|
+
lederhosen add_names --blat=blat_output.txt --level=taxonomic_level --table=otu_file.csv --output=named_out_file.csv
|
99
|
+
|
100
|
+
Where `taxonomic_level` can be: kingdom, domain, phylum, class, order, family, genus or species. This method only works with a TaxCollector database.
|
101
|
+
|
102
|
+
### squish
|
103
|
+
|
104
|
+
Squish an OTU abundance file by column name (phylogenetic description)
|
105
|
+
|
106
|
+
lederhosen squish --csv-file=named_out_file.csv --output=squished_named_out_file.csv
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 29
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 3
|
10
|
+
version: 0.1.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Austin G. Davis-Richardson
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-07-13 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: dna
|
@@ -129,12 +129,15 @@ files:
|
|
129
129
|
- .rvmrc
|
130
130
|
- Gemfile
|
131
131
|
- bin/lederhosen
|
132
|
+
- examples/pipeline.sh
|
132
133
|
- lederhosen.gemspec
|
133
134
|
- lib/lederhosen.rb
|
134
135
|
- lib/lederhosen/buffer.rb
|
135
136
|
- lib/lederhosen/cli.rb
|
136
137
|
- lib/lederhosen/helpers.rb
|
138
|
+
- lib/lederhosen/tasks/add_names.rb
|
137
139
|
- lib/lederhosen/tasks/cluster.rb
|
140
|
+
- lib/lederhosen/tasks/hierarchical.rb
|
138
141
|
- lib/lederhosen/tasks/join.rb
|
139
142
|
- lib/lederhosen/tasks/k_filter.rb
|
140
143
|
- lib/lederhosen/tasks/name.rb
|
@@ -142,10 +145,10 @@ files:
|
|
142
145
|
- lib/lederhosen/tasks/rep_reads.rb
|
143
146
|
- lib/lederhosen/tasks/sort.rb
|
144
147
|
- lib/lederhosen/tasks/split.rb
|
148
|
+
- lib/lederhosen/tasks/squish.rb
|
145
149
|
- lib/lederhosen/tasks/trim.rb
|
146
150
|
- lib/lederhosen/tasks/uc_filter.rb
|
147
151
|
- lib/version.rb
|
148
|
-
- pipeline.sh
|
149
152
|
- readme.md
|
150
153
|
- spec/data/ILT_L_9_B_001_1.txt
|
151
154
|
- spec/data/ILT_L_9_B_001_3.txt
|