lederhosen 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -1
- data/bin/lederhosen +1 -1
- data/examples/hierarchical_clustering.sh +51 -0
- data/examples/pipeline.sh +1 -1
- data/lederhosen.gemspec +1 -1
- data/lib/lederhosen/buffer.rb +2 -2
- data/lib/lederhosen/helpers.rb +10 -5
- data/lib/lederhosen/tasks/add_names.rb +80 -80
- data/lib/lederhosen/tasks/cluster.rb +1 -1
- data/lib/lederhosen/tasks/name.rb +2 -2
- data/lib/lederhosen/tasks/otu_filter.rb +43 -43
- data/lib/lederhosen/tasks/otu_table.rb +13 -13
- data/lib/lederhosen/tasks/rep_reads.rb +2 -2
- data/lib/lederhosen/tasks/squish.rb +42 -42
- data/lib/lederhosen.rb +2 -1
- data/lib/version.rb +1 -1
- data/readme.md +7 -3
- data/spec/data/ILT_L_9_B_001_1.txt.gz +0 -0
- data/spec/data/ILT_L_9_B_001_3.txt.gz +0 -0
- data/spec/data/ILT_L_9_B_002_1.txt.gz +0 -0
- data/spec/data/ILT_L_9_B_002_3.txt.gz +0 -0
- data/spec/helpers_spec.rb +7 -7
- data/spec/misc_spec.rb +1 -1
- data/spec/pipeline_spec.rb +18 -22
- data/spec/spec_helper.rb +5 -1
- metadata +13 -16
- data/spec/data/ILT_L_9_B_001_1.txt +0 -400
- data/spec/data/ILT_L_9_B_001_3.txt +0 -400
- data/spec/data/ILT_L_9_B_002_1.txt +0 -400
- data/spec/data/ILT_L_9_B_002_3.txt +0 -400
- data/spec/data/blast_out.txt +0 -10
- data/spec/data/blat.txt +0 -86
data/Gemfile
CHANGED
data/bin/lederhosen
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
set -x
|
5
|
+
|
6
|
+
# Hierarchical OTU clustering
|
7
|
+
# Austin G. Davis-Richardson
|
8
|
+
# <harekrishna at gmail dot com>
|
9
|
+
# http://github.com/audy/lederhosen
|
10
|
+
|
11
|
+
reads='sorted.fasta'
|
12
|
+
out='h_clustering'
|
13
|
+
|
14
|
+
mkdir -p $out
|
15
|
+
|
16
|
+
# initial clustering at 80%
|
17
|
+
lederhosen cluster --input=$reads --output=$out/clusters_0.80.uc --identity=0.80
|
18
|
+
|
19
|
+
# filter UC file
|
20
|
+
lederhosen uc_filter --input=$out/clusters_0.80.uc --output=$out/clusters_0.80.uc.filtered --reads=1 --samples=1
|
21
|
+
|
22
|
+
# get reads for each cluster
|
23
|
+
mkdir -p $out/split_80
|
24
|
+
lederhosen split --clusters=$out/clusters_0.80.uc.filtered --reads=$reads --out-dir=$out/split_80/
|
25
|
+
|
26
|
+
# now cluster each of those at 90%
|
27
|
+
for fasta in $out/split_80/*.fasta
|
28
|
+
do
|
29
|
+
|
30
|
+
# sort (awww, do I really have to do this again?)
|
31
|
+
lederhosen sort --input=$fasta --output=$fasta.sorted
|
32
|
+
|
33
|
+
# cluster
|
34
|
+
lederhosen cluster --input=$fasta.sorted --output=$fasta.uc --identity=0.90
|
35
|
+
|
36
|
+
# split
|
37
|
+
split=$out/split_80.90_$(basename $fasta .fasta)
|
38
|
+
lederhosen split --clusters=$fasta.uc --reads=$fasta --out-dir=$split
|
39
|
+
done
|
40
|
+
|
41
|
+
# Do it again at 95%
|
42
|
+
for fasta in $out/split_80/split_*_90.fasta/*.fasta
|
43
|
+
do
|
44
|
+
# cluster
|
45
|
+
lederhosen cluster --input=$fasta --output=$fasta.uc --identity=90
|
46
|
+
|
47
|
+
# split
|
48
|
+
split=$outdir/80.90.$fasta.fasta
|
49
|
+
mkdir -p $split
|
50
|
+
lederhosen split --clusters=$fasta.uc --reads=$input --out-dir=$split
|
51
|
+
done
|
data/examples/pipeline.sh
CHANGED
data/lederhosen.gemspec
CHANGED
data/lib/lederhosen/buffer.rb
CHANGED
data/lib/lederhosen/helpers.rb
CHANGED
@@ -17,8 +17,13 @@ module Lederhosen
|
|
17
17
|
cutoff = args[:cutoff] || 20
|
18
18
|
min_length = args[:min_length] || 70
|
19
19
|
|
20
|
-
left_handle
|
21
|
-
|
20
|
+
left_handle, right_handle =
|
21
|
+
begin
|
22
|
+
[ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
|
23
|
+
rescue Zlib::GzipFile::Error
|
24
|
+
[ File.open(left), File.open(right) ]
|
25
|
+
end
|
26
|
+
|
22
27
|
out_handle = File.open out, 'w'
|
23
28
|
|
24
29
|
left_reads = Dna.new left_handle
|
@@ -57,9 +62,9 @@ module Lederhosen
|
|
57
62
|
min = args[:min] || 20
|
58
63
|
offset = args[:cutoff] || 64
|
59
64
|
|
60
|
-
|
61
|
-
|
62
|
-
|
65
|
+
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
66
|
+
|
67
|
+
dna.quality.each_byte.each_with_index do |b, a|
|
63
68
|
_sum += (b - offset - min)
|
64
69
|
if _sum > _max
|
65
70
|
_max = _sum
|
@@ -3,84 +3,84 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
module Lederhosen
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "add_names",
|
9
|
+
"add names to otu abundance matrix using blat output"
|
10
|
+
|
11
|
+
method_option :blat, :type => :string, :required => true
|
12
|
+
method_option :table, :type => :string, :required => true
|
13
|
+
method_option :level, :type => :string, :required => true
|
14
|
+
method_option :output, :type => :string, :required => false
|
15
|
+
|
16
|
+
def add_names
|
17
|
+
blat = options[:blat]
|
18
|
+
table = options[:table]
|
19
|
+
level = options[:level]
|
20
|
+
output = options[:output] || $stdout
|
21
|
+
|
22
|
+
levels = { 'kingdom' => 0,
|
23
|
+
'domain' => 0,
|
24
|
+
'phylum' => 1,
|
25
|
+
'class' => 2,
|
26
|
+
'order' => 3,
|
27
|
+
'family' => 4,
|
28
|
+
'genus' => 5,
|
29
|
+
'species' => 6 }
|
30
|
+
|
31
|
+
fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
|
32
|
+
|
33
|
+
# Corresponds with the numbers used in the TaxCollector database
|
34
|
+
# taxonomic descriptions
|
35
|
+
level_no = levels[level]
|
36
|
+
|
37
|
+
# map cluster_id to taxonomic description
|
38
|
+
# default is the cluster_id itself in case
|
39
|
+
# the cluster was not classified.
|
40
|
+
clusterid_to_name = Hash.new { |h, k| h[k] = k }
|
41
|
+
|
42
|
+
# map clusterid to name using blat output
|
43
|
+
ohai "loading BLAT output from #{blat}"
|
44
|
+
File.open(blat) do |handle|
|
45
|
+
handle.each do |line|
|
46
|
+
line = line.strip.split
|
47
|
+
|
48
|
+
# Only get first match
|
49
|
+
# TODO something smarter here
|
50
|
+
cluster_id = line[0].split(':')[3]
|
51
|
+
next if clusterid_to_name.include? cluster_id
|
52
|
+
|
53
|
+
taxonomic_description = line[1]
|
54
|
+
|
55
|
+
# match by level_no
|
56
|
+
# Example:
|
57
|
+
# [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
|
58
|
+
# I want to match Actinobacteria given level_no = 2
|
59
|
+
level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
|
60
|
+
|
61
|
+
clusterid_to_name[cluster_id] = level_name
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# load table, replace cluster names with taxonomic descriptions
|
66
|
+
output = File.open(output, 'w') unless output == $stdout
|
67
|
+
ohai "replacing names in #{table}"
|
68
|
+
File.open(table) do |handle|
|
69
|
+
|
70
|
+
# read in header, replace clusterids to names
|
71
|
+
header = handle.gets.strip.split(',')
|
72
|
+
header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
|
73
|
+
|
74
|
+
# print new header
|
75
|
+
output.puts header.join(',')
|
76
|
+
|
77
|
+
# print rest of table
|
78
|
+
handle.each { |l| output.print l }
|
79
|
+
end
|
80
|
+
|
81
|
+
# print status message
|
82
|
+
ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
86
|
end
|
@@ -1,45 +1,45 @@
|
|
1
1
|
module Lederhosen
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
2
|
+
class CLI
|
3
|
+
|
4
|
+
desc 'otu_filter', 'works like uc_filter but uses an OTU table as input'
|
5
|
+
|
6
|
+
method_option :input, :type => :string, :required => true
|
7
|
+
method_option :output, :type => :string, :required => true
|
8
|
+
method_option :reads, :type => :numeric, :required => true
|
9
|
+
method_option :samples, :type => :numeric, :required => true
|
10
|
+
|
11
|
+
def otu_filter
|
12
|
+
input = options[:input]
|
13
|
+
output = options[:output]
|
14
|
+
reads = options[:reads]
|
15
|
+
samples = options[:samples]
|
16
|
+
|
17
|
+
##
|
18
|
+
# Iterate over otu table line by line.
|
19
|
+
# Only print if cluster meets criteria
|
20
|
+
#
|
21
|
+
kept = 0
|
22
|
+
File.open(input) do |handle|
|
23
|
+
header = handle.gets.strip
|
24
|
+
header = header.split(',')
|
25
|
+
samples = header[1..-1]
|
26
|
+
|
27
|
+
puts header.join(',')
|
28
|
+
|
29
|
+
handle.each do |line|
|
30
|
+
line = line.strip.split(',')
|
31
|
+
cluster_no = line[0]
|
32
|
+
counts = line[1..-1].collect { |x| x.to_i }
|
33
|
+
|
34
|
+
# should be the same as uc_filter
|
35
|
+
if counts.reject { |x| x < reads }.length > samples
|
36
|
+
puts line.join(',')
|
37
|
+
kept += 1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
ohai "kept #{kept} clusters."
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
45
|
end
|
@@ -17,32 +17,32 @@ module Lederhosen
|
|
17
17
|
input = options[:clusters]
|
18
18
|
output = options[:output]
|
19
19
|
joined_reads = options[:joined]
|
20
|
-
|
20
|
+
|
21
21
|
# Load cluster table
|
22
22
|
|
23
|
-
clstr_info = Helpers.load_uc_file input
|
23
|
+
clstr_info = Helpers.load_uc_file input
|
24
24
|
clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
|
25
25
|
clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
|
26
26
|
samples = clstr_info[:samples]
|
27
27
|
|
28
28
|
# print OTU abundance matrix
|
29
|
-
|
30
|
-
|
29
|
+
# clusters as columns
|
30
|
+
# samples as rows
|
31
31
|
|
32
32
|
File.open("#{output}.csv", 'w') do |h|
|
33
33
|
samples = samples.sort
|
34
34
|
clusters = clstr_counts.keys
|
35
35
|
|
36
36
|
# print header (cluster names)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
37
|
+
h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
|
38
|
+
|
39
|
+
samples.each do |sample|
|
40
|
+
h.print sample
|
41
|
+
clusters.each do |cluster|
|
42
|
+
h.print "#{SEP}#{clstr_counts[cluster][sample]}"
|
43
|
+
end
|
44
|
+
h.print "\n"
|
45
|
+
end
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
@@ -19,7 +19,7 @@ module Lederhosen
|
|
19
19
|
|
20
20
|
|
21
21
|
# Load cluster table!
|
22
|
-
clstr_info = Helpers.load_uc_file input
|
22
|
+
clstr_info = Helpers.load_uc_file input
|
23
23
|
clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
|
24
24
|
seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
|
25
25
|
samples = clstr_info[:samples]
|
@@ -36,7 +36,7 @@ module Lederhosen
|
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
out_handle.close
|
41
41
|
end
|
42
42
|
|
@@ -3,46 +3,46 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
module Lederhosen
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
|
9
|
+
|
10
|
+
method_option :csv_file, :type => :string, :required => true
|
11
|
+
method_option :output, :type => :string, :required => false
|
12
|
+
|
13
|
+
def squish
|
14
|
+
csv_file = options[:csv_file]
|
15
|
+
output = options[:output] || $stdout
|
16
|
+
|
17
|
+
# sample_name -> column name -> total number of reads
|
18
|
+
total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
19
|
+
column_names = '' # scope
|
20
|
+
# Load CSV file, merge counts in columns with the same name
|
21
|
+
File.open(csv_file) do |handle|
|
22
|
+
column_names = handle.gets.strip.split(',')[1..-1]
|
23
|
+
handle.each do |line|
|
24
|
+
line = line.strip.split(',')
|
25
|
+
sample = line[0]
|
26
|
+
line[1..-1].zip(column_names) do |reads, column_name|
|
27
|
+
total_by_sample_by_column[sample][column_name] += reads.to_i
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
output = File.open(output, 'w') rescue $stdout
|
33
|
+
|
34
|
+
# print the new, squished csv file
|
35
|
+
column_names.uniq!.sort!
|
36
|
+
output.puts "-,#{column_names.join(',')}"
|
37
|
+
total_by_sample_by_column.each_pair do |sample_id, row|
|
38
|
+
output.print "#{sample_id}"
|
39
|
+
column_names.each do |column_name|
|
40
|
+
output.print ",#{row[column_name]}"
|
41
|
+
end
|
42
|
+
output.print "\n"
|
43
|
+
end
|
44
|
+
|
45
|
+
output.close
|
46
|
+
end
|
47
|
+
end
|
48
48
|
end
|
data/lib/lederhosen.rb
CHANGED
@@ -4,6 +4,7 @@ require 'dna'
|
|
4
4
|
require 'set'
|
5
5
|
require 'progressbar'
|
6
6
|
require 'awesome_print'
|
7
|
+
require 'zlib'
|
7
8
|
|
8
9
|
Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }
|
9
10
|
|
@@ -13,4 +14,4 @@ class String
|
|
13
14
|
k -= 1
|
14
15
|
(0..(self.length-k-1)).collect { |i| self[i..i+k] }
|
15
16
|
end
|
16
|
-
end
|
17
|
+
end
|
data/lib/version.rb
CHANGED
data/readme.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Lederhosen
|
2
2
|
|
3
|
-
Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
|
3
|
+
Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
|
4
4
|
|
5
5
|
## How do I get Lederhosen?
|
6
6
|
|
@@ -33,9 +33,9 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
|
|
33
33
|
|
34
34
|
### trim
|
35
35
|
|
36
|
-
Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
|
36
|
+
Trim (Illumina) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
|
37
37
|
|
38
|
-
lederhosen trim --reads_dir=reads
|
38
|
+
lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
|
39
39
|
|
40
40
|
### join
|
41
41
|
|
@@ -43,6 +43,10 @@ Join paired reads from all samples end-to-end. This method enables the use of uc
|
|
43
43
|
|
44
44
|
lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
|
45
45
|
|
46
|
+
If your reads are not paired, then you do not need to do this step. Instead, concatenate all of the trimmed reads files.
|
47
|
+
|
48
|
+
cat trimmed/*.fasta > joined.fasta
|
49
|
+
|
46
50
|
### sort
|
47
51
|
|
48
52
|
Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/spec/helpers_spec.rb
CHANGED
@@ -1,26 +1,26 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Lederhosen::Helpers do
|
4
|
-
|
5
|
-
let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt') }
|
6
|
-
|
4
|
+
|
5
|
+
let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt.gz') }
|
6
|
+
|
7
7
|
it 'should have a method for grouping QSEQ files' do
|
8
8
|
groups.length.should == 2
|
9
9
|
end
|
10
10
|
|
11
11
|
it 'should have a method for trimming sequences' do
|
12
12
|
reads = groups.values.first.first
|
13
|
-
record =
|
13
|
+
record = Zlib::GzipReader.open(reads) do |handle|
|
14
14
|
Dna.new(handle).first
|
15
15
|
end
|
16
16
|
# I should probably test with a bad read
|
17
|
-
Lederhosen::Helpers.trim(record).length.should ==
|
17
|
+
Lederhosen::Helpers.trim(record).length.should == 58
|
18
18
|
end
|
19
19
|
|
20
20
|
it 'should be able to trim pairs of qseq files, outputting fasta file' do
|
21
21
|
reads = groups.values.first
|
22
|
-
Lederhosen::Helpers.trim_pairs reads[0], reads[1],
|
22
|
+
Lederhosen::Helpers.trim_pairs reads[0], reads[1], "#{$test_dir}/munchen_trim_test.fasta"
|
23
23
|
# this test will break if trim parameters change
|
24
|
-
File.read(
|
24
|
+
File.read("#{$test_dir}/munchen_trim_test.fasta").grep(/^>/).length.should be_even
|
25
25
|
end
|
26
26
|
end
|
data/spec/misc_spec.rb
CHANGED