lederhosen 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -1
- data/bin/lederhosen +1 -1
- data/examples/hierarchical_clustering.sh +51 -0
- data/examples/pipeline.sh +1 -1
- data/lederhosen.gemspec +1 -1
- data/lib/lederhosen/buffer.rb +2 -2
- data/lib/lederhosen/helpers.rb +10 -5
- data/lib/lederhosen/tasks/add_names.rb +80 -80
- data/lib/lederhosen/tasks/cluster.rb +1 -1
- data/lib/lederhosen/tasks/name.rb +2 -2
- data/lib/lederhosen/tasks/otu_filter.rb +43 -43
- data/lib/lederhosen/tasks/otu_table.rb +13 -13
- data/lib/lederhosen/tasks/rep_reads.rb +2 -2
- data/lib/lederhosen/tasks/squish.rb +42 -42
- data/lib/lederhosen.rb +2 -1
- data/lib/version.rb +1 -1
- data/readme.md +7 -3
- data/spec/data/ILT_L_9_B_001_1.txt.gz +0 -0
- data/spec/data/ILT_L_9_B_001_3.txt.gz +0 -0
- data/spec/data/ILT_L_9_B_002_1.txt.gz +0 -0
- data/spec/data/ILT_L_9_B_002_3.txt.gz +0 -0
- data/spec/helpers_spec.rb +7 -7
- data/spec/misc_spec.rb +1 -1
- data/spec/pipeline_spec.rb +18 -22
- data/spec/spec_helper.rb +5 -1
- metadata +13 -16
- data/spec/data/ILT_L_9_B_001_1.txt +0 -400
- data/spec/data/ILT_L_9_B_001_3.txt +0 -400
- data/spec/data/ILT_L_9_B_002_1.txt +0 -400
- data/spec/data/ILT_L_9_B_002_3.txt +0 -400
- data/spec/data/blast_out.txt +0 -10
- data/spec/data/blat.txt +0 -86
data/Gemfile
CHANGED
data/bin/lederhosen
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
set -e
|
4
|
+
set -x
|
5
|
+
|
6
|
+
# Hierarchical OTU clustering
|
7
|
+
# Austin G. Davis-Richardson
|
8
|
+
# <harekrishna at gmail dot com>
|
9
|
+
# http://github.com/audy/lederhosen
|
10
|
+
|
11
|
+
reads='sorted.fasta'
|
12
|
+
out='h_clustering'
|
13
|
+
|
14
|
+
mkdir -p $out
|
15
|
+
|
16
|
+
# initial clustering at 80%
|
17
|
+
lederhosen cluster --input=$reads --output=$out/clusters_0.80.uc --identity=0.80
|
18
|
+
|
19
|
+
# filter UC file
|
20
|
+
lederhosen uc_filter --input=$out/clusters_0.80.uc --output=$out/clusters_0.80.uc.filtered --reads=1 --samples=1
|
21
|
+
|
22
|
+
# get reads for each cluster
|
23
|
+
mkdir -p $out/split_80
|
24
|
+
lederhosen split --clusters=$out/clusters_0.80.uc.filtered --reads=$reads --out-dir=$out/split_80/
|
25
|
+
|
26
|
+
# now cluster each of those at 90%
|
27
|
+
for fasta in $out/split_80/*.fasta
|
28
|
+
do
|
29
|
+
|
30
|
+
# sort (awww, do I really have to do this again?)
|
31
|
+
lederhosen sort --input=$fasta --output=$fasta.sorted
|
32
|
+
|
33
|
+
# cluster
|
34
|
+
lederhosen cluster --input=$fasta.sorted --output=$fasta.uc --identity=0.90
|
35
|
+
|
36
|
+
# split
|
37
|
+
split=$out/split_80.90_$(basename $fasta .fasta)
|
38
|
+
lederhosen split --clusters=$fasta.uc --reads=$fasta --out-dir=$split
|
39
|
+
done
|
40
|
+
|
41
|
+
# Do it again at 95%
|
42
|
+
for fasta in $out/split_80/split_*_90.fasta/*.fasta
|
43
|
+
do
|
44
|
+
# cluster
|
45
|
+
lederhosen cluster --input=$fasta --output=$fasta.uc --identity=90
|
46
|
+
|
47
|
+
# split
|
48
|
+
split=$outdir/80.90.$fasta.fasta
|
49
|
+
mkdir -p $split
|
50
|
+
lederhosen split --clusters=$fasta.uc --reads=$input --out-dir=$split
|
51
|
+
done
|
data/examples/pipeline.sh
CHANGED
data/lederhosen.gemspec
CHANGED
data/lib/lederhosen/buffer.rb
CHANGED
data/lib/lederhosen/helpers.rb
CHANGED
@@ -17,8 +17,13 @@ module Lederhosen
|
|
17
17
|
cutoff = args[:cutoff] || 20
|
18
18
|
min_length = args[:min_length] || 70
|
19
19
|
|
20
|
-
left_handle
|
21
|
-
|
20
|
+
left_handle, right_handle =
|
21
|
+
begin
|
22
|
+
[ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
|
23
|
+
rescue Zlib::GzipFile::Error
|
24
|
+
[ File.open(left), File.open(right) ]
|
25
|
+
end
|
26
|
+
|
22
27
|
out_handle = File.open out, 'w'
|
23
28
|
|
24
29
|
left_reads = Dna.new left_handle
|
@@ -57,9 +62,9 @@ module Lederhosen
|
|
57
62
|
min = args[:min] || 20
|
58
63
|
offset = args[:cutoff] || 64
|
59
64
|
|
60
|
-
|
61
|
-
|
62
|
-
|
65
|
+
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
66
|
+
|
67
|
+
dna.quality.each_byte.each_with_index do |b, a|
|
63
68
|
_sum += (b - offset - min)
|
64
69
|
if _sum > _max
|
65
70
|
_max = _sum
|
@@ -3,84 +3,84 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
module Lederhosen
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "add_names",
|
9
|
+
"add names to otu abundance matrix using blat output"
|
10
|
+
|
11
|
+
method_option :blat, :type => :string, :required => true
|
12
|
+
method_option :table, :type => :string, :required => true
|
13
|
+
method_option :level, :type => :string, :required => true
|
14
|
+
method_option :output, :type => :string, :required => false
|
15
|
+
|
16
|
+
def add_names
|
17
|
+
blat = options[:blat]
|
18
|
+
table = options[:table]
|
19
|
+
level = options[:level]
|
20
|
+
output = options[:output] || $stdout
|
21
|
+
|
22
|
+
levels = { 'kingdom' => 0,
|
23
|
+
'domain' => 0,
|
24
|
+
'phylum' => 1,
|
25
|
+
'class' => 2,
|
26
|
+
'order' => 3,
|
27
|
+
'family' => 4,
|
28
|
+
'genus' => 5,
|
29
|
+
'species' => 6 }
|
30
|
+
|
31
|
+
fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
|
32
|
+
|
33
|
+
# Corresponds with the numbers used in the TaxCollector database
|
34
|
+
# taxonomic descriptions
|
35
|
+
level_no = levels[level]
|
36
|
+
|
37
|
+
# map cluster_id to taxonomic description
|
38
|
+
# default is the cluster_id itself in case
|
39
|
+
# the cluster was not classified.
|
40
|
+
clusterid_to_name = Hash.new { |h, k| h[k] = k }
|
41
|
+
|
42
|
+
# map clusterid to name using blat output
|
43
|
+
ohai "loading BLAT output from #{blat}"
|
44
|
+
File.open(blat) do |handle|
|
45
|
+
handle.each do |line|
|
46
|
+
line = line.strip.split
|
47
|
+
|
48
|
+
# Only get first match
|
49
|
+
# TODO something smarter here
|
50
|
+
cluster_id = line[0].split(':')[3]
|
51
|
+
next if clusterid_to_name.include? cluster_id
|
52
|
+
|
53
|
+
taxonomic_description = line[1]
|
54
|
+
|
55
|
+
# match by level_no
|
56
|
+
# Example:
|
57
|
+
# [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
|
58
|
+
# I want to match Actinobacteria given level_no = 2
|
59
|
+
level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
|
60
|
+
|
61
|
+
clusterid_to_name[cluster_id] = level_name
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# load table, replace cluster names with taxonomic descriptions
|
66
|
+
output = File.open(output, 'w') unless output == $stdout
|
67
|
+
ohai "replacing names in #{table}"
|
68
|
+
File.open(table) do |handle|
|
69
|
+
|
70
|
+
# read in header, replace clusterids to names
|
71
|
+
header = handle.gets.strip.split(',')
|
72
|
+
header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
|
73
|
+
|
74
|
+
# print new header
|
75
|
+
output.puts header.join(',')
|
76
|
+
|
77
|
+
# print rest of table
|
78
|
+
handle.each { |l| output.print l }
|
79
|
+
end
|
80
|
+
|
81
|
+
# print status message
|
82
|
+
ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
86
|
end
|
@@ -1,45 +1,45 @@
|
|
1
1
|
module Lederhosen
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
2
|
+
class CLI
|
3
|
+
|
4
|
+
desc 'otu_filter', 'works like uc_filter but uses an OTU table as input'
|
5
|
+
|
6
|
+
method_option :input, :type => :string, :required => true
|
7
|
+
method_option :output, :type => :string, :required => true
|
8
|
+
method_option :reads, :type => :numeric, :required => true
|
9
|
+
method_option :samples, :type => :numeric, :required => true
|
10
|
+
|
11
|
+
def otu_filter
|
12
|
+
input = options[:input]
|
13
|
+
output = options[:output]
|
14
|
+
reads = options[:reads]
|
15
|
+
samples = options[:samples]
|
16
|
+
|
17
|
+
##
|
18
|
+
# Iterate over otu table line by line.
|
19
|
+
# Only print if cluster meets criteria
|
20
|
+
#
|
21
|
+
kept = 0
|
22
|
+
File.open(input) do |handle|
|
23
|
+
header = handle.gets.strip
|
24
|
+
header = header.split(',')
|
25
|
+
samples = header[1..-1]
|
26
|
+
|
27
|
+
puts header.join(',')
|
28
|
+
|
29
|
+
handle.each do |line|
|
30
|
+
line = line.strip.split(',')
|
31
|
+
cluster_no = line[0]
|
32
|
+
counts = line[1..-1].collect { |x| x.to_i }
|
33
|
+
|
34
|
+
# should be the same as uc_filter
|
35
|
+
if counts.reject { |x| x < reads }.length > samples
|
36
|
+
puts line.join(',')
|
37
|
+
kept += 1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
ohai "kept #{kept} clusters."
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
45
|
end
|
@@ -17,32 +17,32 @@ module Lederhosen
|
|
17
17
|
input = options[:clusters]
|
18
18
|
output = options[:output]
|
19
19
|
joined_reads = options[:joined]
|
20
|
-
|
20
|
+
|
21
21
|
# Load cluster table
|
22
22
|
|
23
|
-
clstr_info = Helpers.load_uc_file input
|
23
|
+
clstr_info = Helpers.load_uc_file input
|
24
24
|
clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
|
25
25
|
clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
|
26
26
|
samples = clstr_info[:samples]
|
27
27
|
|
28
28
|
# print OTU abundance matrix
|
29
|
-
|
30
|
-
|
29
|
+
# clusters as columns
|
30
|
+
# samples as rows
|
31
31
|
|
32
32
|
File.open("#{output}.csv", 'w') do |h|
|
33
33
|
samples = samples.sort
|
34
34
|
clusters = clstr_counts.keys
|
35
35
|
|
36
36
|
# print header (cluster names)
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
37
|
+
h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
|
38
|
+
|
39
|
+
samples.each do |sample|
|
40
|
+
h.print sample
|
41
|
+
clusters.each do |cluster|
|
42
|
+
h.print "#{SEP}#{clstr_counts[cluster][sample]}"
|
43
|
+
end
|
44
|
+
h.print "\n"
|
45
|
+
end
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
@@ -19,7 +19,7 @@ module Lederhosen
|
|
19
19
|
|
20
20
|
|
21
21
|
# Load cluster table!
|
22
|
-
clstr_info = Helpers.load_uc_file input
|
22
|
+
clstr_info = Helpers.load_uc_file input
|
23
23
|
clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
|
24
24
|
seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
|
25
25
|
samples = clstr_info[:samples]
|
@@ -36,7 +36,7 @@ module Lederhosen
|
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
out_handle.close
|
41
41
|
end
|
42
42
|
|
@@ -3,46 +3,46 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
module Lederhosen
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
|
9
|
+
|
10
|
+
method_option :csv_file, :type => :string, :required => true
|
11
|
+
method_option :output, :type => :string, :required => false
|
12
|
+
|
13
|
+
def squish
|
14
|
+
csv_file = options[:csv_file]
|
15
|
+
output = options[:output] || $stdout
|
16
|
+
|
17
|
+
# sample_name -> column name -> total number of reads
|
18
|
+
total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
19
|
+
column_names = '' # scope
|
20
|
+
# Load CSV file, merge counts in columns with the same name
|
21
|
+
File.open(csv_file) do |handle|
|
22
|
+
column_names = handle.gets.strip.split(',')[1..-1]
|
23
|
+
handle.each do |line|
|
24
|
+
line = line.strip.split(',')
|
25
|
+
sample = line[0]
|
26
|
+
line[1..-1].zip(column_names) do |reads, column_name|
|
27
|
+
total_by_sample_by_column[sample][column_name] += reads.to_i
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
output = File.open(output, 'w') rescue $stdout
|
33
|
+
|
34
|
+
# print the new, squished csv file
|
35
|
+
column_names.uniq!.sort!
|
36
|
+
output.puts "-,#{column_names.join(',')}"
|
37
|
+
total_by_sample_by_column.each_pair do |sample_id, row|
|
38
|
+
output.print "#{sample_id}"
|
39
|
+
column_names.each do |column_name|
|
40
|
+
output.print ",#{row[column_name]}"
|
41
|
+
end
|
42
|
+
output.print "\n"
|
43
|
+
end
|
44
|
+
|
45
|
+
output.close
|
46
|
+
end
|
47
|
+
end
|
48
48
|
end
|
data/lib/lederhosen.rb
CHANGED
@@ -4,6 +4,7 @@ require 'dna'
|
|
4
4
|
require 'set'
|
5
5
|
require 'progressbar'
|
6
6
|
require 'awesome_print'
|
7
|
+
require 'zlib'
|
7
8
|
|
8
9
|
Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }
|
9
10
|
|
@@ -13,4 +14,4 @@ class String
|
|
13
14
|
k -= 1
|
14
15
|
(0..(self.length-k-1)).collect { |i| self[i..i+k] }
|
15
16
|
end
|
16
|
-
end
|
17
|
+
end
|
data/lib/version.rb
CHANGED
data/readme.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Lederhosen
|
2
2
|
|
3
|
-
Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
|
3
|
+
Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
|
4
4
|
|
5
5
|
## How do I get Lederhosen?
|
6
6
|
|
@@ -33,9 +33,9 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
|
|
33
33
|
|
34
34
|
### trim
|
35
35
|
|
36
|
-
Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
|
36
|
+
Trim (Illumina) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
|
37
37
|
|
38
|
-
lederhosen trim --reads_dir=reads
|
38
|
+
lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
|
39
39
|
|
40
40
|
### join
|
41
41
|
|
@@ -43,6 +43,10 @@ Join paired reads from all samples end-to-end. This method enables the use of uc
|
|
43
43
|
|
44
44
|
lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
|
45
45
|
|
46
|
+
If your reads are not paired, then you do not need to do this step. Instead, concatenate all of the trimmed reads files.
|
47
|
+
|
48
|
+
cat trimmed/*.fasta > joined.fasta
|
49
|
+
|
46
50
|
### sort
|
47
51
|
|
48
52
|
Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/spec/helpers_spec.rb
CHANGED
@@ -1,26 +1,26 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Lederhosen::Helpers do
|
4
|
-
|
5
|
-
let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt') }
|
6
|
-
|
4
|
+
|
5
|
+
let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt.gz') }
|
6
|
+
|
7
7
|
it 'should have a method for grouping QSEQ files' do
|
8
8
|
groups.length.should == 2
|
9
9
|
end
|
10
10
|
|
11
11
|
it 'should have a method for trimming sequences' do
|
12
12
|
reads = groups.values.first.first
|
13
|
-
record =
|
13
|
+
record = Zlib::GzipReader.open(reads) do |handle|
|
14
14
|
Dna.new(handle).first
|
15
15
|
end
|
16
16
|
# I should probably test with a bad read
|
17
|
-
Lederhosen::Helpers.trim(record).length.should ==
|
17
|
+
Lederhosen::Helpers.trim(record).length.should == 58
|
18
18
|
end
|
19
19
|
|
20
20
|
it 'should be able to trim pairs of qseq files, outputting fasta file' do
|
21
21
|
reads = groups.values.first
|
22
|
-
Lederhosen::Helpers.trim_pairs reads[0], reads[1],
|
22
|
+
Lederhosen::Helpers.trim_pairs reads[0], reads[1], "#{$test_dir}/munchen_trim_test.fasta"
|
23
23
|
# this test will break if trim parameters change
|
24
|
-
File.read(
|
24
|
+
File.read("#{$test_dir}/munchen_trim_test.fasta").grep(/^>/).length.should be_even
|
25
25
|
end
|
26
26
|
end
|
data/spec/misc_spec.rb
CHANGED