lederhosen 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/.rvmrc +1 -0
- data/Gemfile +5 -0
- data/bin/lederhosen +7 -0
- data/lederhosen.gemspec +27 -0
- data/lib/lederhosen/buffer.rb +54 -0
- data/lib/lederhosen/cli.rb +201 -0
- data/lib/lederhosen/helpers.rb +132 -0
- data/lib/lederhosen.rb +7 -0
- data/lib/version.rb +3 -0
- data/readme.md +41 -0
- data/setup.sh +16 -0
- data/spec/data/ILT_L_9_B_001_1.txt +400 -0
- data/spec/data/ILT_L_9_B_001_3.txt +400 -0
- data/spec/data/ILT_L_9_B_002_1.txt +400 -0
- data/spec/data/ILT_L_9_B_002_3.txt +400 -0
- data/spec/helpers_spec.rb +26 -0
- data/spec/pipeline_spec.rb +41 -0
- data/spec/spec_helper.rb +2 -0
- metadata +173 -0
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
-c
|
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use 1.8.7@lederhosen --create
|
data/Gemfile
ADDED
data/bin/lederhosen
ADDED
data/lederhosen.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require 'lib/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'lederhosen'
|
7
|
+
s.version = Lederhosen::VERSION
|
8
|
+
s.authors = ["Austin G. Davis-Richardson"]
|
9
|
+
s.email = ["harekrishna@gmail.com"]
|
10
|
+
s.homepage = "http://github.com/audy/lederhosen"
|
11
|
+
s.summary = '16S rRNA clustering for paired-end Illumina'
|
12
|
+
s.description = 'Cluster 16S rRNA amplicon data sequenced by paired-end Illumina into OTUs. Also, quality control data first!'
|
13
|
+
|
14
|
+
s.rubyforge_project = "lederhosen"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_dependency('dna')
|
22
|
+
s.add_dependency('thor')
|
23
|
+
s.add_dependency('rspec')
|
24
|
+
s.add_dependency('bundler')
|
25
|
+
s.add_dependency('progressbar')
|
26
|
+
s.add_dependency('bundler')
|
27
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Lederhosen
|
2
|
+
|
3
|
+
class Buffer
|
4
|
+
# for when you need to write out to a shitload of files.
|
5
|
+
|
6
|
+
#
|
7
|
+
# Create a new buffer
|
8
|
+
#
|
9
|
+
def initialize(args={})
|
10
|
+
@buffer = Hash.new { |h, k| h[k] = Array.new }
|
11
|
+
@buffer_max = args[:buffer_max] || 100_000
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Add an object to the buffer
|
16
|
+
#
|
17
|
+
def add_to bucket, obj
|
18
|
+
|
19
|
+
@buffer[bucket] << obj.to_s
|
20
|
+
|
21
|
+
if @buffer[bucket].length > @buffer_max
|
22
|
+
# write out
|
23
|
+
File.open(bucket, 'a+') do |out|
|
24
|
+
@buffer[bucket].each do |v|
|
25
|
+
out.puts v
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# clear that bucket
|
30
|
+
@buffer[bucket].clear
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def [] k
|
35
|
+
@buffer[k]
|
36
|
+
end
|
37
|
+
|
38
|
+
#
|
39
|
+
# Writes out leftover objects
|
40
|
+
#
|
41
|
+
def finalize
|
42
|
+
@buffer.each_key do |bucket|
|
43
|
+
File.open(bucket, 'a+') do |out|
|
44
|
+
@buffer[bucket].each do |v|
|
45
|
+
out.puts v
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
@buffer = Hash.new { |h, k| h[k] = Array.new }
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
module Lederhosen
|
2
|
+
class CLI < Thor
|
3
|
+
|
4
|
+
##
|
5
|
+
# QUALITY TRIMMING
|
6
|
+
#
|
7
|
+
desc "trim Illumina QSEQ files", "--reads_dir=reads/* --out_dir=trimmed.fasta"
|
8
|
+
method_options :reads_dir => :string, :out_dir => :string
|
9
|
+
def trim
|
10
|
+
|
11
|
+
raw_reads = options[:reads_dir]
|
12
|
+
out_dir = options[:out_dir] || 'trimmed/'
|
13
|
+
|
14
|
+
`mkdir -p #{out_dir}`
|
15
|
+
|
16
|
+
raw_reads = Helpers.get_grouped_qseq_files raw_reads
|
17
|
+
puts "found #{raw_reads.length} pairs of reads"
|
18
|
+
puts "trimming!"
|
19
|
+
raw_reads.each do |a|
|
20
|
+
out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
|
21
|
+
# TODO get total and trimmed
|
22
|
+
total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# PAIRED-END READ WORK-AROUND (JOIN THEM)
|
28
|
+
#
|
29
|
+
desc "join reads end-to-end", "--trimmed=trimmed/*.fasta --output=joined.fasta"
|
30
|
+
method_options :trimmed => :string, :output => :string
|
31
|
+
def join
|
32
|
+
puts "joining!"
|
33
|
+
|
34
|
+
trimmed = Dir[options[:trimmed] || 'trimmed/*.fasta']
|
35
|
+
output = options[:output] || 'joined.fasta'
|
36
|
+
|
37
|
+
fail "no reads in #{trimmed}" if trimmed.length == 0
|
38
|
+
|
39
|
+
output = File.open(output, 'w')
|
40
|
+
trimmed.each do |fasta_file|
|
41
|
+
records = Dna.new File.open(fasta_file)
|
42
|
+
records.each_slice(2) do |r, l|
|
43
|
+
output.puts ">#{r.name}:#{File.basename(fasta_file, '.fasta')}\n#{r.sequence.reverse+l.sequence}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# SORT JOINED READS BY LENGTH
|
50
|
+
#
|
51
|
+
desc "sort fasta file by length", "--input=joined.fasta --output=sorted.fasta"
|
52
|
+
method_options :input => :string, :output => :string
|
53
|
+
def sort
|
54
|
+
input = options[:input] || 'joined.fasta'
|
55
|
+
output = options[:output] || 'sorted.fasta'
|
56
|
+
`uclust --mergesort #{input} --output #{output}`
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# FINALLY, CLUSTER!
|
61
|
+
#
|
62
|
+
desc "cluster fasta file", "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
|
63
|
+
method_options :input => :string, :output => :string, :identity => :float
|
64
|
+
def cluster
|
65
|
+
identity = options[:identity] || 0.8
|
66
|
+
output = options[:output] || 'clusters.uc'
|
67
|
+
input = options[:input] || 'sorted.fasta'
|
68
|
+
|
69
|
+
cmd = [
|
70
|
+
'uclust',
|
71
|
+
"--input #{input}",
|
72
|
+
"--uc #{output}",
|
73
|
+
"--id #{identity}",
|
74
|
+
].join(' ')
|
75
|
+
exec cmd
|
76
|
+
end
|
77
|
+
|
78
|
+
##
|
79
|
+
# MAKE TABLES
|
80
|
+
#
|
81
|
+
desc "otu_tables generates otu tables & representative reads", "--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
|
82
|
+
method_options :clusters => :string, :output => :string, :joined => :string
|
83
|
+
def otu_table
|
84
|
+
input = options[:clusters] || 'clusters.uc'
|
85
|
+
output = options[:output] || 'otus'
|
86
|
+
joined_reads = options[:joined] || 'joined.fasta'
|
87
|
+
|
88
|
+
clusters = Hash.new
|
89
|
+
|
90
|
+
# Load cluster table!
|
91
|
+
clusters = Helpers.load_uc_file(input)
|
92
|
+
|
93
|
+
clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
|
94
|
+
|
95
|
+
# Get representative sequences!
|
96
|
+
reads_total = 0
|
97
|
+
representatives = {}
|
98
|
+
clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
|
99
|
+
|
100
|
+
out_handle = File.open("#{output}.fasta", 'w')
|
101
|
+
|
102
|
+
File.open(joined_reads) do |handle|
|
103
|
+
records = Dna.new handle
|
104
|
+
records.each do |dna|
|
105
|
+
reads_total += 1
|
106
|
+
if !representatives[dna.name].nil?
|
107
|
+
dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
|
108
|
+
out_handle.puts dna
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
out_handle.close
|
114
|
+
|
115
|
+
# Print some statistics
|
116
|
+
puts "reads in clusters: #{clusters_total}"
|
117
|
+
puts "number of reads: #{reads_total}"
|
118
|
+
puts "unique clusters: #{clusters.keys.length}"
|
119
|
+
|
120
|
+
# print OTU abundancy matrix
|
121
|
+
csv = Helpers.cluster_data_as_csv(clusters)
|
122
|
+
File.open("#{output}.csv", 'w') do |h|
|
123
|
+
h.puts csv
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
|
128
|
+
##
|
129
|
+
# Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
|
130
|
+
#
|
131
|
+
desc "output separate fasta file containing sequences belonging to each cluster", "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
|
132
|
+
method_options :clusters => :string, :reads=> :string, :buffer_size => :int, :min_clst_size => :int, :out_dir => :string
|
133
|
+
def split
|
134
|
+
clusters = options[:clusters] || 'clusters.uc'
|
135
|
+
reads = options[:reads] || 'joined.fasta'
|
136
|
+
out_dir = options[:out_dir] || 'clusters_split'
|
137
|
+
buffer_size = (options[:buffer_size] || 1000).to_i
|
138
|
+
min_clst_size = (options[:min_clst_size] || 100).to_i
|
139
|
+
finalize_every = 100_000
|
140
|
+
|
141
|
+
`mkdir -p #{out_dir}/`
|
142
|
+
|
143
|
+
puts "loading #{clusters}"
|
144
|
+
|
145
|
+
# Load read id -> cluster
|
146
|
+
read_to_clusterid = Hash.new
|
147
|
+
|
148
|
+
# keep track of cluster sizes
|
149
|
+
cluster_counts = Hash.new { |h, k| h[k] = 0}
|
150
|
+
|
151
|
+
File.open(clusters)do |handle|
|
152
|
+
handle.each do |line|
|
153
|
+
line = line.strip.split
|
154
|
+
cluster_nr = line[1]
|
155
|
+
if line[0] == 'S' || line[0] == 'H'
|
156
|
+
read = line[8]
|
157
|
+
else
|
158
|
+
next
|
159
|
+
end
|
160
|
+
read_to_clusterid[read] = cluster_nr
|
161
|
+
cluster_counts[cluster_nr] += 1
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
read_to_clusterid.delete_if do |read, cluster_nr|
|
166
|
+
cluster_counts[cluster_nr] < min_clst_size
|
167
|
+
end
|
168
|
+
|
169
|
+
total_reads = read_to_clusterid.length
|
170
|
+
total_clusters = read_to_clusterid.values.uniq.length
|
171
|
+
puts "#{total_reads} reads in #{total_clusters} clusters"
|
172
|
+
|
173
|
+
puts "writing out fasta files"
|
174
|
+
|
175
|
+
pbar = ProgressBar.new "writing", total_reads
|
176
|
+
|
177
|
+
# Write reads to individual fasta files using Buffer
|
178
|
+
buffer = Buffer.new :buffer_max => buffer_size
|
179
|
+
File.open(reads) do |handle|
|
180
|
+
records = Dna.new handle
|
181
|
+
$stderr.puts "reads = #{reads}"
|
182
|
+
records.each_with_index do |record, i|
|
183
|
+
cluster_id = read_to_clusterid[record.name]
|
184
|
+
if cluster_id
|
185
|
+
pbar.inc
|
186
|
+
filename = File.join(out_dir, cluster_id + '.fasta')
|
187
|
+
buffer[filename] << record
|
188
|
+
buffer.finalize if (i%finalize_every == 0)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
pbar.finish
|
194
|
+
puts "finalizing output"
|
195
|
+
buffer.finalize # finish writing out
|
196
|
+
|
197
|
+
puts "done"
|
198
|
+
end
|
199
|
+
|
200
|
+
end # class CLI
|
201
|
+
end # module
|
@@ -0,0 +1,132 @@
|
|
1
|
+
module Lederhosen
|
2
|
+
class Helpers
|
3
|
+
class << self
|
4
|
+
|
5
|
+
# Function for grouping qseq files produced by splitting illumina
|
6
|
+
# reads by barcode
|
7
|
+
#
|
8
|
+
# Filenames should look like this:
|
9
|
+
# IL5_L_1_B_007_1.txt
|
10
|
+
def get_grouped_qseq_files(glob='raw_reads/*.txt')
|
11
|
+
Dir.glob(glob).group_by { |x| x.split('_')[0..4].join('_') }
|
12
|
+
end
|
13
|
+
|
14
|
+
# Trim a pair of QSEQ files. Saves to a single,
|
15
|
+
# interleaved .fasta file
|
16
|
+
def trim_pairs(left, right, out, args={})
|
17
|
+
cutoff = args[:cutoff] || 20
|
18
|
+
min_length = args[:min_length] || 70
|
19
|
+
|
20
|
+
left_handle = File.open left
|
21
|
+
right_handle = File.open right
|
22
|
+
out_handle = File.open out, 'w'
|
23
|
+
|
24
|
+
left_reads = Dna.new left_handle
|
25
|
+
right_reads = Dna.new right_handle
|
26
|
+
|
27
|
+
i = 0
|
28
|
+
left_reads.zip(right_reads).each do |a, b|
|
29
|
+
i += 1
|
30
|
+
seqa = trim a
|
31
|
+
seqb = trim b
|
32
|
+
unless [seqa, seqb].include? nil
|
33
|
+
if seqb.length >= min_length && seqa.length >= min_length
|
34
|
+
out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
left_handle.close
|
39
|
+
right_handle.close
|
40
|
+
out_handle.close
|
41
|
+
end
|
42
|
+
|
43
|
+
# Return longest subsequence with quality scores
|
44
|
+
# greater than min. (Illumina PHRED)
|
45
|
+
# Trim2 from Huang, et. al
|
46
|
+
# returns just the sequence
|
47
|
+
def trim(dna, args={})
|
48
|
+
|
49
|
+
# trim primers off of sequence
|
50
|
+
# (THIS IS EXPERIMENT-SPECIFIC)
|
51
|
+
dna.sequence = dna.sequence[11..-1]
|
52
|
+
dna.quality = dna.quality[11..-1]
|
53
|
+
|
54
|
+
# throw away any read with an ambiguous primer
|
55
|
+
return nil if dna.sequence =~ /N/
|
56
|
+
|
57
|
+
min = args[:min] || 20
|
58
|
+
offset = args[:cutoff] || 64
|
59
|
+
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
60
|
+
dna.quality.each_byte.each_with_index do |b, a|
|
61
|
+
_sum += (b - offset - min)
|
62
|
+
if _sum > _max
|
63
|
+
_max = _sum
|
64
|
+
_end = a
|
65
|
+
start = first
|
66
|
+
elsif _sum < 0
|
67
|
+
_sum = 0
|
68
|
+
first = a
|
69
|
+
end
|
70
|
+
end
|
71
|
+
dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
|
72
|
+
end
|
73
|
+
|
74
|
+
# Load uc file from uclust
|
75
|
+
# returns hash with various data
|
76
|
+
def load_uc_file(input)
|
77
|
+
clusters = Hash.new
|
78
|
+
|
79
|
+
# store a list of samples
|
80
|
+
clusters[:samples] = Set.new
|
81
|
+
|
82
|
+
# data for each cluster
|
83
|
+
# - total size
|
84
|
+
# - size by sample
|
85
|
+
# - seed sequence
|
86
|
+
clusters[:count_data] = Hash.new
|
87
|
+
|
88
|
+
File.open(input) do |handle|
|
89
|
+
handle.each do |line|
|
90
|
+
|
91
|
+
# skip comments
|
92
|
+
next if line =~ /^#/
|
93
|
+
line = line.strip.split
|
94
|
+
|
95
|
+
# things we want to know
|
96
|
+
type = line[0]
|
97
|
+
clusternr = line[1]
|
98
|
+
querylabel = line[8]
|
99
|
+
targetlabel = line[9]
|
100
|
+
sample = line[8].split(':')[2]
|
101
|
+
|
102
|
+
# keep track of all samples
|
103
|
+
clusters[:samples] << sample
|
104
|
+
|
105
|
+
if type == 'S' # = Seed Sequence
|
106
|
+
clusters[:count_data][clusternr] = { :seed => querylabel, :total => 1, :counts => Hash.new{ |h, k| h[k] = 0 } }
|
107
|
+
elsif type == 'H' # = Seed Member
|
108
|
+
clusters[:count_data][clusternr][:total] += 1
|
109
|
+
clusters[:count_data][clusternr][:counts][sample] += 1
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
113
|
+
end
|
114
|
+
clusters
|
115
|
+
end
|
116
|
+
|
117
|
+
def cluster_data_as_csv(data)
|
118
|
+
samples = data[:samples].to_a
|
119
|
+
counts = data[:count_data]
|
120
|
+
|
121
|
+
sep = "\t"
|
122
|
+
csv = []
|
123
|
+
csv << ['-'] + samples
|
124
|
+
counts.keys.each do |cluster|
|
125
|
+
csv << ["cluster-#{cluster}"] + samples.collect { |x| "#{counts[cluster][:counts][x]}" }
|
126
|
+
end
|
127
|
+
csv.collect { |x| x.join("\t")}.join("\n")
|
128
|
+
end
|
129
|
+
|
130
|
+
end # class << self
|
131
|
+
end # class Helpers
|
132
|
+
end # Module
|
data/lib/lederhosen.rb
ADDED
data/lib/version.rb
ADDED
data/readme.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# Lederhosen
|
2
|
+
|
3
|
+
Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
|
4
|
+
|
5
|
+
## How do I get Lederhosen?
|
6
|
+
|
7
|
+
0. Obtain & Install uclust (64-bit)
|
8
|
+
1. Download & extract this repo.
|
9
|
+
2. `(sudo) sh setup.sh`
|
10
|
+
|
11
|
+
Alternatively, you may use Bundler to install dependencies.
|
12
|
+
|
13
|
+
## How do I use Lederhosen?
|
14
|
+
|
15
|
+
Type `lederhosen help` for complete instructions
|
16
|
+
|
17
|
+
### 1. Trim raw reads
|
18
|
+
|
19
|
+
`$ lederhosen trim --reads-dir=reads-dir/*.txt`
|
20
|
+
|
21
|
+
### 2. Join trimmed reads
|
22
|
+
|
23
|
+
`$ lederhosen join`
|
24
|
+
|
25
|
+
### 3. Sort trimmed reads
|
26
|
+
|
27
|
+
`$ lederhosen sort`
|
28
|
+
|
29
|
+
### 4. Cluster sorted reads
|
30
|
+
|
31
|
+
`$ lederhosen cluster --idenity=0.975`
|
32
|
+
|
33
|
+
### 5. Make tables & Get representative sequences
|
34
|
+
|
35
|
+
`% lederhosen otu_table --clusters=clusters_97.5.txt`
|
36
|
+
|
37
|
+
### 6. Get fasta files with reads for each cluster
|
38
|
+
|
39
|
+
`% lederhosen split --clusters=clusters_97.5.txt --reads=joined.fasta --min-clst-size=100`
|
40
|
+
|
41
|
+
`--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
|
data/setup.sh
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
echo "Note: You may need to be root.\n\nIf you get an error try running this:\n $ sudo ./setup.sh\n"
|
4
|
+
|
5
|
+
if [ ! `which uclust` ]; then
|
6
|
+
echo "NOTE: You must have uclust installed and in your \$PATH \n"
|
7
|
+
fi
|
8
|
+
|
9
|
+
echo "Installing Lederhosen dependencies"
|
10
|
+
for gem in dna bundler rspec thor progressbar; do
|
11
|
+
gem install $gem --no-ri --no-rdoc > /dev/null
|
12
|
+
done
|
13
|
+
|
14
|
+
cp lederhosen.rb /usr/local/bin/lederhosen
|
15
|
+
|
16
|
+
echo "Installation complete.\n\nFor instructions, type\n\n $ lederhosen help\n\nThank you for choosing Lederhosen."
|