lederhosen 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ -c
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 1.8.7@lederhosen --create
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source :rubygems
2
+ gem 'thor'
3
+ gem 'rspec'
4
+ gem 'dna'
5
+ gem 'progressbar'
data/bin/lederhosen ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'lederhosen'))
4
+
5
+ fail "you need to install uclust and have it in your $PATH" if `which uclust` == ''
6
+
7
+ Lederhosen::CLI.start
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require 'lib/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'lederhosen'
7
+ s.version = Lederhosen::VERSION
8
+ s.authors = ["Austin G. Davis-Richardson"]
9
+ s.email = ["harekrishna@gmail.com"]
10
+ s.homepage = "http://github.com/audy/lederhosen"
11
+ s.summary = '16S rRNA clustering for paired-end Illumina'
12
+ s.description = 'Cluster 16S rRNA amplicon data sequenced by paired-end Illumina into OTUs. Also, quality control data first!'
13
+
14
+ s.rubyforge_project = "lederhosen"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_dependency('dna')
22
+ s.add_dependency('thor')
23
+ s.add_dependency('rspec')
24
+ s.add_dependency('bundler')
25
+ s.add_dependency('progressbar')
26
+ s.add_dependency('bundler')
27
+ end
@@ -0,0 +1,54 @@
1
+ module Lederhosen
2
+
3
+ class Buffer
4
+ # for when you need to write out to a shitload of files.
5
+
6
+ #
7
+ # Create a new buffer
8
+ #
9
+ def initialize(args={})
10
+ @buffer = Hash.new { |h, k| h[k] = Array.new }
11
+ @buffer_max = args[:buffer_max] || 100_000
12
+ end
13
+
14
+ #
15
+ # Add an object to the buffer
16
+ #
17
+ def add_to bucket, obj
18
+
19
+ @buffer[bucket] << obj.to_s
20
+
21
+ if @buffer[bucket].length > @buffer_max
22
+ # write out
23
+ File.open(bucket, 'a+') do |out|
24
+ @buffer[bucket].each do |v|
25
+ out.puts v
26
+ end
27
+ end
28
+
29
+ # clear that bucket
30
+ @buffer[bucket].clear
31
+ end
32
+ end
33
+
34
+ def [] k
35
+ @buffer[k]
36
+ end
37
+
38
+ #
39
+ # Writes out leftover objects
40
+ #
41
+ def finalize
42
+ @buffer.each_key do |bucket|
43
+ File.open(bucket, 'a+') do |out|
44
+ @buffer[bucket].each do |v|
45
+ out.puts v
46
+ end
47
+ end
48
+ end
49
+ @buffer = Hash.new { |h, k| h[k] = Array.new }
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,201 @@
1
+ module Lederhosen
2
+ class CLI < Thor
3
+
4
+ ##
5
+ # QUALITY TRIMMING
6
+ #
7
+ desc "trim Illumina QSEQ files", "--reads_dir=reads/* --out_dir=trimmed.fasta"
8
+ method_options :reads_dir => :string, :out_dir => :string
9
+ def trim
10
+
11
+ raw_reads = options[:reads_dir]
12
+ out_dir = options[:out_dir] || 'trimmed/'
13
+
14
+ `mkdir -p #{out_dir}`
15
+
16
+ raw_reads = Helpers.get_grouped_qseq_files raw_reads
17
+ puts "found #{raw_reads.length} pairs of reads"
18
+ puts "trimming!"
19
+ raw_reads.each do |a|
20
+ out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
21
+ # TODO get total and trimmed
22
+ total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
23
+ end
24
+ end
25
+
26
+ ##
27
+ # PAIRED-END READ WORK-AROUND (JOIN THEM)
28
+ #
29
+ desc "join reads end-to-end", "--trimmed=trimmed/*.fasta --output=joined.fasta"
30
+ method_options :trimmed => :string, :output => :string
31
+ def join
32
+ puts "joining!"
33
+
34
+ trimmed = Dir[options[:trimmed] || 'trimmed/*.fasta']
35
+ output = options[:output] || 'joined.fasta'
36
+
37
+ fail "no reads in #{trimmed}" if trimmed.length == 0
38
+
39
+ output = File.open(output, 'w')
40
+ trimmed.each do |fasta_file|
41
+ records = Dna.new File.open(fasta_file)
42
+ records.each_slice(2) do |r, l|
43
+ output.puts ">#{r.name}:#{File.basename(fasta_file, '.fasta')}\n#{r.sequence.reverse+l.sequence}"
44
+ end
45
+ end
46
+ end
47
+
48
+ ##
49
+ # SORT JOINED READS BY LENGTH
50
+ #
51
+ desc "sort fasta file by length", "--input=joined.fasta --output=sorted.fasta"
52
+ method_options :input => :string, :output => :string
53
+ def sort
54
+ input = options[:input] || 'joined.fasta'
55
+ output = options[:output] || 'sorted.fasta'
56
+ `uclust --mergesort #{input} --output #{output}`
57
+ end
58
+
59
+ ##
60
+ # FINALLY, CLUSTER!
61
+ #
62
+ desc "cluster fasta file", "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
63
+ method_options :input => :string, :output => :string, :identity => :float
64
+ def cluster
65
+ identity = options[:identity] || 0.8
66
+ output = options[:output] || 'clusters.uc'
67
+ input = options[:input] || 'sorted.fasta'
68
+
69
+ cmd = [
70
+ 'uclust',
71
+ "--input #{input}",
72
+ "--uc #{output}",
73
+ "--id #{identity}",
74
+ ].join(' ')
75
+ exec cmd
76
+ end
77
+
78
+ ##
79
+ # MAKE TABLES
80
+ #
81
+ desc "otu_tables generates otu tables & representative reads", "--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
82
+ method_options :clusters => :string, :output => :string, :joined => :string
83
+ def otu_table
84
+ input = options[:clusters] || 'clusters.uc'
85
+ output = options[:output] || 'otus'
86
+ joined_reads = options[:joined] || 'joined.fasta'
87
+
88
+ clusters = Hash.new
89
+
90
+ # Load cluster table!
91
+ clusters = Helpers.load_uc_file(input)
92
+
93
+ clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
94
+
95
+ # Get representative sequences!
96
+ reads_total = 0
97
+ representatives = {}
98
+ clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
99
+
100
+ out_handle = File.open("#{output}.fasta", 'w')
101
+
102
+ File.open(joined_reads) do |handle|
103
+ records = Dna.new handle
104
+ records.each do |dna|
105
+ reads_total += 1
106
+ if !representatives[dna.name].nil?
107
+ dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
108
+ out_handle.puts dna
109
+ end
110
+ end
111
+ end
112
+
113
+ out_handle.close
114
+
115
+ # Print some statistics
116
+ puts "reads in clusters: #{clusters_total}"
117
+ puts "number of reads: #{reads_total}"
118
+ puts "unique clusters: #{clusters.keys.length}"
119
+
120
+ # print OTU abundancy matrix
121
+ csv = Helpers.cluster_data_as_csv(clusters)
122
+ File.open("#{output}.csv", 'w') do |h|
123
+ h.puts csv
124
+ end
125
+
126
+ end
127
+
128
+ ##
129
+ # Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
130
+ #
131
+ desc "output separate fasta file containing sequences belonging to each cluster", "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
132
+ method_options :clusters => :string, :reads=> :string, :buffer_size => :int, :min_clst_size => :int, :out_dir => :string
133
+ def split
134
+ clusters = options[:clusters] || 'clusters.uc'
135
+ reads = options[:reads] || 'joined.fasta'
136
+ out_dir = options[:out_dir] || 'clusters_split'
137
+ buffer_size = (options[:buffer_size] || 1000).to_i
138
+ min_clst_size = (options[:min_clst_size] || 100).to_i
139
+ finalize_every = 100_000
140
+
141
+ `mkdir -p #{out_dir}/`
142
+
143
+ puts "loading #{clusters}"
144
+
145
+ # Load read id -> cluster
146
+ read_to_clusterid = Hash.new
147
+
148
+ # keep track of cluster sizes
149
+ cluster_counts = Hash.new { |h, k| h[k] = 0}
150
+
151
+ File.open(clusters)do |handle|
152
+ handle.each do |line|
153
+ line = line.strip.split
154
+ cluster_nr = line[1]
155
+ if line[0] == 'S' || line[0] == 'H'
156
+ read = line[8]
157
+ else
158
+ next
159
+ end
160
+ read_to_clusterid[read] = cluster_nr
161
+ cluster_counts[cluster_nr] += 1
162
+ end
163
+ end
164
+
165
+ read_to_clusterid.delete_if do |read, cluster_nr|
166
+ cluster_counts[cluster_nr] < min_clst_size
167
+ end
168
+
169
+ total_reads = read_to_clusterid.length
170
+ total_clusters = read_to_clusterid.values.uniq.length
171
+ puts "#{total_reads} reads in #{total_clusters} clusters"
172
+
173
+ puts "writing out fasta files"
174
+
175
+ pbar = ProgressBar.new "writing", total_reads
176
+
177
+ # Write reads to individual fasta files using Buffer
178
+ buffer = Buffer.new :buffer_max => buffer_size
179
+ File.open(reads) do |handle|
180
+ records = Dna.new handle
181
+ $stderr.puts "reads = #{reads}"
182
+ records.each_with_index do |record, i|
183
+ cluster_id = read_to_clusterid[record.name]
184
+ if cluster_id
185
+ pbar.inc
186
+ filename = File.join(out_dir, cluster_id + '.fasta')
187
+ buffer[filename] << record
188
+ buffer.finalize if (i%finalize_every == 0)
189
+ end
190
+ end
191
+ end
192
+
193
+ pbar.finish
194
+ puts "finalizing output"
195
+ buffer.finalize # finish writing out
196
+
197
+ puts "done"
198
+ end
199
+
200
+ end # class CLI
201
+ end # module
@@ -0,0 +1,132 @@
1
+ module Lederhosen
2
+ class Helpers
3
+ class << self
4
+
5
+ # Function for grouping qseq files produced by splitting illumina
6
+ # reads by barcode
7
+ #
8
+ # Filenames should look like this:
9
+ # IL5_L_1_B_007_1.txt
10
+ def get_grouped_qseq_files(glob='raw_reads/*.txt')
11
+ Dir.glob(glob).group_by { |x| x.split('_')[0..4].join('_') }
12
+ end
13
+
14
+ # Trim a pair of QSEQ files. Saves to a single,
15
+ # interleaved .fasta file
16
+ def trim_pairs(left, right, out, args={})
17
+ cutoff = args[:cutoff] || 20
18
+ min_length = args[:min_length] || 70
19
+
20
+ left_handle = File.open left
21
+ right_handle = File.open right
22
+ out_handle = File.open out, 'w'
23
+
24
+ left_reads = Dna.new left_handle
25
+ right_reads = Dna.new right_handle
26
+
27
+ i = 0
28
+ left_reads.zip(right_reads).each do |a, b|
29
+ i += 1
30
+ seqa = trim a
31
+ seqb = trim b
32
+ unless [seqa, seqb].include? nil
33
+ if seqb.length >= min_length && seqa.length >= min_length
34
+ out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
35
+ end
36
+ end
37
+ end
38
+ left_handle.close
39
+ right_handle.close
40
+ out_handle.close
41
+ end
42
+
43
+ # Return longest subsequence with quality scores
44
+ # greater than min. (Illumina PHRED)
45
+ # Trim2 from Huang, et. al
46
+ # returns just the sequence
47
+ def trim(dna, args={})
48
+
49
+ # trim primers off of sequence
50
+ # (THIS IS EXPERIMENT-SPECIFIC)
51
+ dna.sequence = dna.sequence[11..-1]
52
+ dna.quality = dna.quality[11..-1]
53
+
54
+ # throw away any read with an ambiguous primer
55
+ return nil if dna.sequence =~ /N/
56
+
57
+ min = args[:min] || 20
58
+ offset = args[:cutoff] || 64
59
+ _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
60
+ dna.quality.each_byte.each_with_index do |b, a|
61
+ _sum += (b - offset - min)
62
+ if _sum > _max
63
+ _max = _sum
64
+ _end = a
65
+ start = first
66
+ elsif _sum < 0
67
+ _sum = 0
68
+ first = a
69
+ end
70
+ end
71
+ dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
72
+ end
73
+
74
+ # Load uc file from uclust
75
+ # returns hash with various data
76
+ def load_uc_file(input)
77
+ clusters = Hash.new
78
+
79
+ # store a list of samples
80
+ clusters[:samples] = Set.new
81
+
82
+ # data for each cluster
83
+ # - total size
84
+ # - size by sample
85
+ # - seed sequence
86
+ clusters[:count_data] = Hash.new
87
+
88
+ File.open(input) do |handle|
89
+ handle.each do |line|
90
+
91
+ # skip comments
92
+ next if line =~ /^#/
93
+ line = line.strip.split
94
+
95
+ # things we want to know
96
+ type = line[0]
97
+ clusternr = line[1]
98
+ querylabel = line[8]
99
+ targetlabel = line[9]
100
+ sample = line[8].split(':')[2]
101
+
102
+ # keep track of all samples
103
+ clusters[:samples] << sample
104
+
105
+ if type == 'S' # = Seed Sequence
106
+ clusters[:count_data][clusternr] = { :seed => querylabel, :total => 1, :counts => Hash.new{ |h, k| h[k] = 0 } }
107
+ elsif type == 'H' # = Seed Member
108
+ clusters[:count_data][clusternr][:total] += 1
109
+ clusters[:count_data][clusternr][:counts][sample] += 1
110
+ end
111
+
112
+ end
113
+ end
114
+ clusters
115
+ end
116
+
117
+ def cluster_data_as_csv(data)
118
+ samples = data[:samples].to_a
119
+ counts = data[:count_data]
120
+
121
+ sep = "\t"
122
+ csv = []
123
+ csv << ['-'] + samples
124
+ counts.keys.each do |cluster|
125
+ csv << ["cluster-#{cluster}"] + samples.collect { |x| "#{counts[cluster][:counts][x]}" }
126
+ end
127
+ csv.collect { |x| x.join("\t")}.join("\n")
128
+ end
129
+
130
+ end # class << self
131
+ end # class Helpers
132
+ end # Module
data/lib/lederhosen.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'thor'
3
+ require 'dna'
4
+ require 'set'
5
+ require 'progressbar'
6
+
7
+ Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }
data/lib/version.rb ADDED
@@ -0,0 +1,3 @@
1
+ module Lederhosen
2
+ VERSION = '0.0.0'
3
+ end
data/readme.md ADDED
@@ -0,0 +1,41 @@
1
+ # Lederhosen
2
+
3
+ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
4
+
5
+ ## How do I get Lederhosen?
6
+
7
+ 0. Obtain & Install uclust (64-bit)
8
+ 1. Download & extract this repo.
9
+ 2. `(sudo) sh setup.sh`
10
+
11
+ Alternatively, you may use Bundler to install dependencies.
12
+
13
+ ## How do I use Lederhosen?
14
+
15
+ Type `lederhosen help` for complete instructions
16
+
17
+ ### 1. Trim raw reads
18
+
19
+ `$ lederhosen trim --reads-dir=reads-dir/*.txt`
20
+
21
+ ### 2. Join trimmed reads
22
+
23
+ `$ lederhosen join`
24
+
25
+ ### 3. Sort trimmed reads
26
+
27
+ `$ lederhosen sort`
28
+
29
+ ### 4. Cluster sorted reads
30
+
31
+ `$ lederhosen cluster --idenity=0.975`
32
+
33
+ ### 5. Make tables & Get representative sequences
34
+
35
+ `% lederhosen otu_table --clusters=clusters_97.5.txt`
36
+
37
+ ### 6. Get fasta files with reads for each cluster
38
+
39
+ `% lederhosen split --clusters=clusters_97.5.txt --reads=joined.fasta --min-clst-size=100`
40
+
41
+ `--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
data/setup.sh ADDED
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+
3
+ echo "Note: You may need to be root.\n\nIf you get an error try running this:\n $ sudo ./setup.sh\n"
4
+
5
+ if [ ! `which uclust` ]; then
6
+ echo "NOTE: You must have uclust installed and in your \$PATH \n"
7
+ fi
8
+
9
+ echo "Installing Lederhosen dependencies"
10
+ for gem in dna bundler rspec thor progressbar; do
11
+ gem install $gem --no-ri --no-rdoc > /dev/null
12
+ done
13
+
14
+ cp lederhosen.rb /usr/local/bin/lederhosen
15
+
16
+ echo "Installation complete.\n\nFor instructions, type\n\n $ lederhosen help\n\nThank you for choosing Lederhosen."