lederhosen 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/lederhosen/helpers.rb +23 -25
- data/lib/lederhosen/tasks/cluster.rb +3 -3
- data/lib/lederhosen/tasks/join.rb +2 -2
- data/lib/lederhosen/tasks/name.rb +29 -0
- data/lib/lederhosen/tasks/otu_table.rb +52 -34
- data/lib/lederhosen/tasks/rep_reads.rb +44 -0
- data/lib/lederhosen/tasks/sort.rb +2 -3
- data/lib/lederhosen/tasks/split.rb +3 -3
- data/lib/lederhosen/tasks/trim.rb +1 -1
- data/lib/version.rb +1 -1
- data/readme.md +32 -11
- metadata +7 -5
data/lib/lederhosen/helpers.rb
CHANGED
@@ -76,56 +76,54 @@ module Lederhosen
|
|
76
76
|
def load_uc_file(input)
|
77
77
|
clusters = Hash.new
|
78
78
|
|
79
|
-
#
|
79
|
+
# keep track of samples
|
80
|
+
samples = Set.new
|
81
|
+
|
82
|
+
# store a list of all the sample IDs
|
80
83
|
clusters[:samples] = Set.new
|
81
84
|
|
82
85
|
# data for each cluster
|
83
|
-
#
|
84
|
-
|
85
|
-
# - seed sequence
|
86
|
-
clusters[:count_data] = Hash.new
|
86
|
+
# clstr_counts[:clstr][:sample] = number_of_reads
|
87
|
+
clstr_counts = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
87
88
|
|
89
|
+
# clstrnr_to_seed[seed_sequence_id] = clstr_nr
|
90
|
+
seed_to_clstrnr = Hash.new
|
88
91
|
File.open(input) do |handle|
|
89
92
|
handle.each do |line|
|
90
93
|
|
91
|
-
# skip comments
|
92
|
-
|
94
|
+
next if line =~ /^#/ # skip comments
|
95
|
+
|
93
96
|
line = line.strip.split
|
94
97
|
|
95
98
|
# things we want to know
|
96
99
|
type = line[0]
|
97
|
-
clusternr = line[1]
|
100
|
+
clusternr = line[1].to_i
|
98
101
|
querylabel = line[8]
|
99
102
|
targetlabel = line[9]
|
100
103
|
sample = line[8].split(':')[2]
|
104
|
+
|
105
|
+
# keep track of samples
|
106
|
+
samples.add sample
|
101
107
|
|
102
108
|
# keep track of all samples
|
103
|
-
clusters[:samples]
|
109
|
+
clusters[:samples].add sample
|
104
110
|
|
105
111
|
if type == 'S' # = Seed Sequence
|
106
|
-
|
112
|
+
clstr_counts[clusternr][sample] += 1
|
113
|
+
seed_to_clstrnr[querylabel] = clusternr
|
107
114
|
elsif type == 'H' # = Seed Member
|
108
|
-
|
109
|
-
clusters[:count_data][clusternr][:counts][sample] += 1
|
115
|
+
clstr_counts[clusternr][sample] += 1
|
110
116
|
end
|
111
117
|
|
112
118
|
end
|
113
119
|
end
|
114
|
-
|
120
|
+
return {
|
121
|
+
:clstr_counts => clstr_counts,
|
122
|
+
:seed_to_clstrnr => seed_to_clstrnr,
|
123
|
+
:samples => samples
|
124
|
+
}
|
115
125
|
end
|
116
126
|
|
117
|
-
def cluster_data_as_csv(data)
|
118
|
-
samples = data[:samples].to_a
|
119
|
-
counts = data[:count_data]
|
120
|
-
|
121
|
-
sep = ","
|
122
|
-
csv = []
|
123
|
-
csv << ['-'] + samples
|
124
|
-
counts.keys.each do |cluster|
|
125
|
-
csv << ["cluster-#{cluster}"] + samples.collect { |x| "#{counts[cluster][:counts][x]}" }
|
126
|
-
end
|
127
|
-
csv.collect { |x| x.join("\t")}.join("\n")
|
128
|
-
end
|
129
127
|
|
130
128
|
end # class << self
|
131
129
|
end # class Helpers
|
@@ -8,9 +8,9 @@ module Lederhosen
|
|
8
8
|
desc "cluster fasta file",
|
9
9
|
"--input=sorted.fasta --identity=0.80 --output=clusters.uc"
|
10
10
|
|
11
|
-
method_option :input, :type => :string, :
|
12
|
-
method_option :output, :type => :string, :
|
13
|
-
method_option :identity, :type => :numeric, :
|
11
|
+
method_option :input, :type => :string, :required => true
|
12
|
+
method_option :output, :type => :string, :required => true
|
13
|
+
method_option :identity, :type => :numeric, :required => true
|
14
14
|
|
15
15
|
def cluster
|
16
16
|
identity = options[:identity]
|
@@ -7,8 +7,8 @@ module Lederhosen
|
|
7
7
|
desc "join reads end-to-end",
|
8
8
|
"--trimmed=trimmed/*.fasta --output=joined.fasta"
|
9
9
|
|
10
|
-
method_option :trimmed, :type => :string, :
|
11
|
-
method_option :output, :type => :string, :
|
10
|
+
method_option :trimmed, :type => :string, :required => true
|
11
|
+
method_option :output, :type => :string, :required => true
|
12
12
|
|
13
13
|
def join
|
14
14
|
|
@@ -0,0 +1,29 @@
|
|
1
|
+
##
|
2
|
+
# IDENTIFY CLUSTERS IN A TAXCOLLECTOR DATABASE
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "name identify clusters in a taxcollector database",
|
9
|
+
"--reps representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
|
10
|
+
|
11
|
+
method_option :reps, :type => :string, :required => true
|
12
|
+
method_option :database, :type => :string, :required => true
|
13
|
+
method_option :output, :type => :string, :required => true
|
14
|
+
|
15
|
+
def name
|
16
|
+
reps = options[:reps]
|
17
|
+
database = options[:database]
|
18
|
+
output = options[:output]
|
19
|
+
|
20
|
+
# run blat/blast
|
21
|
+
cmd = [
|
22
|
+
'blat',
|
23
|
+
|
24
|
+
]
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
@@ -2,58 +2,76 @@
|
|
2
2
|
# MAKE TABLES
|
3
3
|
#
|
4
4
|
|
5
|
+
SEP = ','
|
6
|
+
|
5
7
|
module Lederhosen
|
6
8
|
class CLI
|
7
9
|
|
8
|
-
desc "otu_tables generates otu tables
|
9
|
-
"--clusters=clusters.uc --output=otu_prefix
|
10
|
+
desc "otu_tables generates otu tables",
|
11
|
+
"--clusters=clusters.uc --output=otu_prefix"
|
10
12
|
|
11
|
-
method_option :clusters, :type => :string, :
|
12
|
-
method_option :output, :type => :string, :
|
13
|
-
method_option :joined, :type => :string, :default => 'joined.fasta'
|
13
|
+
method_option :clusters, :type => :string, :required => true
|
14
|
+
method_option :output, :type => :string, :required => true
|
14
15
|
|
15
16
|
def otu_table
|
16
|
-
input
|
17
|
-
output
|
17
|
+
input = options[:clusters]
|
18
|
+
output = options[:output]
|
18
19
|
joined_reads = options[:joined]
|
19
|
-
|
20
|
-
clusters = Hash.new
|
20
|
+
|
21
21
|
|
22
22
|
# Load cluster table!
|
23
|
-
|
23
|
+
clstr_info = Helpers.load_uc_file input
|
24
|
+
clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
|
25
|
+
clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
|
26
|
+
samples = clstr_info[:samples]
|
24
27
|
|
25
|
-
|
26
|
-
|
27
|
-
#
|
28
|
-
|
29
|
-
|
30
|
-
clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
|
28
|
+
# print OTU abundancy matrix
|
29
|
+
|
30
|
+
File.open("#{output}.csv", 'w') do |h|
|
31
|
+
samples = samples.sort
|
32
|
+
clusters = clstr_counts.keys
|
31
33
|
|
32
|
-
|
34
|
+
# print header
|
35
|
+
head = samples.join(SEP)
|
36
|
+
h.puts "-" + SEP + head
|
33
37
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
|
40
|
-
out_handle.puts dna
|
38
|
+
# start printing clusters
|
39
|
+
clusters.each do |cluster|
|
40
|
+
h.print "cluster-#{cluster}"
|
41
|
+
samples.each do |sample|
|
42
|
+
h.print "#{SEP}#{clstr_counts[cluster][sample]}"
|
41
43
|
end
|
44
|
+
h.print "\n"
|
42
45
|
end
|
46
|
+
|
43
47
|
end
|
44
48
|
|
45
|
-
|
49
|
+
# # Get representative sequences!
|
50
|
+
# reads_total = 0
|
51
|
+
# representatives = {}
|
52
|
+
# clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
|
53
|
+
#
|
54
|
+
# out_handle = File.open("#{output}.fasta", 'w')
|
55
|
+
#
|
56
|
+
# File.open(joined_reads) do |handle|
|
57
|
+
# records = Dna.new handle
|
58
|
+
# records.each do |dna|
|
59
|
+
# reads_total += 1
|
60
|
+
# if !representatives[dna.name].nil?
|
61
|
+
# dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
|
62
|
+
# out_handle.puts dna
|
63
|
+
# end
|
64
|
+
# end
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
# out_handle.close
|
68
|
+
#
|
69
|
+
# # Print some statistics
|
70
|
+
# ohai "reads in clusters: #{clusters_total}"
|
71
|
+
# ohai "number of reads: #{reads_total}"
|
72
|
+
# ohai "unique clusters: #{clusters.keys.length}"
|
46
73
|
|
47
|
-
# Print some statistics
|
48
|
-
ohai "reads in clusters: #{clusters_total}"
|
49
|
-
ohai "number of reads: #{reads_total}"
|
50
|
-
ohai "unique clusters: #{clusters.keys.length}"
|
51
74
|
|
52
|
-
# print OTU abundancy matrix
|
53
|
-
csv = Helpers.cluster_data_as_csv(clusters)
|
54
|
-
File.open("#{output}.csv", 'w') do |h|
|
55
|
-
h.puts csv
|
56
|
-
end
|
57
75
|
|
58
76
|
end
|
59
77
|
|
@@ -0,0 +1,44 @@
|
|
1
|
+
##
|
2
|
+
# GET REPRESENTATIVE READS
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "rep_reads extract representative reads for each cluster to a fasta file",
|
9
|
+
"--clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta"
|
10
|
+
|
11
|
+
method_option :clusters, :type => :string, :required => true
|
12
|
+
method_option :output, :type => :string, :required => true
|
13
|
+
method_option :joined, :type => :string, :required => true
|
14
|
+
|
15
|
+
def rep_reads
|
16
|
+
input = options[:clusters]
|
17
|
+
output = options[:output]
|
18
|
+
joined_reads = options[:joined]
|
19
|
+
|
20
|
+
|
21
|
+
# Load cluster table!
|
22
|
+
clstr_info = Helpers.load_uc_file input
|
23
|
+
clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
|
24
|
+
seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
|
25
|
+
samples = clstr_info[:samples]
|
26
|
+
|
27
|
+
out_handle = File.open("#{output}", 'w')
|
28
|
+
|
29
|
+
File.open(joined_reads) do |handle|
|
30
|
+
records = Dna.new handle
|
31
|
+
records.each do |dna|
|
32
|
+
clstrnr = seed_to_clstrnr[dna.name]
|
33
|
+
unless clstrnr.nil?
|
34
|
+
dna.name = "#{dna.name}:cluster-#{clstrnr}"
|
35
|
+
out_handle.puts dna
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
out_handle.close
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
@@ -8,9 +8,8 @@ module Lederhosen
|
|
8
8
|
desc "sort fasta file by length",
|
9
9
|
"--input=joined.fasta --output=sorted.fasta"
|
10
10
|
|
11
|
-
|
12
|
-
method_option :
|
13
|
-
method_option :output, :type => :string, :default => 'sorted.fasta'
|
11
|
+
method_option :input, :type => :string, :required => true
|
12
|
+
method_option :output, :type => :string, :required => true
|
14
13
|
|
15
14
|
def sort
|
16
15
|
input = options[:input]
|
@@ -8,9 +8,9 @@ module Lederhosen
|
|
8
8
|
desc "output separate fasta file containing sequences belonging to each cluster",
|
9
9
|
"--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
|
10
10
|
|
11
|
-
method_option :clusters, :type => :string,
|
12
|
-
method_option :reads, :type => :string,
|
13
|
-
method_option :out_dir, :type => :string,
|
11
|
+
method_option :clusters, :type => :string, :required => true
|
12
|
+
method_option :reads, :type => :string, :required => true
|
13
|
+
method_option :out_dir, :type => :string, :required => true
|
14
14
|
method_option :buffer_size, :type => :numeric, :default => 1000
|
15
15
|
method_option :min_clst_size, :type => :numeric, :default => 100
|
16
16
|
|
@@ -9,7 +9,7 @@ module Lederhosen
|
|
9
9
|
"--reads_dir=reads/* --out_dir=trimmed.fasta"
|
10
10
|
|
11
11
|
method_option :reads_dir, :type => :string, :required => true
|
12
|
-
method_option :out_dir, :type => :string, :
|
12
|
+
method_option :out_dir, :type => :string, :required => true
|
13
13
|
|
14
14
|
def trim
|
15
15
|
|
data/lib/version.rb
CHANGED
data/readme.md
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
# Lederhosen
|
2
2
|
|
3
|
-
Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
|
3
|
+
Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own risk.
|
4
4
|
|
5
5
|
## How do I get Lederhosen?
|
6
6
|
|
7
|
-
0. Obtain & Install
|
8
|
-
1.
|
7
|
+
0. Obtain & Install [UCLUST](http://www.drive5.com/) (64-bit)
|
8
|
+
1. Obtain & Install [BLAT](http://genome.ucsc.edu/FAQ/FAQblat.html#blat3)
|
9
|
+
2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
|
10
|
+
3. Install Lederhosen by typing:
|
11
|
+
|
12
|
+
`sudo gem install lederhosen`
|
13
|
+
4. Check installation by typing `lederhosen`. You should see some help text.
|
9
14
|
|
10
15
|
## How do I use Lederhosen?
|
11
16
|
|
@@ -13,26 +18,42 @@ Type `lederhosen help` for complete instructions
|
|
13
18
|
|
14
19
|
### 1. Trim raw reads
|
15
20
|
|
16
|
-
`$ lederhosen trim --reads-dir=reads-dir/*.txt`
|
21
|
+
`$ lederhosen trim --reads-dir=reads-dir/*.txt --out-dir=trimmed`
|
17
22
|
|
18
23
|
### 2. Join trimmed reads
|
19
24
|
|
20
|
-
`$ lederhosen join`
|
25
|
+
`$ lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta`
|
21
26
|
|
22
27
|
### 3. Sort trimmed reads
|
23
28
|
|
24
|
-
`$ lederhosen sort`
|
29
|
+
`$ lederhosen sort --input=joined.fasta --output=sorted.fasta`
|
25
30
|
|
26
31
|
### 4. Cluster sorted reads
|
27
32
|
|
28
|
-
`$ lederhosen cluster --
|
33
|
+
`$ lederhosen cluster --identity=0.975 --input=sorted.fasta --output=clusters`
|
34
|
+
|
35
|
+
### 5. Make OTU tables
|
36
|
+
|
37
|
+
`% lederhosen otu_table --clusters=clusters.uc --output=clusters_975.csv`
|
38
|
+
|
39
|
+
This will output a csv (`clusters.975.csv`) and a fasta (`clusters.975.fasta`) file. The fasta file can be used to identify clusters in a 16S rRNA database using BLAST or something.
|
40
|
+
|
41
|
+
### 6. Get representative reads from each cluster
|
29
42
|
|
30
|
-
|
43
|
+
`% lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representatives.fasta`
|
31
44
|
|
32
|
-
|
45
|
+
### 6. Get a fasta file containing all reads for each cluster
|
33
46
|
|
34
|
-
|
47
|
+
(time consuming and probably not necessary)
|
35
48
|
|
36
49
|
`% lederhosen split --clusters=clusters_97.5.txt --reads=joined.fasta --min-clst-size=100`
|
37
50
|
|
38
|
-
`--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
|
51
|
+
`--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
|
52
|
+
|
53
|
+
### 7. Identifying Clusters
|
54
|
+
|
55
|
+
(Still under development)
|
56
|
+
|
57
|
+
You need BLAT (in your `$PATH`) & TaxCollector.
|
58
|
+
|
59
|
+
`$ lederhosen name --reps=representatives.fasta --db=taxcollector.fa --output=output_prefix`
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 13
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 9
|
10
|
+
version: 0.0.9
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Austin G. Davis-Richardson
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-
|
18
|
+
date: 2012-05-01 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: dna
|
@@ -122,7 +122,9 @@ files:
|
|
122
122
|
- lib/lederhosen/helpers.rb
|
123
123
|
- lib/lederhosen/tasks/cluster.rb
|
124
124
|
- lib/lederhosen/tasks/join.rb
|
125
|
+
- lib/lederhosen/tasks/name.rb
|
125
126
|
- lib/lederhosen/tasks/otu_table.rb
|
127
|
+
- lib/lederhosen/tasks/rep_reads.rb
|
126
128
|
- lib/lederhosen/tasks/sort.rb
|
127
129
|
- lib/lederhosen/tasks/split.rb
|
128
130
|
- lib/lederhosen/tasks/trim.rb
|
@@ -164,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
164
166
|
requirements: []
|
165
167
|
|
166
168
|
rubyforge_project: lederhosen
|
167
|
-
rubygems_version: 1.8.
|
169
|
+
rubygems_version: 1.8.24
|
168
170
|
signing_key:
|
169
171
|
specification_version: 3
|
170
172
|
summary: 16S rRNA clustering for paired-end Illumina
|