lederhosen 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/lederhosen/cli.rb +7 -204
- data/lib/version.rb +1 -1
- metadata +3 -3
data/lib/lederhosen/cli.rb
CHANGED
@@ -1,212 +1,15 @@
|
|
1
1
|
module Lederhosen
|
2
2
|
class CLI < Thor
|
3
|
-
|
4
|
-
##
|
5
|
-
# QUALITY TRIMMING
|
6
|
-
#
|
7
|
-
desc "trim Illumina QSEQ files", "--reads_dir=reads/* --out_dir=trimmed.fasta"
|
8
|
-
method_options :reads_dir => :string, :out_dir => :string
|
9
|
-
def trim
|
10
3
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
raw_reads = Helpers.get_grouped_qseq_files raw_reads
|
17
|
-
puts "found #{raw_reads.length} pairs of reads"
|
18
|
-
puts "trimming!"
|
19
|
-
|
20
|
-
pbar = ProgressBar.new "trimming", raw_reads.length
|
21
|
-
|
22
|
-
raw_reads.each do |a|
|
23
|
-
pbar.inc
|
24
|
-
out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
|
25
|
-
# TODO get total and trimmed
|
26
|
-
total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
|
27
|
-
end
|
28
|
-
|
29
|
-
pbar.finish
|
30
|
-
end
|
31
|
-
|
32
|
-
##
|
33
|
-
# PAIRED-END READ WORK-AROUND (JOIN THEM)
|
34
|
-
#
|
35
|
-
desc "join reads end-to-end", "--trimmed=trimmed/*.fasta --output=joined.fasta"
|
36
|
-
method_options :trimmed => :string, :output => :string
|
37
|
-
def join
|
38
|
-
puts "joining!"
|
39
|
-
|
40
|
-
trimmed = Dir[options[:trimmed] || 'trimmed/*.fasta']
|
41
|
-
output = options[:output] || 'joined.fasta'
|
42
|
-
|
43
|
-
fail "no reads in #{trimmed}" if trimmed.length == 0
|
44
|
-
|
45
|
-
output = File.open(output, 'w')
|
46
|
-
|
47
|
-
pbar = ProgressBar.new "joining", trimmed.length
|
48
|
-
|
49
|
-
trimmed.each do |fasta_file|
|
50
|
-
pbar.inc
|
51
|
-
records = Dna.new File.open(fasta_file)
|
52
|
-
records.each_slice(2) do |r, l|
|
53
|
-
output.puts ">#{r.name}:#{File.basename(fasta_file, '.fasta')}\n#{r.sequence.reverse+l.sequence}"
|
54
|
-
end
|
4
|
+
no_tasks do
|
5
|
+
# just print string to STDERR
|
6
|
+
def ohai(s)
|
7
|
+
$stderr.puts s
|
55
8
|
end
|
56
|
-
pbar.finish
|
57
9
|
end
|
58
10
|
|
59
|
-
|
60
|
-
# SORT JOINED READS BY LENGTH
|
61
|
-
#
|
62
|
-
desc "sort fasta file by length", "--input=joined.fasta --output=sorted.fasta"
|
63
|
-
method_options :input => :string, :output => :string
|
64
|
-
def sort
|
65
|
-
input = options[:input] || 'joined.fasta'
|
66
|
-
output = options[:output] || 'sorted.fasta'
|
67
|
-
`uclust --mergesort #{input} --output #{output}`
|
68
|
-
end
|
69
|
-
|
70
|
-
##
|
71
|
-
# FINALLY, CLUSTER!
|
72
|
-
#
|
73
|
-
desc "cluster fasta file", "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
|
74
|
-
method_options :input => :string, :output => :string, :identity => :float
|
75
|
-
def cluster
|
76
|
-
identity = options[:identity] || 0.8
|
77
|
-
output = options[:output] || 'clusters.uc'
|
78
|
-
input = options[:input] || 'sorted.fasta'
|
79
|
-
|
80
|
-
cmd = [
|
81
|
-
'uclust',
|
82
|
-
"--input #{input}",
|
83
|
-
"--uc #{output}",
|
84
|
-
"--id #{identity}",
|
85
|
-
].join(' ')
|
86
|
-
exec cmd
|
87
|
-
end
|
88
|
-
|
89
|
-
##
|
90
|
-
# MAKE TABLES
|
91
|
-
#
|
92
|
-
desc "otu_tables generates otu tables & representative reads", "--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
|
93
|
-
method_options :clusters => :string, :output => :string, :joined => :string
|
94
|
-
def otu_table
|
95
|
-
input = options[:clusters] || 'clusters.uc'
|
96
|
-
output = options[:output] || 'otus'
|
97
|
-
joined_reads = options[:joined] || 'joined.fasta'
|
98
|
-
|
99
|
-
clusters = Hash.new
|
100
|
-
|
101
|
-
# Load cluster table!
|
102
|
-
clusters = Helpers.load_uc_file(input)
|
103
|
-
|
104
|
-
clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
|
105
|
-
|
106
|
-
# Get representative sequences!
|
107
|
-
reads_total = 0
|
108
|
-
representatives = {}
|
109
|
-
clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
|
110
|
-
|
111
|
-
out_handle = File.open("#{output}.fasta", 'w')
|
112
|
-
|
113
|
-
File.open(joined_reads) do |handle|
|
114
|
-
records = Dna.new handle
|
115
|
-
records.each do |dna|
|
116
|
-
reads_total += 1
|
117
|
-
if !representatives[dna.name].nil?
|
118
|
-
dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
|
119
|
-
out_handle.puts dna
|
120
|
-
end
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
out_handle.close
|
125
|
-
|
126
|
-
# Print some statistics
|
127
|
-
puts "reads in clusters: #{clusters_total}"
|
128
|
-
puts "number of reads: #{reads_total}"
|
129
|
-
puts "unique clusters: #{clusters.keys.length}"
|
130
|
-
|
131
|
-
# print OTU abundancy matrix
|
132
|
-
csv = Helpers.cluster_data_as_csv(clusters)
|
133
|
-
File.open("#{output}.csv", 'w') do |h|
|
134
|
-
h.puts csv
|
135
|
-
end
|
136
|
-
|
137
|
-
end
|
138
|
-
|
139
|
-
##
|
140
|
-
# Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
|
141
|
-
#
|
142
|
-
desc "output separate fasta file containing sequences belonging to each cluster", "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
|
143
|
-
method_options :clusters => :string, :reads=> :string, :buffer_size => :int, :min_clst_size => :int, :out_dir => :string
|
144
|
-
def split
|
145
|
-
clusters = options[:clusters] || 'clusters.uc'
|
146
|
-
reads = options[:reads] || 'joined.fasta'
|
147
|
-
out_dir = options[:out_dir] || 'clusters_split'
|
148
|
-
buffer_size = (options[:buffer_size] || 1000).to_i
|
149
|
-
min_clst_size = (options[:min_clst_size] || 100).to_i
|
150
|
-
finalize_every = 100_000
|
151
|
-
|
152
|
-
`mkdir -p #{out_dir}/`
|
153
|
-
|
154
|
-
puts "loading #{clusters}"
|
155
|
-
|
156
|
-
# Load read id -> cluster
|
157
|
-
read_to_clusterid = Hash.new
|
158
|
-
|
159
|
-
# keep track of cluster sizes
|
160
|
-
cluster_counts = Hash.new { |h, k| h[k] = 0}
|
161
|
-
|
162
|
-
File.open(clusters)do |handle|
|
163
|
-
handle.each do |line|
|
164
|
-
line = line.strip.split
|
165
|
-
cluster_nr = line[1]
|
166
|
-
if line[0] == 'S' || line[0] == 'H'
|
167
|
-
read = line[8]
|
168
|
-
else
|
169
|
-
next
|
170
|
-
end
|
171
|
-
read_to_clusterid[read] = cluster_nr
|
172
|
-
cluster_counts[cluster_nr] += 1
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
read_to_clusterid.delete_if do |read, cluster_nr|
|
177
|
-
cluster_counts[cluster_nr] < min_clst_size
|
178
|
-
end
|
179
|
-
|
180
|
-
total_reads = read_to_clusterid.length
|
181
|
-
total_clusters = read_to_clusterid.values.uniq.length
|
182
|
-
puts "#{total_reads} reads in #{total_clusters} clusters"
|
183
|
-
|
184
|
-
puts "writing out fasta files"
|
185
|
-
|
186
|
-
pbar = ProgressBar.new "writing", total_reads
|
187
|
-
|
188
|
-
# Write reads to individual fasta files using Buffer
|
189
|
-
buffer = Buffer.new :buffer_max => buffer_size
|
190
|
-
File.open(reads) do |handle|
|
191
|
-
records = Dna.new handle
|
192
|
-
$stderr.puts "reads = #{reads}"
|
193
|
-
records.each_with_index do |record, i|
|
194
|
-
cluster_id = read_to_clusterid[record.name]
|
195
|
-
if cluster_id
|
196
|
-
pbar.inc
|
197
|
-
filename = File.join(out_dir, cluster_id + '.fasta')
|
198
|
-
buffer[filename] << record
|
199
|
-
buffer.finalize if (i%finalize_every == 0)
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
pbar.finish
|
205
|
-
puts "finalizing output"
|
206
|
-
buffer.finalize # finish writing out
|
11
|
+
end # class CLI
|
207
12
|
|
208
|
-
|
209
|
-
end
|
13
|
+
end # module
|
210
14
|
|
211
|
-
|
212
|
-
end # module
|
15
|
+
Dir.glob(File.join(File.dirname(__FILE__), 'tasks', '*.rb')).each { |f| require f }
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Austin G. Davis-Richardson
|