lederhosen 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/lederhosen/tasks/cluster.rb +30 -0
- data/lib/lederhosen/tasks/join.rb +35 -0
- data/lib/lederhosen/tasks/otu_table.rb +61 -0
- data/lib/lederhosen/tasks/sort.rb +22 -0
- data/lib/lederhosen/tasks/split.rb +81 -0
- data/lib/lederhosen/tasks/trim.rb +36 -0
- data/lib/version.rb +1 -1
- metadata +9 -3
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
##
|
|
2
|
+
# FINALLY, CLUSTER!
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
module Lederhosen
|
|
6
|
+
class CLI
|
|
7
|
+
|
|
8
|
+
desc "cluster fasta file",
|
|
9
|
+
"--input=sorted.fasta --identity=0.80 --output=clusters.uc"
|
|
10
|
+
|
|
11
|
+
method_option :input, :type => :string, :default => 'sorted.fasta'
|
|
12
|
+
method_option :output, :type => :string, :default => 'clusters.uc'
|
|
13
|
+
method_option :identity, :type => :numeric, :default => 0.8
|
|
14
|
+
|
|
15
|
+
def cluster
|
|
16
|
+
identity = options[:identity]
|
|
17
|
+
output = options[:output]
|
|
18
|
+
input = options[:input]
|
|
19
|
+
|
|
20
|
+
cmd = [
|
|
21
|
+
'uclust',
|
|
22
|
+
"--input #{input}",
|
|
23
|
+
"--uc #{output}",
|
|
24
|
+
"--id #{identity}",
|
|
25
|
+
].join(' ')
|
|
26
|
+
exec cmd
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
module Lederhosen
|
|
2
|
+
class CLI
|
|
3
|
+
|
|
4
|
+
##
|
|
5
|
+
# PAIRED-END READ WORK-AROUND (JOIN THEM)
|
|
6
|
+
#
|
|
7
|
+
desc "join reads end-to-end",
|
|
8
|
+
"--trimmed=trimmed/*.fasta --output=joined.fasta"
|
|
9
|
+
|
|
10
|
+
method_option :trimmed, :type => :string, :default => 'trimmed/*,fasta'
|
|
11
|
+
method_option :output, :type => :string, :default => 'joined.fasta'
|
|
12
|
+
|
|
13
|
+
def join
|
|
14
|
+
|
|
15
|
+
trimmed = Dir[options[:trimmed]]
|
|
16
|
+
output = options[:output]
|
|
17
|
+
|
|
18
|
+
fail "no reads in #{trimmed}" if trimmed.length == 0
|
|
19
|
+
|
|
20
|
+
output = File.open(output, 'w')
|
|
21
|
+
|
|
22
|
+
pbar = ProgressBar.new "joining", trimmed.length
|
|
23
|
+
|
|
24
|
+
trimmed.each do |fasta_file|
|
|
25
|
+
pbar.inc
|
|
26
|
+
records = Dna.new File.open(fasta_file)
|
|
27
|
+
records.each_slice(2) do |r, l|
|
|
28
|
+
output.puts ">#{r.name}:#{File.basename(fasta_file, '.fasta')}\n#{r.sequence.reverse+l.sequence}"
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
pbar.finish
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
##
|
|
2
|
+
# MAKE TABLES
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
module Lederhosen
|
|
6
|
+
class CLI
|
|
7
|
+
|
|
8
|
+
desc "otu_tables generates otu tables & representative reads",
|
|
9
|
+
"--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
|
|
10
|
+
|
|
11
|
+
method_option :clusters, :type => :string, :default => 'clusters.uc'
|
|
12
|
+
method_option :output, :type => :string, :default => 'otus'
|
|
13
|
+
method_option :joined, :type => :string, :default => 'joined.fasta'
|
|
14
|
+
|
|
15
|
+
def otu_table
|
|
16
|
+
input = options[:clusters]
|
|
17
|
+
output = options[:output]
|
|
18
|
+
joined_reads = options[:joined]
|
|
19
|
+
|
|
20
|
+
clusters = Hash.new
|
|
21
|
+
|
|
22
|
+
# Load cluster table!
|
|
23
|
+
clusters = Helpers.load_uc_file(input)
|
|
24
|
+
|
|
25
|
+
clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
|
|
26
|
+
|
|
27
|
+
# Get representative sequences!
|
|
28
|
+
reads_total = 0
|
|
29
|
+
representatives = {}
|
|
30
|
+
clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
|
|
31
|
+
|
|
32
|
+
out_handle = File.open("#{output}.fasta", 'w')
|
|
33
|
+
|
|
34
|
+
File.open(joined_reads) do |handle|
|
|
35
|
+
records = Dna.new handle
|
|
36
|
+
records.each do |dna|
|
|
37
|
+
reads_total += 1
|
|
38
|
+
if !representatives[dna.name].nil?
|
|
39
|
+
dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
|
|
40
|
+
out_handle.puts dna
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
out_handle.close
|
|
46
|
+
|
|
47
|
+
# Print some statistics
|
|
48
|
+
ohai "reads in clusters: #{clusters_total}"
|
|
49
|
+
ohai "number of reads: #{reads_total}"
|
|
50
|
+
ohai "unique clusters: #{clusters.keys.length}"
|
|
51
|
+
|
|
52
|
+
# print OTU abundancy matrix
|
|
53
|
+
csv = Helpers.cluster_data_as_csv(clusters)
|
|
54
|
+
File.open("#{output}.csv", 'w') do |h|
|
|
55
|
+
h.puts csv
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
##
|
|
2
|
+
# SORT JOINED READS BY LENGTH
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
module Lederhosen
|
|
6
|
+
class CLI
|
|
7
|
+
|
|
8
|
+
desc "sort fasta file by length",
|
|
9
|
+
"--input=joined.fasta --output=sorted.fasta"
|
|
10
|
+
|
|
11
|
+
method_options :input => :string, :output => :string
|
|
12
|
+
method_option :input, :type => :string, :default => 'joined.fasta'
|
|
13
|
+
method_option :output, :type => :string, :default => 'sorted.fasta'
|
|
14
|
+
|
|
15
|
+
def sort
|
|
16
|
+
input = options[:input]
|
|
17
|
+
output = options[:output]
|
|
18
|
+
`uclust --mergesort #{input} --output #{output}`
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
##
|
|
2
|
+
# Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
module Lederhosen
|
|
6
|
+
class CLI
|
|
7
|
+
|
|
8
|
+
desc "output separate fasta file containing sequences belonging to each cluster",
|
|
9
|
+
"--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
|
|
10
|
+
|
|
11
|
+
method_option :clusters, :type => :string, :default => 'clusters.uc'
|
|
12
|
+
method_option :reads, :type => :string, :default => 'joined.fasta'
|
|
13
|
+
method_option :out_dir, :type => :string, :default => 'clusters_split'
|
|
14
|
+
method_option :buffer_size, :type => :numeric, :default => 1000
|
|
15
|
+
method_option :min_clst_size, :type => :numeric, :default => 100
|
|
16
|
+
|
|
17
|
+
def split
|
|
18
|
+
clusters = options[:clusters]
|
|
19
|
+
reads = options[:reads]
|
|
20
|
+
out_dir = options[:out_dir]
|
|
21
|
+
buffer_size = options[:buffer_size]
|
|
22
|
+
min_clst_size = options[:min_clst_size]
|
|
23
|
+
finalize_every = 100_000
|
|
24
|
+
|
|
25
|
+
`mkdir -p #{out_dir}/`
|
|
26
|
+
|
|
27
|
+
ohai "loading #{clusters}"
|
|
28
|
+
|
|
29
|
+
# Load read id -> cluster
|
|
30
|
+
read_to_clusterid = Hash.new
|
|
31
|
+
|
|
32
|
+
# keep track of cluster sizes
|
|
33
|
+
cluster_counts = Hash.new { |h, k| h[k] = 0}
|
|
34
|
+
|
|
35
|
+
File.open(clusters)do |handle|
|
|
36
|
+
handle.each do |line|
|
|
37
|
+
line = line.strip.split
|
|
38
|
+
cluster_nr = line[1]
|
|
39
|
+
if line[0] == 'S' || line[0] == 'H'
|
|
40
|
+
read = line[8]
|
|
41
|
+
else
|
|
42
|
+
next
|
|
43
|
+
end
|
|
44
|
+
read_to_clusterid[read] = cluster_nr
|
|
45
|
+
cluster_counts[cluster_nr] += 1
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
read_to_clusterid.delete_if do |read, cluster_nr|
|
|
50
|
+
cluster_counts[cluster_nr] < min_clst_size
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
total_reads = read_to_clusterid.length
|
|
54
|
+
total_clusters = read_to_clusterid.values.uniq.length
|
|
55
|
+
ohai "#{total_reads} reads in #{total_clusters} clusters"
|
|
56
|
+
|
|
57
|
+
pbar = ProgressBar.new "saving", total_reads
|
|
58
|
+
|
|
59
|
+
# Write reads to individual fasta files using Buffer
|
|
60
|
+
buffer = Buffer.new :buffer_max => buffer_size
|
|
61
|
+
File.open(reads) do |handle|
|
|
62
|
+
records = Dna.new handle
|
|
63
|
+
records.each_with_index do |record, i|
|
|
64
|
+
cluster_id = read_to_clusterid[record.name]
|
|
65
|
+
if cluster_id
|
|
66
|
+
pbar.inc
|
|
67
|
+
filename = File.join(out_dir, cluster_id + '.fasta')
|
|
68
|
+
buffer[filename] << record
|
|
69
|
+
buffer.finalize if (i%finalize_every == 0)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
pbar.finish
|
|
75
|
+
ohai "finalizing output"
|
|
76
|
+
buffer.finalize # finish writing out
|
|
77
|
+
|
|
78
|
+
puts "done"
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
##
|
|
2
|
+
# QUALITY TRIMMING
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
module Lederhosen
|
|
6
|
+
class CLI
|
|
7
|
+
|
|
8
|
+
desc "trim Illumina QSEQ files",
|
|
9
|
+
"--reads_dir=reads/* --out_dir=trimmed.fasta"
|
|
10
|
+
|
|
11
|
+
method_option :reads_dir, :type => :string, :required => true
|
|
12
|
+
method_option :out_dir, :type => :string, :default => 'trimmed/'
|
|
13
|
+
|
|
14
|
+
def trim
|
|
15
|
+
|
|
16
|
+
raw_reads = options[:reads_dir]
|
|
17
|
+
out_dir = options[:out_dir]
|
|
18
|
+
|
|
19
|
+
`mkdir -p #{out_dir}`
|
|
20
|
+
|
|
21
|
+
raw_reads = Helpers.get_grouped_qseq_files raw_reads
|
|
22
|
+
ohai "found #{raw_reads.length} pairs of reads"
|
|
23
|
+
|
|
24
|
+
pbar = ProgressBar.new "trimming", raw_reads.length
|
|
25
|
+
|
|
26
|
+
raw_reads.each do |a|
|
|
27
|
+
pbar.inc
|
|
28
|
+
out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
|
|
29
|
+
# TODO get total and trimmed
|
|
30
|
+
total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
pbar.finish
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lederhosen
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
hash:
|
|
4
|
+
hash: 23
|
|
5
5
|
prerelease:
|
|
6
6
|
segments:
|
|
7
7
|
- 0
|
|
8
8
|
- 0
|
|
9
|
-
-
|
|
10
|
-
version: 0.0.
|
|
9
|
+
- 4
|
|
10
|
+
version: 0.0.4
|
|
11
11
|
platform: ruby
|
|
12
12
|
authors:
|
|
13
13
|
- Austin G. Davis-Richardson
|
|
@@ -120,6 +120,12 @@ files:
|
|
|
120
120
|
- lib/lederhosen/buffer.rb
|
|
121
121
|
- lib/lederhosen/cli.rb
|
|
122
122
|
- lib/lederhosen/helpers.rb
|
|
123
|
+
- lib/lederhosen/tasks/cluster.rb
|
|
124
|
+
- lib/lederhosen/tasks/join.rb
|
|
125
|
+
- lib/lederhosen/tasks/otu_table.rb
|
|
126
|
+
- lib/lederhosen/tasks/sort.rb
|
|
127
|
+
- lib/lederhosen/tasks/split.rb
|
|
128
|
+
- lib/lederhosen/tasks/trim.rb
|
|
123
129
|
- lib/version.rb
|
|
124
130
|
- readme.md
|
|
125
131
|
- spec/data/ILT_L_9_B_001_1.txt
|