lederhosen 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/lederhosen/tasks/cluster.rb +30 -0
- data/lib/lederhosen/tasks/join.rb +35 -0
- data/lib/lederhosen/tasks/otu_table.rb +61 -0
- data/lib/lederhosen/tasks/sort.rb +22 -0
- data/lib/lederhosen/tasks/split.rb +81 -0
- data/lib/lederhosen/tasks/trim.rb +36 -0
- data/lib/version.rb +1 -1
- metadata +9 -3
@@ -0,0 +1,30 @@
|
|
1
|
+
##
|
2
|
+
# FINALLY, CLUSTER!
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "cluster fasta file",
|
9
|
+
"--input=sorted.fasta --identity=0.80 --output=clusters.uc"
|
10
|
+
|
11
|
+
method_option :input, :type => :string, :default => 'sorted.fasta'
|
12
|
+
method_option :output, :type => :string, :default => 'clusters.uc'
|
13
|
+
method_option :identity, :type => :numeric, :default => 0.8
|
14
|
+
|
15
|
+
def cluster
|
16
|
+
identity = options[:identity]
|
17
|
+
output = options[:output]
|
18
|
+
input = options[:input]
|
19
|
+
|
20
|
+
cmd = [
|
21
|
+
'uclust',
|
22
|
+
"--input #{input}",
|
23
|
+
"--uc #{output}",
|
24
|
+
"--id #{identity}",
|
25
|
+
].join(' ')
|
26
|
+
exec cmd
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Lederhosen
|
2
|
+
class CLI
|
3
|
+
|
4
|
+
##
|
5
|
+
# PAIRED-END READ WORK-AROUND (JOIN THEM)
|
6
|
+
#
|
7
|
+
desc "join reads end-to-end",
|
8
|
+
"--trimmed=trimmed/*.fasta --output=joined.fasta"
|
9
|
+
|
10
|
+
method_option :trimmed, :type => :string, :default => 'trimmed/*,fasta'
|
11
|
+
method_option :output, :type => :string, :default => 'joined.fasta'
|
12
|
+
|
13
|
+
def join
|
14
|
+
|
15
|
+
trimmed = Dir[options[:trimmed]]
|
16
|
+
output = options[:output]
|
17
|
+
|
18
|
+
fail "no reads in #{trimmed}" if trimmed.length == 0
|
19
|
+
|
20
|
+
output = File.open(output, 'w')
|
21
|
+
|
22
|
+
pbar = ProgressBar.new "joining", trimmed.length
|
23
|
+
|
24
|
+
trimmed.each do |fasta_file|
|
25
|
+
pbar.inc
|
26
|
+
records = Dna.new File.open(fasta_file)
|
27
|
+
records.each_slice(2) do |r, l|
|
28
|
+
output.puts ">#{r.name}:#{File.basename(fasta_file, '.fasta')}\n#{r.sequence.reverse+l.sequence}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
pbar.finish
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
##
|
2
|
+
# MAKE TABLES
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "otu_tables generates otu tables & representative reads",
|
9
|
+
"--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
|
10
|
+
|
11
|
+
method_option :clusters, :type => :string, :default => 'clusters.uc'
|
12
|
+
method_option :output, :type => :string, :default => 'otus'
|
13
|
+
method_option :joined, :type => :string, :default => 'joined.fasta'
|
14
|
+
|
15
|
+
def otu_table
|
16
|
+
input = options[:clusters]
|
17
|
+
output = options[:output]
|
18
|
+
joined_reads = options[:joined]
|
19
|
+
|
20
|
+
clusters = Hash.new
|
21
|
+
|
22
|
+
# Load cluster table!
|
23
|
+
clusters = Helpers.load_uc_file(input)
|
24
|
+
|
25
|
+
clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
|
26
|
+
|
27
|
+
# Get representative sequences!
|
28
|
+
reads_total = 0
|
29
|
+
representatives = {}
|
30
|
+
clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
|
31
|
+
|
32
|
+
out_handle = File.open("#{output}.fasta", 'w')
|
33
|
+
|
34
|
+
File.open(joined_reads) do |handle|
|
35
|
+
records = Dna.new handle
|
36
|
+
records.each do |dna|
|
37
|
+
reads_total += 1
|
38
|
+
if !representatives[dna.name].nil?
|
39
|
+
dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
|
40
|
+
out_handle.puts dna
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
out_handle.close
|
46
|
+
|
47
|
+
# Print some statistics
|
48
|
+
ohai "reads in clusters: #{clusters_total}"
|
49
|
+
ohai "number of reads: #{reads_total}"
|
50
|
+
ohai "unique clusters: #{clusters.keys.length}"
|
51
|
+
|
52
|
+
# print OTU abundancy matrix
|
53
|
+
csv = Helpers.cluster_data_as_csv(clusters)
|
54
|
+
File.open("#{output}.csv", 'w') do |h|
|
55
|
+
h.puts csv
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
##
|
2
|
+
# SORT JOINED READS BY LENGTH
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "sort fasta file by length",
|
9
|
+
"--input=joined.fasta --output=sorted.fasta"
|
10
|
+
|
11
|
+
method_options :input => :string, :output => :string
|
12
|
+
method_option :input, :type => :string, :default => 'joined.fasta'
|
13
|
+
method_option :output, :type => :string, :default => 'sorted.fasta'
|
14
|
+
|
15
|
+
def sort
|
16
|
+
input = options[:input]
|
17
|
+
output = options[:output]
|
18
|
+
`uclust --mergesort #{input} --output #{output}`
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
##
|
2
|
+
# Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "output separate fasta file containing sequences belonging to each cluster",
|
9
|
+
"--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
|
10
|
+
|
11
|
+
method_option :clusters, :type => :string, :default => 'clusters.uc'
|
12
|
+
method_option :reads, :type => :string, :default => 'joined.fasta'
|
13
|
+
method_option :out_dir, :type => :string, :default => 'clusters_split'
|
14
|
+
method_option :buffer_size, :type => :numeric, :default => 1000
|
15
|
+
method_option :min_clst_size, :type => :numeric, :default => 100
|
16
|
+
|
17
|
+
def split
|
18
|
+
clusters = options[:clusters]
|
19
|
+
reads = options[:reads]
|
20
|
+
out_dir = options[:out_dir]
|
21
|
+
buffer_size = options[:buffer_size]
|
22
|
+
min_clst_size = options[:min_clst_size]
|
23
|
+
finalize_every = 100_000
|
24
|
+
|
25
|
+
`mkdir -p #{out_dir}/`
|
26
|
+
|
27
|
+
ohai "loading #{clusters}"
|
28
|
+
|
29
|
+
# Load read id -> cluster
|
30
|
+
read_to_clusterid = Hash.new
|
31
|
+
|
32
|
+
# keep track of cluster sizes
|
33
|
+
cluster_counts = Hash.new { |h, k| h[k] = 0}
|
34
|
+
|
35
|
+
File.open(clusters)do |handle|
|
36
|
+
handle.each do |line|
|
37
|
+
line = line.strip.split
|
38
|
+
cluster_nr = line[1]
|
39
|
+
if line[0] == 'S' || line[0] == 'H'
|
40
|
+
read = line[8]
|
41
|
+
else
|
42
|
+
next
|
43
|
+
end
|
44
|
+
read_to_clusterid[read] = cluster_nr
|
45
|
+
cluster_counts[cluster_nr] += 1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
read_to_clusterid.delete_if do |read, cluster_nr|
|
50
|
+
cluster_counts[cluster_nr] < min_clst_size
|
51
|
+
end
|
52
|
+
|
53
|
+
total_reads = read_to_clusterid.length
|
54
|
+
total_clusters = read_to_clusterid.values.uniq.length
|
55
|
+
ohai "#{total_reads} reads in #{total_clusters} clusters"
|
56
|
+
|
57
|
+
pbar = ProgressBar.new "saving", total_reads
|
58
|
+
|
59
|
+
# Write reads to individual fasta files using Buffer
|
60
|
+
buffer = Buffer.new :buffer_max => buffer_size
|
61
|
+
File.open(reads) do |handle|
|
62
|
+
records = Dna.new handle
|
63
|
+
records.each_with_index do |record, i|
|
64
|
+
cluster_id = read_to_clusterid[record.name]
|
65
|
+
if cluster_id
|
66
|
+
pbar.inc
|
67
|
+
filename = File.join(out_dir, cluster_id + '.fasta')
|
68
|
+
buffer[filename] << record
|
69
|
+
buffer.finalize if (i%finalize_every == 0)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
pbar.finish
|
75
|
+
ohai "finalizing output"
|
76
|
+
buffer.finalize # finish writing out
|
77
|
+
|
78
|
+
puts "done"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
##
|
2
|
+
# QUALITY TRIMMING
|
3
|
+
#
|
4
|
+
|
5
|
+
module Lederhosen
|
6
|
+
class CLI
|
7
|
+
|
8
|
+
desc "trim Illumina QSEQ files",
|
9
|
+
"--reads_dir=reads/* --out_dir=trimmed.fasta"
|
10
|
+
|
11
|
+
method_option :reads_dir, :type => :string, :required => true
|
12
|
+
method_option :out_dir, :type => :string, :default => 'trimmed/'
|
13
|
+
|
14
|
+
def trim
|
15
|
+
|
16
|
+
raw_reads = options[:reads_dir]
|
17
|
+
out_dir = options[:out_dir]
|
18
|
+
|
19
|
+
`mkdir -p #{out_dir}`
|
20
|
+
|
21
|
+
raw_reads = Helpers.get_grouped_qseq_files raw_reads
|
22
|
+
ohai "found #{raw_reads.length} pairs of reads"
|
23
|
+
|
24
|
+
pbar = ProgressBar.new "trimming", raw_reads.length
|
25
|
+
|
26
|
+
raw_reads.each do |a|
|
27
|
+
pbar.inc
|
28
|
+
out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
|
29
|
+
# TODO get total and trimmed
|
30
|
+
total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
|
31
|
+
end
|
32
|
+
|
33
|
+
pbar.finish
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 4
|
10
|
+
version: 0.0.4
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Austin G. Davis-Richardson
|
@@ -120,6 +120,12 @@ files:
|
|
120
120
|
- lib/lederhosen/buffer.rb
|
121
121
|
- lib/lederhosen/cli.rb
|
122
122
|
- lib/lederhosen/helpers.rb
|
123
|
+
- lib/lederhosen/tasks/cluster.rb
|
124
|
+
- lib/lederhosen/tasks/join.rb
|
125
|
+
- lib/lederhosen/tasks/otu_table.rb
|
126
|
+
- lib/lederhosen/tasks/sort.rb
|
127
|
+
- lib/lederhosen/tasks/split.rb
|
128
|
+
- lib/lederhosen/tasks/trim.rb
|
123
129
|
- lib/version.rb
|
124
130
|
- readme.md
|
125
131
|
- spec/data/ILT_L_9_B_001_1.txt
|