lederhosen 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ ##
2
+ # FINALLY, CLUSTER!
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "cluster fasta file",
9
+ "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
10
+
11
+ method_option :input, :type => :string, :default => 'sorted.fasta'
12
+ method_option :output, :type => :string, :default => 'clusters.uc'
13
+ method_option :identity, :type => :numeric, :default => 0.8
14
+
15
+ def cluster
16
+ identity = options[:identity]
17
+ output = options[:output]
18
+ input = options[:input]
19
+
20
+ cmd = [
21
+ 'uclust',
22
+ "--input #{input}",
23
+ "--uc #{output}",
24
+ "--id #{identity}",
25
+ ].join(' ')
26
+ exec cmd
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,35 @@
1
+ module Lederhosen
2
+ class CLI
3
+
4
+ ##
5
+ # PAIRED-END READ WORK-AROUND (JOIN THEM)
6
+ #
7
+ desc "join reads end-to-end",
8
+ "--trimmed=trimmed/*.fasta --output=joined.fasta"
9
+
10
+ method_option :trimmed, :type => :string, :default => 'trimmed/*,fasta'
11
+ method_option :output, :type => :string, :default => 'joined.fasta'
12
+
13
+ def join
14
+
15
+ trimmed = Dir[options[:trimmed]]
16
+ output = options[:output]
17
+
18
+ fail "no reads in #{trimmed}" if trimmed.length == 0
19
+
20
+ output = File.open(output, 'w')
21
+
22
+ pbar = ProgressBar.new "joining", trimmed.length
23
+
24
+ trimmed.each do |fasta_file|
25
+ pbar.inc
26
+ records = Dna.new File.open(fasta_file)
27
+ records.each_slice(2) do |r, l|
28
+ output.puts ">#{r.name}:#{File.basename(fasta_file, '.fasta')}\n#{r.sequence.reverse+l.sequence}"
29
+ end
30
+ end
31
+ pbar.finish
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,61 @@
1
+ ##
2
+ # MAKE TABLES
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "otu_tables generates otu tables & representative reads",
9
+ "--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
10
+
11
+ method_option :clusters, :type => :string, :default => 'clusters.uc'
12
+ method_option :output, :type => :string, :default => 'otus'
13
+ method_option :joined, :type => :string, :default => 'joined.fasta'
14
+
15
+ def otu_table
16
+ input = options[:clusters]
17
+ output = options[:output]
18
+ joined_reads = options[:joined]
19
+
20
+ clusters = Hash.new
21
+
22
+ # Load cluster table!
23
+ clusters = Helpers.load_uc_file(input)
24
+
25
+ clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
26
+
27
+ # Get representative sequences!
28
+ reads_total = 0
29
+ representatives = {}
30
+ clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
31
+
32
+ out_handle = File.open("#{output}.fasta", 'w')
33
+
34
+ File.open(joined_reads) do |handle|
35
+ records = Dna.new handle
36
+ records.each do |dna|
37
+ reads_total += 1
38
+ if !representatives[dna.name].nil?
39
+ dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
40
+ out_handle.puts dna
41
+ end
42
+ end
43
+ end
44
+
45
+ out_handle.close
46
+
47
+ # Print some statistics
48
+ ohai "reads in clusters: #{clusters_total}"
49
+ ohai "number of reads: #{reads_total}"
50
+ ohai "unique clusters: #{clusters.keys.length}"
51
+
52
+ # print OTU abundancy matrix
53
+ csv = Helpers.cluster_data_as_csv(clusters)
54
+ File.open("#{output}.csv", 'w') do |h|
55
+ h.puts csv
56
+ end
57
+
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,22 @@
1
+ ##
2
+ # SORT JOINED READS BY LENGTH
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "sort fasta file by length",
9
+ "--input=joined.fasta --output=sorted.fasta"
10
+
11
+ method_options :input => :string, :output => :string
12
+ method_option :input, :type => :string, :default => 'joined.fasta'
13
+ method_option :output, :type => :string, :default => 'sorted.fasta'
14
+
15
+ def sort
16
+ input = options[:input]
17
+ output = options[:output]
18
+ `uclust --mergesort #{input} --output #{output}`
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,81 @@
1
+ ##
2
+ # Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "output separate fasta file containing sequences belonging to each cluster",
9
+ "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
10
+
11
+ method_option :clusters, :type => :string, :default => 'clusters.uc'
12
+ method_option :reads, :type => :string, :default => 'joined.fasta'
13
+ method_option :out_dir, :type => :string, :default => 'clusters_split'
14
+ method_option :buffer_size, :type => :numeric, :default => 1000
15
+ method_option :min_clst_size, :type => :numeric, :default => 100
16
+
17
+ def split
18
+ clusters = options[:clusters]
19
+ reads = options[:reads]
20
+ out_dir = options[:out_dir]
21
+ buffer_size = options[:buffer_size]
22
+ min_clst_size = options[:min_clst_size]
23
+ finalize_every = 100_000
24
+
25
+ `mkdir -p #{out_dir}/`
26
+
27
+ ohai "loading #{clusters}"
28
+
29
+ # Load read id -> cluster
30
+ read_to_clusterid = Hash.new
31
+
32
+ # keep track of cluster sizes
33
+ cluster_counts = Hash.new { |h, k| h[k] = 0}
34
+
35
+ File.open(clusters)do |handle|
36
+ handle.each do |line|
37
+ line = line.strip.split
38
+ cluster_nr = line[1]
39
+ if line[0] == 'S' || line[0] == 'H'
40
+ read = line[8]
41
+ else
42
+ next
43
+ end
44
+ read_to_clusterid[read] = cluster_nr
45
+ cluster_counts[cluster_nr] += 1
46
+ end
47
+ end
48
+
49
+ read_to_clusterid.delete_if do |read, cluster_nr|
50
+ cluster_counts[cluster_nr] < min_clst_size
51
+ end
52
+
53
+ total_reads = read_to_clusterid.length
54
+ total_clusters = read_to_clusterid.values.uniq.length
55
+ ohai "#{total_reads} reads in #{total_clusters} clusters"
56
+
57
+ pbar = ProgressBar.new "saving", total_reads
58
+
59
+ # Write reads to individual fasta files using Buffer
60
+ buffer = Buffer.new :buffer_max => buffer_size
61
+ File.open(reads) do |handle|
62
+ records = Dna.new handle
63
+ records.each_with_index do |record, i|
64
+ cluster_id = read_to_clusterid[record.name]
65
+ if cluster_id
66
+ pbar.inc
67
+ filename = File.join(out_dir, cluster_id + '.fasta')
68
+ buffer[filename] << record
69
+ buffer.finalize if (i%finalize_every == 0)
70
+ end
71
+ end
72
+ end
73
+
74
+ pbar.finish
75
+ ohai "finalizing output"
76
+ buffer.finalize # finish writing out
77
+
78
+ puts "done"
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,36 @@
1
+ ##
2
+ # QUALITY TRIMMING
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "trim Illumina QSEQ files",
9
+ "--reads_dir=reads/* --out_dir=trimmed.fasta"
10
+
11
+ method_option :reads_dir, :type => :string, :required => true
12
+ method_option :out_dir, :type => :string, :default => 'trimmed/'
13
+
14
+ def trim
15
+
16
+ raw_reads = options[:reads_dir]
17
+ out_dir = options[:out_dir]
18
+
19
+ `mkdir -p #{out_dir}`
20
+
21
+ raw_reads = Helpers.get_grouped_qseq_files raw_reads
22
+ ohai "found #{raw_reads.length} pairs of reads"
23
+
24
+ pbar = ProgressBar.new "trimming", raw_reads.length
25
+
26
+ raw_reads.each do |a|
27
+ pbar.inc
28
+ out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
29
+ # TODO get total and trimmed
30
+ total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
31
+ end
32
+
33
+ pbar.finish
34
+ end
35
+ end
36
+ end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Lederhosen
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 23
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 3
10
- version: 0.0.3
9
+ - 4
10
+ version: 0.0.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Austin G. Davis-Richardson
@@ -120,6 +120,12 @@ files:
120
120
  - lib/lederhosen/buffer.rb
121
121
  - lib/lederhosen/cli.rb
122
122
  - lib/lederhosen/helpers.rb
123
+ - lib/lederhosen/tasks/cluster.rb
124
+ - lib/lederhosen/tasks/join.rb
125
+ - lib/lederhosen/tasks/otu_table.rb
126
+ - lib/lederhosen/tasks/sort.rb
127
+ - lib/lederhosen/tasks/split.rb
128
+ - lib/lederhosen/tasks/trim.rb
123
129
  - lib/version.rb
124
130
  - readme.md
125
131
  - spec/data/ILT_L_9_B_001_1.txt