lederhosen 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,30 @@
1
+ ##
2
+ # FINALLY, CLUSTER!
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "cluster fasta file",
9
+ "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
10
+
11
+ method_option :input, :type => :string, :default => 'sorted.fasta'
12
+ method_option :output, :type => :string, :default => 'clusters.uc'
13
+ method_option :identity, :type => :numeric, :default => 0.8
14
+
15
+ def cluster
16
+ identity = options[:identity]
17
+ output = options[:output]
18
+ input = options[:input]
19
+
20
+ cmd = [
21
+ 'uclust',
22
+ "--input #{input}",
23
+ "--uc #{output}",
24
+ "--id #{identity}",
25
+ ].join(' ')
26
+ exec cmd
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,35 @@
1
+ module Lederhosen
2
+ class CLI
3
+
4
+ ##
5
+ # PAIRED-END READ WORK-AROUND (JOIN THEM)
6
+ #
7
+ desc "join reads end-to-end",
8
+ "--trimmed=trimmed/*.fasta --output=joined.fasta"
9
+
10
+ method_option :trimmed, :type => :string, :default => 'trimmed/*,fasta'
11
+ method_option :output, :type => :string, :default => 'joined.fasta'
12
+
13
+ def join
14
+
15
+ trimmed = Dir[options[:trimmed]]
16
+ output = options[:output]
17
+
18
+ fail "no reads in #{trimmed}" if trimmed.length == 0
19
+
20
+ output = File.open(output, 'w')
21
+
22
+ pbar = ProgressBar.new "joining", trimmed.length
23
+
24
+ trimmed.each do |fasta_file|
25
+ pbar.inc
26
+ records = Dna.new File.open(fasta_file)
27
+ records.each_slice(2) do |r, l|
28
+ output.puts ">#{r.name}:#{File.basename(fasta_file, '.fasta')}\n#{r.sequence.reverse+l.sequence}"
29
+ end
30
+ end
31
+ pbar.finish
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,61 @@
1
+ ##
2
+ # MAKE TABLES
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "otu_tables generates otu tables & representative reads",
9
+ "--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
10
+
11
+ method_option :clusters, :type => :string, :default => 'clusters.uc'
12
+ method_option :output, :type => :string, :default => 'otus'
13
+ method_option :joined, :type => :string, :default => 'joined.fasta'
14
+
15
+ def otu_table
16
+ input = options[:clusters]
17
+ output = options[:output]
18
+ joined_reads = options[:joined]
19
+
20
+ clusters = Hash.new
21
+
22
+ # Load cluster table!
23
+ clusters = Helpers.load_uc_file(input)
24
+
25
+ clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
26
+
27
+ # Get representative sequences!
28
+ reads_total = 0
29
+ representatives = {}
30
+ clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
31
+
32
+ out_handle = File.open("#{output}.fasta", 'w')
33
+
34
+ File.open(joined_reads) do |handle|
35
+ records = Dna.new handle
36
+ records.each do |dna|
37
+ reads_total += 1
38
+ if !representatives[dna.name].nil?
39
+ dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
40
+ out_handle.puts dna
41
+ end
42
+ end
43
+ end
44
+
45
+ out_handle.close
46
+
47
+ # Print some statistics
48
+ ohai "reads in clusters: #{clusters_total}"
49
+ ohai "number of reads: #{reads_total}"
50
+ ohai "unique clusters: #{clusters.keys.length}"
51
+
52
+ # print OTU abundancy matrix
53
+ csv = Helpers.cluster_data_as_csv(clusters)
54
+ File.open("#{output}.csv", 'w') do |h|
55
+ h.puts csv
56
+ end
57
+
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,22 @@
1
+ ##
2
+ # SORT JOINED READS BY LENGTH
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "sort fasta file by length",
9
+ "--input=joined.fasta --output=sorted.fasta"
10
+
11
+ method_options :input => :string, :output => :string
12
+ method_option :input, :type => :string, :default => 'joined.fasta'
13
+ method_option :output, :type => :string, :default => 'sorted.fasta'
14
+
15
+ def sort
16
+ input = options[:input]
17
+ output = options[:output]
18
+ `uclust --mergesort #{input} --output #{output}`
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,81 @@
1
+ ##
2
+ # Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "output separate fasta file containing sequences belonging to each cluster",
9
+ "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
10
+
11
+ method_option :clusters, :type => :string, :default => 'clusters.uc'
12
+ method_option :reads, :type => :string, :default => 'joined.fasta'
13
+ method_option :out_dir, :type => :string, :default => 'clusters_split'
14
+ method_option :buffer_size, :type => :numeric, :default => 1000
15
+ method_option :min_clst_size, :type => :numeric, :default => 100
16
+
17
+ def split
18
+ clusters = options[:clusters]
19
+ reads = options[:reads]
20
+ out_dir = options[:out_dir]
21
+ buffer_size = options[:buffer_size]
22
+ min_clst_size = options[:min_clst_size]
23
+ finalize_every = 100_000
24
+
25
+ `mkdir -p #{out_dir}/`
26
+
27
+ ohai "loading #{clusters}"
28
+
29
+ # Load read id -> cluster
30
+ read_to_clusterid = Hash.new
31
+
32
+ # keep track of cluster sizes
33
+ cluster_counts = Hash.new { |h, k| h[k] = 0}
34
+
35
+ File.open(clusters)do |handle|
36
+ handle.each do |line|
37
+ line = line.strip.split
38
+ cluster_nr = line[1]
39
+ if line[0] == 'S' || line[0] == 'H'
40
+ read = line[8]
41
+ else
42
+ next
43
+ end
44
+ read_to_clusterid[read] = cluster_nr
45
+ cluster_counts[cluster_nr] += 1
46
+ end
47
+ end
48
+
49
+ read_to_clusterid.delete_if do |read, cluster_nr|
50
+ cluster_counts[cluster_nr] < min_clst_size
51
+ end
52
+
53
+ total_reads = read_to_clusterid.length
54
+ total_clusters = read_to_clusterid.values.uniq.length
55
+ ohai "#{total_reads} reads in #{total_clusters} clusters"
56
+
57
+ pbar = ProgressBar.new "saving", total_reads
58
+
59
+ # Write reads to individual fasta files using Buffer
60
+ buffer = Buffer.new :buffer_max => buffer_size
61
+ File.open(reads) do |handle|
62
+ records = Dna.new handle
63
+ records.each_with_index do |record, i|
64
+ cluster_id = read_to_clusterid[record.name]
65
+ if cluster_id
66
+ pbar.inc
67
+ filename = File.join(out_dir, cluster_id + '.fasta')
68
+ buffer[filename] << record
69
+ buffer.finalize if (i%finalize_every == 0)
70
+ end
71
+ end
72
+ end
73
+
74
+ pbar.finish
75
+ ohai "finalizing output"
76
+ buffer.finalize # finish writing out
77
+
78
+ puts "done"
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,36 @@
1
+ ##
2
+ # QUALITY TRIMMING
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "trim Illumina QSEQ files",
9
+ "--reads_dir=reads/* --out_dir=trimmed.fasta"
10
+
11
+ method_option :reads_dir, :type => :string, :required => true
12
+ method_option :out_dir, :type => :string, :default => 'trimmed/'
13
+
14
+ def trim
15
+
16
+ raw_reads = options[:reads_dir]
17
+ out_dir = options[:out_dir]
18
+
19
+ `mkdir -p #{out_dir}`
20
+
21
+ raw_reads = Helpers.get_grouped_qseq_files raw_reads
22
+ ohai "found #{raw_reads.length} pairs of reads"
23
+
24
+ pbar = ProgressBar.new "trimming", raw_reads.length
25
+
26
+ raw_reads.each do |a|
27
+ pbar.inc
28
+ out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
29
+ # TODO get total and trimmed
30
+ total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
31
+ end
32
+
33
+ pbar.finish
34
+ end
35
+ end
36
+ end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Lederhosen
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- hash: 25
4
+ hash: 23
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 3
10
- version: 0.0.3
9
+ - 4
10
+ version: 0.0.4
11
11
  platform: ruby
12
12
  authors:
13
13
  - Austin G. Davis-Richardson
@@ -120,6 +120,12 @@ files:
120
120
  - lib/lederhosen/buffer.rb
121
121
  - lib/lederhosen/cli.rb
122
122
  - lib/lederhosen/helpers.rb
123
+ - lib/lederhosen/tasks/cluster.rb
124
+ - lib/lederhosen/tasks/join.rb
125
+ - lib/lederhosen/tasks/otu_table.rb
126
+ - lib/lederhosen/tasks/sort.rb
127
+ - lib/lederhosen/tasks/split.rb
128
+ - lib/lederhosen/tasks/trim.rb
123
129
  - lib/version.rb
124
130
  - readme.md
125
131
  - spec/data/ILT_L_9_B_001_1.txt