lederhosen 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lederhosen.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "1.0.1"
8
+ s.version = "1.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
@@ -23,24 +23,15 @@ Gem::Specification.new do |s|
23
23
  "LICENSE.txt",
24
24
  "Rakefile",
25
25
  "bin/lederhosen",
26
- "examples/hierarchical_clustering.sh",
27
- "examples/pipeline.sh",
28
26
  "lederhosen.gemspec",
29
27
  "lib/lederhosen.rb",
30
- "lib/lederhosen/buffer.rb",
31
28
  "lib/lederhosen/cli.rb",
32
- "lib/lederhosen/helpers.rb",
33
29
  "lib/lederhosen/tasks/cluster.rb",
34
- "lib/lederhosen/tasks/k_filter.rb",
35
30
  "lib/lederhosen/tasks/make_udb.rb",
36
31
  "lib/lederhosen/tasks/otu_filter.rb",
37
32
  "lib/lederhosen/tasks/otu_table.rb",
38
- "lib/lederhosen/tasks/rep_reads.rb",
39
- "lib/lederhosen/tasks/split.rb",
40
33
  "lib/lederhosen/tasks/split_fasta.rb",
41
34
  "lib/lederhosen/tasks/trim.rb",
42
- "lib/lederhosen/tasks/uc_filter.rb",
43
- "lib/lederhosen/tasks/uc_stats.rb",
44
35
  "lib/lederhosen/tasks/version.rb",
45
36
  "lib/lederhosen/version.rb",
46
37
  "readme.md",
@@ -50,7 +41,6 @@ Gem::Specification.new do |s|
50
41
  "spec/data/ILT_L_9_B_002_1.txt.gz",
51
42
  "spec/data/ILT_L_9_B_002_3.txt.gz",
52
43
  "spec/data/test.uc",
53
- "spec/helpers_spec.rb",
54
44
  "spec/misc_spec.rb",
55
45
  "spec/spec_helper.rb"
56
46
  ]
@@ -26,9 +26,11 @@ module Lederhosen
26
26
  sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
27
27
 
28
28
  all_names = Set.new
29
+ pbar = ProgressBar.new "loading", input.size
29
30
 
30
31
  # Load cluster table
31
32
  input.each do |input_file|
33
+ pbar.inc
32
34
  File.open(input_file) do |handle|
33
35
  handle.each do |line|
34
36
  dat = parse_usearch_line(line.strip)
@@ -41,6 +43,8 @@ module Lederhosen
41
43
  end
42
44
  end
43
45
 
46
+ pbar.finish
47
+
44
48
  ohai "found #{all_names.size} unique taxa at #{level} level"
45
49
 
46
50
  # save to csv
@@ -19,7 +19,7 @@ module Lederhosen
19
19
 
20
20
  run "mkdir -p #{out_dir}"
21
21
 
22
- raw_reads = Helpers.get_grouped_qseq_files raw_reads
22
+ raw_reads = get_grouped_qseq_files raw_reads
23
23
 
24
24
  ohai "found #{raw_reads.length} pairs of reads"
25
25
 
@@ -28,10 +28,97 @@ module Lederhosen
28
28
  pbar.inc
29
29
  out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
30
30
  # TODO get total and trimmed
31
- total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
31
+ total, trimmed = trim_pairs a[1][0], a[1][1], out, :min_length => 70
32
32
  end
33
33
  pbar.finish
34
34
 
35
35
  end
36
+
37
+ no_tasks do
38
+
39
+ # reverse complement a DNA sequence
40
+ # assumes only GATCN nucleotides
41
+ def reverse_complement(s)
42
+ s.reverse.tr('GATCNgatcn','CTAGNctagn')
43
+ end
44
+
45
+ # Function for grouping qseq files produced by splitting illumina
46
+ # reads by barcode
47
+ #
48
+ # Filenames should look like this:
49
+ # IL5_L_1_B_007_1.txt
50
+ def get_grouped_qseq_files(glob='raw_reads/*.txt')
51
+ Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
52
+ end
53
+
54
+ # Trim a pair of QSEQ files. Saves to a single,
55
+ # interleaved .fasta file
56
+ def trim_pairs(left, right, out, args={})
57
+ cutoff = args[:cutoff] || 20
58
+ min_length = args[:min_length] || 70
59
+
60
+ left_handle, right_handle =
61
+ begin
62
+ [ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
63
+ rescue Zlib::GzipFile::Error
64
+ [ File.open(left), File.open(right) ]
65
+ end
66
+
67
+ out_handle = File.open out, 'w'
68
+
69
+ left_reads = Dna.new left_handle
70
+ right_reads = Dna.new right_handle
71
+
72
+ i = 0
73
+ left_reads.zip(right_reads).each do |a, b|
74
+ i += 1
75
+ seqa = trim_seq a
76
+ seqb = trim_seq b
77
+ unless [seqa, seqb].include? nil
78
+ if seqb.length >= min_length && seqa.length >= min_length
79
+ seqb = reverse_complement(seqb)
80
+ out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
81
+ end
82
+ end
83
+ end
84
+ left_handle.close
85
+ right_handle.close
86
+ out_handle.close
87
+ end
88
+
89
+ # Return longest subsequence with quality scores
90
+ # greater than min. (Illumina PHRED)
91
+ # Trim2 from Huang, et. al
92
+ # returns just the sequence
93
+ def trim_seq(dna, args={})
94
+
95
+ # trim primers off of sequence
96
+ # (THIS IS EXPERIMENT-SPECIFIC)
97
+ dna.sequence = dna.sequence[11..-1]
98
+ dna.quality = dna.quality[11..-1]
99
+
100
+ # throw away any read with an ambiguous primer
101
+ return nil if dna.sequence =~ /N/
102
+
103
+ min = args[:min] || 20
104
+ offset = args[:cutoff] || 64
105
+
106
+ _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
107
+
108
+ dna.quality.each_byte.each_with_index do |b, a|
109
+ _sum += (b - offset - min)
110
+ if _sum > _max
111
+ _max = _sum
112
+ _end = a
113
+ start = first
114
+ elsif _sum < 0
115
+ _sum = 0
116
+ first = a
117
+ end
118
+ end
119
+ dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
120
+ end
121
+ end
122
+
36
123
  end
37
124
  end
@@ -3,7 +3,7 @@ module Lederhosen
3
3
  MAJOR = 1
4
4
  CODENAME = 'Hefeweizen'
5
5
  MINOR = 0
6
- PATCH = 1
6
+ PATCH = 2
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end
data/readme.md CHANGED
@@ -2,13 +2,18 @@
2
2
 
3
3
  Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
4
4
 
5
- ## Who can use Lederhosen?
5
+ ### About
6
6
 
7
- Lederhosen is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/)
7
+ - Lederhosen is a project born out of the Triplett Lab at the University of Florida.
8
+ - Lederhosen is designed to be a fast and simple method of clustering 16S rRNA amplicons sequenced
9
+ using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq).
10
+ - Lederhosen uses Semantic Versioning.
11
+ - Lederhosen is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/).
12
+ - Except for USEARCH which requires a license, Lederhosen is available for commercial use.
8
13
 
9
14
  ## How do I get Lederhosen?
10
15
 
11
- 0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine)
16
+ 0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine for non-commercial use)
12
17
  2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
13
18
  3. Install Lederhosen by typing:
14
19
 
@@ -18,13 +23,8 @@ Lederhosen is free and open source under the [MIT open source license](http://op
18
23
  ## Features
19
24
 
20
25
  - Sequence trimming (paired-end Illumina).
21
- - K-mer filtering.
22
- - Clustering w/ UCLUST.
23
- - UCLUST output filtering.
24
- - Separation of representative reads.
25
- - Separation of all reads belonging to each cluster.
26
- - Identification of clusters using TaxCollector.
27
- - Generation of OTU abundancy matrices.
26
+ - Parallel, referenced-based clustering to TaxCollector using USEARCH
27
+ - Generation and filtering of OTU abundancy matrices.
28
28
 
29
29
  ## How do I use Lederhosen?
30
30
 
@@ -40,6 +40,8 @@ Trim (Illumina) reads using quality scores. Output will be a directory of fasta
40
40
 
41
41
  lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
42
42
 
43
+ The trimming process will reverse complement the "right" pair so that both reads are in the forward orientation.
44
+
43
45
  ### Create Database
44
46
 
45
47
  Create UDB database required by usearch from TaxCollector
@@ -58,4 +60,4 @@ Create an OTU abundance table where rows are samples and columns are clusters. T
58
60
 
59
61
  lederhosen otu_table --clusters=clusters_95.uc --output=genus.csv --level=genus
60
62
 
61
- Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus. To make tables at all levels do:
63
+ Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus.
data/spec/cli_spec.rb CHANGED
@@ -8,7 +8,7 @@ describe Lederhosen::CLI do
8
8
  end
9
9
 
10
10
  it 'should have a version command' do
11
- `./bin/lederhosen version `.strip.should == "lederhosen-#{Lederhosen::Version::STRING}"
11
+ `./bin/lederhosen version`
12
12
  $?.success?.should be_true
13
13
  end
14
14
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 1
10
- version: 1.0.1
9
+ - 2
10
+ version: 1.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Austin G. Davis-Richardson
@@ -119,24 +119,15 @@ files:
119
119
  - LICENSE.txt
120
120
  - Rakefile
121
121
  - bin/lederhosen
122
- - examples/hierarchical_clustering.sh
123
- - examples/pipeline.sh
124
122
  - lederhosen.gemspec
125
123
  - lib/lederhosen.rb
126
- - lib/lederhosen/buffer.rb
127
124
  - lib/lederhosen/cli.rb
128
- - lib/lederhosen/helpers.rb
129
125
  - lib/lederhosen/tasks/cluster.rb
130
- - lib/lederhosen/tasks/k_filter.rb
131
126
  - lib/lederhosen/tasks/make_udb.rb
132
127
  - lib/lederhosen/tasks/otu_filter.rb
133
128
  - lib/lederhosen/tasks/otu_table.rb
134
- - lib/lederhosen/tasks/rep_reads.rb
135
- - lib/lederhosen/tasks/split.rb
136
129
  - lib/lederhosen/tasks/split_fasta.rb
137
130
  - lib/lederhosen/tasks/trim.rb
138
- - lib/lederhosen/tasks/uc_filter.rb
139
- - lib/lederhosen/tasks/uc_stats.rb
140
131
  - lib/lederhosen/tasks/version.rb
141
132
  - lib/lederhosen/version.rb
142
133
  - readme.md
@@ -146,7 +137,6 @@ files:
146
137
  - spec/data/ILT_L_9_B_002_1.txt.gz
147
138
  - spec/data/ILT_L_9_B_002_3.txt.gz
148
139
  - spec/data/test.uc
149
- - spec/helpers_spec.rb
150
140
  - spec/misc_spec.rb
151
141
  - spec/spec_helper.rb
152
142
  homepage: http://audy.github.com/lederhosen
@@ -1,51 +0,0 @@
1
- #!/bin/bash
2
-
3
- set -e
4
- set -x
5
-
6
- # Hierarchical OTU clustering
7
- # Austin G. Davis-Richardson
8
- # <harekrishna at gmail dot com>
9
- # http://github.com/audy/lederhosen
10
-
11
- reads='sorted.fasta'
12
- out='h_clustering'
13
-
14
- mkdir -p $out
15
-
16
- # initial clustering at 80%
17
- lederhosen cluster --input=$reads --output=$out/clusters_0.80.uc --identity=0.80
18
-
19
- # filter UC file
20
- lederhosen uc_filter --input=$out/clusters_0.80.uc --output=$out/clusters_0.80.uc.filtered --reads=1 --samples=1
21
-
22
- # get reads for each cluster
23
- mkdir -p $out/split_80
24
- lederhosen split --clusters=$out/clusters_0.80.uc.filtered --reads=$reads --out-dir=$out/split_80/
25
-
26
- # now cluster each of those at 90%
27
- for fasta in $out/split_80/*.fasta
28
- do
29
-
30
- # sort (awww, do I really have to do this again?)
31
- lederhosen sort --input=$fasta --output=$fasta.sorted
32
-
33
- # cluster
34
- lederhosen cluster --input=$fasta.sorted --output=$fasta.uc --identity=0.90
35
-
36
- # split
37
- split=$out/split_80.90_$(basename $fasta .fasta)
38
- lederhosen split --clusters=$fasta.uc --reads=$fasta --out-dir=$split
39
- done
40
-
41
- # Do it again at 95%
42
- for fasta in $out/split_80/split_*_90.fasta/*.fasta
43
- do
44
- # cluster
45
- lederhosen cluster --input=$fasta --output=$fasta.uc --identity=90
46
-
47
- # split
48
- split=$outdir/80.90.$fasta.fasta
49
- mkdir -p $split
50
- lederhosen split --clusters=$fasta.uc --reads=$input --out-dir=$split
51
- done
data/examples/pipeline.sh DELETED
@@ -1,71 +0,0 @@
1
- #!/bash
2
-
3
- # An example OTU clustering pipeline
4
- # Austin G. Davis-Richardson
5
- # <harekrishna at gmail dot com>
6
- # http://github.com/audy/lederhosen
7
-
8
- set -e
9
-
10
- raw_reads='spec/data/*.txt'
11
- out_dir='pipeline'
12
- taxcollector='taxcollector.fa'
13
- min_reads=50
14
- min_samples=10
15
-
16
- # trim reads
17
- lederhosen trim \
18
- --reads-dir=$raw_reads \
19
- --out-dir=$out_dir/trimmed
20
-
21
- # join reads
22
- lederhosen join \
23
- --trimmed=$out_dir/trimmed/*.fasta \
24
- --output=$out_dir/joined.fasta
25
-
26
- # filter reads
27
- lederhosen k_filter \
28
- --input=$out_dir/joined.fasta \
29
- --output=$out_dir/filtered.fasta \
30
- -k=10 \
31
- --cutoff=50
32
-
33
- # sort
34
- lederhosen sort \
35
- --input=$out_dir/filtered.fasta \
36
- --output=$out_dir/sorted.fasta
37
-
38
- for i in 0.80 0.90 0.95
39
- do
40
- # cluster
41
- lederhosen cluster \
42
- --input=$out_dir/sorted.fasta \
43
- --output=$out_dir/clusters_"$i".uc \
44
- --identity=$i
45
-
46
- # filter uc file
47
- lederhosen uc_filter \
48
- --input=$out_dir/clusters_"$i".uc \
49
- --output=$out_dir/clusters_"$i".uc.filtered \
50
- --reads=$min_reads \
51
- --samples=$min_samples \
52
-
53
- # generate otu table
54
- lederhosen otu_table \
55
- --clusters=$out_dir/clusters_"$i".uc.filtered \
56
- --output=$out_dir/otus_"$i"
57
-
58
- # get representative reads
59
- lederhosen rep_reads \
60
- --clusters=$out_dir/clusters_"$i".uc.filtered \
61
- --joined=$out_dir/sorted.fasta \
62
- --output=$out_dir/representatives_"$i".fasta
63
-
64
- # blast representative reads
65
- lederhosen name \
66
- --reps=$out_dir/representatives_"$i".fasta \
67
- --output=$out_dir/taxonomies_"$i".txt \
68
- --database=$taxcollector
69
- done
70
-
71
- echo "complete!"
@@ -1,54 +0,0 @@
1
- module Lederhosen
2
-
3
- class Buffer
4
- # for when you need to write out to a shitload of files.
5
-
6
- #
7
- # Create a new buffer
8
- #
9
- def initialize(args={})
10
- @buffer = Hash.new { |h, k| h[k] = Array.new }
11
- @buffer_max = args[:buffer_max] || 100_000
12
- end
13
-
14
- #
15
- # Add an object to the buffer
16
- #
17
- def add_to bucket, obj
18
-
19
- @buffer[bucket] << obj.to_s
20
-
21
- if @buffer[bucket].length > @buffer_max
22
- # write out
23
- File.open(bucket, 'a+') do |out|
24
- @buffer[bucket].each do |v|
25
- out.puts v
26
- end
27
- end
28
-
29
- # clear that bucket
30
- @buffer[bucket].clear
31
- end
32
- end
33
-
34
- def [] k
35
- @buffer[k]
36
- end
37
-
38
- #
39
- # Writes out leftover objects
40
- #
41
- def finalize
42
- @buffer.each_key do |bucket|
43
- File.open(bucket, 'a+') do |out|
44
- @buffer[bucket].each do |v|
45
- out.puts v
46
- end
47
- end
48
- end
49
- @buffer = Hash.new { |h, k| h[k] = Array.new }
50
- end
51
-
52
- end
53
-
54
- end
@@ -1,166 +0,0 @@
1
- module Lederhosen
2
- class Helpers
3
- class << self
4
-
5
- # reverse complement a DNA sequence
6
- # assumes only GATCN nucleotides
7
- def reverse_complement(s)
8
- s.reverse.tr('GATCNgatcn','CTAGNctagn')
9
- end
10
-
11
- # Function for grouping qseq files produced by splitting illumina
12
- # reads by barcode
13
- #
14
- # Filenames should look like this:
15
- # IL5_L_1_B_007_1.txt
16
- def get_grouped_qseq_files(glob='raw_reads/*.txt')
17
- Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
18
- end
19
-
20
- # Trim a pair of QSEQ files. Saves to a single,
21
- # interleaved .fasta file
22
- def trim_pairs(left, right, out, args={})
23
- cutoff = args[:cutoff] || 20
24
- min_length = args[:min_length] || 70
25
-
26
- left_handle, right_handle =
27
- begin
28
- [ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
29
- rescue Zlib::GzipFile::Error
30
- [ File.open(left), File.open(right) ]
31
- end
32
-
33
- out_handle = File.open out, 'w'
34
-
35
- left_reads = Dna.new left_handle
36
- right_reads = Dna.new right_handle
37
-
38
- i = 0
39
- left_reads.zip(right_reads).each do |a, b|
40
- i += 1
41
- seqa = trim a
42
- seqb = trim b
43
- unless [seqa, seqb].include? nil
44
- if seqb.length >= min_length && seqa.length >= min_length
45
- seqb = reverse_complement(seqb)
46
- out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
47
- end
48
- end
49
- end
50
- left_handle.close
51
- right_handle.close
52
- out_handle.close
53
- end
54
-
55
- # Return longest subsequence with quality scores
56
- # greater than min. (Illumina PHRED)
57
- # Trim2 from Huang, et. al
58
- # returns just the sequence
59
- def trim(dna, args={})
60
-
61
- # trim primers off of sequence
62
- # (THIS IS EXPERIMENT-SPECIFIC)
63
- dna.sequence = dna.sequence[11..-1]
64
- dna.quality = dna.quality[11..-1]
65
-
66
- # throw away any read with an ambiguous primer
67
- return nil if dna.sequence =~ /N/
68
-
69
- min = args[:min] || 20
70
- offset = args[:cutoff] || 64
71
-
72
- _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
73
-
74
- dna.quality.each_byte.each_with_index do |b, a|
75
- _sum += (b - offset - min)
76
- if _sum > _max
77
- _max = _sum
78
- _end = a
79
- start = first
80
- elsif _sum < 0
81
- _sum = 0
82
- first = a
83
- end
84
- end
85
- dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
86
- end
87
-
88
- # Load uc file from uclust
89
- # returns hash with various data
90
- def load_uc_file(input)
91
- clusters = Hash.new
92
-
93
- # keep track of samples
94
- samples = Set.new
95
-
96
- # store a list of all the sample IDs
97
- clusters[:samples] = Set.new
98
-
99
- # data for each cluster
100
- # clstr_counts[:clstr][:sample] = number_of_reads
101
- clstr_counts = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
102
-
103
- # clstrnr_to_seed[seed_sequence_id] = clstr_nr
104
- seed_to_clstrnr = Hash.new
105
- bytes = File.size(input)
106
- pbar = ProgressBar.new 'loading uc file', bytes
107
- File.open(input) do |handle|
108
- handle.each do |line|
109
- pbar.set handle.pos
110
- next if line =~ /^#/ # skip comments
111
-
112
- line = line.strip.split
113
-
114
- # things we want to know
115
- type = line[0]
116
- clusternr = line[1].to_i
117
- querylabel = line[8]
118
- targetlabel = line[9]
119
- header = line[8]
120
-
121
- sample =
122
- begin
123
- # get the sample id via regexp match
124
- # this way more info can be stored in the header.
125
- line[8].match(/sample=(.*)/)[1]
126
- rescue NoMethodError # catch no method [] for NilClass
127
- # Need to maintain some backwards compatibility here
128
- # this is the old way of getting the same id.
129
- sample = line[8].split(':')[2]
130
- end
131
-
132
- # keep track of samples
133
- samples.add(sample)
134
-
135
- # keep track of all samples
136
- clusters[:samples].add sample
137
-
138
- # L=LibSeed
139
- # S=NewSeed
140
- # H=Hit
141
- # R=Reject
142
- # D=LibCluster
143
- # C=NewCluster
144
- # N=NoHit
145
-
146
- if type =~ /[LS]/ # = Seed Sequence
147
- clstr_counts[clusternr][sample] += 1
148
- seed_to_clstrnr[querylabel] = clusternr
149
- elsif type =~ /H/ # = Seed Member
150
- clstr_counts[clusternr][sample] += 1
151
- end
152
-
153
- end
154
- end
155
- pbar.finish
156
- return {
157
- :clstr_counts => clstr_counts,
158
- :seed_to_clstrnr => seed_to_clstrnr,
159
- :samples => samples
160
- }
161
- end
162
-
163
-
164
- end # class << self
165
- end # class Helpers
166
- end # Module
@@ -1,82 +0,0 @@
1
- ##
2
- # FILTER READS WITH LOW ABUNDANCE KMERS
3
- #
4
-
5
- module Lederhosen
6
- class CLI
7
-
8
- desc "k_filter",
9
- "filter novel reads likely to form small/singleton clusters (experimental)"
10
-
11
- method_option :input, :type => :string, :required => true
12
- method_option :output, :type => :string, :required => true
13
- method_option :k, :type => :numeric, :required => true
14
- method_option :cutoff, :type => :numeric, :required => true
15
-
16
- def k_filter
17
- input = options[:input]
18
- output = options[:output]
19
- k_len = options[:k].to_i
20
- cutoff = options[:cutoff]
21
-
22
- ohai "kmer filtering #{input} (k = #{k_len}, cutoff = #{cutoff})"
23
-
24
- counting_table = Hash.new { |h, k| h[k] = 0 }
25
- total_reads = 0
26
-
27
- File.open(input) do |handle|
28
- pbar = ProgressBar.new 'counting', File.size(input)
29
- records = Dna.new handle
30
- records.each do |r|
31
- pbar.set handle.pos
32
- total_reads += 1
33
- kmers = r.sequence.to_kmers(k_len)
34
- kmers.each { |x| counting_table[x] += 1 }
35
- end
36
- pbar.finish
37
- end
38
-
39
- sum_of_kmers = counting_table.values.inject(:+)
40
-
41
- ohai "total reads = #{total_reads}"
42
- ohai "sum of kmers = #{sum_of_kmers}"
43
-
44
- kept = 0
45
- total_reads = total_reads.to_f
46
-
47
- pbar = ProgressBar.new "saving", total_reads.to_i
48
- output = File.open(output, 'w')
49
- File.open(input) do |handle|
50
- records = Dna.new handle
51
- records.each do |r|
52
- kmers = r.sequence.to_kmers(k_len)
53
-
54
- # check if any of the kmers are rare
55
- keep = true
56
- coverage = 0
57
- kmers.each do |kmer|
58
- # if any of the kmers are rare, don't print the read
59
- c = counting_table[kmer]
60
- coverage += c
61
- if c < cutoff
62
- keep = false
63
- break
64
- end
65
- end
66
-
67
- if keep
68
- kept += 1
69
- output.puts r
70
- end
71
- pbar.inc
72
- end
73
- end
74
-
75
- pbar.finish
76
-
77
- ohai "survivors = #{kept} (#{kept/total_reads.to_f})"
78
- output.close
79
- end
80
- end
81
-
82
- end
@@ -1,45 +0,0 @@
1
- ##
2
- # GET REPRESENTATIVE READS
3
- #
4
-
5
- module Lederhosen
6
- class CLI
7
-
8
- desc "rep_reads",
9
- "output a fasta file containing representative reads for each cluster given a UCLUST output file and the joined reads file"
10
-
11
- method_option :clusters, :type => :string, :required => true
12
- method_option :output, :type => :string, :required => true
13
- method_option :joined, :type => :string, :required => true
14
-
15
- def rep_reads
16
- input = options[:clusters]
17
- output = options[:output]
18
- joined_reads = options[:joined]
19
-
20
- ohai "getting represntative reads for #{input} w/ reads #{joined_reads} and saving to #{output}"
21
-
22
- # Load cluster table!
23
- clstr_info = Helpers.load_uc_file input
24
- clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
25
- seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
26
- samples = clstr_info[:samples]
27
-
28
- out_handle = File.open("#{output}", 'w')
29
-
30
- File.open(joined_reads) do |handle|
31
- records = Dna.new handle
32
- records.each do |dna|
33
- clstrnr = seed_to_clstrnr[dna.name]
34
- unless clstrnr.nil?
35
- dna.name = "#{dna.name}:cluster-#{clstrnr}"
36
- out_handle.puts dna
37
- end
38
- end
39
- end
40
-
41
- out_handle.close
42
- end
43
-
44
- end
45
- end
@@ -1,84 +0,0 @@
1
- ##
2
- # Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
3
- #
4
-
5
- module Lederhosen
6
- class CLI
7
-
8
- desc "split",
9
- "create fasta files containing reads from each cluster"
10
-
11
- method_option :clusters, :type => :string, :required => true
12
- method_option :reads, :type => :string, :required => true
13
- method_option :out_dir, :type => :string, :required => true
14
- method_option :buffer_size, :type => :numeric, :default => 1000
15
- method_option :min_clst_size, :type => :numeric, :default => 1
16
-
17
- def split
18
- clusters = options[:clusters]
19
- reads = options[:reads]
20
- out_dir = options[:out_dir]
21
- buffer_size = options[:buffer_size]
22
- min_clst_size = options[:min_clst_size]
23
- finalize_every = 100_000
24
-
25
- ohai "spltting #{reads} by #{clusters} and saving to #{out_dir}"
26
- ohai "minimum cluster size = #{min_clst_size}"
27
-
28
- run "mkdir -p #{out_dir}/"
29
-
30
- ohai "loading #{clusters}"
31
-
32
- # Load read id -> cluster
33
- read_to_clusterid = Hash.new
34
-
35
- # keep track of cluster sizes
36
- cluster_counts = Hash.new { |h, k| h[k] = 0}
37
-
38
- File.open(clusters)do |handle|
39
- handle.each do |line|
40
- line = line.strip.split
41
- cluster_nr = line[1]
42
- if line[0] == 'S' || line[0] == 'H'
43
- read = line[8]
44
- else
45
- next
46
- end
47
- read_to_clusterid[read] = cluster_nr
48
- cluster_counts[cluster_nr] += 1
49
- end
50
- end
51
-
52
- read_to_clusterid.delete_if do |read, cluster_nr|
53
- cluster_counts[cluster_nr] < min_clst_size
54
- end
55
-
56
- total_reads = read_to_clusterid.length
57
- total_clusters = read_to_clusterid.values.uniq.length
58
- ohai "#{total_reads} reads in #{total_clusters} clusters"
59
-
60
- pbar = ProgressBar.new "saving", total_reads
61
-
62
- # Write reads to individual fasta files using Buffer
63
- buffer = Buffer.new :buffer_max => buffer_size
64
- File.open(reads) do |handle|
65
- records = Dna.new handle
66
- records.each_with_index do |record, i|
67
- cluster_id = read_to_clusterid[record.name]
68
- if cluster_id
69
- pbar.inc
70
- filename = File.join(out_dir, cluster_id + '.fasta')
71
- buffer[filename] << record
72
- buffer.finalize if (i%finalize_every == 0)
73
- end
74
- end
75
- end
76
-
77
- pbar.finish
78
- ohai "finalizing output"
79
- buffer.finalize # finish writing out
80
-
81
- puts "done"
82
- end
83
- end
84
- end
@@ -1,80 +0,0 @@
1
- ##
2
- # FILTER UC FILE BY MIN SAMPLES
3
- #
4
- require 'set'
5
-
6
- module Lederhosen
7
- class CLI
8
-
9
- desc "uc_filter",
10
- "filter UCLUST output to remove small, infrequent clusters"
11
-
12
- method_option :input, :type => :string, :required => true
13
- method_option :output, :type => :string, :required => true
14
- method_option :reads, :type => :numeric, :required => true
15
- method_option :samples, :type => :numeric, :required => true
16
-
17
- def uc_filter
18
- input = options[:input]
19
- output = options[:output]
20
- reads = options[:reads].to_i
21
- samples = options[:samples].to_i
22
-
23
- ohai "filtering #{input} to #{output}, reads = #{reads} & samples = #{samples}"
24
-
25
- # load UC file
26
- ohai "loading uc file"
27
- clstr_info = Helpers.load_uc_file input
28
- clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
29
-
30
- # filter
31
- ohai "filtering"
32
- survivors = clstr_counts.reject do |a, b|
33
- b.reject{ |i, j| j < reads }.length < samples
34
- end
35
-
36
- surviving_clusters = survivors.keys.to_set
37
-
38
- # print filtered uc file
39
- ohai "saving filtered table"
40
- out = File.open(output, 'w')
41
-
42
- lines = `wc -l #{input}`.split.first.to_i
43
-
44
- pbar = ProgressBar.new 'saving', lines
45
- kept, total = 1, 0
46
-
47
- # output lederhosen filtering information because I often
48
- # forget to write this down :)
49
- out.puts "# filtered: #{input}"
50
- out.puts "# #{reads} reads in at least #{samples} samples"
51
-
52
- File.open(input) do |handle|
53
- pbar = ProgressBar.new 'saving', File.size(input)
54
- handle.each do |line|
55
-
56
- pbar.set handle.pos
57
- if line =~ /^#/
58
- out.print line
59
- next
60
- end
61
- total += 1
62
-
63
- # check if cluster is in surviving clusters
64
- if surviving_clusters.include? line.split[1].to_i
65
- out.print line
66
- kept += 1
67
- end
68
-
69
- end
70
- pbar.finish
71
- end
72
-
73
- out.close
74
-
75
- ohai "clusters: #{surviving_clusters.length}/#{clstr_counts.keys.length} = #{100*surviving_clusters.length/clstr_counts.keys.length.to_f}%"
76
- ohai "reads: #{kept}/#{total} = #{100*kept/total.to_f}%"
77
- end
78
- end
79
-
80
- end
@@ -1,41 +0,0 @@
1
- ##
2
- # Get statistics about clusters in a UC file
3
- #
4
-
5
- module Lederhosen
6
- class CLI
7
- desc 'uc_stats',
8
- 'get statistics about clusters in a UC file. for now, this only calculates the size of each cluster'
9
-
10
- method_option :input, :type => :string, :required => true
11
-
12
- def uc_stats
13
- input = options[:input]
14
-
15
- ohai "calculating statistics for #{input}"
16
-
17
- # TODO add more stats
18
- cluster_stats = Hash.new { |h, k|
19
- h[k] = {
20
- :size => 0
21
- }
22
- }
23
-
24
- File.open(input) do |handle|
25
- handle.each do |line|
26
- line = line.strip.split
27
- type, clustr_nr = line[0], line[1]
28
- cluster_stats[clustr_nr][:size] += 1
29
- end
30
- end
31
-
32
- stat_types = cluster_stats.values.first.keys.sort
33
-
34
- puts "cluster,#{stat_types.join(',')}"
35
- cluster_stats.each do |cluster, stats|
36
- puts "#{cluster},#{stat_types.map { |x| stats[x] }.join(',')}"
37
- end
38
- end
39
-
40
- end
41
- end
data/spec/helpers_spec.rb DELETED
@@ -1,30 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe Lederhosen::Helpers do
4
-
5
- let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt.gz') }
6
-
7
- it 'should have a method for grouping QSEQ files' do
8
- groups.length.should == 2
9
- end
10
-
11
- it 'should have a method for reverse complementing a dna sequence' do
12
- Lederhosen::Helpers.reverse_complement("GATCCCGANNANTAGGACCAA").should == "TTGGTCCTANTNNTCGGGATC"
13
- end
14
-
15
- it 'should have a method for trimming sequences' do
16
- reads = groups.values.first.first
17
- record = Zlib::GzipReader.open(reads) do |handle|
18
- Dna.new(handle).first
19
- end
20
- # I should probably test with a bad read
21
- Lederhosen::Helpers.trim(record).length.should == 58
22
- end
23
-
24
- it 'should be able to trim pairs of qseq files, outputting fasta file' do
25
- reads = groups.values.first
26
- Lederhosen::Helpers.trim_pairs reads[0], reads[1], "#{$test_dir}/munchen_trim_test.fasta"
27
- # this test will break if trim parameters change
28
- File.readlines("#{$test_dir}/munchen_trim_test.fasta").grep(/^>/).length.should be_even
29
- end
30
- end