lederhosen 0.5.7 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lederhosen.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "0.5.7"
8
+ s.version = "1.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
12
- s.date = "2012-09-17"
12
+ s.date = "2012-10-30"
13
13
  s.description = "Various tools for OTU clustering"
14
14
  s.email = "harekrishna@gmail.com"
15
15
  s.executables = ["lederhosen"]
@@ -30,18 +30,14 @@ Gem::Specification.new do |s|
30
30
  "lib/lederhosen/buffer.rb",
31
31
  "lib/lederhosen/cli.rb",
32
32
  "lib/lederhosen/helpers.rb",
33
- "lib/lederhosen/tasks/add_names.rb",
34
33
  "lib/lederhosen/tasks/cluster.rb",
35
- "lib/lederhosen/tasks/join.rb",
36
34
  "lib/lederhosen/tasks/k_filter.rb",
37
- "lib/lederhosen/tasks/name.rb",
35
+ "lib/lederhosen/tasks/make_udb.rb",
38
36
  "lib/lederhosen/tasks/otu_filter.rb",
39
37
  "lib/lederhosen/tasks/otu_table.rb",
40
38
  "lib/lederhosen/tasks/rep_reads.rb",
41
- "lib/lederhosen/tasks/sort.rb",
42
39
  "lib/lederhosen/tasks/split.rb",
43
40
  "lib/lederhosen/tasks/split_fasta.rb",
44
- "lib/lederhosen/tasks/squish.rb",
45
41
  "lib/lederhosen/tasks/trim.rb",
46
42
  "lib/lederhosen/tasks/uc_filter.rb",
47
43
  "lib/lederhosen/tasks/uc_stats.rb",
@@ -53,8 +49,7 @@ Gem::Specification.new do |s|
53
49
  "spec/data/ILT_L_9_B_001_3.txt.gz",
54
50
  "spec/data/ILT_L_9_B_002_1.txt.gz",
55
51
  "spec/data/ILT_L_9_B_002_3.txt.gz",
56
- "spec/data/blat.txt",
57
- "spec/data/otus.csv",
52
+ "spec/data/test.uc",
58
53
  "spec/helpers_spec.rb",
59
54
  "spec/misc_spec.rb",
60
55
  "spec/spec_helper.rb"
@@ -2,6 +2,12 @@ module Lederhosen
2
2
  class Helpers
3
3
  class << self
4
4
 
5
+ # reverse complement a DNA sequence
6
+ # assumes only GATCN nucleotides
7
+ def reverse_complement(s)
8
+ s.reverse.tr('GATCNgatcn','CTAGNctagn')
9
+ end
10
+
5
11
  # Function for grouping qseq files produced by splitting illumina
6
12
  # reads by barcode
7
13
  #
@@ -36,6 +42,7 @@ module Lederhosen
36
42
  seqb = trim b
37
43
  unless [seqa, seqb].include? nil
38
44
  if seqb.length >= min_length && seqa.length >= min_length
45
+ seqb = reverse_complement(seqb)
39
46
  out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
40
47
  end
41
48
  end
@@ -1,58 +1,46 @@
1
- ##
2
- # FINALLY, CLUSTER!
3
- #
4
-
5
1
  module Lederhosen
2
+
6
3
  class CLI
7
4
 
8
- desc "cluster",
9
- "cluster a fasta file using UCLUST"
5
+ desc 'cluster', 'reference-based clustering using usearch'
10
6
 
11
- method_option :input, :type => :string, :required => true
12
- method_option :output, :type => :string, :required => true
13
- method_option :identity, :type => :numeric, :required => true
14
- method_option :stepwords, :type => :numeric, :default => 8
15
- method_option :wordlen, :type => :numeric, :default => 8
16
- method_option :maxaccepts, :type => :numeric, :default => 1
17
- method_option :maxrejects, :type => :numeric, :default => 8
18
- method_option :lib, :type => :string
19
- method_option :libonly, :type => :boolean, :default => false
7
+ method_option :input, :type => :string, :required => true
8
+ method_option :database, :type => :string, :required => true
9
+ method_option :threads, :type => :numeric, :default => 0
10
+ method_option :identity, :type => :numeric, :required => true
11
+ method_option :output, :type => :string, :required => true
12
+ method_option :strand, :type => :string, :default => 'plus'
20
13
 
21
14
  def cluster
22
- identity = options[:identity]
23
- output = options[:output]
24
- input = options[:input]
25
- stepwords = options[:stepwords]
26
- maxaccepts = options[:maxaccepts]
27
- maxrejects = options[:maxrejects]
28
- wordlen = options[:wordlen]
29
- lib = options[:lib]
30
- libonly = options[:libonly]
31
-
32
- ohai "clustering #{input}, saving to #{output}"
15
+ input = options[:input]
16
+ database = options[:database]
17
+ threads = options[:threads]
18
+ identity = options[:identity]
19
+ output = options[:output]
20
+ strand = options[:strand]
21
+
22
+ ohai "clustering #{input} to #{database} and saving to #{output}"
33
23
 
34
24
  options.each_pair do |key, value|
35
25
  ohai "#{key} = #{value}"
36
26
  end
37
27
 
38
- cmd = [
39
- 'uclust',
40
- "--input #{input}",
41
- "--uc #{output}",
28
+ cmd = ['usearch',
29
+ "--usearch_local #{input}",
42
30
  "--id #{identity}",
43
- "--stepwords #{stepwords}",
44
- "--maxaccepts #{maxaccepts}",
45
- "--maxrejects #{maxrejects}",
46
- "--w #{wordlen}"
31
+ "--uc #{output}",
32
+ "--db #{database}",
33
+ "--strand #{strand}"
47
34
  ]
48
35
 
49
- cmd << "--lib #{lib}" unless lib.nil?
50
- cmd << "--libonly" if libonly == true
36
+ # threads = 0 : use all threads (default)
37
+ if threads != 0
38
+ cmd << "--threads #{threads}"
39
+ end
51
40
 
52
41
  cmd = cmd.join(' ')
53
42
 
54
- @shell.mute { run cmd }
43
+ run cmd
55
44
  end
56
-
57
45
  end
58
46
  end
@@ -0,0 +1,25 @@
1
+ module Lederhosen
2
+ class CLI
3
+
4
+ desc 'make_udb', 'format database for usearch'
5
+
6
+ method_option :input, :type => :string, :required => true
7
+ method_option :output, :type => :string, :required => true
8
+
9
+ def make_udb
10
+ input = options[:input]
11
+ output = options[:output]
12
+ word_length = options[:word_length]
13
+
14
+ ohai "making udb w/ #{input}, saving as #{output}."
15
+
16
+ cmd = ['usearch',
17
+ "-makeudb_usearch #{input}",
18
+ "-output #{output}"]
19
+
20
+ cmd = cmd.join(' ')
21
+
22
+ run cmd
23
+ end
24
+ end
25
+ end
@@ -2,50 +2,109 @@
2
2
  # MAKE TABLES
3
3
  #
4
4
 
5
- SEP = ','
5
+ require 'set'
6
6
 
7
7
  module Lederhosen
8
8
  class CLI
9
9
 
10
10
  desc "otu_table",
11
- "create an OTU abundance matrix from UCLUST output"
11
+ "create an OTU abundance matrix from USEARCH output"
12
12
 
13
- method_option :clusters, :type => :string, :required => true
14
- method_option :output, :type => :string, :required => true
13
+ method_option :files, :type => :string, :required => true
14
+ method_option :output, :type => :string, :required => true
15
+ method_option :level, :type => :string, :required => true, :banner => 'valid options: domain, kingdom, phylum, class, order, genus, or species'
15
16
 
16
17
  def otu_table
17
- input = options[:clusters]
18
- output = options[:output]
18
+ input = Dir[options[:files]]
19
+ output = options[:output]
20
+ level = options[:level].downcase
19
21
 
20
- ohai "generating otu table from #{input}, saving to #{output}"
22
+ ohai "generating #{level} table from #{input.size} file(s) and saving to #{output}."
21
23
 
22
- # Load cluster table
24
+ fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom}.include? level
25
+
26
+ sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
23
27
 
24
- clstr_info = Helpers.load_uc_file input
25
- clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
26
- clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
27
- samples = clstr_info[:samples]
28
+ all_names = Set.new
28
29
 
29
- # print OTU abundance matrix
30
- # clusters as columns
31
- # samples as rows
30
+ # Load cluster table
31
+ input.each do |input_file|
32
+ File.open(input_file) do |handle|
33
+ handle.each do |line|
34
+ dat = parse_usearch_line(line.strip)
35
+ next if dat.nil?
36
+ name = dat[level] rescue ohai(dat.inspect)
32
37
 
33
- File.open("#{output}", 'w') do |h|
34
- samples = samples.sort
35
- clusters = clstr_counts.keys
38
+ all_names << name
39
+ sample_cluster_count[input_file][name] += 1
40
+ end
41
+ end
42
+ end
36
43
 
37
- # print header (cluster names)
38
- h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
44
+ ohai "found #{all_names.size} unique taxa at #{level} level"
45
+
46
+ # save to csv
47
+ File.open(output, 'w') do |handle|
48
+ header = all_names.to_a.compact.sort
49
+ handle.puts "#{level.capitalize},#{header.join(',')}"
50
+ samples = sample_cluster_count.keys.sort
39
51
 
40
52
  samples.each do |sample|
41
- h.print sample
42
- clusters.each do |cluster|
43
- h.print "#{SEP}#{clstr_counts[cluster][sample]}"
53
+ handle.print "#{sample}"
54
+ header.each do |name|
55
+ handle.print ",#{sample_cluster_count[sample][name]}"
44
56
  end
45
- h.print "\n"
57
+ handle.print "\n"
46
58
  end
47
59
  end
48
60
  end
49
61
 
50
- end
51
- end
62
+ no_tasks do
63
+ # parse a line of usearch output
64
+ # return a hash in the form:
65
+ # { :taxonomy => '', :identity => 0.00, ... }
66
+ # unless the line is not a "hit" in which case
67
+ # the function returns nil
68
+ def parse_usearch_line(str)
69
+
70
+ # skip non hits
71
+ return nil unless str =~ /^H/
72
+
73
+ str = str.split
74
+
75
+ taxonomic_description = str[9]
76
+ identity = str[3].to_f
77
+
78
+ # parse taxonomic_description
79
+ taxonomies = parse_taxonomy(taxonomic_description)
80
+
81
+ { :identity => identity }.merge(taxonomies)
82
+ end
83
+
84
+ # parse a taxonomic description using the
85
+ # taxcollector format returning name at each level (genus, etc...)
86
+ def parse_taxonomy(taxonomy)
87
+
88
+ levels = { 'domain' => 0,
89
+ 'kingdom' => 0,
90
+ 'phylum' => 1,
91
+ 'class' => 2,
92
+ 'order' => 3,
93
+ 'family' => 4,
94
+ 'genus' => 5,
95
+ 'species' => 6 }
96
+
97
+ names = Hash.new
98
+
99
+ levels.each_pair do |level, num|
100
+ name = taxonomy.match(/\[#{num}\](\w*)[;\[]/)[1] rescue nil
101
+ names[level] = name
102
+ end
103
+
104
+ names
105
+ end
106
+
107
+ end # no tasks
108
+
109
+ end # class CLI
110
+ end # module Lederhosen
@@ -1,8 +1,8 @@
1
1
  module Lederhosen
2
2
  module Version
3
- MAJOR = 0
4
- MINOR = 5
5
- PATCH = 7
3
+ MAJOR = 1
4
+ MINOR = 0
5
+ PATCH = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, PATCH].join('.')
8
8
  end
data/readme.md CHANGED
@@ -8,8 +8,7 @@ Lederhosen is free and open source under the [MIT open source license](http://op
8
8
 
9
9
  ## How do I get Lederhosen?
10
10
 
11
- 0. Obtain & Install [UCLUST](http://www.drive5.com/)
12
- 1. Obtain & Install [BLAT](http://genome.ucsc.edu/FAQ/FAQblat.html#blat3)
11
+ 0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine)
13
12
  2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
14
13
  3. Install Lederhosen by typing:
15
14
 
@@ -35,80 +34,28 @@ Lederhosen is just a convenient wrapper for UCLUST and BLAT with some scripts fo
35
34
 
36
35
  Lederhosen is invoked by typing `lederhosen [TASK]`
37
36
 
38
- ### trim
37
+ ### Trim Reads
39
38
 
40
39
  Trim (Illumina) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
41
40
 
42
41
  lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
43
42
 
44
- ### join
43
+ ### Create Database
45
44
 
46
- Join paired reads from all samples end-to-end. This method enables the use of uclust with paired-end data. Output will be a single fasta file.
45
+ Create UDB database required by usearch from TaxCollector
47
46
 
48
- lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
47
+ lederhosen make_udb --input=taxcollector.fa --output=taxcollector.udb
49
48
 
50
- If your reads are not paired, then you do not need to do this step. Instead, concatenate all of the trimmed reads files.
49
+ ### Cluster Reads using USEARCH
51
50
 
52
- cat trimmed/*.fasta > joined.fasta
51
+ Cluster reads using USEARCH. Output is a uc file.
53
52
 
54
- ### sort
53
+ lederhosen cluster --input=trimmed/*.fasta --identity=0.95 --output=clusters_95.uc --database=taxcollector.udb
55
54
 
56
- Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
57
-
58
- lederhosen sort --input=joined.fasta --output=sorted.fasta
59
-
60
- ### k_filter
61
-
62
- K-mer abundance noise filtering. This step is experimental and optional. It may reduce the time it takes to perform the clustering.
63
-
64
- lederhosen k_filter --input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50
65
-
66
- ### cluster
67
-
68
- Cluster reads using UCLUST. Output is a uc file.
69
-
70
- lederhosen cluster --input=sorted.fasta --identity=0.80 --output=clusters.uc
71
-
72
- ### uc_filter
73
-
74
- Filter UC file removing singleton clusters or clusters that are only present in a few samples. This greatly reduces the noise of the data without removing many of the reads.
75
-
76
- lederhosen uc_filter --input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10
77
-
78
- ### otu_table
55
+ ### Generate OTU tables
79
56
 
80
57
  Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
81
58
 
82
- lederhosen otu_table --clusters=clusters.uc --output=otu_prefix.csv
83
-
84
- ### rep_reads
85
-
86
- Get representative reads for each cluster. Output is a single fasta file.
87
-
88
- lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta
89
-
90
- ### split
91
-
92
- Get all reads belonging to each cluster. Output is a directory containing a fasta file for each cluster. The fasta file contains the joined reads.
93
-
94
- lederhosen split --clusters=clusters.uc --reads=joined.fasta --min-clst-size=100
95
-
96
- ### name
97
-
98
- Identify clusters in a database using the representative reads. This is a simple wrapper for BLAT. The output is a tab-delimited file similar to a BLAST output file. For this step you need to have BLAT installed and also a [TaxCollector](http://github.com/audy/taxcollector) database.
99
-
100
- lederhosen name --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt
101
-
102
- ### add_names
103
-
104
- Add phylogenetic classification of clusters to OTU abundance file.
105
-
106
- lederhosen add_names --blat=blat_output.txt --level=taxonomic_level --table=otu_file.csv --output=named_out_file.csv
107
-
108
- Where `taxonomic_level` can be: kingdom, domain, phylum, class, order, family, genus or species. This method only works with a TaxCollector database.
109
-
110
- ### squish
111
-
112
- Squish an OTU abundance file by column name (phylogenetic description)
59
+ lederhosen otu_table --clusters=clusters_95.uc --output=genus.csv --level=genus
113
60
 
114
- lederhosen squish --csv-file=named_out_file.csv --output=squished_named_out_file.csv
61
+ Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus. To make tables at all levels do:
data/spec/cli_spec.rb CHANGED
@@ -17,35 +17,20 @@ describe Lederhosen::CLI do
17
17
  $?.success?.should be_true
18
18
  end
19
19
 
20
- it 'should join reads' do
21
- `./bin/lederhosen join --trimmed=#{$test_dir}/trimmed/*.fasta --output=#{$test_dir}/joined.fasta`
20
+ it 'can create a usearch udb using usearch' do
21
+ `./bin/lederhosen make_udb --input #{$test_dir}/trimmed/ILT_L_9_B_001.fasta --output #{$test_dir}/test_db.udb`
22
22
  $?.success?.should be_true
23
23
  end
24
24
 
25
- it 'should support libonly clustering (w/ maxaccepts and maxrejects too)' do
26
- # clustering reads against themselves because there is no reference database
27
- # included in specs/data
28
- `./bin/lederhosen cluster --input=#{$test_dir}/joined.fasta --output=#{$test_dir}/joined.libonly.uc --lib=#{$test_dir}/joined.fasta --libonly --identity 0.95 --maxaccepts 500 --maxrejects 12`
25
+ it 'can cluster reads using usearch' do
26
+ `./bin/lederhosen cluster --input #{$test_dir}/trimmed/ILT_L_9_B_001.fasta --database #{$test_dir}/test_db.udb --identity 0.95 --output #{$test_dir}/clusters.uc`
29
27
  end
30
28
 
31
- it 'should sort reads' do
32
- `./bin/lederhosen sort --input=#{$test_dir}/joined.fasta --output=#{$test_dir}/sorted.fasta`
33
- $?.success?.should be_true
34
- end
35
-
36
- it 'should k_filter reads' do
37
- `./bin/lederhosen k_filter --input=#{$test_dir}/sorted.fasta --output=#{$test_dir}/filtered.fasta -k=15 --cutoff 1`
38
- $?.success?.should be_true
39
- end
40
-
41
- it 'should cluster reads' do
42
- `./bin/lederhosen cluster --identity=0.80 --input=#{$test_dir}/filtered.fasta --output=#{$test_dir}/clusters.uc`
43
- $?.success?.should be_true
44
- end
45
-
46
- it 'should build OTU abundance matrices' do
47
- `./bin/lederhosen otu_table --clusters=#{$test_dir}/clusters.uc --output=#{$test_dir}/otu_table.csv`
48
- $?.success?.should be_true
29
+ %w{domain phylum class ORDER Family genus species}.each do |level|
30
+ it "should build #{level} abundance matrix" do
31
+ `./bin/lederhosen otu_table --files=spec/data/test.uc --output=#{$test_dir}/otu_table.csv --level=#{level}`
32
+ $?.success?.should be_true
33
+ end
49
34
  end
50
35
 
51
36
  it 'should filter OTU abundance matrices' do
@@ -54,33 +39,9 @@ describe Lederhosen::CLI do
54
39
  end
55
40
 
56
41
  it 'should split a fasta file into smaller fasta files (optionally gzipped)' do
57
- `./bin/lederhosen split_fasta --input=#{$test_dir}/joined.fasta --out-dir=#{$test_dir}/split/ --gzip true -n 100`
58
- $?.success?.should be_true
59
- end
60
-
61
- it 'should split joined.fasta into reads for each cluster' do
62
- `./bin/lederhosen split --reads=#{$test_dir}/joined.fasta --clusters=#{$test_dir}/clusters.uc --out-dir=#{$test_dir}/split --min-clst-size=1`
63
- $?.success?.should be_true
64
- end
65
-
66
- it 'should create a fasta file containing representative reads for each cluster' do
67
- `./bin/lederhosen rep_reads --clusters=#{$test_dir}/clusters.uc --joined=#{$test_dir}/filtered.fasta --output=#{$test_dir}/representatives.fasta`
42
+ `./bin/lederhosen split_fasta --input=#{$test_dir}/trimmed/ILT_L_9_B_001.fasta --out-dir=#{$test_dir}/split/ --gzip true -n 100`
68
43
  $?.success?.should be_true
69
44
  end
70
45
 
71
- # Need a taxcollector database for this one.
72
- it 'should identify clusters given a taxcollector database'
73
-
74
- it 'should add names to otu abundance matrix given blat output' do
75
- levels = %w{kingdom domain phylum class order genus speces}
76
- # Ruby 1.9 vs Ruby 1.8
77
- level = levels.sample rescue levels.choice
78
- `./bin/lederhosen add_names --table=spec/data/otus.csv --blat=spec/data/blat.txt --level=#{level} --output=#{$test_dir}/named_otus.csv`
79
- $?.success?.should be_true
80
- end
81
-
82
- it 'should squish otu abundance matrix by same name' do
83
- `./bin/lederhosen squish --csv-file=#{$test_dir}/named_otus.csv --output=#{$test_dir}/squished.csv`
84
- $?.success?.should be_true
85
- end
46
+ it 'should create a fasta file containing representative reads for each cluster'
86
47
  end