RubyGems - lederhosen - Versions diffs - 0.5.7 → 1.0.0 - Mend

lederhosen 0.5.7 → 1.0.0

Files changed (18) hide show

data/lederhosen.gemspec +4 -9
data/lib/lederhosen/helpers.rb +7 -0
data/lib/lederhosen/tasks/cluster.rb +26 -38
data/lib/lederhosen/tasks/make_udb.rb +25 -0
data/lib/lederhosen/tasks/otu_table.rb +85 -26
data/lib/lederhosen/version.rb +3 -3
data/readme.md +11 -64
data/spec/cli_spec.rb +11 -50
data/spec/data/test.uc +684 -0
data/spec/helpers_spec.rb +4 -0
metadata +7 -12
data/lib/lederhosen/tasks/add_names.rb +0 -98
data/lib/lederhosen/tasks/join.rb +0 -68
data/lib/lederhosen/tasks/name.rb +0 -38
data/lib/lederhosen/tasks/sort.rb +0 -25
data/lib/lederhosen/tasks/squish.rb +0 -51
data/spec/data/blat.txt +0 -86
data/spec/data/otus.csv +0 -4

data/lederhosen.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "lederhosen"
-  s.version = "0.5.7"
+  s.version = "1.0.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Austin G. Davis-Richardson"]
-  s.date = "2012-09-17"
+  s.date = "2012-10-30"
   s.description = "Various tools for OTU clustering"
   s.email = "harekrishna@gmail.com"
   s.executables = ["lederhosen"]
@@ -30,18 +30,14 @@ Gem::Specification.new do |s|
     "lib/lederhosen/buffer.rb",
     "lib/lederhosen/cli.rb",
     "lib/lederhosen/helpers.rb",
-    "lib/lederhosen/tasks/add_names.rb",
     "lib/lederhosen/tasks/cluster.rb",
-    "lib/lederhosen/tasks/join.rb",
     "lib/lederhosen/tasks/k_filter.rb",
-    "lib/lederhosen/tasks/name.rb",
+    "lib/lederhosen/tasks/make_udb.rb",
     "lib/lederhosen/tasks/otu_filter.rb",
     "lib/lederhosen/tasks/otu_table.rb",
     "lib/lederhosen/tasks/rep_reads.rb",
-    "lib/lederhosen/tasks/sort.rb",
     "lib/lederhosen/tasks/split.rb",
     "lib/lederhosen/tasks/split_fasta.rb",
-    "lib/lederhosen/tasks/squish.rb",
     "lib/lederhosen/tasks/trim.rb",
     "lib/lederhosen/tasks/uc_filter.rb",
     "lib/lederhosen/tasks/uc_stats.rb",
@@ -53,8 +49,7 @@ Gem::Specification.new do |s|
     "spec/data/ILT_L_9_B_001_3.txt.gz",
     "spec/data/ILT_L_9_B_002_1.txt.gz",
     "spec/data/ILT_L_9_B_002_3.txt.gz",
-    "spec/data/blat.txt",
-    "spec/data/otus.csv",
+    "spec/data/test.uc",
     "spec/helpers_spec.rb",
     "spec/misc_spec.rb",
     "spec/spec_helper.rb"

data/lib/lederhosen/helpers.rb CHANGED Viewed

@@ -2,6 +2,12 @@ module Lederhosen
   class Helpers
     class << self
+    # reverse complement a DNA sequence
+    # assumes only GATCN nucleotides
+    def reverse_complement(s)
+      s.reverse.tr('GATCNgatcn','CTAGNctagn')
+    end
     # Function for grouping qseq files produced by splitting illumina
     # reads by barcode
     #
@@ -36,6 +42,7 @@ module Lederhosen
         seqb = trim b
         unless [seqa, seqb].include? nil
           if seqb.length >= min_length && seqa.length >= min_length
+            seqb = reverse_complement(seqb)
             out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
           end
         end

data/lib/lederhosen/tasks/cluster.rb CHANGED Viewed

@@ -1,58 +1,46 @@
-##
-# FINALLY, CLUSTER!
-#
 module Lederhosen
   class CLI
-    desc "cluster",
-         "cluster a fasta file using UCLUST"
+    desc 'cluster', 'reference-based clustering using usearch'
-    method_option :input,       :type => :string,  :required => true
-    method_option :output,      :type => :string,  :required => true
-    method_option :identity,    :type => :numeric, :required => true
-    method_option :stepwords,   :type => :numeric, :default  => 8
-    method_option :wordlen,     :type => :numeric, :default  => 8
-    method_option :maxaccepts,  :type => :numeric, :default  => 1
-    method_option :maxrejects,  :type => :numeric, :default  => 8
-    method_option :lib,         :type => :string
-    method_option :libonly,     :type => :boolean, :default  => false
+    method_option :input,    :type => :string,  :required => true
+    method_option :database, :type => :string,  :required => true
+    method_option :threads,  :type => :numeric, :default  => 0
+    method_option :identity, :type => :numeric, :required => true
+    method_option :output,   :type => :string,  :required => true
+    method_option :strand,   :type => :string,  :default => 'plus'
     def cluster
-      identity   = options[:identity]
-      output     = options[:output]
-      input      = options[:input]
-      stepwords  = options[:stepwords]
-      maxaccepts = options[:maxaccepts]
-      maxrejects = options[:maxrejects]
-      wordlen    = options[:wordlen]
-      lib        = options[:lib]
-      libonly    = options[:libonly]
-      ohai "clustering #{input}, saving to #{output}"
+      input    = options[:input]
+      database = options[:database]
+      threads  = options[:threads]
+      identity = options[:identity]
+      output   = options[:output]
+      strand   = options[:strand]
+      ohai "clustering #{input} to #{database} and saving to #{output}"
       options.each_pair do |key, value|
         ohai "#{key} = #{value}"
       end
-      cmd = [
-        'uclust',
-        "--input #{input}",
-        "--uc #{output}",
+      cmd = ['usearch',
+        "--usearch_local #{input}",
         "--id #{identity}",
-        "--stepwords #{stepwords}",
-        "--maxaccepts #{maxaccepts}",
-        "--maxrejects #{maxrejects}",
-        "--w #{wordlen}"
+        "--uc #{output}",
+        "--db #{database}",
+        "--strand #{strand}"
       ]
-      cmd << "--lib #{lib}" unless lib.nil?
-      cmd << "--libonly" if libonly == true
+      # threads = 0 : use all threads (default)
+      if threads != 0
+        cmd << "--threads #{threads}"
+      end
       cmd = cmd.join(' ')
-      @shell.mute { run cmd }
+      run cmd
     end
   end
 end

data/lib/lederhosen/tasks/make_udb.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module Lederhosen
+  class CLI
+    desc 'make_udb', 'format database for usearch'
+    method_option :input,       :type => :string,  :required => true
+    method_option :output,      :type => :string,  :required => true
+    def make_udb
+      input       = options[:input]
+      output      = options[:output]
+      word_length = options[:word_length]
+      ohai "making udb w/ #{input}, saving as #{output}."
+      cmd = ['usearch',
+             "-makeudb_usearch #{input}",
+             "-output #{output}"]
+      cmd = cmd.join(' ')
+      run cmd
+    end
+  end
+end

data/lib/lederhosen/tasks/otu_table.rb CHANGED Viewed

@@ -2,50 +2,109 @@
 # MAKE TABLES
 #
-SEP = ','
+require 'set'
 module Lederhosen
   class CLI
     desc "otu_table",
-         "create an OTU abundance matrix from UCLUST output"
+         "create an OTU abundance matrix from USEARCH output"
-    method_option :clusters, :type => :string, :required => true
-    method_option :output,   :type => :string, :required => true
+    method_option :files,  :type => :string, :required => true
+    method_option :output, :type => :string, :required => true
+    method_option :level,  :type => :string, :required => true, :banner => 'valid options: domain, kingdom, phylum, class, order, genus, or species'
     def otu_table
-      input        = options[:clusters]
-      output       = options[:output]
+      input  = Dir[options[:files]]
+      output = options[:output]
+      level  = options[:level].downcase
-      ohai "generating otu table from #{input}, saving to #{output}"
+      ohai "generating #{level} table from #{input.size} file(s) and saving to #{output}."
-      # Load cluster table
+      fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom}.include? level
+      sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
-      clstr_info      = Helpers.load_uc_file input
-      clstr_counts    = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
-      clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
-      samples         = clstr_info[:samples]
+      all_names = Set.new
-      # print OTU abundance matrix
-      # clusters as columns
-      # samples as rows
+      # Load cluster table
+      input.each do |input_file|
+        File.open(input_file) do |handle|
+          handle.each do |line|
+            dat = parse_usearch_line(line.strip)
+            next if dat.nil?
+            name = dat[level] rescue ohai(dat.inspect)
-      File.open("#{output}", 'w') do |h|
-        samples  = samples.sort
-        clusters = clstr_counts.keys
+            all_names << name
+            sample_cluster_count[input_file][name] += 1
+          end
+        end
+      end
-        # print header (cluster names)
-        h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
+      ohai "found #{all_names.size} unique taxa at #{level} level"
+      # save to csv
+      File.open(output, 'w') do |handle|
+        header = all_names.to_a.compact.sort
+        handle.puts "#{level.capitalize},#{header.join(',')}"
+        samples = sample_cluster_count.keys.sort
         samples.each do |sample|
-          h.print sample
-          clusters.each do |cluster|
-            h.print "#{SEP}#{clstr_counts[cluster][sample]}"
+          handle.print "#{sample}"
+          header.each do |name|
+            handle.print ",#{sample_cluster_count[sample][name]}"
           end
-          h.print "\n"
+          handle.print "\n"
         end
       end
     end
-  end
-end
+    no_tasks do
+      # parse a line of usearch output
+      # return a hash in the form:
+      # { :taxonomy => '', :identity => 0.00, ... }
+      # unless the line is not a "hit" in which case
+      # the function returns nil
+      def parse_usearch_line(str)
+        # skip non hits
+        return nil unless str =~ /^H/
+        str = str.split
+        taxonomic_description = str[9]
+        identity = str[3].to_f
+        # parse taxonomic_description
+        taxonomies = parse_taxonomy(taxonomic_description)
+        { :identity => identity }.merge(taxonomies)
+      end
+      # parse a taxonomic description using the
+      # taxcollector format returning name at each level (genus, etc...)
+      def parse_taxonomy(taxonomy)
+        levels = { 'domain'  => 0,
+                   'kingdom' => 0,
+                   'phylum'  => 1,
+                   'class'   => 2,
+                   'order'   => 3,
+                   'family'  => 4,
+                   'genus'   => 5,
+                   'species' => 6 }
+        names = Hash.new
+        levels.each_pair do |level, num|
+          name = taxonomy.match(/\[#{num}\](\w*)[;\[]/)[1] rescue nil
+          names[level] = name
+        end
+        names
+      end
+    end # no tasks
+  end # class CLI
+end # module Lederhosen

data/lib/lederhosen/version.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 module Lederhosen
   module Version
-    MAJOR = 0
-    MINOR = 5
-    PATCH = 7
+    MAJOR = 1
+    MINOR = 0
+    PATCH = 0
     STRING = [MAJOR, MINOR, PATCH].join('.')
   end

data/readme.md CHANGED Viewed

@@ -8,8 +8,7 @@ Lederhosen is free and open source under the [MIT open source license](http://op
 ## How do I get Lederhosen?
-0. Obtain & Install [UCLUST](http://www.drive5.com/)
-1. Obtain & Install [BLAT](http://genome.ucsc.edu/FAQ/FAQblat.html#blat3)
+0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine)
 2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
 3. Install Lederhosen by typing:
@@ -35,80 +34,28 @@ Lederhosen is just a convenient wrapper for UCLUST and BLAT with some scripts fo
 Lederhosen is invoked by typing `lederhosen [TASK]`
-### trim
+### Trim Reads
 Trim (Illumina) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
     lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
-### join
+### Create Database
-Join paired reads from all samples end-to-end. This method enables the use of uclust with paired-end data. Output will be a single fasta file.
+Create UDB database required by usearch from TaxCollector
-    lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
+    lederhosen make_udb --input=taxcollector.fa --output=taxcollector.udb
-If your reads are not paired, then you do not need to do this step. Instead, concatenate all of the trimmed reads files.
+### Cluster Reads using USEARCH
-    cat trimmed/*.fasta > joined.fasta
+Cluster reads using USEARCH. Output is a uc file.
-### sort
+    lederhosen cluster --input=trimmed/*.fasta --identity=0.95 --output=clusters_95.uc --database=taxcollector.udb
-Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
-    lederhosen sort --input=joined.fasta --output=sorted.fasta
-### k_filter
-K-mer abundance noise filtering. This step is experimental and optional. It may reduce the time it takes to perform the clustering.
-    lederhosen k_filter --input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50
-### cluster
-Cluster reads using UCLUST. Output is a uc file.
-    lederhosen cluster --input=sorted.fasta --identity=0.80 --output=clusters.uc
-### uc_filter
-Filter UC file removing singleton clusters or clusters that are only present in a few samples. This greatly reduces the noise of the data without removing many of the reads.
-    lederhosen uc_filter --input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10
-### otu_table
+### Generate OTU tables
 Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
-    lederhosen otu_table --clusters=clusters.uc --output=otu_prefix.csv
-### rep_reads
-Get representative reads for each cluster. Output is a single fasta file.
-    lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta
-### split
-Get all reads belonging to each cluster. Output is a directory containing a fasta file for each cluster. The fasta file contains the joined reads.
-    lederhosen split --clusters=clusters.uc --reads=joined.fasta --min-clst-size=100
-### name
-Identify clusters in a database using the representative reads. This is a simple wrapper for BLAT. The output is a tab-delimited file similar to a BLAST output file. For this step you need to have BLAT installed and also a [TaxCollector](http://github.com/audy/taxcollector) database.
-    lederhosen name --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt
-### add_names
-Add phylogenetic classification of clusters to OTU abundance file.
-	lederhosen add_names --blat=blat_output.txt --level=taxonomic_level --table=otu_file.csv --output=named_out_file.csv
-Where `taxonomic_level` can be: kingdom, domain, phylum, class, order, family, genus or species. This method only works with a TaxCollector database.
-### squish
-Squish an OTU abundance file by column name (phylogenetic description)
+    lederhosen otu_table --clusters=clusters_95.uc --output=genus.csv --level=genus
-	lederhosen squish --csv-file=named_out_file.csv --output=squished_named_out_file.csv
+Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus. To make tables at all levels do:

data/spec/cli_spec.rb CHANGED Viewed

@@ -17,35 +17,20 @@ describe Lederhosen::CLI do
     $?.success?.should be_true
   end
-  it 'should join reads' do
-    `./bin/lederhosen join --trimmed=#{$test_dir}/trimmed/*.fasta --output=#{$test_dir}/joined.fasta`
+  it 'can create a usearch udb using usearch' do
+    `./bin/lederhosen make_udb --input #{$test_dir}/trimmed/ILT_L_9_B_001.fasta --output #{$test_dir}/test_db.udb`
     $?.success?.should be_true
   end
-  it 'should support libonly clustering (w/ maxaccepts and maxrejects too)' do
-    # clustering reads against themselves because there is no reference database
-    # included in specs/data
-    `./bin/lederhosen cluster --input=#{$test_dir}/joined.fasta --output=#{$test_dir}/joined.libonly.uc --lib=#{$test_dir}/joined.fasta --libonly --identity 0.95 --maxaccepts 500 --maxrejects 12`
+  it 'can cluster reads using usearch' do
+    `./bin/lederhosen cluster --input #{$test_dir}/trimmed/ILT_L_9_B_001.fasta --database #{$test_dir}/test_db.udb --identity 0.95 --output #{$test_dir}/clusters.uc`
   end
-  it 'should sort reads' do
-    `./bin/lederhosen sort --input=#{$test_dir}/joined.fasta --output=#{$test_dir}/sorted.fasta`
-    $?.success?.should be_true
-  end
-  it 'should k_filter reads' do
-    `./bin/lederhosen k_filter --input=#{$test_dir}/sorted.fasta --output=#{$test_dir}/filtered.fasta -k=15 --cutoff 1`
-    $?.success?.should be_true
-  end
-  it 'should cluster reads' do
-    `./bin/lederhosen cluster --identity=0.80 --input=#{$test_dir}/filtered.fasta --output=#{$test_dir}/clusters.uc`
-    $?.success?.should be_true
-  end
-  it 'should build OTU abundance matrices' do
-    `./bin/lederhosen otu_table --clusters=#{$test_dir}/clusters.uc --output=#{$test_dir}/otu_table.csv`
-    $?.success?.should be_true
+  %w{domain phylum class ORDER Family genus species}.each do |level|
+    it "should build #{level} abundance matrix" do
+      `./bin/lederhosen otu_table --files=spec/data/test.uc --output=#{$test_dir}/otu_table.csv --level=#{level}`
+      $?.success?.should be_true
+    end
   end
   it 'should filter OTU abundance matrices' do
@@ -54,33 +39,9 @@ describe Lederhosen::CLI do
   end
   it 'should split a fasta file into smaller fasta files (optionally gzipped)' do
-    `./bin/lederhosen split_fasta --input=#{$test_dir}/joined.fasta --out-dir=#{$test_dir}/split/ --gzip true -n 100`
-    $?.success?.should be_true
-  end
-  it 'should split joined.fasta into reads for each cluster' do
-    `./bin/lederhosen split --reads=#{$test_dir}/joined.fasta --clusters=#{$test_dir}/clusters.uc --out-dir=#{$test_dir}/split --min-clst-size=1`
-    $?.success?.should be_true
-  end
-  it 'should create a fasta file containing representative reads for each cluster' do
-    `./bin/lederhosen rep_reads --clusters=#{$test_dir}/clusters.uc --joined=#{$test_dir}/filtered.fasta --output=#{$test_dir}/representatives.fasta`
+    `./bin/lederhosen split_fasta --input=#{$test_dir}/trimmed/ILT_L_9_B_001.fasta --out-dir=#{$test_dir}/split/ --gzip true -n 100`
     $?.success?.should be_true
   end
-  # Need a taxcollector database for this one.
-  it 'should identify clusters given a taxcollector database'
-  it 'should add names to otu abundance matrix given blat output' do
-    levels = %w{kingdom domain phylum class order genus speces}
-    # Ruby 1.9 vs Ruby 1.8
-    level = levels.sample rescue levels.choice
-    `./bin/lederhosen add_names --table=spec/data/otus.csv --blat=spec/data/blat.txt --level=#{level} --output=#{$test_dir}/named_otus.csv`
-    $?.success?.should be_true
-  end
-  it 'should squish otu abundance matrix by same name' do
-    `./bin/lederhosen squish --csv-file=#{$test_dir}/named_otus.csv --output=#{$test_dir}/squished.csv`
-    $?.success?.should be_true
-  end
+  it 'should create a fasta file containing representative reads for each cluster'
 end