RubyGems - lederhosen - Versions diffs - 0.5.7 → 1.0.0 - Mend

lederhosen 0.5.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/lederhosen.gemspec +4 -9
data/lib/lederhosen/helpers.rb +7 -0
data/lib/lederhosen/tasks/cluster.rb +26 -38
data/lib/lederhosen/tasks/make_udb.rb +25 -0
data/lib/lederhosen/tasks/otu_table.rb +85 -26
data/lib/lederhosen/version.rb +3 -3
data/readme.md +11 -64
data/spec/cli_spec.rb +11 -50
data/spec/data/test.uc +684 -0
data/spec/helpers_spec.rb +4 -0
metadata +7 -12
data/lib/lederhosen/tasks/add_names.rb +0 -98
data/lib/lederhosen/tasks/join.rb +0 -68
data/lib/lederhosen/tasks/name.rb +0 -38
data/lib/lederhosen/tasks/sort.rb +0 -25
data/lib/lederhosen/tasks/squish.rb +0 -51
data/spec/data/blat.txt +0 -86
data/spec/data/otus.csv +0 -4

data/lederhosen.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "lederhosen"
-  s.version = "0.5.7"
+  s.version = "1.0.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Austin G. Davis-Richardson"]
-  s.date = "2012-09-17"
+  s.date = "2012-10-30"
   s.description = "Various tools for OTU clustering"
   s.email = "harekrishna@gmail.com"
   s.executables = ["lederhosen"]
@@ -30,18 +30,14 @@ Gem::Specification.new do |s|
     "lib/lederhosen/buffer.rb",
     "lib/lederhosen/cli.rb",
     "lib/lederhosen/helpers.rb",
-    "lib/lederhosen/tasks/add_names.rb",
     "lib/lederhosen/tasks/cluster.rb",
-    "lib/lederhosen/tasks/join.rb",
     "lib/lederhosen/tasks/k_filter.rb",
-    "lib/lederhosen/tasks/name.rb",
+    "lib/lederhosen/tasks/make_udb.rb",
     "lib/lederhosen/tasks/otu_filter.rb",
     "lib/lederhosen/tasks/otu_table.rb",
     "lib/lederhosen/tasks/rep_reads.rb",
-    "lib/lederhosen/tasks/sort.rb",
     "lib/lederhosen/tasks/split.rb",
     "lib/lederhosen/tasks/split_fasta.rb",
-    "lib/lederhosen/tasks/squish.rb",
     "lib/lederhosen/tasks/trim.rb",
     "lib/lederhosen/tasks/uc_filter.rb",
     "lib/lederhosen/tasks/uc_stats.rb",
@@ -53,8 +49,7 @@ Gem::Specification.new do |s|
     "spec/data/ILT_L_9_B_001_3.txt.gz",
     "spec/data/ILT_L_9_B_002_1.txt.gz",
     "spec/data/ILT_L_9_B_002_3.txt.gz",
-    "spec/data/blat.txt",
-    "spec/data/otus.csv",
+    "spec/data/test.uc",
     "spec/helpers_spec.rb",
     "spec/misc_spec.rb",
     "spec/spec_helper.rb"

data/lib/lederhosen/helpers.rb CHANGED Viewed

@@ -2,6 +2,12 @@ module Lederhosen
   class Helpers
     class << self
+    # reverse complement a DNA sequence
+    # assumes only GATCN nucleotides
+    def reverse_complement(s)
+      s.reverse.tr('GATCNgatcn','CTAGNctagn')
+    end
     # Function for grouping qseq files produced by splitting illumina
     # reads by barcode
     #
@@ -36,6 +42,7 @@ module Lederhosen
         seqb = trim b
         unless [seqa, seqb].include? nil
           if seqb.length >= min_length && seqa.length >= min_length
+            seqb = reverse_complement(seqb)
             out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
           end
         end

data/lib/lederhosen/tasks/cluster.rb CHANGED Viewed

@@ -1,58 +1,46 @@
-##
-# FINALLY, CLUSTER!
-#
 module Lederhosen
   class CLI
-    desc "cluster",
-         "cluster a fasta file using UCLUST"
+    desc 'cluster', 'reference-based clustering using usearch'
-    method_option :input,       :type => :string,  :required => true
-    method_option :output,      :type => :string,  :required => true
-    method_option :identity,    :type => :numeric, :required => true
-    method_option :stepwords,   :type => :numeric, :default  => 8
-    method_option :wordlen,     :type => :numeric, :default  => 8
-    method_option :maxaccepts,  :type => :numeric, :default  => 1
-    method_option :maxrejects,  :type => :numeric, :default  => 8
-    method_option :lib,         :type => :string
-    method_option :libonly,     :type => :boolean, :default  => false
+    method_option :input,    :type => :string,  :required => true
+    method_option :database, :type => :string,  :required => true
+    method_option :threads,  :type => :numeric, :default  => 0
+    method_option :identity, :type => :numeric, :required => true
+    method_option :output,   :type => :string,  :required => true
+    method_option :strand,   :type => :string,  :default => 'plus'
     def cluster
-      identity   = options[:identity]
-      output     = options[:output]
-      input      = options[:input]
-      stepwords  = options[:stepwords]
-      maxaccepts = options[:maxaccepts]
-      maxrejects = options[:maxrejects]
-      wordlen    = options[:wordlen]
-      lib        = options[:lib]
-      libonly    = options[:libonly]
-      ohai "clustering #{input}, saving to #{output}"
+      input    = options[:input]
+      database = options[:database]
+      threads  = options[:threads]
+      identity = options[:identity]
+      output   = options[:output]
+      strand   = options[:strand]
+      ohai "clustering #{input} to #{database} and saving to #{output}"
       options.each_pair do |key, value|
         ohai "#{key} = #{value}"
       end
-      cmd = [
-        'uclust',
-        "--input #{input}",
-        "--uc #{output}",
+      cmd = ['usearch',
+        "--usearch_local #{input}",
         "--id #{identity}",
-        "--stepwords #{stepwords}",
-        "--maxaccepts #{maxaccepts}",
-        "--maxrejects #{maxrejects}",
-        "--w #{wordlen}"
+        "--uc #{output}",
+        "--db #{database}",
+        "--strand #{strand}"
       ]
-      cmd << "--lib #{lib}" unless lib.nil?
-      cmd << "--libonly" if libonly == true
+      # threads = 0 : use all threads (default)
+      if threads != 0
+        cmd << "--threads #{threads}"
+      end
       cmd = cmd.join(' ')
-      @shell.mute { run cmd }
+      run cmd
     end
   end
 end

data/lib/lederhosen/tasks/make_udb.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module Lederhosen
+  class CLI
+    desc 'make_udb', 'format database for usearch'
+    method_option :input,       :type => :string,  :required => true
+    method_option :output,      :type => :string,  :required => true
+    def make_udb
+      input       = options[:input]
+      output      = options[:output]
+      word_length = options[:word_length]
+      ohai "making udb w/ #{input}, saving as #{output}."
+      cmd = ['usearch',
+             "-makeudb_usearch #{input}",
+             "-output #{output}"]
+      cmd = cmd.join(' ')
+      run cmd
+    end
+  end
+end

data/lib/lederhosen/tasks/otu_table.rb CHANGED Viewed

@@ -2,50 +2,109 @@
 # MAKE TABLES
 #
-SEP = ','
+require 'set'
 module Lederhosen
   class CLI
     desc "otu_table",
-         "create an OTU abundance matrix from UCLUST output"
+         "create an OTU abundance matrix from USEARCH output"
-    method_option :clusters, :type => :string, :required => true
-    method_option :output,   :type => :string, :required => true
+    method_option :files,  :type => :string, :required => true
+    method_option :output, :type => :string, :required => true
+    method_option :level,  :type => :string, :required => true, :banner => 'valid options: domain, kingdom, phylum, class, order, genus, or species'
     def otu_table
-      input        = options[:clusters]
-      output       = options[:output]
+      input  = Dir[options[:files]]
+      output = options[:output]
+      level  = options[:level].downcase
-      ohai "generating otu table from #{input}, saving to #{output}"
+      ohai "generating #{level} table from #{input.size} file(s) and saving to #{output}."
-      # Load cluster table
+      fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom}.include? level
+      sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
-      clstr_info      = Helpers.load_uc_file input
-      clstr_counts    = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
-      clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
-      samples         = clstr_info[:samples]
+      all_names = Set.new
-      # print OTU abundance matrix
-      # clusters as columns
-      # samples as rows
+      # Load cluster table
+      input.each do |input_file|
+        File.open(input_file) do |handle|
+          handle.each do |line|
+            dat = parse_usearch_line(line.strip)
+            next if dat.nil?
+            name = dat[level] rescue ohai(dat.inspect)
-      File.open("#{output}", 'w') do |h|
-        samples  = samples.sort
-        clusters = clstr_counts.keys
+            all_names << name
+            sample_cluster_count[input_file][name] += 1
+          end
+        end
+      end
-        # print header (cluster names)
-        h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
+      ohai "found #{all_names.size} unique taxa at #{level} level"
+      # save to csv
+      File.open(output, 'w') do |handle|
+        header = all_names.to_a.compact.sort
+        handle.puts "#{level.capitalize},#{header.join(',')}"
+        samples = sample_cluster_count.keys.sort
         samples.each do |sample|
-          h.print sample
-          clusters.each do |cluster|
-            h.print "#{SEP}#{clstr_counts[cluster][sample]}"
+          handle.print "#{sample}"
+          header.each do |name|
+            handle.print ",#{sample_cluster_count[sample][name]}"
           end
-          h.print "\n"
+          handle.print "\n"
         end
       end
     end
-  end
-end
+    no_tasks do
+      # parse a line of usearch output
+      # return a hash in the form:
+      # { :taxonomy => '', :identity => 0.00, ... }
+      # unless the line is not a "hit" in which case
+      # the function returns nil
+      def parse_usearch_line(str)
+        # skip non hits
+        return nil unless str =~ /^H/
+        str = str.split
+        taxonomic_description = str[9]
+        identity = str[3].to_f
+        # parse taxonomic_description
+        taxonomies = parse_taxonomy(taxonomic_description)
+        { :identity => identity }.merge(taxonomies)
+      end
+      # parse a taxonomic description using the
+      # taxcollector format returning name at each level (genus, etc...)
+      def parse_taxonomy(taxonomy)
+        levels = { 'domain'  => 0,
+                   'kingdom' => 0,
+                   'phylum'  => 1,
+                   'class'   => 2,
+                   'order'   => 3,
+                   'family'  => 4,
+                   'genus'   => 5,
+                   'species' => 6 }
+        names = Hash.new
+        levels.each_pair do |level, num|
+          name = taxonomy.match(/\[#{num}\](\w*)[;\[]/)[1] rescue nil
+          names[level] = name
+        end
+        names
+      end
+    end # no tasks
+  end # class CLI
+end # module Lederhosen

data/lib/lederhosen/version.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 module Lederhosen
   module Version
-    MAJOR = 0
-    MINOR = 5
-    PATCH = 7
+    MAJOR = 1
+    MINOR = 0
+    PATCH = 0
     STRING = [MAJOR, MINOR, PATCH].join('.')
   end

data/readme.md CHANGED Viewed

@@ -8,8 +8,7 @@ Lederhosen is free and open source under the [MIT open source license](http://op
 ## How do I get Lederhosen?
-0. Obtain & Install [UCLUST](http://www.drive5.com/)
-1. Obtain & Install [BLAT](http://genome.ucsc.edu/FAQ/FAQblat.html#blat3)
+0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine)
 2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
 3. Install Lederhosen by typing:
@@ -35,80 +34,28 @@ Lederhosen is just a convenient wrapper for UCLUST and BLAT with some scripts fo
 Lederhosen is invoked by typing `lederhosen [TASK]`
-### trim
+### Trim Reads
 Trim (Illumina) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
     lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
-### join
+### Create Database
-Join paired reads from all samples end-to-end. This method enables the use of uclust with paired-end data. Output will be a single fasta file.
+Create UDB database required by usearch from TaxCollector
-    lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
+    lederhosen make_udb --input=taxcollector.fa --output=taxcollector.udb
-If your reads are not paired, then you do not need to do this step. Instead, concatenate all of the trimmed reads files.
+### Cluster Reads using USEARCH
-    cat trimmed/*.fasta > joined.fasta
+Cluster reads using USEARCH. Output is a uc file.
-### sort
+    lederhosen cluster --input=trimmed/*.fasta --identity=0.95 --output=clusters_95.uc --database=taxcollector.udb
-Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
-    lederhosen sort --input=joined.fasta --output=sorted.fasta
-### k_filter
-K-mer abundance noise filtering. This step is experimental and optional. It may reduce the time it takes to perform the clustering.
-    lederhosen k_filter --input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50
-### cluster
-Cluster reads using UCLUST. Output is a uc file.
-    lederhosen cluster --input=sorted.fasta --identity=0.80 --output=clusters.uc
-### uc_filter
-Filter UC file removing singleton clusters or clusters that are only present in a few samples. This greatly reduces the noise of the data without removing many of the reads.
-    lederhosen uc_filter --input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10
-### otu_table
+### Generate OTU tables
 Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
-    lederhosen otu_table --clusters=clusters.uc --output=otu_prefix.csv
-### rep_reads
-Get representative reads for each cluster. Output is a single fasta file.
-    lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta
-### split
-Get all reads belonging to each cluster. Output is a directory containing a fasta file for each cluster. The fasta file contains the joined reads.
-    lederhosen split --clusters=clusters.uc --reads=joined.fasta --min-clst-size=100
-### name
-Identify clusters in a database using the representative reads. This is a simple wrapper for BLAT. The output is a tab-delimited file similar to a BLAST output file. For this step you need to have BLAT installed and also a [TaxCollector](http://github.com/audy/taxcollector) database.
-    lederhosen name --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt
-### add_names
-Add phylogenetic classification of clusters to OTU abundance file.
-	lederhosen add_names --blat=blat_output.txt --level=taxonomic_level --table=otu_file.csv --output=named_out_file.csv
-Where `taxonomic_level` can be: kingdom, domain, phylum, class, order, family, genus or species. This method only works with a TaxCollector database.
-### squish
-Squish an OTU abundance file by column name (phylogenetic description)
+    lederhosen otu_table --clusters=clusters_95.uc --output=genus.csv --level=genus
-	lederhosen squish --csv-file=named_out_file.csv --output=squished_named_out_file.csv
+Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus. To make tables at all levels do:

data/spec/cli_spec.rb CHANGED Viewed

@@ -17,35 +17,20 @@ describe Lederhosen::CLI do
     $?.success?.should be_true
   end
-  it 'should join reads' do
-    `./bin/lederhosen join --trimmed=#{$test_dir}/trimmed/*.fasta --output=#{$test_dir}/joined.fasta`
+  it 'can create a usearch udb using usearch' do
+    `./bin/lederhosen make_udb --input #{$test_dir}/trimmed/ILT_L_9_B_001.fasta --output #{$test_dir}/test_db.udb`
     $?.success?.should be_true
   end
-  it 'should support libonly clustering (w/ maxaccepts and maxrejects too)' do
-    # clustering reads against themselves because there is no reference database
-    # included in specs/data
-    `./bin/lederhosen cluster --input=#{$test_dir}/joined.fasta --output=#{$test_dir}/joined.libonly.uc --lib=#{$test_dir}/joined.fasta --libonly --identity 0.95 --maxaccepts 500 --maxrejects 12`
+  it 'can cluster reads using usearch' do
+    `./bin/lederhosen cluster --input #{$test_dir}/trimmed/ILT_L_9_B_001.fasta --database #{$test_dir}/test_db.udb --identity 0.95 --output #{$test_dir}/clusters.uc`
   end
-  it 'should sort reads' do
-    `./bin/lederhosen sort --input=#{$test_dir}/joined.fasta --output=#{$test_dir}/sorted.fasta`
-    $?.success?.should be_true
-  end
-  it 'should k_filter reads' do
-    `./bin/lederhosen k_filter --input=#{$test_dir}/sorted.fasta --output=#{$test_dir}/filtered.fasta -k=15 --cutoff 1`
-    $?.success?.should be_true
-  end
-  it 'should cluster reads' do
-    `./bin/lederhosen cluster --identity=0.80 --input=#{$test_dir}/filtered.fasta --output=#{$test_dir}/clusters.uc`
-    $?.success?.should be_true
-  end
-  it 'should build OTU abundance matrices' do
-    `./bin/lederhosen otu_table --clusters=#{$test_dir}/clusters.uc --output=#{$test_dir}/otu_table.csv`
-    $?.success?.should be_true
+  %w{domain phylum class ORDER Family genus species}.each do |level|
+    it "should build #{level} abundance matrix" do
+      `./bin/lederhosen otu_table --files=spec/data/test.uc --output=#{$test_dir}/otu_table.csv --level=#{level}`
+      $?.success?.should be_true
+    end
   end
   it 'should filter OTU abundance matrices' do
@@ -54,33 +39,9 @@ describe Lederhosen::CLI do
   end
   it 'should split a fasta file into smaller fasta files (optionally gzipped)' do
-    `./bin/lederhosen split_fasta --input=#{$test_dir}/joined.fasta --out-dir=#{$test_dir}/split/ --gzip true -n 100`
-    $?.success?.should be_true
-  end
-  it 'should split joined.fasta into reads for each cluster' do
-    `./bin/lederhosen split --reads=#{$test_dir}/joined.fasta --clusters=#{$test_dir}/clusters.uc --out-dir=#{$test_dir}/split --min-clst-size=1`
-    $?.success?.should be_true
-  end
-  it 'should create a fasta file containing representative reads for each cluster' do
-    `./bin/lederhosen rep_reads --clusters=#{$test_dir}/clusters.uc --joined=#{$test_dir}/filtered.fasta --output=#{$test_dir}/representatives.fasta`
+    `./bin/lederhosen split_fasta --input=#{$test_dir}/trimmed/ILT_L_9_B_001.fasta --out-dir=#{$test_dir}/split/ --gzip true -n 100`
     $?.success?.should be_true
   end
-  # Need a taxcollector database for this one.
-  it 'should identify clusters given a taxcollector database'
-  it 'should add names to otu abundance matrix given blat output' do
-    levels = %w{kingdom domain phylum class order genus speces}
-    # Ruby 1.9 vs Ruby 1.8
-    level = levels.sample rescue levels.choice
-    `./bin/lederhosen add_names --table=spec/data/otus.csv --blat=spec/data/blat.txt --level=#{level} --output=#{$test_dir}/named_otus.csv`
-    $?.success?.should be_true
-  end
-  it 'should squish otu abundance matrix by same name' do
-    `./bin/lederhosen squish --csv-file=#{$test_dir}/named_otus.csv --output=#{$test_dir}/squished.csv`
-    $?.success?.should be_true
-  end
+  it 'should create a fasta file containing representative reads for each cluster'
 end