RubyGems - lederhosen - Versions diffs - 1.8.2 → 2.0.0 - Mend

lederhosen 1.8.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/Gemfile +1 -1
data/lederhosen.gemspec +7 -3
data/lib/lederhosen/no_tasks.rb +18 -18
data/lib/lederhosen/tasks/count_taxonomies.rb +83 -0
data/lib/lederhosen/tasks/get_reps.rb +3 -4
data/lib/lederhosen/tasks/make_udb.rb +2 -2
data/lib/lederhosen/tasks/otu_filter.rb +8 -1
data/lib/lederhosen/tasks/otu_table.rb +33 -70
data/lib/lederhosen/tasks/separate_unclassified.rb +65 -0
data/lib/lederhosen/uc_parser.rb +88 -0
data/lib/lederhosen/version.rb +4 -4
data/readme.md +107 -11
data/spec/cli_spec.rb +62 -10
data/spec/data/test.uc +9 -684
data/spec/data/trimmed/ILT_L_9_B_001.fasta +100 -1596
data/spec/no_tasks_spec.rb +1 -1
data/spec/uc_parser_spec.rb +0 -0
metadata +7 -3

data/Gemfile CHANGED Viewed

@@ -8,7 +8,7 @@ group :test do
   gem 'rspec', '2.12.0'
   gem 'rspec-prof', '0.0.3'
   gem 'pry'
-  gem 'plymouth'
+#  gem 'plymouth'
 end
 group :development do

data/lederhosen.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "lederhosen"
-  s.version = "1.8.2"
+  s.version = "2.0.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Austin G. Davis-Richardson"]
-  s.date = "2013-01-17"
+  s.date = "2013-01-24"
   s.description = "Various tools for OTU clustering"
   s.email = "harekrishna@gmail.com"
   s.executables = ["lederhosen"]
@@ -27,13 +27,16 @@ Gem::Specification.new do |s|
     "lib/lederhosen/cli.rb",
     "lib/lederhosen/no_tasks.rb",
     "lib/lederhosen/tasks/cluster.rb",
+    "lib/lederhosen/tasks/count_taxonomies.rb",
     "lib/lederhosen/tasks/get_reps.rb",
     "lib/lederhosen/tasks/join_otu_tables.rb",
     "lib/lederhosen/tasks/make_udb.rb",
     "lib/lederhosen/tasks/otu_filter.rb",
     "lib/lederhosen/tasks/otu_table.rb",
+    "lib/lederhosen/tasks/separate_unclassified.rb",
     "lib/lederhosen/tasks/split_fasta.rb",
     "lib/lederhosen/tasks/version.rb",
+    "lib/lederhosen/uc_parser.rb",
     "lib/lederhosen/version.rb",
     "readme.md",
     "scripts/illumina_pipeline/.gitignore",
@@ -46,7 +49,8 @@ Gem::Specification.new do |s|
     "spec/data/test.uc",
     "spec/data/trimmed/ILT_L_9_B_001.fasta",
     "spec/no_tasks_spec.rb",
-    "spec/spec_helper.rb"
+    "spec/spec_helper.rb",
+    "spec/uc_parser_spec.rb"
   ]
   s.homepage = "http://audy.github.com/lederhosen"
   s.licenses = ["MIT"]

data/lib/lederhosen/no_tasks.rb CHANGED Viewed

@@ -6,26 +6,24 @@ module Lederhosen
     no_tasks do
-      # parse a line of usearch prefix
-      # return a hash in the form:
-      # { :taxonomy => '', :identity => '0.00', ... }
-      # unless the line is not a "hit" in which case
-      # the function returns nil
-      def parse_usearch_line(str)
-        # skip non hits
-        return nil unless str =~ /^H/
-        str = str.split
-        taxonomic_description = str[9]
-        identity = str[3]
+      # get a taxonomic description from a line of usearch (uc) output
+      # return 'unclassified_reads' if the result was not a hit
+      # if the result was neither a hit nor a miss (for example, a seed)
+      # return nil
+      # this will probably break for different versions of uc file
+      # as produced by uclust or older versions of usearch
+      def get_tax(s)
+        dat = parse_usearch_line(s.strip)
+        if dat[:type] == 'H'
+          dat[:taxonomic_description].tr(',', '_')
+        elsif dat[:type] == 'N'
+          'unclassified_reads'
+        else
+          nil
+        end
+      end
-        # parse taxonomic_description
-        taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
-        { :identity => identity }.merge(taxonomies)
-      end
       # detect whether the taxonomy is one of the following
       # possible formats:
@@ -40,6 +38,8 @@ module Lederhosen
           :taxcollector
         elsif taxonomy =~ /^\d/
           :greengenes
+        elsif taxonomy.nil?
+          raise "nil ain't no taxonomy I ever heard of!"
         else
           :qiime
         end

data/lib/lederhosen/tasks/count_taxonomies.rb ADDED Viewed

@@ -0,0 +1,83 @@
+module Lederhosen
+  class CLI
+    desc 'count_taxonomies', 'count taxonomies from a uc file, generating a csv file with: <taxonomy>,<reads>'
+    method_option :input, :type => :string, :required => true
+    method_option :output, :type => :string, :required => true
+    method_option :strict, :type => :string, :default => false,
+                  :banner => '<level> only count reads where both taxonomies are in agreement at <level>'
+    def count_taxonomies
+      input  = options[:input]
+      output = options[:output]
+      strict = options[:strict]
+      ohai "generating #{output} from #{input}"
+      handle = File.open(input)
+      uc = UCParser.new(handle)
+      taxonomy_count =
+        if not strict
+          get_taxonomy_count(uc)
+        elsif strict
+          get_strict_taxonomy_count(uc, strict)
+        end
+      handle.close
+      out = File.open(output, 'w')
+      out.puts '# taxonomy, number_of_reads'
+      taxonomy_count.each_pair do |taxonomy, count|
+        out.puts "#{taxonomy.tr(',','_')},#{count}"
+      end
+      out.close
+    end
+    no_tasks do
+      # returns Hash of taxonomy => number_of_reads
+      def get_taxonomy_count(uc)
+        taxonomy_count = Hash.new { |h, k| h[k] = 0 }
+        uc.each do |result|
+          if result.hit?
+            taxonomy_count[result.target] += 1
+          else
+            taxonomy_count['unclassified_reads'] += 1
+          end
+        end
+        taxonomy_count
+      end
+      # returns Hash of taxonomy => number_of_reads
+      # if a pair of reads do not agree at a taxonomic level,
+      # or if at least one is unclassified, bot reads are counted
+      # as unclassified_reads
+      def get_strict_taxonomy_count(uc, level)
+        taxonomy_count = Hash.new { |h, k| h[k] = 0 }
+        # TODO: I'm making a block for results because I don't know how to
+        # make results return an Enumerator when not given a block
+        uc.each_slice(2) do |left, right|
+          if left.miss? or right.miss? # at least one is a miss
+            taxonomy_count['unclassified_reads'] += 2
+          # both are hits, check taxonomies
+          else
+            ta = parse_taxonomy(left.target)
+            tb = parse_taxonomy(right.target)
+            # they match up, count both separately
+            if ta[level] == tb[level]
+              taxonomy_count[left.target] += 1
+              taxonomy_count[right.target] += 1
+            # they don't match up, count as unclassified
+            else
+              taxonomy_count['unclassified_reads'] += 2
+            end
+          end
+        end # results.each_slice
+        taxonomy_count
+      end
+    end
+  end
+end

data/lib/lederhosen/tasks/get_reps.rb CHANGED Viewed

@@ -23,10 +23,9 @@ module Lederhosen
       inputs.each do |input|
         File.open(input) do |handle|
-          pbar.inc
-          handle.each do |line|
-            header = parse_usearch_line(line.strip)
-            taxa << header['original'] rescue nil
+          results = UCParser.new(handle)
+          results.each do |result|
+            taxa << result.target if result.hit?
           end
         end
       end

data/lib/lederhosen/tasks/make_udb.rb CHANGED Viewed

@@ -3,8 +3,8 @@ module Lederhosen
     desc 'make_udb', 'format database for usearch'
-    method_option :input,       :type => :string,  :required => true
-    method_option :output,      :type => :string,  :required => true
+    method_option :input,  :type => :string, :required => true
+    method_option :output, :type => :string, :required => true
     def make_udb
       input       = options[:input]

data/lib/lederhosen/tasks/otu_filter.rb CHANGED Viewed

@@ -39,20 +39,27 @@ module Lederhosen
       ohai "filtering"
       # filter sample_cluster_count
+      # todo: move filtered reads to 'unclassified_reads' classification
       filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }
+      # use functional programming they said
+      # it will make your better they said
+      noise = cluster_sample_count.keys - filtered.keys
       ohai "saving to #{output}"
       # save the table
       out = File.open(output, 'w')
       samples = filtered.values.map(&:keys).flatten.uniq
       clusters = filtered.keys
-      out.puts "-,#{clusters.join(',')}"
+      out.puts "-,#{clusters.join(',')},noise"
       samples.each do |sample|
         out.print "#{sample}"
         clusters.each do |cluster|
           out.print ",#{filtered[cluster][sample]}"
         end
+        noise_sum = noise.map { |n| cluster_sample_count[n][sample]}.inject(:+)
+        out.print ",#{noise_sum}"
         out.print "\n"
       end
       out.close

data/lib/lederhosen/tasks/otu_table.rb CHANGED Viewed

@@ -6,96 +6,59 @@ module Lederhosen
   class CLI
     desc "otu_table",
-         "create an OTU abundance matrix from USEARCH prefix"
+         "create an OTU abundance matrix from taxonomy count files"
     method_option :files,  :type => :string, :required => true
-    method_option :prefix, :type => :string, :required => true,
-                  :banner => 'prefix prefix'
-    method_option :levels, :type => :array, :required => true,
-                  :banner => 'valid options: domain, kingdom, phylum, class, order, genus, species, original (or all of them at once)'
+    method_option :level,  :type => :string, :required => true
+    method_option :output, :type => :string, :required => true
     def otu_table
-      input  = Dir[options[:files]]
-      prefix = options[:prefix]
-      levels = options[:levels].map(&:downcase)
+      inputs = Dir[options[:files]]
+      level  = options[:level].downcase
+      output = options[:output]
-      ohai "generating #{levels.join(', ')} table(s) from #{input.size} file(s) and saving to prefix #{prefix}."
+      ohai "Generating OTU matrix from #{inputs.size} inputs at #{level} level and saving to #{output}."
       # sanity check
-      levels.each do |level|
-        fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
-      end
+      fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
+      fail 'no inputs matched your glob' if inputs.size == 0
-      # there has to be a more efficient way of doing this
-      level_sample_cluster_count =
-        Hash.new do |h, k|
-          h[k] = Hash.new do |h, k|
-            h[k] = Hash.new(0)
-          end
-        end
+      sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
       # create a progress bar with the total number of bytes of
       # the files we're slurping up
-      pbar = ProgressBar.new "loading", input.size
-      # Load cluster table
-      input.each do |input_file|
-        pbar.inc
-        File.open(input_file) do |handle|
-          handle.each do |line|
-            dat = parse_usearch_line(line.strip)
-            levels.each do |level|
-              name =
-                if dat.nil?
-                  'unclassified_reads'
-                else
-                  dat[level] || 'unparsed_name'
-                end
-              # remove commas from name
-              name = name.tr(',', '_')
-              # the next two lines are what is slow
-              level_sample_cluster_count[level][input_file][name] += 1
+      pbar = ProgressBar.new "loading", inputs.size
+      inputs.each do |input_file|
+        File.open(input_file).each do |line|
+          next if line =~ /^#/ # skip header(s)
+          line = line.strip.split(',')
+          taxonomy, count = line
+          count = count.to_i
+          tax =
+            if taxonomy == 'unclassified_reads'
+              'unclassified_reads'
+            else
+              parse_taxonomy(taxonomy)[level]
             end
-          end
+          sample_cluster_count[input_file][tax] += count
         end
       end
-      pbar.finish
+      all_clusters = sample_cluster_count.values.map(&:keys).flatten.uniq.sort
-      # get all taxonomic names at each level
-      all_names = Hash.new.tap do |bar|
-        level_sample_cluster_count.each_pair.map do |k, v|
-          names = v.each_value.map(&:keys).flatten.uniq
-          bar[k] = names
+      out = File.open(output, 'w')
+      out.puts all_clusters.join(',')
+      inputs.sort.each do |input|
+        out.print "#{input}"
+        all_clusters.each do |c|
+          out.print ",#{sample_cluster_count[input][c]}"
         end
+        out.print "\n"
       end
-      # save to csv(s)
-      levels.each do |level|
-        ohai "saving #{level} table"
-        File.open("#{prefix}.#{level}.csv", 'w') do |handle|
-          header = all_names[level].to_a.compact.sort
-          handle.puts "#{level.capitalize},#{header.join(',')}"
-          input.each do |sample|
-            handle.print "#{sample}"
-            header.each do |name|
-              handle.print ",#{level_sample_cluster_count[level][sample][name]}"
-            end
-            handle.print "\n"
-          end
-        end
-      end
     end
   end # class CLI
 end # module Lederhosen

data/lib/lederhosen/tasks/separate_unclassified.rb ADDED Viewed

@@ -0,0 +1,65 @@
+require 'set'
+module Lederhosen
+  class CLI
+    desc 'separate_unclassified',
+         'separate unclassified reads (with or without strict pairing)'
+    method_option :uc_file, :type => :string, :required => true
+    method_option :reads,   :type => :string, :required => true
+    method_option :output,  :type => :string, :required => true
+    method_option :strict,  :type => :string, :default => false
+    def separate_unclassified
+      uc_file = options[:uc_file]
+      reads   = options[:reads]
+      output  = options[:output]
+      strict  = options[:strict]
+      unclassifieds = Set.new
+      handle = File.open(uc_file)
+      uc = UCParser.new(handle)
+      if not strict
+        uc.each do |result|
+          unclassifieds << result.query if result.miss?
+        end
+      elsif strict
+        uc.each_slice(2) do |left, right|
+          if left.miss? || right.miss? # at least one is a miss
+            unclassifieds << left.query
+            unclassifieds << right.query
+          # both are hits, check taxonomies
+          else
+            ta = parse_taxonomy(right.target)
+            tb = parse_taxonomy(left.target)
+            # inconsistent assignment or at least one is a miss
+            if (ta[strict] != tb[strict])
+              unclassifieds << left.query
+              unclassifieds << right.query
+            end
+          end
+        end
+      end
+      ohai "found #{unclassifieds.size} unclassified #{'(strict pairing)' if strict} reads."
+      handle.close
+      # open fasta file, output unclassified reads
+      out = File.open(output, 'w')
+      Dna.new(File.open(reads)).each do |record|
+        if unclassifieds.include? record.name
+          out.puts record
+        end
+      end
+      out.close
+    end
+  end
+end

data/lib/lederhosen/uc_parser.rb ADDED Viewed

@@ -0,0 +1,88 @@
+require 'ostruct'
+module Lederhosen
+  # represents a usearch result
+  class UResult
+    def initialize(hash)
+      @source = OpenStruct.new(hash)
+    end
+    def method_missing(method, *args, &block)
+      @source.send(method, *args, &block)
+    end
+    def hit?
+      @source.type == 'H'
+    end
+    def miss?
+      @source.type == 'N'
+    end
+  end
+  # class for parsing UC files, generates UResult objects
+  class UCParser
+    include Enumerable
+    def initialize(handle)
+      @handle = handle
+    end
+    def each(&block)
+      @handle.each do |line|
+        next if line =~ /^[#C]/ # skip comments and cluster summaries
+        dat = parse_usearch_line(line.strip)
+        result = UResult.new(dat)
+        block.call(result)
+      end
+    end
+    private
+    # parse a line of usearch prefix
+    # return a hash in the form:
+    # { :taxonomy => '', :identity => '0.00', ... }
+    # unless the line is not a "hit" in which case
+    # the function returns nil
+    def parse_usearch_line(str)
+      # from http://drive5.com/usearch/manual/ucout.html
+      # 1   Record type S, H, C or N (see table below).
+      # 2   Cluster number (0-based).
+      # 3   Sequence length (S, N and H) or cluster size (C).
+      # 4   For H records, percent identity with target.
+      # 5   For H records, the strand: + or - for nucleotides, . for proteins.
+      # 6   Not used, parsers should ignore this field. Included for backwards compatibility.
+      # 7   Not used, parsers should ignore this field. Included for backwards compatibility.
+      # 8   Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10).
+      # 9   Label of query sequence (always present).
+      # 10    Label of target sequence (H records only).
+      str = str.split("\t")
+      dat = {
+        :type => str[0],
+        :cluster_no => str[1],
+        :alignment => str[7],
+        :query => str[8],
+        :target => str[9],
+      }
+      r =
+        if dat[:type] =~ /[SNH]/ # hits
+          { :length => str[2].to_i,
+            :identity => str[3],
+            :strand => str[4],
+          }
+      elsif dat[:type] == 'C' # clusters
+        { :cluster_size => str[2].to_i }
+      else
+        raise Exception, "Do not understand record type #{str[0]}!"
+      end
+      dat.merge(r)
+    end
+  end
+end

data/lib/lederhosen/version.rb CHANGED Viewed

@@ -1,9 +1,9 @@
 module Lederhosen
   module Version
-    MAJOR = 1
-    MINOR = 8
-    CODENAME = 'Karottensaft' # changes for minor versions
-    PATCH = 2
+    MAJOR = 2
+    MINOR = 0
+    CODENAME = 'Schnittlauchbrot' # changes for minor versions
+    PATCH = 0
     STRING = [MAJOR, MINOR, PATCH].join('.')
   end