RubyGems - lederhosen - Versions diffs - 1.3.8 → 1.3.10 - Mend

lederhosen 1.3.8 → 1.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lederhosen.gemspec +2 -2
data/lib/lederhosen/no_tasks.rb +10 -6
data/lib/lederhosen/tasks/otu_table.rb +16 -7
data/lib/lederhosen/tasks/trim.rb +16 -5
data/lib/lederhosen/version.rb +1 -1
data/lib/lederhosen.rb +3 -1
metadata +3 -3

data/lederhosen.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "lederhosen"
-  s.version = "1.3.8"
+  s.version = "1.3.10"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Austin G. Davis-Richardson"]
-  s.date = "2012-12-07"
+  s.date = "2012-12-13"
   s.description = "Various tools for OTU clustering"
   s.email = "harekrishna@gmail.com"
   s.executables = ["lederhosen"]

data/lib/lederhosen/no_tasks.rb CHANGED Viewed

@@ -1,3 +1,4 @@
 module Lederhosen
   class CLI
@@ -7,7 +8,7 @@ module Lederhosen
       # parse a line of usearch prefix
       # return a hash in the form:
-      # { :taxonomy => '', :identity => 0.00, ... }
+      # { :taxonomy => '', :identity => '0.00', ... }
       # unless the line is not a "hit" in which case
       # the function returns nil
       def parse_usearch_line(str)
@@ -18,7 +19,7 @@ module Lederhosen
         str = str.split
         taxonomic_description = str[9]
-        identity = str[3].to_f
+        identity = str[3]
         # parse taxonomic_description
         taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
@@ -59,9 +60,13 @@ module Lederhosen
         end
       end
+      RE_TAXCOLLECTOR = /^\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/
+      RE_GREENGENES = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?(.*);/
+      RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
       def parse_taxonomy_qiime(taxonomy)
         levels = %w{kingdom phylum class order family genus species}
-        match_data = taxonomy.match(/k__(\w*);p__(\w*);c__(\w*);o__(\w*);f__(\w*);g__(\w*);s__(\w*)/)
+        match_data = taxonomy.match(RE_QIIME)
         match_data = match_data[1..-1]
         names = Hash.new
@@ -74,7 +79,7 @@ module Lederhosen
       def parse_taxonomy_greengenes(taxonomy)
         levels = %w{kingdom phylum class order family genus species}
-        match_data = taxonomy.match(/k__(\w*); ?p__(\w*); ?c__(\w*); ?o__(\w*); ?f__(\w*); ?g__(\w*); ?(\w*);/)
+        match_data = taxonomy.match(RE_GREENGENES)
         match_data = match_data[1..-1]
         names = Hash.new
@@ -100,9 +105,8 @@ module Lederhosen
         match_data =
           begin
-            taxonomy.match(/\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/)[1..-1]
+            taxonomy.match(RE_TAXCOLLECTOR)[1..-1]
           rescue
-            $stderr.puts taxonomy.inspect
             return nil
           end

data/lib/lederhosen/tasks/otu_table.rb CHANGED Viewed

@@ -2,8 +2,6 @@
 # MAKE TABLES
 #
-require 'set'
 module Lederhosen
   class CLI
@@ -30,16 +28,19 @@ module Lederhosen
         fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
       end
-      level_sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } } }
-      all_names = Hash.new { |h, k| h[k] = Set.new }
+      # there has to be a more efficient way of doing this
+      level_sample_cluster_count =
+        Hash.new do |h, k|
+          h[k] = Hash.new do |h, k|
+            h[k] = Hash.new(0)
+          end
+        end
       # create a progress bar with the total number of bytes of
       # the files we're slurping up
       pbar = ProgressBar.new "loading", input.size
       # Load cluster table
       input.each do |input_file|
         pbar.inc
         File.open(input_file) do |handle|
@@ -54,8 +55,8 @@ module Lederhosen
                   dat[level] || 'unparsed_name'
                 end
+              # the next two lines are what is slow
               level_sample_cluster_count[level][input_file][name] += 1
-              all_names[level] << name
             end
           end
@@ -64,6 +65,14 @@ module Lederhosen
       pbar.finish
+      # get all taxonomic names at each level
+      all_names = Hash.new.tap do |bar|
+        level_sample_cluster_count.each_pair.map do |k, v|
+          names = v.each_value.map(&:keys).flatten.uniq
+          bar[k] = names
+        end
+      end
       # save to csv(s)
       levels.each do |level|

data/lib/lederhosen/tasks/trim.rb CHANGED Viewed

@@ -2,6 +2,8 @@
 # QUALITY TRIMMING
 #
+# This should probably be broken into its own module or command-line utility.
 module Lederhosen
   class CLI
@@ -10,10 +12,12 @@ module Lederhosen
     method_option :reads_dir, :type => :string, :required => true
     method_option :out_dir,   :type => :string, :required => true
+    method_option :pretrim,   :type => :numeric, :default => 11
     def trim
       raw_reads = options[:reads_dir]
       out_dir   = options[:out_dir]
+      pretrim   = options[:pretrim]
       ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"
@@ -92,16 +96,21 @@ module Lederhosen
       # returns just the sequence
       def trim_seq(dna, args={})
+        pretrim = args[:pretrim] || false
         # trim primers off of sequence
-        # (THIS IS EXPERIMENT-SPECIFIC)
-        dna.sequence = dna.sequence[11..-1]
-        dna.quality  = dna.quality[11..-1]
+        # XXX this is experiment-specific and needs to be made
+        # into a parameter
+        if pretrim
+          dna.sequence = dna.sequence[pretrim..-1]
+          dna.quality  = dna.quality[pretrim..-1]
+        end
         # throw away any read with an ambiguous primer
         return nil if dna.sequence =~ /N/
-        min    = args[:min]    || 20
-        offset = args[:cutoff] || 64
+        min    = args[:min]    || 20 # what is this constant?
+        offset = args[:cutoff] || 64 # XXX depends on sequencing tech.
         _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
@@ -116,6 +125,8 @@ module Lederhosen
             first = a
           end
         end
+        # XXX why is this rescue statement here?
         dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
       end
     end

data/lib/lederhosen/version.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module Lederhosen
     MAJOR = 1
     MINOR = 3
     CODENAME = 'Dirndl' # changes for minor versions
-    PATCH = 8
+    PATCH = 10
     STRING = [MAJOR, MINOR, PATCH].join('.')
   end

data/lib/lederhosen.rb CHANGED Viewed

@@ -1,7 +1,9 @@
 require 'rubygems'
 require 'bundler'
-Bundler.require :default
 require 'set'
+require 'dna'
+require 'progressbar'
+require 'thor'
 Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lederhosen
 version: !ruby/object:Gem::Version
-  version: 1.3.8
+  version: 1.3.10
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-07 00:00:00.000000000 Z
+date: 2012-12-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: dna
@@ -176,7 +176,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: 1569227273029021963
+      hash: 2999187278262571353
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements: