RubyGems - lederhosen - Versions diffs - 2.0.6 → 2.0.7 - Mend

lederhosen 2.0.6 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

data/lederhosen.gemspec +2 -2
data/lib/lederhosen/tasks/otu_filter.rb +48 -34
data/lib/lederhosen/version.rb +1 -1
metadata +3 -3

data/lederhosen.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "lederhosen"
-  s.version = "2.0.6"
+  s.version = "2.0.7"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Austin G. Davis-Richardson"]
-  s.date = "2013-02-27"
+  s.date = "2013-03-11"
   s.description = "Various tools for OTU clustering"
   s.email = "harekrishna@gmail.com"
   s.executables = ["lederhosen"]

data/lib/lederhosen/tasks/otu_filter.rb CHANGED Viewed

@@ -18,57 +18,71 @@ module Lederhosen
       ohai "filtering otu file #{input} (reads = #{reads}, samples = #{min_samples})"
-      cluster_sample_count = Hash.new { |h, k| h[k] = Hash.new }
+      # make one pass finding which OTUs to keep
+      # create mask that maps which columns correspond to good OTUs
+      # pass over table again printing only those columns
-      ohai "loading csv file #{input}"
+      seen = Hash.new { |h, k| h[k] = 0 }
-      # slurp up CSV file
-      File.open input do |handle|
+      otu_order = []
+      pbar = ProgressBar.new 'counting', File.size(input)
+      total_reads = 0
+      File.open(input) do |handle|
         header = handle.gets.strip.split(',')
-        cluster_ids = header[0..-1]
+        header.each { |x| otu_order << x }
         handle.each do |line|
+          pbar.set handle.pos
           line = line.strip.split(',')
-          sample_id = line[0].to_sym
-          counts = line[1..-1].map(&:to_i)
-          cluster_ids.zip(counts).each do |cluster, count|
-            cluster_sample_count[cluster][sample_id] = count
+          sample_name = line[0]
+          abunds = line[1..-1].map &:to_i
+          otu_order.zip(abunds) do |o, a|
+            total_reads += a
+            seen[o] += 1 if a >= reads
           end
         end
       end
-      ohai "filtering"
+      pbar.finish
-      # filter sample_cluster_count
-      # todo: move filtered reads to 'unclassified_reads' classification
-      filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }
+      mask = otu_order.map { |x| seen[x] >= min_samples }
-      # use functional programming they said
-      # it will make your better they said
-      noise = cluster_sample_count.keys - filtered.keys
+      ohai "found #{otu_order.size} otus, keeping #{mask.count(true)}"
-      ohai "saving to #{output}"
+      output = File.open(output, 'w')
-      # save the table
-      out = File.open(output, 'w')
-      samples = filtered.values.map(&:keys).flatten.uniq
-      clusters = filtered.keys
-      out.puts "-,#{clusters.join(',')},noise"
+      pbar = ProgressBar.new 'writing', File.size(input)
+      filtered_reads = 0
+      File.open(input) do |handle|
+        header = handle.gets.strip.split(',')
+        header = header.zip(mask).map { |k, m| k if m }.compact
+        output.print header.join(',')
+        output.print ",noise\n" # need a "noise" column
+        handle.each do |line|
+          pbar.set handle.pos
+          line = line.strip.split(',')
+          sample_name = line[0]
+          counts = line[1..-1].map &:to_i
+          kept_counts = counts.zip(mask).map { |c, m| c if m }.compact
+          noise = counts.zip(mask).map { |c, m| c unless m }.compact.inject(:+)
+          filtered_reads += noise
+          output.puts "#{sample_name},#{kept_counts.join(',')},#{noise}"
-      samples.each do |sample|
-        out.print "#{sample}"
-        clusters.each do |cluster|
-          out.print ",#{filtered[cluster][sample]}"
         end
-        noise_sum = noise.map { |n| cluster_sample_count[n][sample] }.inject(:+)
-        out.print ",#{noise_sum || 0}"
-        out.print "\n"
       end
-      out.close
-      ohai "kept #{filtered.keys.size} clusters (#{filtered.keys.size/cluster_sample_count.size.to_f})."
-      kept_reads = filtered.values.map { |x| x.values.inject(:+) }.inject(:+)
-      total_reads = cluster_sample_count.values.map { |x| x.values.inject(:+) }.inject(:+)
-      ohai "kept #{kept_reads}/#{total_reads} reads (#{kept_reads/total_reads.to_f})."
+      pbar.finish
+      ohai "kept #{total_reads - filtered_reads}/#{total_reads} reads (#{100*(total_reads - filtered_reads)/total_reads.to_f}%)"
+      output.close
     end
   end

data/lib/lederhosen/version.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module Lederhosen
     MAJOR = 2
     MINOR = 0
     CODENAME = 'Schnittlauchbrot' # changes for minor versions
-    PATCH = 6
+    PATCH = 7
     STRING = [MAJOR, MINOR, PATCH].join('.')
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lederhosen
 version: !ruby/object:Gem::Version
-  version: 2.0.6
+  version: 2.0.7
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-02-27 00:00:00.000000000 Z
+date: 2013-03-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: dna
@@ -149,7 +149,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -902984509893215788
+      hash: -4462113867490056177
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements: