RubyGems - lederhosen - Versions diffs - 0.0.8 → 0.0.9 - Mend

lederhosen 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/lib/lederhosen/helpers.rb +23 -25
data/lib/lederhosen/tasks/cluster.rb +3 -3
data/lib/lederhosen/tasks/join.rb +2 -2
data/lib/lederhosen/tasks/name.rb +29 -0
data/lib/lederhosen/tasks/otu_table.rb +52 -34
data/lib/lederhosen/tasks/rep_reads.rb +44 -0
data/lib/lederhosen/tasks/sort.rb +2 -3
data/lib/lederhosen/tasks/split.rb +3 -3
data/lib/lederhosen/tasks/trim.rb +1 -1
data/lib/version.rb +1 -1
data/readme.md +32 -11
metadata +7 -5

data/lib/lederhosen/helpers.rb CHANGED Viewed

@@ -76,56 +76,54 @@ module Lederhosen
     def load_uc_file(input)
       clusters = Hash.new
-      # store a list of samples
+      # keep track of samples
+      samples = Set.new
+      # store a list of all the sample IDs
       clusters[:samples] = Set.new
       # data for each cluster
-      # - total size
-      # - size by sample
-      # - seed sequence
-      clusters[:count_data] = Hash.new
+      # clstr_counts[:clstr][:sample] = number_of_reads
+      clstr_counts = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
+      # clstrnr_to_seed[seed_sequence_id] = clstr_nr
+      seed_to_clstrnr = Hash.new
       File.open(input) do |handle|
         handle.each do |line|
-          # skip comments
-          next if line =~ /^#/
+          next if line =~ /^#/ # skip comments
           line = line.strip.split
           # things we want to know
           type        = line[0]
-          clusternr   = line[1]
+          clusternr   = line[1].to_i
           querylabel  = line[8]
           targetlabel = line[9]
           sample      = line[8].split(':')[2]
+          # keep track of samples
+          samples.add sample
           # keep track of all samples
-          clusters[:samples] << sample
+          clusters[:samples].add sample
           if type == 'S' # = Seed Sequence
-            clusters[:count_data][clusternr] = { :seed => querylabel, :total => 1, :counts => Hash.new{ |h, k| h[k] = 0 } }
+            clstr_counts[clusternr][sample] += 1
+            seed_to_clstrnr[querylabel] = clusternr
           elsif type == 'H' # = Seed Member
-            clusters[:count_data][clusternr][:total] += 1
-            clusters[:count_data][clusternr][:counts][sample] += 1
+            clstr_counts[clusternr][sample] += 1
           end
         end
       end
-      clusters
+      return {
+        :clstr_counts    => clstr_counts,
+        :seed_to_clstrnr => seed_to_clstrnr,
+        :samples         => samples
+      }
     end
-    def cluster_data_as_csv(data)
-      samples = data[:samples].to_a
-      counts = data[:count_data]
-      sep = ","
-      csv = []
-      csv << ['-'] + samples
-      counts.keys.each do |cluster|
-        csv << ["cluster-#{cluster}"] + samples.collect { |x| "#{counts[cluster][:counts][x]}" }
-      end
-      csv.collect { |x| x.join("\t")}.join("\n")
-    end
     end # class << self
   end # class Helpers

data/lib/lederhosen/tasks/cluster.rb CHANGED Viewed

@@ -8,9 +8,9 @@ module Lederhosen
     desc "cluster fasta file",
          "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
-    method_option :input,    :type => :string,  :default => 'sorted.fasta'
-    method_option :output,   :type => :string,  :default => 'clusters.uc'
-    method_option :identity, :type => :numeric, :default => 0.8
+    method_option :input,    :type => :string,  :required => true
+    method_option :output,   :type => :string,  :required => true
+    method_option :identity, :type => :numeric, :required => true
     def cluster
       identity = options[:identity]

data/lib/lederhosen/tasks/join.rb CHANGED Viewed

@@ -7,8 +7,8 @@ module Lederhosen
     desc "join reads end-to-end",
          "--trimmed=trimmed/*.fasta --output=joined.fasta"
-    method_option :trimmed, :type => :string, :default => 'trimmed/*.fasta'
-    method_option :output,  :type => :string, :default => 'joined.fasta'
+    method_option :trimmed, :type => :string, :required => true
+    method_option :output,  :type => :string, :required => true
     def join

data/lib/lederhosen/tasks/name.rb ADDED Viewed

@@ -0,0 +1,29 @@
+##
+# IDENTIFY CLUSTERS IN A TAXCOLLECTOR DATABASE
+#
+module Lederhosen
+  class CLI
+    desc "name identify clusters in a taxcollector database",
+         "--reps representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
+    method_option :reps,     :type => :string, :required => true
+    method_option :database, :type => :string, :required => true
+    method_option :output,   :type => :string, :required => true
+    def name
+      reps     = options[:reps]
+      database = options[:database]
+      output   = options[:output]
+      # run blat/blast
+      cmd = [
+      'blat',
+      ]
+    end
+  end
+end

data/lib/lederhosen/tasks/otu_table.rb CHANGED Viewed

@@ -2,58 +2,76 @@
 # MAKE TABLES
 #
+SEP = ','
 module Lederhosen
   class CLI
-    desc "otu_tables generates otu tables & representative reads",
-         "--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
+    desc "otu_tables generates otu tables",
+         "--clusters=clusters.uc --output=otu_prefix"
-    method_option :clusters, :type => :string, :default => 'clusters.uc'
-    method_option :output,   :type => :string, :default => 'otus'
-    method_option :joined,   :type => :string, :default => 'joined.fasta'
+    method_option :clusters, :type => :string, :required => true
+    method_option :output,   :type => :string, :required => true
     def otu_table
-      input = options[:clusters]
-      output = options[:output]
+      input        = options[:clusters]
+      output       = options[:output]
       joined_reads = options[:joined]
-      clusters = Hash.new
       # Load cluster table!
-      clusters = Helpers.load_uc_file(input)
+      clstr_info      = Helpers.load_uc_file input
+      clstr_counts    = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
+      clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
+      samples         = clstr_info[:samples]
-      clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
-      # Get representative sequences!
-      reads_total = 0
-      representatives = {}
-      clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
+      # print OTU abundancy matrix
+      File.open("#{output}.csv", 'w') do |h|
+        samples  = samples.sort
+        clusters = clstr_counts.keys
-      out_handle = File.open("#{output}.fasta", 'w')
+        # print header
+        head = samples.join(SEP)
+        h.puts "-" + SEP + head
-      File.open(joined_reads) do |handle|
-        records = Dna.new handle
-        records.each do |dna|
-          reads_total += 1
-          if !representatives[dna.name].nil?
-            dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
-            out_handle.puts dna
+        # start printing clusters
+        clusters.each do |cluster|
+          h.print "cluster-#{cluster}"
+          samples.each do |sample|
+            h.print "#{SEP}#{clstr_counts[cluster][sample]}"
           end
+          h.print "\n"
         end
       end
-      out_handle.close
+      # # Get representative sequences!
+      # reads_total = 0
+      # representatives = {}
+      # clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
+      #
+      # out_handle = File.open("#{output}.fasta", 'w')
+      #
+      # File.open(joined_reads) do |handle|
+      #   records = Dna.new handle
+      #   records.each do |dna|
+      #     reads_total += 1
+      #     if !representatives[dna.name].nil?
+      #       dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
+      #       out_handle.puts dna
+      #     end
+      #   end
+      # end
+      #
+      # out_handle.close
+      #
+      # # Print some statistics
+      # ohai "reads in clusters:  #{clusters_total}"
+      # ohai "number of reads:    #{reads_total}"
+      # ohai "unique clusters:    #{clusters.keys.length}"
-      # Print some statistics
-      ohai "reads in clusters:  #{clusters_total}"
-      ohai "number of reads:    #{reads_total}"
-      ohai "unique clusters:    #{clusters.keys.length}"
-      # print OTU abundancy matrix
-      csv = Helpers.cluster_data_as_csv(clusters)
-      File.open("#{output}.csv", 'w') do |h|
-        h.puts csv
-      end
     end

data/lib/lederhosen/tasks/rep_reads.rb ADDED Viewed

@@ -0,0 +1,44 @@
+##
+# GET REPRESENTATIVE READS
+#
+module Lederhosen
+  class CLI
+    desc "rep_reads extract representative reads for each cluster to a fasta file",
+         "--clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta"
+    method_option :clusters, :type => :string, :required => true
+    method_option :output,   :type => :string, :required => true
+    method_option :joined,   :type => :string, :required => true
+    def rep_reads
+      input        = options[:clusters]
+      output       = options[:output]
+      joined_reads = options[:joined]
+      # Load cluster table!
+      clstr_info      = Helpers.load_uc_file input
+      clstr_counts    = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
+      seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
+      samples         = clstr_info[:samples]
+      out_handle = File.open("#{output}", 'w')
+      File.open(joined_reads) do |handle|
+        records = Dna.new handle
+        records.each do |dna|
+          clstrnr = seed_to_clstrnr[dna.name]
+          unless clstrnr.nil?
+            dna.name = "#{dna.name}:cluster-#{clstrnr}"
+            out_handle.puts dna
+          end
+        end
+      end
+      out_handle.close
+    end
+  end
+end

data/lib/lederhosen/tasks/sort.rb CHANGED Viewed

@@ -8,9 +8,8 @@ module Lederhosen
     desc "sort fasta file by length",
          "--input=joined.fasta --output=sorted.fasta"
-    method_options :input => :string, :output => :string
-    method_option :input,  :type => :string, :default => 'joined.fasta'
-    method_option :output, :type => :string, :default => 'sorted.fasta'
+    method_option :input,  :type => :string, :required => true
+    method_option :output, :type => :string, :required => true
     def sort
       input = options[:input]

data/lib/lederhosen/tasks/split.rb CHANGED Viewed

@@ -8,9 +8,9 @@ module Lederhosen
     desc "output separate fasta file containing sequences belonging to each cluster",
          "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
-    method_option :clusters,      :type => :string,  :default => 'clusters.uc'
-    method_option :reads,         :type => :string,  :default => 'joined.fasta'
-    method_option :out_dir,       :type => :string,  :default => 'clusters_split'
+    method_option :clusters,      :type => :string, :required => true
+    method_option :reads,         :type => :string, :required => true
+    method_option :out_dir,       :type => :string, :required => true
     method_option :buffer_size,   :type => :numeric, :default => 1000
     method_option :min_clst_size, :type => :numeric, :default => 100

data/lib/lederhosen/tasks/trim.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module Lederhosen
          "--reads_dir=reads/* --out_dir=trimmed.fasta"
     method_option :reads_dir, :type => :string, :required => true
-    method_option :out_dir,   :type => :string, :default => 'trimmed/'
+    method_option :out_dir,   :type => :string, :required => true
     def trim

data/lib/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Lederhosen
-  VERSION = '0.0.8'
+  VERSION = '0.0.9'
 end

data/readme.md CHANGED Viewed

@@ -1,11 +1,16 @@
 # Lederhosen
-Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
+Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own risk.
 ## How do I get Lederhosen?
-0. Obtain & Install uclust (64-bit)
-1. `sudo gem install lederhosen`
+0. Obtain & Install [UCLUST](http://www.drive5.com/) (64-bit)
+1. Obtain & Install [BLAT](http://genome.ucsc.edu/FAQ/FAQblat.html#blat3)
+2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
+3. Install Lederhosen by typing:
+    `sudo gem install lederhosen`
+4. Check installation by typing `lederhosen`. You should see some help text.
 ## How do I use Lederhosen?
@@ -13,26 +18,42 @@ Type `lederhosen help` for complete instructions
 ### 1. Trim raw reads
-`$ lederhosen trim --reads-dir=reads-dir/*.txt`
+`$ lederhosen trim --reads-dir=reads-dir/*.txt --out-dir=trimmed`
 ### 2. Join trimmed reads
-`$ lederhosen join`
+`$ lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta`
 ### 3. Sort trimmed reads
-`$ lederhosen sort`
+`$ lederhosen sort --input=joined.fasta --output=sorted.fasta`
 ### 4. Cluster sorted reads
-`$ lederhosen cluster --idenity=0.975`
+`$ lederhosen cluster --identity=0.975 --input=sorted.fasta --output=clusters`
+### 5. Make OTU tables
+`% lederhosen otu_table --clusters=clusters.uc --output=clusters_975.csv`
+This will output a csv (`clusters.975.csv`) and a fasta (`clusters.975.fasta`) file. The fasta file can be used to identify clusters in a 16S rRNA database using BLAST or something.
+### 6. Get representative reads from each cluster
-### 5. Make tables & Get representative sequences
+`% lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representatives.fasta`
-`% lederhosen otu_table --clusters=clusters.uc --output=clusters9.75.txt`
+### 6. Get a fasta file containing all reads for each cluster
-### 6. Get fasta files with reads for each cluster
+(time consuming and probably not necessary)
 `% lederhosen split --clusters=clusters_97.5.txt --reads=joined.fasta --min-clst-size=100`
-`--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
+`--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
+### 7. Identifying Clusters
+(Still under development)
+You need BLAT (in your `$PATH`) & TaxCollector.
+`$ lederhosen name --reps=representatives.fasta --db=taxcollector.fa --output=output_prefix`

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: lederhosen
 version: !ruby/object:Gem::Version
-  hash: 15
+  hash: 13
   prerelease:
   segments:
   - 0
   - 0
-  - 8
-  version: 0.0.8
+  - 9
+  version: 0.0.9
 platform: ruby
 authors:
 - Austin G. Davis-Richardson
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-13 00:00:00 Z
+date: 2012-05-01 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: dna
@@ -122,7 +122,9 @@ files:
 - lib/lederhosen/helpers.rb
 - lib/lederhosen/tasks/cluster.rb
 - lib/lederhosen/tasks/join.rb
+- lib/lederhosen/tasks/name.rb
 - lib/lederhosen/tasks/otu_table.rb
+- lib/lederhosen/tasks/rep_reads.rb
 - lib/lederhosen/tasks/sort.rb
 - lib/lederhosen/tasks/split.rb
 - lib/lederhosen/tasks/trim.rb
@@ -164,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: lederhosen
-rubygems_version: 1.8.21
+rubygems_version: 1.8.24
 signing_key:
 specification_version: 3
 summary: 16S rRNA clustering for paired-end Illumina