RubyGems - lederhosen - Versions diffs - 1.0.1 → 1.0.2 - Mend

lederhosen 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/lederhosen.gemspec +1 -11
data/lib/lederhosen/tasks/otu_table.rb +4 -0
data/lib/lederhosen/tasks/trim.rb +89 -2
data/lib/lederhosen/version.rb +1 -1
data/readme.md +13 -11
data/spec/cli_spec.rb +1 -1
metadata +3 -13
data/examples/hierarchical_clustering.sh +0 -51
data/examples/pipeline.sh +0 -71
data/lib/lederhosen/buffer.rb +0 -54
data/lib/lederhosen/helpers.rb +0 -166
data/lib/lederhosen/tasks/k_filter.rb +0 -82
data/lib/lederhosen/tasks/rep_reads.rb +0 -45
data/lib/lederhosen/tasks/split.rb +0 -84
data/lib/lederhosen/tasks/uc_filter.rb +0 -80
data/lib/lederhosen/tasks/uc_stats.rb +0 -41
data/spec/helpers_spec.rb +0 -30

data/lederhosen.gemspec CHANGED Viewed

@@ -5,7 +5,7 @@
 Gem::Specification.new do |s|
   s.name = "lederhosen"
-  s.version = "1.0.1"
+  s.version = "1.0.2"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Austin G. Davis-Richardson"]
@@ -23,24 +23,15 @@ Gem::Specification.new do |s|
     "LICENSE.txt",
     "Rakefile",
     "bin/lederhosen",
-    "examples/hierarchical_clustering.sh",
-    "examples/pipeline.sh",
     "lederhosen.gemspec",
     "lib/lederhosen.rb",
-    "lib/lederhosen/buffer.rb",
     "lib/lederhosen/cli.rb",
-    "lib/lederhosen/helpers.rb",
     "lib/lederhosen/tasks/cluster.rb",
-    "lib/lederhosen/tasks/k_filter.rb",
     "lib/lederhosen/tasks/make_udb.rb",
     "lib/lederhosen/tasks/otu_filter.rb",
     "lib/lederhosen/tasks/otu_table.rb",
-    "lib/lederhosen/tasks/rep_reads.rb",
-    "lib/lederhosen/tasks/split.rb",
     "lib/lederhosen/tasks/split_fasta.rb",
     "lib/lederhosen/tasks/trim.rb",
-    "lib/lederhosen/tasks/uc_filter.rb",
-    "lib/lederhosen/tasks/uc_stats.rb",
     "lib/lederhosen/tasks/version.rb",
     "lib/lederhosen/version.rb",
     "readme.md",
@@ -50,7 +41,6 @@ Gem::Specification.new do |s|
     "spec/data/ILT_L_9_B_002_1.txt.gz",
     "spec/data/ILT_L_9_B_002_3.txt.gz",
     "spec/data/test.uc",
-    "spec/helpers_spec.rb",
     "spec/misc_spec.rb",
     "spec/spec_helper.rb"
   ]

data/lib/lederhosen/tasks/otu_table.rb CHANGED Viewed

@@ -26,9 +26,11 @@ module Lederhosen
       sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
       all_names = Set.new
+      pbar = ProgressBar.new "loading", input.size
       # Load cluster table
       input.each do |input_file|
+        pbar.inc
         File.open(input_file) do |handle|
           handle.each do |line|
             dat = parse_usearch_line(line.strip)
@@ -41,6 +43,8 @@ module Lederhosen
         end
       end
+      pbar.finish
       ohai "found #{all_names.size} unique taxa at #{level} level"
       # save to csv

data/lib/lederhosen/tasks/trim.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module Lederhosen
       run "mkdir -p #{out_dir}"
-      raw_reads = Helpers.get_grouped_qseq_files raw_reads
+      raw_reads = get_grouped_qseq_files raw_reads
       ohai "found #{raw_reads.length} pairs of reads"
@@ -28,10 +28,97 @@ module Lederhosen
         pbar.inc
         out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
         # TODO get total and trimmed
-        total, trimmed = Helpers.trim_pairs a[1][0], a[1][1], out, :min_length => 70
+        total, trimmed = trim_pairs a[1][0], a[1][1], out, :min_length => 70
       end
       pbar.finish
     end
+    no_tasks do
+      # reverse complement a DNA sequence
+      # assumes only GATCN nucleotides
+      def reverse_complement(s)
+        s.reverse.tr('GATCNgatcn','CTAGNctagn')
+      end
+      # Function for grouping qseq files produced by splitting illumina
+      # reads by barcode
+      #
+      # Filenames should look like this:
+      # IL5_L_1_B_007_1.txt
+      def get_grouped_qseq_files(glob='raw_reads/*.txt')
+        Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
+      end
+      # Trim a pair of QSEQ files. Saves to a single,
+      # interleaved .fasta file
+      def trim_pairs(left, right, out, args={})
+        cutoff     = args[:cutoff]     || 20
+        min_length = args[:min_length] || 70
+        left_handle, right_handle =
+          begin
+            [ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
+          rescue Zlib::GzipFile::Error
+            [ File.open(left), File.open(right) ]
+          end
+        out_handle   = File.open out, 'w'
+        left_reads  = Dna.new left_handle
+        right_reads = Dna.new right_handle
+        i = 0
+        left_reads.zip(right_reads).each do |a, b|
+          i += 1
+          seqa = trim_seq a
+          seqb = trim_seq b
+          unless [seqa, seqb].include? nil
+            if seqb.length >= min_length && seqa.length >= min_length
+              seqb = reverse_complement(seqb)
+              out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
+            end
+          end
+        end
+        left_handle.close
+        right_handle.close
+        out_handle.close
+      end
+      # Return longest subsequence with quality scores
+      # greater than min. (Illumina PHRED)
+      # Trim2 from Huang, et. al
+      # returns just the sequence
+      def trim_seq(dna, args={})
+        # trim primers off of sequence
+        # (THIS IS EXPERIMENT-SPECIFIC)
+        dna.sequence = dna.sequence[11..-1]
+        dna.quality  = dna.quality[11..-1]
+        # throw away any read with an ambiguous primer
+        return nil if dna.sequence =~ /N/
+        min    = args[:min]    || 20
+        offset = args[:cutoff] || 64
+        _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
+        dna.quality.each_byte.each_with_index do |b, a|
+          _sum += (b - offset - min)
+          if _sum > _max
+            _max = _sum
+            _end = a
+            start = first
+          elsif _sum < 0
+            _sum = 0
+            first = a
+          end
+        end
+        dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
+      end
+    end
   end
 end

data/lib/lederhosen/version.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module Lederhosen
     MAJOR = 1
     CODENAME = 'Hefeweizen'
     MINOR = 0
-    PATCH = 1
+    PATCH = 2
     STRING = [MAJOR, MINOR, PATCH].join('.')
   end

data/readme.md CHANGED Viewed

@@ -2,13 +2,18 @@
 Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
-## Who can use Lederhosen?
+### About
-Lederhosen is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/)
+- Lederhosen is a project born out of the Triplett Lab at the University of Florida.
+- Lederhosen is designed to be a fast and simple method of clustering 16S rRNA amplicons sequenced
+using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq).
+- Lederhosen uses Semantic Versioning.
+- Lederhosen is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/).
+- Except for USEARCH which requires a license, Lederhosen is available for commercial use.
 ## How do I get Lederhosen?
-0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine)
+0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine for non-commercial use)
 2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
 3. Install Lederhosen by typing:
@@ -18,13 +23,8 @@ Lederhosen is free and open source under the [MIT open source license](http://op
 ## Features
 - Sequence trimming (paired-end Illumina).
-- K-mer filtering.
-- Clustering w/ UCLUST.
-- UCLUST output filtering.
-- Separation of representative reads.
-- Separation of all reads belonging to each cluster.
-- Identification of clusters using TaxCollector.
-- Generation of OTU abundancy matrices.
+- Parallel, referenced-based clustering to TaxCollector using USEARCH
+- Generation and filtering of OTU abundancy matrices.
 ## How do I use Lederhosen?
@@ -40,6 +40,8 @@ Trim (Illumina) reads using quality scores. Output will be a directory of fasta
     lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
+The trimming process will reverse complement the "right" pair so that both reads are in the forward orientation.
 ### Create Database
 Create UDB database required by usearch from TaxCollector
@@ -58,4 +60,4 @@ Create an OTU abundance table where rows are samples and columns are clusters. T
     lederhosen otu_table --clusters=clusters_95.uc --output=genus.csv --level=genus
-Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus. To make tables at all levels do:
+Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus.

data/spec/cli_spec.rb CHANGED Viewed

@@ -8,7 +8,7 @@ describe Lederhosen::CLI do
   end
   it 'should have a version command' do
-    `./bin/lederhosen version `.strip.should == "lederhosen-#{Lederhosen::Version::STRING}"
+    `./bin/lederhosen version`
     $?.success?.should be_true
   end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: lederhosen
 version: !ruby/object:Gem::Version
-  hash: 21
+  hash: 19
   prerelease:
   segments:
   - 1
   - 0
-  - 1
-  version: 1.0.1
+  - 2
+  version: 1.0.2
 platform: ruby
 authors:
 - Austin G. Davis-Richardson
@@ -119,24 +119,15 @@ files:
 - LICENSE.txt
 - Rakefile
 - bin/lederhosen
-- examples/hierarchical_clustering.sh
-- examples/pipeline.sh
 - lederhosen.gemspec
 - lib/lederhosen.rb
-- lib/lederhosen/buffer.rb
 - lib/lederhosen/cli.rb
-- lib/lederhosen/helpers.rb
 - lib/lederhosen/tasks/cluster.rb
-- lib/lederhosen/tasks/k_filter.rb
 - lib/lederhosen/tasks/make_udb.rb
 - lib/lederhosen/tasks/otu_filter.rb
 - lib/lederhosen/tasks/otu_table.rb
-- lib/lederhosen/tasks/rep_reads.rb
-- lib/lederhosen/tasks/split.rb
 - lib/lederhosen/tasks/split_fasta.rb
 - lib/lederhosen/tasks/trim.rb
-- lib/lederhosen/tasks/uc_filter.rb
-- lib/lederhosen/tasks/uc_stats.rb
 - lib/lederhosen/tasks/version.rb
 - lib/lederhosen/version.rb
 - readme.md
@@ -146,7 +137,6 @@ files:
 - spec/data/ILT_L_9_B_002_1.txt.gz
 - spec/data/ILT_L_9_B_002_3.txt.gz
 - spec/data/test.uc
-- spec/helpers_spec.rb
 - spec/misc_spec.rb
 - spec/spec_helper.rb
 homepage: http://audy.github.com/lederhosen

data/examples/hierarchical_clustering.sh DELETED Viewed

@@ -1,51 +0,0 @@
-#!/bin/bash
-set -e
-set -x
-# Hierarchical OTU clustering
-# Austin G. Davis-Richardson
-# <harekrishna at gmail dot com>
-# http://github.com/audy/lederhosen
-reads='sorted.fasta'
-out='h_clustering'
-mkdir -p $out
-# initial clustering at 80%
-lederhosen cluster --input=$reads --output=$out/clusters_0.80.uc --identity=0.80
-# filter UC file
-lederhosen uc_filter --input=$out/clusters_0.80.uc --output=$out/clusters_0.80.uc.filtered --reads=1 --samples=1
-# get reads for each cluster
-mkdir -p $out/split_80
-lederhosen split --clusters=$out/clusters_0.80.uc.filtered --reads=$reads --out-dir=$out/split_80/
-# now cluster each of those at 90%
-for fasta in $out/split_80/*.fasta
-do
-  # sort (awww, do I really have to do this again?)
-  lederhosen sort --input=$fasta --output=$fasta.sorted
-  # cluster
-  lederhosen cluster --input=$fasta.sorted --output=$fasta.uc --identity=0.90
-  # split
-  split=$out/split_80.90_$(basename $fasta .fasta)
-  lederhosen split --clusters=$fasta.uc --reads=$fasta --out-dir=$split
-done
-# Do it again at 95%
-for fasta in $out/split_80/split_*_90.fasta/*.fasta
-do
-  # cluster
-  lederhosen cluster --input=$fasta --output=$fasta.uc --identity=90
-  # split
-  split=$outdir/80.90.$fasta.fasta
-  mkdir -p $split
-  lederhosen split --clusters=$fasta.uc --reads=$input --out-dir=$split
-done

data/examples/pipeline.sh DELETED Viewed

@@ -1,71 +0,0 @@
-#!/bash
-# An example OTU clustering pipeline
-# Austin G. Davis-Richardson
-# <harekrishna at gmail dot com>
-# http://github.com/audy/lederhosen
-set -e
-raw_reads='spec/data/*.txt'
-out_dir='pipeline'
-taxcollector='taxcollector.fa'
-min_reads=50
-min_samples=10
-# trim reads
-lederhosen trim \
-               --reads-dir=$raw_reads \
-               --out-dir=$out_dir/trimmed
-# join reads
-lederhosen join \
-               --trimmed=$out_dir/trimmed/*.fasta \
-               --output=$out_dir/joined.fasta
-# filter reads
-lederhosen k_filter \
-               --input=$out_dir/joined.fasta \
-               --output=$out_dir/filtered.fasta \
-               -k=10 \
-               --cutoff=50
-# sort
-lederhosen sort \
-               --input=$out_dir/filtered.fasta \
-               --output=$out_dir/sorted.fasta
-for i in 0.80 0.90 0.95
-do
-    # cluster
-    lederhosen cluster \
-                   --input=$out_dir/sorted.fasta \
-                   --output=$out_dir/clusters_"$i".uc \
-                   --identity=$i
-    # filter uc file
-    lederhosen uc_filter \
-                   --input=$out_dir/clusters_"$i".uc \
-                   --output=$out_dir/clusters_"$i".uc.filtered \
-                   --reads=$min_reads \
-                   --samples=$min_samples \
-    # generate otu table
-    lederhosen otu_table \
-                   --clusters=$out_dir/clusters_"$i".uc.filtered \
-                   --output=$out_dir/otus_"$i"
-    # get representative reads
-    lederhosen rep_reads \
-                   --clusters=$out_dir/clusters_"$i".uc.filtered \
-                   --joined=$out_dir/sorted.fasta \
-                   --output=$out_dir/representatives_"$i".fasta
-    # blast representative reads
-    lederhosen name \
-                   --reps=$out_dir/representatives_"$i".fasta \
-                   --output=$out_dir/taxonomies_"$i".txt \
-                   --database=$taxcollector
-done
-echo "complete!"

data/lib/lederhosen/buffer.rb DELETED Viewed

@@ -1,54 +0,0 @@
-module Lederhosen
-  class Buffer
-    # for when you need to write out to a shitload of files.
-    #
-    # Create a new buffer
-    #
-    def initialize(args={})
-      @buffer = Hash.new { |h, k| h[k] = Array.new }
-      @buffer_max = args[:buffer_max] || 100_000
-    end
-    #
-    # Add an object to the buffer
-    #
-    def add_to bucket, obj
-      @buffer[bucket] << obj.to_s
-      if @buffer[bucket].length > @buffer_max
-        # write out
-        File.open(bucket, 'a+') do |out|
-          @buffer[bucket].each do |v|
-            out.puts v
-          end
-        end
-        # clear that bucket
-        @buffer[bucket].clear
-      end
-    end
-    def [] k
-      @buffer[k]
-    end
-    #
-    # Writes out leftover objects
-    #
-    def finalize
-      @buffer.each_key do |bucket|
-        File.open(bucket, 'a+') do |out|
-          @buffer[bucket].each do |v|
-            out.puts v
-          end
-        end
-      end
-      @buffer = Hash.new { |h, k| h[k] = Array.new }
-    end
-  end
-end

data/lib/lederhosen/helpers.rb DELETED Viewed

@@ -1,166 +0,0 @@
-module Lederhosen
-  class Helpers
-    class << self
-    # reverse complement a DNA sequence
-    # assumes only GATCN nucleotides
-    def reverse_complement(s)
-      s.reverse.tr('GATCNgatcn','CTAGNctagn')
-    end
-    # Function for grouping qseq files produced by splitting illumina
-    # reads by barcode
-    #
-    # Filenames should look like this:
-    # IL5_L_1_B_007_1.txt
-    def get_grouped_qseq_files(glob='raw_reads/*.txt')
-      Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
-    end
-    # Trim a pair of QSEQ files. Saves to a single,
-    # interleaved .fasta file
-    def trim_pairs(left, right, out, args={})
-      cutoff     = args[:cutoff]     || 20
-      min_length = args[:min_length] || 70
-      left_handle, right_handle =
-        begin
-          [ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
-        rescue Zlib::GzipFile::Error
-          [ File.open(left), File.open(right) ]
-        end
-      out_handle   = File.open out, 'w'
-      left_reads  = Dna.new left_handle
-      right_reads = Dna.new right_handle
-      i = 0
-      left_reads.zip(right_reads).each do |a, b|
-        i += 1
-        seqa = trim a
-        seqb = trim b
-        unless [seqa, seqb].include? nil
-          if seqb.length >= min_length && seqa.length >= min_length
-            seqb = reverse_complement(seqb)
-            out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
-          end
-        end
-      end
-      left_handle.close
-      right_handle.close
-      out_handle.close
-    end
-    # Return longest subsequence with quality scores
-    # greater than min. (Illumina PHRED)
-    # Trim2 from Huang, et. al
-    # returns just the sequence
-    def trim(dna, args={})
-      # trim primers off of sequence
-      # (THIS IS EXPERIMENT-SPECIFIC)
-      dna.sequence = dna.sequence[11..-1]
-      dna.quality  = dna.quality[11..-1]
-      # throw away any read with an ambiguous primer
-      return nil if dna.sequence =~ /N/
-      min    = args[:min]    || 20
-      offset = args[:cutoff] || 64
-      _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
-      dna.quality.each_byte.each_with_index do |b, a|
-        _sum += (b - offset - min)
-        if _sum > _max
-          _max = _sum
-          _end = a
-          start = first
-        elsif _sum < 0
-          _sum = 0
-          first = a
-        end
-      end
-      dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
-    end
-    # Load uc file from uclust
-    # returns hash with various data
-    def load_uc_file(input)
-      clusters = Hash.new
-      # keep track of samples
-      samples = Set.new
-      # store a list of all the sample IDs
-      clusters[:samples] = Set.new
-      # data for each cluster
-      # clstr_counts[:clstr][:sample] = number_of_reads
-      clstr_counts = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
-      # clstrnr_to_seed[seed_sequence_id] = clstr_nr
-      seed_to_clstrnr = Hash.new
-      bytes = File.size(input)
-      pbar = ProgressBar.new 'loading uc file', bytes
-      File.open(input) do |handle|
-        handle.each do |line|
-          pbar.set handle.pos
-          next if line =~ /^#/ # skip comments
-          line = line.strip.split
-          # things we want to know
-          type        = line[0]
-          clusternr   = line[1].to_i
-          querylabel  = line[8]
-          targetlabel = line[9]
-          header      = line[8]
-          sample =
-            begin
-              # get the sample id via regexp match
-              # this way more info can be stored in the header.
-              line[8].match(/sample=(.*)/)[1]
-            rescue NoMethodError # catch no method [] for NilClass
-              # Need to maintain some backwards compatibility here
-              # this is the old way of getting the same id.
-              sample = line[8].split(':')[2]
-            end
-          # keep track of samples
-          samples.add(sample)
-          # keep track of all samples
-          clusters[:samples].add sample
-          # L=LibSeed
-          # S=NewSeed
-          # H=Hit
-          # R=Reject
-          # D=LibCluster
-          # C=NewCluster
-          # N=NoHit
-          if type =~ /[LS]/ # = Seed Sequence
-            clstr_counts[clusternr][sample] += 1
-            seed_to_clstrnr[querylabel] = clusternr
-          elsif type =~ /H/ # = Seed Member
-            clstr_counts[clusternr][sample] += 1
-          end
-        end
-      end
-      pbar.finish
-      return {
-        :clstr_counts    => clstr_counts,
-        :seed_to_clstrnr => seed_to_clstrnr,
-        :samples         => samples
-      }
-    end
-    end # class << self
-  end # class Helpers
-end # Module

data/lib/lederhosen/tasks/k_filter.rb DELETED Viewed

@@ -1,82 +0,0 @@
-##
-# FILTER READS WITH LOW ABUNDANCE KMERS
-#
-module Lederhosen
-  class CLI
-    desc "k_filter",
-         "filter novel reads likely to form small/singleton clusters (experimental)"
-    method_option :input,    :type => :string,  :required => true
-    method_option :output,   :type => :string,  :required => true
-    method_option :k,        :type => :numeric, :required => true
-    method_option :cutoff,   :type => :numeric, :required => true
-    def k_filter
-      input  = options[:input]
-      output = options[:output]
-      k_len  = options[:k].to_i
-      cutoff = options[:cutoff]
-      ohai "kmer filtering #{input} (k = #{k_len}, cutoff = #{cutoff})"
-      counting_table = Hash.new { |h, k| h[k] = 0 }
-      total_reads = 0
-      File.open(input) do |handle|
-        pbar = ProgressBar.new 'counting', File.size(input)
-        records = Dna.new handle
-        records.each do |r|
-          pbar.set handle.pos
-          total_reads += 1
-          kmers = r.sequence.to_kmers(k_len)
-          kmers.each { |x| counting_table[x] += 1 }
-        end
-        pbar.finish
-      end
-      sum_of_kmers = counting_table.values.inject(:+)
-      ohai "total reads = #{total_reads}"
-      ohai "sum of kmers = #{sum_of_kmers}"
-      kept = 0
-      total_reads = total_reads.to_f
-      pbar = ProgressBar.new "saving", total_reads.to_i
-      output = File.open(output, 'w')
-      File.open(input) do |handle|
-        records = Dna.new handle
-        records.each do |r|
-          kmers = r.sequence.to_kmers(k_len)
-          # check if any of the kmers are rare
-          keep = true
-          coverage = 0
-          kmers.each do |kmer|
-            # if any of the kmers are rare, don't print the read
-            c = counting_table[kmer]
-            coverage += c
-            if c < cutoff
-              keep = false
-              break
-            end
-          end
-          if keep
-            kept += 1
-            output.puts r
-          end
-          pbar.inc
-        end
-      end
-      pbar.finish
-      ohai "survivors = #{kept} (#{kept/total_reads.to_f})"
-      output.close
-    end
-  end
-end

data/lib/lederhosen/tasks/rep_reads.rb DELETED Viewed

@@ -1,45 +0,0 @@
-##
-# GET REPRESENTATIVE READS
-#
-module Lederhosen
-  class CLI
-    desc "rep_reads",
-         "output a fasta file containing representative reads for each cluster given a UCLUST output file and the joined reads file"
-    method_option :clusters, :type => :string, :required => true
-    method_option :output,   :type => :string, :required => true
-    method_option :joined,   :type => :string, :required => true
-    def rep_reads
-      input        = options[:clusters]
-      output       = options[:output]
-      joined_reads = options[:joined]
-      ohai "getting represntative reads for #{input} w/ reads #{joined_reads} and saving to #{output}"
-      # Load cluster table!
-      clstr_info      = Helpers.load_uc_file input
-      clstr_counts    = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
-      seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
-      samples         = clstr_info[:samples]
-      out_handle = File.open("#{output}", 'w')
-      File.open(joined_reads) do |handle|
-        records = Dna.new handle
-        records.each do |dna|
-          clstrnr = seed_to_clstrnr[dna.name]
-          unless clstrnr.nil?
-            dna.name = "#{dna.name}:cluster-#{clstrnr}"
-            out_handle.puts dna
-          end
-        end
-      end
-      out_handle.close
-    end
-  end
-end

data/lib/lederhosen/tasks/split.rb DELETED Viewed

@@ -1,84 +0,0 @@
-##
-# Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
-#
-module Lederhosen
-  class CLI
-    desc "split",
-         "create fasta files containing reads from each cluster"
-    method_option :clusters,      :type => :string, :required => true
-    method_option :reads,         :type => :string, :required => true
-    method_option :out_dir,       :type => :string, :required => true
-    method_option :buffer_size,   :type => :numeric, :default => 1000
-    method_option :min_clst_size, :type => :numeric, :default => 1
-    def split
-      clusters = options[:clusters]
-      reads    = options[:reads]
-      out_dir  = options[:out_dir]
-      buffer_size = options[:buffer_size]
-      min_clst_size = options[:min_clst_size]
-      finalize_every = 100_000
-      ohai "spltting #{reads} by #{clusters} and saving to #{out_dir}"
-      ohai "minimum cluster size = #{min_clst_size}"
-      run "mkdir -p #{out_dir}/"
-      ohai "loading #{clusters}"
-      # Load read id -> cluster
-      read_to_clusterid = Hash.new
-      # keep track of cluster sizes
-      cluster_counts    = Hash.new { |h, k| h[k] = 0}
-      File.open(clusters)do |handle|
-        handle.each do |line|
-          line = line.strip.split
-          cluster_nr = line[1]
-          if line[0] == 'S' || line[0] == 'H'
-            read = line[8]
-          else
-            next
-          end
-          read_to_clusterid[read] = cluster_nr
-          cluster_counts[cluster_nr] += 1
-        end
-      end
-      read_to_clusterid.delete_if do |read, cluster_nr|
-        cluster_counts[cluster_nr] < min_clst_size
-      end
-      total_reads = read_to_clusterid.length
-      total_clusters = read_to_clusterid.values.uniq.length
-      ohai "#{total_reads} reads in #{total_clusters} clusters"
-      pbar = ProgressBar.new "saving", total_reads
-      # Write reads to individual fasta files using Buffer
-      buffer = Buffer.new :buffer_max => buffer_size
-      File.open(reads) do |handle|
-        records = Dna.new handle
-        records.each_with_index do |record, i|
-          cluster_id = read_to_clusterid[record.name]
-          if cluster_id
-            pbar.inc
-            filename = File.join(out_dir, cluster_id + '.fasta')
-            buffer[filename] << record
-            buffer.finalize if (i%finalize_every == 0)
-          end
-        end
-      end
-      pbar.finish
-      ohai "finalizing output"
-      buffer.finalize # finish writing out
-      puts "done"
-    end
-  end
-end

data/lib/lederhosen/tasks/uc_filter.rb DELETED Viewed

@@ -1,80 +0,0 @@
-##
-# FILTER UC FILE BY MIN SAMPLES
-#
-require 'set'
-module Lederhosen
-  class CLI
-    desc "uc_filter",
-         "filter UCLUST output to remove small, infrequent clusters"
-    method_option :input,    :type => :string,  :required => true
-    method_option :output,   :type => :string,  :required => true
-    method_option :reads,    :type => :numeric, :required => true
-    method_option :samples,  :type => :numeric, :required => true
-    def uc_filter
-      input   = options[:input]
-      output  = options[:output]
-      reads   = options[:reads].to_i
-      samples = options[:samples].to_i
-      ohai "filtering #{input} to #{output}, reads = #{reads} & samples = #{samples}"
-      # load UC file
-      ohai "loading uc file"
-      clstr_info   = Helpers.load_uc_file input
-      clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
-      # filter
-      ohai "filtering"
-      survivors = clstr_counts.reject do |a, b|
-        b.reject{ |i, j| j < reads }.length < samples
-      end
-      surviving_clusters = survivors.keys.to_set
-      # print filtered uc file
-      ohai "saving filtered table"
-      out = File.open(output, 'w')
-      lines = `wc -l #{input}`.split.first.to_i
-      pbar = ProgressBar.new 'saving', lines
-      kept, total = 1, 0
-      # output lederhosen filtering information because I often
-      # forget to write this down :)
-      out.puts "# filtered: #{input}"
-      out.puts "# #{reads} reads in at least #{samples} samples"
-      File.open(input) do |handle|
-        pbar = ProgressBar.new 'saving', File.size(input)
-        handle.each do |line|
-          pbar.set handle.pos
-          if line =~ /^#/
-            out.print line
-            next
-          end
-          total += 1
-          # check if cluster is in surviving clusters
-          if surviving_clusters.include? line.split[1].to_i
-            out.print line
-            kept += 1
-          end
-        end
-        pbar.finish
-      end
-      out.close
-      ohai "clusters: #{surviving_clusters.length}/#{clstr_counts.keys.length} = #{100*surviving_clusters.length/clstr_counts.keys.length.to_f}%"
-      ohai "reads:    #{kept}/#{total} = #{100*kept/total.to_f}%"
-    end
-  end
-end

data/lib/lederhosen/tasks/uc_stats.rb DELETED Viewed

@@ -1,41 +0,0 @@
-##
-# Get statistics about clusters in a UC file
-#
-module Lederhosen
-  class CLI
-    desc 'uc_stats',
-      'get statistics about clusters in a UC file. for now, this only calculates the size of each cluster'
-    method_option :input, :type => :string, :required => true
-    def uc_stats
-      input = options[:input]
-      ohai "calculating statistics for #{input}"
-      # TODO add more stats
-      cluster_stats = Hash.new { |h, k|
-        h[k] = {
-          :size  => 0
-        }
-      }
-      File.open(input) do |handle|
-        handle.each do |line|
-          line = line.strip.split
-          type, clustr_nr = line[0], line[1]
-          cluster_stats[clustr_nr][:size] += 1
-        end
-      end
-      stat_types = cluster_stats.values.first.keys.sort
-      puts "cluster,#{stat_types.join(',')}"
-      cluster_stats.each do |cluster, stats|
-        puts "#{cluster},#{stat_types.map { |x| stats[x] }.join(',')}"
-      end
-    end
-  end
-end

data/spec/helpers_spec.rb DELETED Viewed

@@ -1,30 +0,0 @@
-require 'spec_helper'
-describe Lederhosen::Helpers do
-  let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt.gz') }
-  it 'should have a method for grouping QSEQ files' do
-    groups.length.should == 2
-  end
-  it 'should have a method for reverse complementing a dna sequence' do
-    Lederhosen::Helpers.reverse_complement("GATCCCGANNANTAGGACCAA").should == "TTGGTCCTANTNNTCGGGATC"
-  end
-  it 'should have a method for trimming sequences' do
-    reads = groups.values.first.first
-    record = Zlib::GzipReader.open(reads) do |handle|
-      Dna.new(handle).first
-    end
-    # I should probably test with a bad read
-    Lederhosen::Helpers.trim(record).length.should == 58
-  end
-  it 'should be able to trim pairs of qseq files, outputting fasta file' do
-    reads = groups.values.first
-    Lederhosen::Helpers.trim_pairs reads[0], reads[1], "#{$test_dir}/munchen_trim_test.fasta"
-    # this test will break if trim parameters change
-    File.readlines("#{$test_dir}/munchen_trim_test.fasta").grep(/^>/).length.should be_even
-  end
-end