RubyGems - lederhosen - Versions diffs - 0.1.6 → 0.1.7 - Mend

lederhosen 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/Gemfile +1 -1
data/bin/lederhosen +1 -1
data/examples/hierarchical_clustering.sh +51 -0
data/examples/pipeline.sh +1 -1
data/lederhosen.gemspec +1 -1
data/lib/lederhosen/buffer.rb +2 -2
data/lib/lederhosen/helpers.rb +10 -5
data/lib/lederhosen/tasks/add_names.rb +80 -80
data/lib/lederhosen/tasks/cluster.rb +1 -1
data/lib/lederhosen/tasks/name.rb +2 -2
data/lib/lederhosen/tasks/otu_filter.rb +43 -43
data/lib/lederhosen/tasks/otu_table.rb +13 -13
data/lib/lederhosen/tasks/rep_reads.rb +2 -2
data/lib/lederhosen/tasks/squish.rb +42 -42
data/lib/lederhosen.rb +2 -1
data/lib/version.rb +1 -1
data/readme.md +7 -3
data/spec/data/ILT_L_9_B_001_1.txt.gz +0 -0
data/spec/data/ILT_L_9_B_001_3.txt.gz +0 -0
data/spec/data/ILT_L_9_B_002_1.txt.gz +0 -0
data/spec/data/ILT_L_9_B_002_3.txt.gz +0 -0
data/spec/helpers_spec.rb +7 -7
data/spec/misc_spec.rb +1 -1
data/spec/pipeline_spec.rb +18 -22
data/spec/spec_helper.rb +5 -1
metadata +13 -16
data/spec/data/ILT_L_9_B_001_1.txt +0 -400
data/spec/data/ILT_L_9_B_001_3.txt +0 -400
data/spec/data/ILT_L_9_B_002_1.txt +0 -400
data/spec/data/ILT_L_9_B_002_3.txt +0 -400
data/spec/data/blast_out.txt +0 -10
data/spec/data/blat.txt +0 -86

data/Gemfile CHANGED Viewed

@@ -4,4 +4,4 @@ gem 'thor'
 gem 'rspec'
 gem 'dna'
 gem 'progressbar'
-gem 'awesome_print'
+gem 'awesome_print'

data/bin/lederhosen CHANGED Viewed

@@ -4,4 +4,4 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'lederho
 fail "you need to install uclust and have it in your $PATH" if `which uclust` == ''
-Lederhosen::CLI.start
+Lederhosen::CLI.start

data/examples/hierarchical_clustering.sh ADDED Viewed

@@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+set -x
+# Hierarchical OTU clustering
+# Austin G. Davis-Richardson
+# <harekrishna at gmail dot com>
+# http://github.com/audy/lederhosen
+reads='sorted.fasta'
+out='h_clustering'
+mkdir -p $out
+# initial clustering at 80%
+lederhosen cluster --input=$reads --output=$out/clusters_0.80.uc --identity=0.80
+# filter UC file
+lederhosen uc_filter --input=$out/clusters_0.80.uc --output=$out/clusters_0.80.uc.filtered --reads=1 --samples=1
+# get reads for each cluster
+mkdir -p $out/split_80
+lederhosen split --clusters=$out/clusters_0.80.uc.filtered --reads=$reads --out-dir=$out/split_80/
+# now cluster each of those at 90%
+for fasta in $out/split_80/*.fasta
+do
+  # sort (awww, do I really have to do this again?)
+  lederhosen sort --input=$fasta --output=$fasta.sorted
+  # cluster
+  lederhosen cluster --input=$fasta.sorted --output=$fasta.uc --identity=0.90
+  # split
+  split=$out/split_80.90_$(basename $fasta .fasta)
+  lederhosen split --clusters=$fasta.uc --reads=$fasta --out-dir=$split
+done
+# Do it again at 95%
+for fasta in $out/split_80/split_*_90.fasta/*.fasta
+do
+  # cluster
+  lederhosen cluster --input=$fasta --output=$fasta.uc --identity=90
+  # split
+  split=$outdir/80.90.$fasta.fasta
+  mkdir -p $split
+  lederhosen split --clusters=$fasta.uc --reads=$input --out-dir=$split
+done

data/examples/pipeline.sh CHANGED Viewed

@@ -68,4 +68,4 @@ do
                    --database=$taxcollector
 done
-echo "complete!"
+echo "complete!"

data/lederhosen.gemspec CHANGED Viewed

@@ -25,4 +25,4 @@ Gem::Specification.new do |s|
   s.add_dependency('progressbar')
   s.add_dependency('bundler')
   s.add_dependency('awesome_print')
-end
+end

data/lib/lederhosen/buffer.rb CHANGED Viewed

@@ -48,7 +48,7 @@ module Lederhosen
       end
       @buffer = Hash.new { |h, k| h[k] = Array.new }
     end
   end
-end
+end

data/lib/lederhosen/helpers.rb CHANGED Viewed

@@ -17,8 +17,13 @@ module Lederhosen
       cutoff     = args[:cutoff]     || 20
       min_length = args[:min_length] || 70
-      left_handle  = File.open left
-      right_handle = File.open right
+      left_handle, right_handle =
+        begin
+          [ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
+        rescue Zlib::GzipFile::Error
+          [ File.open(left), File.open(right) ]
+        end
       out_handle   = File.open out, 'w'
       left_reads  = Dna.new left_handle
@@ -57,9 +62,9 @@ module Lederhosen
       min    = args[:min]    || 20
       offset = args[:cutoff] || 64
-			_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
-			dna.quality.each_byte.each_with_index do |b, a|
+      _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
+      dna.quality.each_byte.each_with_index do |b, a|
         _sum += (b - offset - min)
         if _sum > _max
           _max = _sum

data/lib/lederhosen/tasks/add_names.rb CHANGED Viewed

@@ -3,84 +3,84 @@
 #
 module Lederhosen
-	class CLI
-		desc "add_names",
-			"add names to otu abundance matrix using blat output"
-		method_option :blat,   :type => :string, :required => true
-		method_option :table,  :type => :string, :required => true
-		method_option :level,  :type => :string, :required => true
-		method_option :output, :type => :string, :required => false
-	  def add_names
-			blat =  options[:blat]
-			table = options[:table]
-			level = options[:level]
-			output = options[:output] || $stdout
-			levels = { 'kingdom' => 0,
-								 'domain'  => 0,
-						  	 'phylum'  => 1,
-								 'class'   => 2,
-					  		 'order'   => 3,
-					  	   'family'  => 4,
-					  		 'genus'   => 5,
-						  	 'species' => 6 }
-			fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
-			# Corresponds with the numbers used in the TaxCollector database
-			# taxonomic descriptions
-			level_no = levels[level]
-			# map cluster_id to taxonomic description
-			# default is the cluster_id itself in case
-			# the cluster was not classified.
-			clusterid_to_name = Hash.new { |h, k| h[k] = k }
-			# map clusterid to name using blat output
-			ohai "loading BLAT output from #{blat}"
-			File.open(blat) do |handle|
-				handle.each do |line|
-					line = line.strip.split
-					# Only get first match
-					# TODO something smarter here
-					cluster_id = line[0].split(':')[3]
-					next if clusterid_to_name.include? cluster_id
-					taxonomic_description = line[1]
-					# match by level_no
-					# Example:
-					# [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
-					# I want to match Actinobacteria given level_no = 2
-					level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
-					clusterid_to_name[cluster_id] = level_name
-				end
-			end
-			# load table, replace cluster names with taxonomic descriptions
-			output = File.open(output, 'w') unless output == $stdout
-			ohai "replacing names in #{table}"
-			File.open(table) do |handle|
-				# read in header, replace clusterids to names
-				header = handle.gets.strip.split(',')
-				header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
-				# print new header
-				output.puts header.join(',')
-				# print rest of table
-				handle.each { |l| output.print l }
-			end
-			# print status message
-			ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
-		end
-	end
+  class CLI
+    desc "add_names",
+      "add names to otu abundance matrix using blat output"
+    method_option :blat,   :type => :string, :required => true
+    method_option :table,  :type => :string, :required => true
+    method_option :level,  :type => :string, :required => true
+    method_option :output, :type => :string, :required => false
+    def add_names
+      blat =  options[:blat]
+      table = options[:table]
+      level = options[:level]
+      output = options[:output] || $stdout
+      levels = { 'kingdom' => 0,
+                 'domain'  => 0,
+                 'phylum'  => 1,
+                 'class'   => 2,
+                 'order'   => 3,
+                 'family'  => 4,
+                 'genus'   => 5,
+                 'species' => 6 }
+      fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
+      # Corresponds with the numbers used in the TaxCollector database
+      # taxonomic descriptions
+      level_no = levels[level]
+      # map cluster_id to taxonomic description
+      # default is the cluster_id itself in case
+      # the cluster was not classified.
+      clusterid_to_name = Hash.new { |h, k| h[k] = k }
+      # map clusterid to name using blat output
+      ohai "loading BLAT output from #{blat}"
+      File.open(blat) do |handle|
+        handle.each do |line|
+          line = line.strip.split
+          # Only get first match
+          # TODO something smarter here
+          cluster_id = line[0].split(':')[3]
+          next if clusterid_to_name.include? cluster_id
+          taxonomic_description = line[1]
+          # match by level_no
+          # Example:
+          # [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
+          # I want to match Actinobacteria given level_no = 2
+          level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
+          clusterid_to_name[cluster_id] = level_name
+        end
+      end
+      # load table, replace cluster names with taxonomic descriptions
+      output = File.open(output, 'w') unless output == $stdout
+      ohai "replacing names in #{table}"
+      File.open(table) do |handle|
+        # read in header, replace clusterids to names
+        header = handle.gets.strip.split(',')
+        header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
+        # print new header
+        output.puts header.join(',')
+        # print rest of table
+        handle.each { |l| output.print l }
+      end
+      # print status message
+      ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
+    end
+  end
 end

data/lib/lederhosen/tasks/cluster.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module Lederhosen
       identity = options[:identity]
       output = options[:output]
       input = options[:input]
       cmd = [
         'uclust',
         "--input #{input}",

data/lib/lederhosen/tasks/name.rb CHANGED Viewed

@@ -25,9 +25,9 @@ module Lederhosen
         '-t=dna',
         '-q=dna',
         '-out=blast8',
-        output
+        output
       ]
       exec cmd.join(' ')
     end

data/lib/lederhosen/tasks/otu_filter.rb CHANGED Viewed

@@ -1,45 +1,45 @@
 module Lederhosen
-	class CLI
-		desc 'otu_filter', 'works like uc_filter but uses an OTU table as input'
-		method_option :input, :type   =>  :string, :required => true
-		method_option :output, :type  =>  :string, :required => true
-		method_option :reads, :type   => :numeric, :required => true
-		method_option :samples, :type => :numeric, :required => true
-		def otu_filter
-			input   = options[:input]
-			output  = options[:output]
-			reads   = options[:reads]
-			samples = options[:samples]
-			##
-			# Iterate over otu table line by line.
-			# Only print if cluster meets criteria
-			#
-			kept = 0
-			File.open(input) do |handle|
-			  header  = handle.gets.strip
-				header  = header.split(',')
-				samples = header[1..-1]
-				puts header.join(',')
-				handle.each do |line|
-					line       = line.strip.split(',')
-					cluster_no = line[0]
-					counts     = line[1..-1].collect { |x| x.to_i }
-					# should be the same as uc_filter
-					if counts.reject { |x| x < reads }.length > samples
-						puts line.join(',')
-						kept += 1
-					end
-				end
-			end
-			ohai "kept #{kept} clusters."
-		end
-	end
+  class CLI
+    desc 'otu_filter', 'works like uc_filter but uses an OTU table as input'
+    method_option :input, :type   =>  :string, :required => true
+    method_option :output, :type  =>  :string, :required => true
+    method_option :reads, :type   => :numeric, :required => true
+    method_option :samples, :type => :numeric, :required => true
+    def otu_filter
+      input   = options[:input]
+      output  = options[:output]
+      reads   = options[:reads]
+      samples = options[:samples]
+      ##
+      # Iterate over otu table line by line.
+      # Only print if cluster meets criteria
+      #
+      kept = 0
+      File.open(input) do |handle|
+        header  = handle.gets.strip
+        header  = header.split(',')
+        samples = header[1..-1]
+        puts header.join(',')
+        handle.each do |line|
+          line       = line.strip.split(',')
+          cluster_no = line[0]
+          counts     = line[1..-1].collect { |x| x.to_i }
+          # should be the same as uc_filter
+          if counts.reject { |x| x < reads }.length > samples
+            puts line.join(',')
+            kept += 1
+          end
+        end
+      end
+      ohai "kept #{kept} clusters."
+    end
+  end
 end

data/lib/lederhosen/tasks/otu_table.rb CHANGED Viewed

@@ -17,32 +17,32 @@ module Lederhosen
       input        = options[:clusters]
       output       = options[:output]
       joined_reads = options[:joined]
       # Load cluster table
-      clstr_info      = Helpers.load_uc_file input
+      clstr_info      = Helpers.load_uc_file input
       clstr_counts    = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
       clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
       samples         = clstr_info[:samples]
       # print OTU abundance matrix
-			# clusters as columns
-			# samples as rows
+      # clusters as columns
+      # samples as rows
       File.open("#{output}.csv", 'w') do |h|
         samples  = samples.sort
         clusters = clstr_counts.keys
         # print header (cluster names)
-				h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
-				samples.each do |sample|
-					h.print sample
-					clusters.each do |cluster|
-						h.print "#{SEP}#{clstr_counts[cluster][sample]}"
-					end
-					h.print "\n"
-				end
+        h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
+        samples.each do |sample|
+          h.print sample
+          clusters.each do |cluster|
+            h.print "#{SEP}#{clstr_counts[cluster][sample]}"
+          end
+          h.print "\n"
+        end
       end
     end

data/lib/lederhosen/tasks/rep_reads.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module Lederhosen
       # Load cluster table!
-      clstr_info      = Helpers.load_uc_file input
+      clstr_info      = Helpers.load_uc_file input
       clstr_counts    = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
       seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
       samples         = clstr_info[:samples]
@@ -36,7 +36,7 @@ module Lederhosen
           end
         end
       end
       out_handle.close
     end

data/lib/lederhosen/tasks/squish.rb CHANGED Viewed

@@ -3,46 +3,46 @@
 #
 module Lederhosen
-	class CLI
-		desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
-		method_option :csv_file, :type => :string, :required => true
-		method_option :output,   :type => :string, :required => false
-		def squish
-			csv_file = options[:csv_file]
-			output   = options[:output] || $stdout
-			# sample_name -> column name -> total number of reads
-			total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
-			column_names = '' # scope
-			# Load CSV file, merge counts in columns with the same name
-			File.open(csv_file) do |handle|
-				column_names = handle.gets.strip.split(',')[1..-1]
-				handle.each do |line|
-					line = line.strip.split(',')
-					sample = line[0]
-					line[1..-1].zip(column_names) do |reads, column_name|
-						total_by_sample_by_column[sample][column_name] += reads.to_i
-					end
-				end
-			end
-			output = File.open(output, 'w') rescue $stdout
-			# print the new, squished csv file
-			column_names.uniq!.sort!
-			output.puts "-,#{column_names.join(',')}"
-			total_by_sample_by_column.each_pair do |sample_id, row|
-				output.print "#{sample_id}"
-				column_names.each do |column_name|
-					output.print ",#{row[column_name]}"
-				end
-				output.print "\n"
-			end
-			output.close
-		end
-	end
+  class CLI
+    desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
+    method_option :csv_file, :type => :string, :required => true
+    method_option :output,   :type => :string, :required => false
+    def squish
+      csv_file = options[:csv_file]
+      output   = options[:output] || $stdout
+      # sample_name -> column name -> total number of reads
+      total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
+      column_names = '' # scope
+      # Load CSV file, merge counts in columns with the same name
+      File.open(csv_file) do |handle|
+        column_names = handle.gets.strip.split(',')[1..-1]
+        handle.each do |line|
+          line = line.strip.split(',')
+          sample = line[0]
+          line[1..-1].zip(column_names) do |reads, column_name|
+            total_by_sample_by_column[sample][column_name] += reads.to_i
+          end
+        end
+      end
+      output = File.open(output, 'w') rescue $stdout
+      # print the new, squished csv file
+      column_names.uniq!.sort!
+      output.puts "-,#{column_names.join(',')}"
+      total_by_sample_by_column.each_pair do |sample_id, row|
+        output.print "#{sample_id}"
+        column_names.each do |column_name|
+          output.print ",#{row[column_name]}"
+        end
+        output.print "\n"
+      end
+      output.close
+    end
+  end
 end

data/lib/lederhosen.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'dna'
 require 'set'
 require 'progressbar'
 require 'awesome_print'
+require 'zlib'
 Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }
@@ -13,4 +14,4 @@ class String
     k -= 1
     (0..(self.length-k-1)).collect { |i| self[i..i+k] }
   end
-end
+end

data/lib/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Lederhosen
-  VERSION = '0.1.6'
+  VERSION = '0.1.7'
 end

data/readme.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # Lederhosen
-Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own risk.
+Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
 ## How do I get Lederhosen?
@@ -33,9 +33,9 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
 ### trim
-Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
+Trim (Illumina) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
-    lederhosen trim --reads_dir=reads/* --out_dir=trimmed/
+    lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
 ### join
@@ -43,6 +43,10 @@ Join paired reads from all samples end-to-end. This method enables the use of uc
     lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
+If your reads are not paired, then you do not need to do this step. Instead, concatenate all of the trimmed reads files.
+    cat trimmed/*.fasta > joined.fasta
 ### sort
 Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.

data/spec/data/ILT_L_9_B_001_1.txt.gz ADDED Viewed

Binary file

data/spec/data/ILT_L_9_B_001_3.txt.gz ADDED Viewed

Binary file

data/spec/data/ILT_L_9_B_002_1.txt.gz ADDED Viewed

Binary file

data/spec/data/ILT_L_9_B_002_3.txt.gz ADDED Viewed

Binary file

data/spec/helpers_spec.rb CHANGED Viewed

@@ -1,26 +1,26 @@
 require 'spec_helper'
 describe Lederhosen::Helpers do
-  let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt') }
+  let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt.gz') }
   it 'should have a method for grouping QSEQ files' do
     groups.length.should == 2
   end
   it 'should have a method for trimming sequences' do
     reads = groups.values.first.first
-    record = File.open(reads) do |handle|
+    record = Zlib::GzipReader.open(reads) do |handle|
       Dna.new(handle).first
     end
     # I should probably test with a bad read
-    Lederhosen::Helpers.trim(record).length.should == 79
+    Lederhosen::Helpers.trim(record).length.should == 58
   end
   it 'should be able to trim pairs of qseq files, outputting fasta file' do
     reads = groups.values.first
-    Lederhosen::Helpers.trim_pairs reads[0], reads[1], '/tmp/munchen_trim_test.fasta'
+    Lederhosen::Helpers.trim_pairs reads[0], reads[1], "#{$test_dir}/munchen_trim_test.fasta"
     # this test will break if trim parameters change
-    File.read('/tmp/munchen_trim_test.fasta').grep(/^>/).length.should be_even
+    File.read("#{$test_dir}/munchen_trim_test.fasta").grep(/^>/).length.should be_even
   end
 end

data/spec/misc_spec.rb CHANGED Viewed

@@ -8,4 +8,4 @@ describe String do
     'test'.to_kmers(5).should == []
     'test'.to_kmers(0).should == []
   end
-end
+end