RubyGems - lederhosen - Versions diffs - 0.1.2 → 0.1.3 - Mend

lederhosen 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/{pipeline.sh → examples/pipeline.sh} +10 -10
data/lib/lederhosen/tasks/add_names.rb +86 -0
data/lib/lederhosen/tasks/cluster.rb +1 -1
data/lib/lederhosen/tasks/hierarchical.rb +40 -0
data/lib/lederhosen/tasks/join.rb +1 -1
data/lib/lederhosen/tasks/k_filter.rb +1 -1
data/lib/lederhosen/tasks/name.rb +2 -2
data/lib/lederhosen/tasks/otu_table.rb +1 -1
data/lib/lederhosen/tasks/rep_reads.rb +1 -1
data/lib/lederhosen/tasks/sort.rb +1 -1
data/lib/lederhosen/tasks/split.rb +3 -3
data/lib/lederhosen/tasks/squish.rb +48 -0
data/lib/lederhosen/tasks/trim.rb +1 -1
data/lib/lederhosen/tasks/uc_filter.rb +1 -1
data/lib/version.rb +1 -1
data/readme.md +83 -7
metadata +8 -5

data/{pipeline.sh → examples/pipeline.sh} RENAMED Viewed

@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bash
 # An example OTU clustering pipeline
 # Austin G. Davis-Richardson
@@ -14,55 +14,55 @@ min_reads=50
 min_samples=10
 # trim reads
-bin/lederhosen trim \
+lederhosen trim \
                --reads-dir=$raw_reads \
                --out-dir=$out_dir/trimmed
 # join reads
-bin/lederhosen join \
+lederhosen join \
                --trimmed=$out_dir/trimmed/*.fasta \
                --output=$out_dir/joined.fasta
 # filter reads
-bin/lederhosen k_filter \
+lederhosen k_filter \
                --input=$out_dir/joined.fasta \
                --output=$out_dir/filtered.fasta \
                -k=10 \
                --cutoff=50
 # sort
-bin/lederhosen sort \
+lederhosen sort \
                --input=$out_dir/filtered.fasta \
                --output=$out_dir/sorted.fasta
 for i in 0.80 0.90 0.95
 do
     # cluster
-    bin/lederhosen cluster \
+    lederhosen cluster \
                    --input=$out_dir/sorted.fasta \
                    --output=$out_dir/clusters_"$i".uc \
                    --identity=$i
     # filter uc file
-    bin/lederhosen uc_filter \
+    lederhosen uc_filter \
                    --input=$out_dir/clusters_"$i".uc \
                    --output=$out_dir/clusters_"$i".uc.filtered \
                    --reads=$min_reads \
                    --samples=$min_samples \
     # generate otu table
-    bin/lederhosen otu_table \
+    lederhosen otu_table \
                    --clusters=$out_dir/clusters_"$i".uc.filtered \
                    --output=$out_dir/otus_"$i"
     # get representative reads
-    bin/lederhosen rep_reads \
+    lederhosen rep_reads \
                    --clusters=$out_dir/clusters_"$i".uc.filtered \
                    --joined=$out_dir/sorted.fasta \
                    --output=$out_dir/representatives_"$i".fasta
     # blast representative reads
-    bin/lederhosen name \
+    lederhosen name \
                    --reps=$out_dir/representatives_"$i".fasta \
                    --output=$out_dir/taxonomies_"$i".txt \
                    --database=$taxcollector

data/lib/lederhosen/tasks/add_names.rb ADDED Viewed

@@ -0,0 +1,86 @@
+##
+# ADD TAXONOMIC DESCRIPTIONS TO OTU TABLE
+#
+module Lederhosen
+	class CLI
+		desc "add_names",
+			"--blat=blat_output.txt --table=cluster_table.csv --level=taxonomic level (i.e 6 genus)"
+		method_option :blat,   :type => :string, :required => true
+		method_option :table,  :type => :string, :required => true
+		method_option :level,  :type => :string, :required => true
+		method_option :output, :type => :string, :required => false
+	  def add_names
+			blat =  options[:blat]
+			table = options[:table]
+			level = options[:level]
+			output = options[:output] || $stdout
+			levels = { 'kingdom' => 0,
+								 'domain'  => 0,
+						  	 'phylum'  => 1,
+								 'class'   => 2,
+					  		 'order'   => 3,
+					  	   'family'  => 4,
+					  		 'genus'   => 5,
+						  	 'species' => 6 }
+			fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
+			# Corresponds with the numbers used in the TaxCollector database
+			# taxonomic descriptions
+			level_no = levels[level]
+			# map cluster_id to taxonomic description
+			# default is the cluster_id itself in case
+			# the cluster was not classified.
+			clusterid_to_name = Hash.new { |h, k| h[k] = k }
+			# map clusterid to name using blat output
+			ohai "loading BLAT output from #{blat}"
+			File.open(blat) do |handle|
+				handle.each do |line|
+					line = line.strip.split
+					# Only get first match
+					# TODO something smarter here
+					cluster_id = line[0].split(':')[3]
+					next if clusterid_to_name.include? cluster_id
+					taxonomic_description = line[1]
+					# match by level_no
+					# Example:
+					# [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
+					# I want to match Actinobacteria given level_no = 2
+					level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
+					clusterid_to_name[cluster_id] = level_name
+				end
+			end
+			# load table, replace cluster names with taxonomic descriptions
+			output = File.open(output, 'w') unless output == $stdout
+			ohai "replacing names in #{table}"
+			File.open(table) do |handle|
+				# read in header, replace clusterids to names
+				header = handle.gets.strip.split(',')
+				header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
+				# print new header
+				output.puts header.join(',')
+				# print rest of table
+				handle.each { |l| output.print l }
+			end
+			# print status message
+			ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
+		end
+	end
+end

data/lib/lederhosen/tasks/cluster.rb CHANGED Viewed

@@ -5,7 +5,7 @@
 module Lederhosen
   class CLI
-    desc "cluster fasta file",
+    desc "cluster",
          "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
     method_option :input,    :type => :string,  :required => true

data/lib/lederhosen/tasks/hierarchical.rb ADDED Viewed

@@ -0,0 +1,40 @@
+##
+# HIERARCHICAL CLUSTERING FTW
+#
+module Lederhosen
+  class CLI
+    desc "h_cluster",
+         "--input=sorted.fasta --identity=0.80 --output=clusters.uc --identities=0.80 0.90 0.95"
+    method_option :input,      :type => :string, :required => true
+    method_option :out_dir,    :type => :string, :required => true
+    method_option :identities, :type => :array,  :required => true
+    def h_cluster
+      out_dir    = options[:out_dir]
+      input      = options[:input]
+      identities = options[:identities].map(&:to_f).sort
+      `mkdir -p #{out_dir}`
+      # initial clustering
+      i = identities.shift
+      clusters = File.join(out_dir, "clusters_#{i}.uc")
+      clusters_filtered = File.join(out_dir, "clusters_#{i}.uc.filtered")
+      # cluster
+      invoke :cluster, [], { :input => input, :output => clusters, :identity => i }
+      # filter
+      invoke :uc_filter, [], { :input => clusters, :output => clusters_filtered }
+      # get reads for each cluster
+      invoke :split, [], { :clusters => clusters_filtered, :reads => input }
+      [t1, t2, t3].map(&:call)
+    end
+  end
+end

data/lib/lederhosen/tasks/join.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Lederhosen
     ##
     # PAIRED-END READ WORK-AROUND (JOIN THEM)
     #
-    desc "join reads end-to-end",
+    desc "join",
          "--trimmed=trimmed/*.fasta --output=joined.fasta"
     method_option :trimmed, :type => :string, :required => true

data/lib/lederhosen/tasks/k_filter.rb CHANGED Viewed

@@ -5,7 +5,7 @@
 module Lederhosen
   class CLI
-    desc "k_filter khmer filtering",
+    desc "k_filter",
          "--input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50"
     method_option :input,    :type => :string,  :required => true

data/lib/lederhosen/tasks/name.rb CHANGED Viewed

@@ -5,8 +5,8 @@
 module Lederhosen
   class CLI
-    desc "name identify clusters in a taxcollector database",
-         "--reps representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
+    desc "name",
+         "--reps --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
     method_option :reps,     :type => :string, :required => true
     method_option :database, :type => :string, :required => true

data/lib/lederhosen/tasks/otu_table.rb CHANGED Viewed

@@ -7,7 +7,7 @@ SEP = ','
 module Lederhosen
   class CLI
-    desc "otu_tables generates otu tables",
+    desc "otu_table",
          "--clusters=clusters.uc --output=otu_prefix"
     method_option :clusters, :type => :string, :required => true

data/lib/lederhosen/tasks/rep_reads.rb CHANGED Viewed

@@ -5,7 +5,7 @@
 module Lederhosen
   class CLI
-    desc "rep_reads extract representative reads for each cluster to a fasta file",
+    desc "rep_reads",
          "--clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta"
     method_option :clusters, :type => :string, :required => true

data/lib/lederhosen/tasks/sort.rb CHANGED Viewed

@@ -5,7 +5,7 @@
 module Lederhosen
   class CLI
-    desc "sort fasta file by length",
+    desc "sort",
          "--input=joined.fasta --output=sorted.fasta"
     method_option :input,  :type => :string, :required => true

data/lib/lederhosen/tasks/split.rb CHANGED Viewed

@@ -5,14 +5,14 @@
 module Lederhosen
   class CLI
-    desc "output separate fasta file containing sequences belonging to each cluster",
-         "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
+    desc "split",
+         "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=1 --out-dir=output_directory"
     method_option :clusters,      :type => :string, :required => true
     method_option :reads,         :type => :string, :required => true
     method_option :out_dir,       :type => :string, :required => true
     method_option :buffer_size,   :type => :numeric, :default => 1000
-    method_option :min_clst_size, :type => :numeric, :default => 100
+    method_option :min_clst_size, :type => :numeric, :default => 1
     def split
       clusters = options[:clusters]

data/lib/lederhosen/tasks/squish.rb ADDED Viewed

@@ -0,0 +1,48 @@
+##
+# SQUISH A CSV FILE BY COLUMN NAME
+#
+module Lederhosen
+	class CLI
+		desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
+		method_option :csv_file, :type => :string, :required => true
+		method_option :output,   :type => :string, :required => false
+		def squish
+			csv_file = options[:csv_file]
+			output   = options[:output] || $stdout
+			# sample_name -> column name -> total number of reads
+			total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
+			column_names = '' # scope
+			# Load CSV file, merge counts in columns with the same name
+			File.open(csv_file) do |handle|
+				column_names = handle.gets.strip.split(',')[1..-1]
+				handle.each do |line|
+					line = line.strip.split(',')
+					sample = line[0]
+					line[1..-1].zip(column_names) do |reads, column_name|
+						total_by_sample_by_column[sample][column_name] += reads.to_i
+					end
+				end
+			end
+			output = File.open(output) rescue $stdout
+			# print the new, squished csv file
+			column_names.uniq!.sort!
+			puts "-,#{column_names.join(',')}"
+			total_by_sample_by_column.each_pair do |sample_id, row|
+				print "#{sample_id}"
+				column_names.each do |column_name|
+					print ",#{row[column_name]}"
+				end
+				print "\n"
+			end
+			output.close
+		end
+	end
+end

data/lib/lederhosen/tasks/trim.rb CHANGED Viewed

@@ -5,7 +5,7 @@
 module Lederhosen
   class CLI
-    desc "trim Illumina QSEQ files",
+    desc "trim",
          "--reads_dir=reads/* --out_dir=trimmed.fasta"
     method_option :reads_dir, :type => :string, :required => true

data/lib/lederhosen/tasks/uc_filter.rb CHANGED Viewed

@@ -5,7 +5,7 @@
 module Lederhosen
   class CLI
-    desc "uc_filter filter uc file by min samples",
+    desc "uc_filter",
          "--input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10"
     method_option :input,    :type => :string,  :required => true

data/lib/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Lederhosen
-  VERSION = '0.1.2'
+  VERSION = '0.1.3'
 end

data/readme.md CHANGED Viewed

@@ -12,12 +12,6 @@ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own ri
     `sudo gem install lederhosen`
 4. Check installation by typing `lederhosen`. You should see some help text.
-## How do I use Lederhosen?
-Type `lederhosen help` for complete instructions
-See pipeline.sh for example usage.
 ## Features
 - Sequence trimming (paired-end Illumina).
@@ -27,4 +21,86 @@ See pipeline.sh for example usage.
 - Separation of representative reads.
 - Separation of all reads belonging to each cluster.
 - Identification of clusters using TaxCollector.
-- Generation of OTU abundancy matrices.
+- Generation of OTU abundancy matrices.
+## How do I use Lederhosen?
+Lederhosen is just a convenient wrapper for UCLUST and BLAT with some scripts for quality filtering, de-noising of data as well as creation of nice tables. It is similar to QIIME but meant for paired-end Illumina data rather than single-end 454. The basic lederhosen pipeline consists of: trimming, joining, sorting, filtering, clustering, more filtering, and output generation (OTU tables, representative reads, reads by cluster, and taxonomic descriptions for clusters). See the example pipeline in `pipeline.sh`.
+## Tasks
+Lederhosen is invoked by typing `lederhosen [TASK]`
+### trim
+Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
+    lederhosen trim --reads_dir=reads/* --out_dir=trimmed/
+### join
+Join paired reads from all samples end-to-end. This method enables the use of uclust with paired-end data. Output will be a single fasta file.
+    lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
+### sort
+Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
+    lederhosen sort --input=joined.fasta --output=sorted.fasta
+### k_filter
+K-mer abundance noise filtering. This step is experimental and optional. It may reduce the time it takes to perform the clustering.
+    lederhosen k_filter --input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50
+### cluster
+Cluster reads using UCLUST. Output is a uc file.
+    lederhosen cluster --input=sorted.fasta --identity=0.80 --output=clusters.uc
+### uc_filter
+Filter UC file removing singleton clusters or clusters that are only present in a few samples. This greatly reduces the noise of the data without removing many of the reads.
+    lederhosen uc_filter --input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10
+### otu_table
+Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
+    lederhosen otu_table --clusters=clusters.uc --output=otu_prefix.csv
+### rep_reads
+Get representative reads for each cluster. Output is a single fasta file.
+    lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta
+### split
+Get all reads belonging to each cluster. Output is a directory containing a fasta file for each cluster. The fasta file contains the joined reads.
+    lederhosen split --clusters=clusters.uc --reads=joined.fasta --min-clst-size=100
+### name
+Identify clusters in a database using the representative reads. This is a simple wrapper for BLAT. The output is a tab-delimited file similar to a BLAST output file. For this step you need to have BLAT installed and also a [TaxCollector](http://github.com/audy/taxcollector) database.
+    lederhosen name --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt
+### add_names
+Add phylogenetic classification of clusters to OTU abundance file.
+	lederhosen add_names --blat=blat_output.txt --level=taxonomic_level --table=otu_file.csv --output=named_out_file.csv
+Where `taxonomic_level` can be: kingdom, domain, phylum, class, order, family, genus or species. This method only works with a TaxCollector database.
+### squish
+Squish an OTU abundance file by column name (phylogenetic description)
+	lederhosen squish --csv-file=named_out_file.csv --output=squished_named_out_file.csv

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: lederhosen
 version: !ruby/object:Gem::Version
-  hash: 31
+  hash: 29
   prerelease:
   segments:
   - 0
   - 1
-  - 2
-  version: 0.1.2
+  - 3
+  version: 0.1.3
 platform: ruby
 authors:
 - Austin G. Davis-Richardson
@@ -15,7 +15,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-05-23 00:00:00 Z
+date: 2012-07-13 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: dna
@@ -129,12 +129,15 @@ files:
 - .rvmrc
 - Gemfile
 - bin/lederhosen
+- examples/pipeline.sh
 - lederhosen.gemspec
 - lib/lederhosen.rb
 - lib/lederhosen/buffer.rb
 - lib/lederhosen/cli.rb
 - lib/lederhosen/helpers.rb
+- lib/lederhosen/tasks/add_names.rb
 - lib/lederhosen/tasks/cluster.rb
+- lib/lederhosen/tasks/hierarchical.rb
 - lib/lederhosen/tasks/join.rb
 - lib/lederhosen/tasks/k_filter.rb
 - lib/lederhosen/tasks/name.rb
@@ -142,10 +145,10 @@ files:
 - lib/lederhosen/tasks/rep_reads.rb
 - lib/lederhosen/tasks/sort.rb
 - lib/lederhosen/tasks/split.rb
+- lib/lederhosen/tasks/squish.rb
 - lib/lederhosen/tasks/trim.rb
 - lib/lederhosen/tasks/uc_filter.rb
 - lib/version.rb
-- pipeline.sh
 - readme.md
 - spec/data/ILT_L_9_B_001_1.txt
 - spec/data/ILT_L_9_B_001_3.txt