RubyGems - lederhosen - Versions diffs - 1.7.0 → 1.8.0 - Mend

lederhosen 1.7.0 → 1.8.0

Files changed (16) hide show

data/.rspec +1 -1
data/Gemfile +3 -1
data/lederhosen.gemspec +8 -3
data/lib/lederhosen/no_tasks.rb +3 -3
data/lib/lederhosen/version.rb +2 -2
data/lib/lederhosen.rb +0 -2
data/readme.md +16 -26
data/scripts/illumina_pipeline/.gitignore +1 -0
data/scripts/illumina_pipeline/Makefile +14 -0
data/scripts/illumina_pipeline/pipeline.sh +3 -0
data/scripts/illumina_pipeline/readme.md +3 -0
data/scripts/otu_ref_picking/readme.md +9 -0
data/scripts/readme.md +3 -0
data/spec/no_tasks_spec.rb +10 -10
metadata +9 -4
data/lib/lederhosen/tasks/trim.rb +0 -88

data/.rspec CHANGED Viewed

	@@ -1 +1 @@
1	- -c ~~--fail~~-~~fast -~~f d
1	+ -c -f d

data/Gemfile CHANGED Viewed

@@ -7,10 +7,12 @@ gem 'thor', '0.16.0'
 group :test do
   gem 'rspec', '2.12.0'
   gem 'rspec-prof', '0.0.3'
+  gem 'pry'
+  gem 'plymouth'
 end
 group :development do
   gem 'rdoc', '~> 3.12'
   gem 'jeweler', '1.8.4'
   gem 'ruby-prof', '0.11.2'
-end
+end

data/lederhosen.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "lederhosen"
-  s.version = "1.7.0"
+  s.version = "1.8.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Austin G. Davis-Richardson"]
-  s.date = "2012-12-19"
+  s.date = "2013-01-17"
   s.description = "Various tools for OTU clustering"
   s.email = "harekrishna@gmail.com"
   s.executables = ["lederhosen"]
@@ -33,11 +33,16 @@ Gem::Specification.new do |s|
     "lib/lederhosen/tasks/otu_filter.rb",
     "lib/lederhosen/tasks/otu_table.rb",
     "lib/lederhosen/tasks/split_fasta.rb",
-    "lib/lederhosen/tasks/trim.rb",
     "lib/lederhosen/tasks/version.rb",
     "lib/lederhosen/trimmer.rb",
     "lib/lederhosen/version.rb",
     "readme.md",
+    "scripts/illumina_pipeline/.gitignore",
+    "scripts/illumina_pipeline/Makefile",
+    "scripts/illumina_pipeline/pipeline.sh",
+    "scripts/illumina_pipeline/readme.md",
+    "scripts/otu_ref_picking/readme.md",
+    "scripts/readme.md",
     "spec/cli_spec.rb",
     "spec/data/ILT_L_9_B_001_1.txt.gz",
     "spec/data/ILT_L_9_B_001_3.txt.gz",

data/lib/lederhosen/no_tasks.rb CHANGED Viewed

@@ -65,7 +65,7 @@ module Lederhosen
       RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
       def parse_taxonomy_qiime(taxonomy)
-        levels = %w{kingdom phylum class order family genus species}
+        levels = %w{domain phylum class order family genus species}
         match_data = taxonomy.match(RE_QIIME)
         match_data = match_data[1..-1]
@@ -78,7 +78,7 @@ module Lederhosen
       end
       def parse_taxonomy_greengenes(taxonomy)
-        levels = %w{kingdom phylum class order family genus species}
+        levels = %w{domain phylum class order family genus species}
         match_data = taxonomy.match(RE_GREENGENES)
         match_data = match_data[1..-1]
@@ -101,7 +101,7 @@ module Lederhosen
       #
       def parse_taxonomy_taxcollector(taxonomy)
-        levels = %w{kingdom phylum class order family genus species strain}
+        levels = %w{domain phylum class order family genus species strain}
         match_data =
           begin

data/lib/lederhosen/version.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 module Lederhosen
   module Version
     MAJOR = 1
-    MINOR = 7
-    CODENAME = 'Franziskaner' # changes for minor versions
+    MINOR = 8
+    CODENAME = 'Karottensaft' # changes for minor versions
     PATCH = 0
     STRING = [MAJOR, MINOR, PATCH].join('.')

data/lib/lederhosen.rb CHANGED Viewed

@@ -1,6 +1,4 @@
 require 'rubygems'
-require 'bundler'
-require 'set'
 require 'dna'
 require 'progressbar'
 require 'thor'

data/readme.md CHANGED Viewed

@@ -4,32 +4,32 @@
 Lederhosen is a set of tools for OTU clustering rRNA amplicons using Robert Edgar's USEARCH.
-It handles quality control of raw sequence data, running USEARCH, and creating and filtering tables.
+It's used to run USEARCH and create and filter tables. Unlike most of the software in Bioinformatics,
+It is meant to be UNIX-y: do one thing and do it well.
+Do you want to run Lederhosen on a cluster? Use `--dry-run` and feed it to your cluster's queue management system.
 Lederhosen is not a pipeline but rather a set of tools broken up into tasks. Tasks are invoked by running `lederhosen TASK ...`.
 Lederhosen is designed with the following "pipeline" in mind:
-1. Quality control of sequence data.
-2. Clustering sequences to centroid or reference sequences (read: database)
-3. Generating tables from USEARCH output.
-4. Filtering tables to remove small or insignificant OTUs.
+1. Clustering sequences to centroid or reference sequences (read: database)
+2. Generating tables from USEARCH output.
+3. Filtering tables to remove small or insignificant OTUs.
 ### About
 - Lederhosen is a project born out of the Triplett Lab at the University of Florida.
-- Lederhosen is designed to be a fast and simple method of clustering 16S rRNA amplicons sequenced
-using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq).
-- Lederhosen uses [Semantic Versioning](http://semver.org/).
-- Lederhosen is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/).
+- Lederhosen is designed to be a fast and **simple** tool to aid in clustering 16S rRNA amplicons sequenced
+using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq), Ion Torrent, or Roche-454.
+- Lederhosen uses [Semantic Versioning](http://semver.org/), is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/), and has **UNIT TESTS** (omg!).
 - Except for USEARCH which requires a license, Lederhosen is available for commercial use.
 ### Features
-- Sequence trimming (paired-end Illumina).
-- Parallel, referenced-based clustering to TaxCollector using USEARCH.
-- Queue-agnostic support for running jobs on clusters.
-- Support for RDP, TaxCollector or GreenGenes databases.
+- Closed/Open/Mixed OTU clustering to TaxCollector or GreenGenes via USEARCH.
+- Parallel support (pipe commands into [parallel](http://savannah.gnu.org/projects/parallel/), or use your cluster's queue).
+- Support for RDP, TaxCollector or GreenGenes 16S rRNA databases.
 - Generation and filtering of OTU abundancy matrices.
 ### Installation
@@ -50,19 +50,7 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
 ### Trim Reads
-Trim (Illumina, QSEQ format) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
-    lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
-The trimming process will reverse complement the "right" pair so that both reads are in the forward orientation.
-You can also trim interleaved, paired-end FASTQ files:
-    lederhosen trim --reads_dir=reads/*.fastq --out_dir=trimmed/ read-type='fastq'
-Lederhosen will also trim off adapter sequences from the 5' end of the "left" read with the `--left-trim` option.
-    lederhosen trim --reads_dir=reads/*.fastq --out_dir=trimed/ --read-type='fastq' --left-trim=11
+Trimming removed. I think you should use [Sickle](https://github.com/najoshi/sickle).
 ### Create Database
@@ -74,6 +62,8 @@ lederhosen make_udb \
   --output=taxcollector.udb
 ```
+(not actually required but will make batch searching a lot faster)
 ### Cluster Reads using USEARCH
 Cluster reads using USEARCH. Output is a uc file.

data/scripts/illumina_pipeline/.gitignore ADDED Viewed

	@@ -0,0 +1 @@
1	+ data/

data/scripts/illumina_pipeline/Makefile ADDED Viewed

@@ -0,0 +1,14 @@
+#!/bin/bash
+# for now, we use the Caporaso reference OTUs
+# In the future, I would like to be able to generate a fresh
+# OTU reference database from scratch.
+REF_DB='http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Reference_OTUs_for_Pipelines/Caporaso_Reference_OTUs/gg_otus_4feb2011.tgz'
+default: reference_otus
+reference_otus:
+	mkdir -p data
+	curl -L ${REF_DB} > data/ref_otus.tar.gz
+	tar -zxvf data/ref_otus.tar.gz # this will end up in some other directory

data/scripts/illumina_pipeline/pipeline.sh ADDED Viewed

	@@ -0,0 +1,3 @@
1	+ #!/bin/bash
2	+
3	+

data/scripts/illumina_pipeline/readme.md ADDED Viewed

@@ -0,0 +1,3 @@
+# Illumina Pipeline
+This is the pipeline for closed or closed + open reference OTU clustering from paired-end 16S rRNA amplicons.

data/scripts/otu_ref_picking/readme.md ADDED Viewed

@@ -0,0 +1,9 @@
+# OTU Ref Picking
+This script will pick reference OTUs to use as centroids for OTU clustering from amplicons.
+It will also generate multiple sequence alignments and trees from the reference OTUs.
+It is intended to be used in combination with the Illumina pipeline in order to generate
+datasets that are suitable for analysis using PhyloSeq.

data/scripts/readme.md ADDED Viewed

@@ -0,0 +1,3 @@
+# Lederhosen Scripts
+This directory will contain scripts that can be used with Lederhosen such as pipelines and what-not.

data/spec/no_tasks_spec.rb CHANGED Viewed

@@ -4,7 +4,7 @@ describe 'no_tasks' do
   let(:greengenes_taxonomies) { ['124 U55236.1 Methanobrevibacter thaueri str. CW k__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobrevibacter; Unclassified; otu_127']}
   let(:qiime_taxonomies) { [ 'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Rahnella;s__' ]}
-  let(:taxcollector_taxonomies) { ['[0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]null;[4]null;[5]null;[6]bacterium_TH3;[7]bacterium_TH3;[8]bacterium_TH3|M79434|8'] }
+  let(:taxcollector_taxonomies) { ['[0]domain;[1]phylum;[2]class;[3]order;[4]family;[5]genus;[6]species;[7]strain;[8]Genus_species_strain_id'] }
   let(:lederhosen) { Lederhosen::CLI.new }
   it '#parse_usearch_line should parse a line of usearch output'
@@ -25,18 +25,18 @@ describe 'no_tasks' do
     lederhosen.detect_taxonomy_format('this is not a taxonomic description').should raise_error
   end
-  it '#parse_taxonomy_taxcollector should parse taxcollector taxonomy' do
-    taxcollector_taxonomies.each do |taxcollector_taxonomy|
-      taxonomy = lederhosen.parse_taxonomy_taxcollector(taxcollector_taxonomy)
-      taxonomy['original'].should == taxcollector_taxonomy
-      levels = %w{domain phylum class order family genus species kingdom original strain}
-      taxonomy.keys.each do |v|
-        levels.should include v
+  %w{domain phylum class order family genus species strain}.each do |level|
+    it "#parse_taxonomy_taxcollector should parse taxcollector taxonomy (#{level})" do
+      taxcollector_taxonomies.each do |taxonomy|
+        taxonomy = lederhosen.parse_taxonomy_taxcollector(taxonomy)
+        taxonomy[level].should == level
       end
     end
   end
+  it '#parse_taxonomy_taxcollector should return original taxonomy' do
+    lederhosen.parse_taxonomy_taxcollector(taxcollector_taxonomies[0])['original'].should == taxcollector_taxonomies[0]
+  end
   it '#parse_taxonomy_greengenes should parse greengenes taxonomy' do
     greengenes_taxonomies.each do |greengenes_taxonomy|

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lederhosen
 version: !ruby/object:Gem::Version
-  version: 1.7.0
+  version: 1.8.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-19 00:00:00.000000000 Z
+date: 2013-01-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: dna
@@ -131,11 +131,16 @@ files:
 - lib/lederhosen/tasks/otu_filter.rb
 - lib/lederhosen/tasks/otu_table.rb
 - lib/lederhosen/tasks/split_fasta.rb
-- lib/lederhosen/tasks/trim.rb
 - lib/lederhosen/tasks/version.rb
 - lib/lederhosen/trimmer.rb
 - lib/lederhosen/version.rb
 - readme.md
+- scripts/illumina_pipeline/.gitignore
+- scripts/illumina_pipeline/Makefile
+- scripts/illumina_pipeline/pipeline.sh
+- scripts/illumina_pipeline/readme.md
+- scripts/otu_ref_picking/readme.md
+- scripts/readme.md
 - spec/cli_spec.rb
 - spec/data/ILT_L_9_B_001_1.txt.gz
 - spec/data/ILT_L_9_B_001_3.txt.gz
@@ -162,7 +167,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -1116066410733680786
+      hash: -1539752797284012594
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:

data/lib/lederhosen/tasks/trim.rb DELETED Viewed

@@ -1,88 +0,0 @@
-##
-# QUALITY TRIMMING
-#
-# This should probably be broken into its own module or command-line utility.
-module Lederhosen
-  class CLI
-    desc "trim",
-         "trim reads based on quality scores"
-    method_option :reads_dir,  :type => :string, :required => true
-    method_option :out_dir,    :type => :string, :required => true
-    method_option :left_trim,  :type => :numeric, :default => 0
-    method_option :read_type,  :type => :string, :default => 'qseq'
-    method_option :min_length, :type => :numeric, :default => 75
-    def trim
-      raw_reads  = options[:reads_dir]
-      out_dir    = options[:out_dir]
-      left_trim  = options[:left_trim]
-      read_type  = options[:read_type]
-      min_length = options[:min_length]
-      ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"
-      run "mkdir -p #{out_dir}"
-      raw_reads =
-        if read_type == 'qseq'
-          get_grouped_qseq_files(raw_reads)
-        elsif read_type == 'fastq'
-          r = Dir[raw_reads].map do |x|
-            [ File.basename(x, '.fastq'), x ]
-          end
-          Hash[r]
-        end
-      if raw_reads.size == 0
-        ohno 'glob matches no reads'
-      end
-      pbar = ProgressBar.new 'trimming', raw_reads.size
-      raw_reads.each do |prefix, files|
-        # get an output handle
-        out = File.join(out_dir, "#{File.basename(prefix)}.fasta")
-        # create the trimmed sequence generator
-        trim_args = { :left_trim => left_trim, :min_length => min_length }
-        trimmer =
-          if read_type == 'qseq'
-            Trimmer::QSEQTrimmer.new(*files, trim_args)
-          elsif read_type == 'fastq'
-            Trimmer::InterleavedTrimmer.new(files, trim_args)
-          end
-        # trim and write
-        File.open(out, 'w') do |o|
-          trimmer.each do |trimmed_record|
-            o.puts trimmed_record
-          end
-        end # File.open
-        pbar.inc
-      end
-      pbar.finish
-    end
-    no_tasks do
-      # Function for grouping qseq files produced by splitting illumina
-      # reads by barcode
-      #
-      # Filenames should look like this:
-      # IL5_L_1_B_007_1.txt
-      def get_grouped_qseq_files(glob='raw_reads/*.txt')
-        Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
-      end
-    end # no_tasks
-  end
-end