lederhosen 1.7.0 → 1.8.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -1
- data/Gemfile +3 -1
- data/lederhosen.gemspec +8 -3
- data/lib/lederhosen/no_tasks.rb +3 -3
- data/lib/lederhosen/version.rb +2 -2
- data/lib/lederhosen.rb +0 -2
- data/readme.md +16 -26
- data/scripts/illumina_pipeline/.gitignore +1 -0
- data/scripts/illumina_pipeline/Makefile +14 -0
- data/scripts/illumina_pipeline/pipeline.sh +3 -0
- data/scripts/illumina_pipeline/readme.md +3 -0
- data/scripts/otu_ref_picking/readme.md +9 -0
- data/scripts/readme.md +3 -0
- data/spec/no_tasks_spec.rb +10 -10
- metadata +9 -4
- data/lib/lederhosen/tasks/trim.rb +0 -88
data/.rspec
CHANGED
@@ -1 +1 @@
|
|
1
|
-
-c
|
1
|
+
-c -f d
|
data/Gemfile
CHANGED
data/lederhosen.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "1.
|
8
|
+
s.version = "1.8.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
12
|
-
s.date = "
|
12
|
+
s.date = "2013-01-17"
|
13
13
|
s.description = "Various tools for OTU clustering"
|
14
14
|
s.email = "harekrishna@gmail.com"
|
15
15
|
s.executables = ["lederhosen"]
|
@@ -33,11 +33,16 @@ Gem::Specification.new do |s|
|
|
33
33
|
"lib/lederhosen/tasks/otu_filter.rb",
|
34
34
|
"lib/lederhosen/tasks/otu_table.rb",
|
35
35
|
"lib/lederhosen/tasks/split_fasta.rb",
|
36
|
-
"lib/lederhosen/tasks/trim.rb",
|
37
36
|
"lib/lederhosen/tasks/version.rb",
|
38
37
|
"lib/lederhosen/trimmer.rb",
|
39
38
|
"lib/lederhosen/version.rb",
|
40
39
|
"readme.md",
|
40
|
+
"scripts/illumina_pipeline/.gitignore",
|
41
|
+
"scripts/illumina_pipeline/Makefile",
|
42
|
+
"scripts/illumina_pipeline/pipeline.sh",
|
43
|
+
"scripts/illumina_pipeline/readme.md",
|
44
|
+
"scripts/otu_ref_picking/readme.md",
|
45
|
+
"scripts/readme.md",
|
41
46
|
"spec/cli_spec.rb",
|
42
47
|
"spec/data/ILT_L_9_B_001_1.txt.gz",
|
43
48
|
"spec/data/ILT_L_9_B_001_3.txt.gz",
|
data/lib/lederhosen/no_tasks.rb
CHANGED
@@ -65,7 +65,7 @@ module Lederhosen
|
|
65
65
|
RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
|
66
66
|
|
67
67
|
def parse_taxonomy_qiime(taxonomy)
|
68
|
-
levels = %w{
|
68
|
+
levels = %w{domain phylum class order family genus species}
|
69
69
|
match_data = taxonomy.match(RE_QIIME)
|
70
70
|
match_data = match_data[1..-1]
|
71
71
|
|
@@ -78,7 +78,7 @@ module Lederhosen
|
|
78
78
|
end
|
79
79
|
|
80
80
|
def parse_taxonomy_greengenes(taxonomy)
|
81
|
-
levels = %w{
|
81
|
+
levels = %w{domain phylum class order family genus species}
|
82
82
|
match_data = taxonomy.match(RE_GREENGENES)
|
83
83
|
match_data = match_data[1..-1]
|
84
84
|
|
@@ -101,7 +101,7 @@ module Lederhosen
|
|
101
101
|
#
|
102
102
|
def parse_taxonomy_taxcollector(taxonomy)
|
103
103
|
|
104
|
-
levels = %w{
|
104
|
+
levels = %w{domain phylum class order family genus species strain}
|
105
105
|
|
106
106
|
match_data =
|
107
107
|
begin
|
data/lib/lederhosen/version.rb
CHANGED
data/lib/lederhosen.rb
CHANGED
data/readme.md
CHANGED
@@ -4,32 +4,32 @@
|
|
4
4
|
|
5
5
|
Lederhosen is a set of tools for OTU clustering rRNA amplicons using Robert Edgar's USEARCH.
|
6
6
|
|
7
|
-
It
|
7
|
+
It's used to run USEARCH and create and filter tables. Unlike most of the software in Bioinformatics,
|
8
|
+
It is meant to be UNIX-y: do one thing and do it well.
|
9
|
+
|
10
|
+
Do you want to run Lederhosen on a cluster? Use `--dry-run` and feed it to your cluster's queue management system.
|
8
11
|
|
9
12
|
Lederhosen is not a pipeline but rather a set of tools broken up into tasks. Tasks are invoked by running `lederhosen TASK ...`.
|
10
13
|
|
11
14
|
Lederhosen is designed with the following "pipeline" in mind:
|
12
15
|
|
13
|
-
1.
|
14
|
-
2.
|
15
|
-
3.
|
16
|
-
4. Filtering tables to remove small or insignificant OTUs.
|
16
|
+
1. Clustering sequences to centroid or reference sequences (read: database)
|
17
|
+
2. Generating tables from USEARCH output.
|
18
|
+
3. Filtering tables to remove small or insignificant OTUs.
|
17
19
|
|
18
20
|
### About
|
19
21
|
|
20
22
|
- Lederhosen is a project born out of the Triplett Lab at the University of Florida.
|
21
|
-
- Lederhosen is designed to be a fast and simple
|
22
|
-
using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq).
|
23
|
-
- Lederhosen uses [Semantic Versioning](http://semver.org/).
|
24
|
-
- Lederhosen is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/).
|
23
|
+
- Lederhosen is designed to be a fast and **simple** tool to aid in clustering 16S rRNA amplicons sequenced
|
24
|
+
using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq), Ion Torrent, or Roche-454.
|
25
|
+
- Lederhosen uses [Semantic Versioning](http://semver.org/), is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/), and has **UNIT TESTS** (omg!).
|
25
26
|
- Except for USEARCH which requires a license, Lederhosen is available for commercial use.
|
26
27
|
|
27
28
|
### Features
|
28
29
|
|
29
|
-
-
|
30
|
-
- Parallel,
|
31
|
-
-
|
32
|
-
- Support for RDP, TaxCollector or GreenGenes databases.
|
30
|
+
- Closed/Open/Mixed OTU clustering to TaxCollector or GreenGenes via USEARCH.
|
31
|
+
- Parallel support (pipe commands into [parallel](http://savannah.gnu.org/projects/parallel/), or use your cluster's queue).
|
32
|
+
- Support for RDP, TaxCollector or GreenGenes 16S rRNA databases.
|
33
33
|
- Generation and filtering of OTU abundancy matrices.
|
34
34
|
|
35
35
|
### Installation
|
@@ -50,19 +50,7 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
|
|
50
50
|
|
51
51
|
### Trim Reads
|
52
52
|
|
53
|
-
|
54
|
-
|
55
|
-
lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
|
56
|
-
|
57
|
-
The trimming process will reverse complement the "right" pair so that both reads are in the forward orientation.
|
58
|
-
|
59
|
-
You can also trim interleaved, paired-end FASTQ files:
|
60
|
-
|
61
|
-
lederhosen trim --reads_dir=reads/*.fastq --out_dir=trimmed/ read-type='fastq'
|
62
|
-
|
63
|
-
Lederhosen will also trim off adapter sequences from the 5' end of the "left" read with the `--left-trim` option.
|
64
|
-
|
65
|
-
lederhosen trim --reads_dir=reads/*.fastq --out_dir=trimed/ --read-type='fastq' --left-trim=11
|
53
|
+
Trimming removed. I think you should use [Sickle](https://github.com/najoshi/sickle).
|
66
54
|
|
67
55
|
### Create Database
|
68
56
|
|
@@ -74,6 +62,8 @@ lederhosen make_udb \
|
|
74
62
|
--output=taxcollector.udb
|
75
63
|
```
|
76
64
|
|
65
|
+
(not actually required but will make batch searching a lot faster)
|
66
|
+
|
77
67
|
### Cluster Reads using USEARCH
|
78
68
|
|
79
69
|
Cluster reads using USEARCH. Output is a uc file.
|
@@ -0,0 +1 @@
|
|
1
|
+
data/
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
# for now, we use the Caporaso reference OTUs
|
4
|
+
# In the future, I would like to be able to generate a fresh
|
5
|
+
# OTU reference database from scratch.
|
6
|
+
|
7
|
+
REF_DB='http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Reference_OTUs_for_Pipelines/Caporaso_Reference_OTUs/gg_otus_4feb2011.tgz'
|
8
|
+
|
9
|
+
default: reference_otus
|
10
|
+
|
11
|
+
reference_otus:
|
12
|
+
mkdir -p data
|
13
|
+
curl -L ${REF_DB} > data/ref_otus.tar.gz
|
14
|
+
tar -zxvf data/ref_otus.tar.gz # this will end up in some other directory
|
@@ -0,0 +1,9 @@
|
|
1
|
+
# OTU Ref Picking
|
2
|
+
|
3
|
+
This script will pick reference OTUs to use as centroids for OTU clustering from amplicons.
|
4
|
+
|
5
|
+
It will also generate multiple sequence alignments and trees from the reference OTUs.
|
6
|
+
|
7
|
+
It is intended to be used in combination with the Illumina pipeline in order to generate
|
8
|
+
datasets that are suitable for analysis using PhyloSeq.
|
9
|
+
|
data/scripts/readme.md
ADDED
data/spec/no_tasks_spec.rb
CHANGED
@@ -4,7 +4,7 @@ describe 'no_tasks' do
|
|
4
4
|
|
5
5
|
let(:greengenes_taxonomies) { ['124 U55236.1 Methanobrevibacter thaueri str. CW k__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobrevibacter; Unclassified; otu_127']}
|
6
6
|
let(:qiime_taxonomies) { [ 'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Rahnella;s__' ]}
|
7
|
-
let(:taxcollector_taxonomies) { ['[0]
|
7
|
+
let(:taxcollector_taxonomies) { ['[0]domain;[1]phylum;[2]class;[3]order;[4]family;[5]genus;[6]species;[7]strain;[8]Genus_species_strain_id'] }
|
8
8
|
let(:lederhosen) { Lederhosen::CLI.new }
|
9
9
|
|
10
10
|
it '#parse_usearch_line should parse a line of usearch output'
|
@@ -25,18 +25,18 @@ describe 'no_tasks' do
|
|
25
25
|
lederhosen.detect_taxonomy_format('this is not a taxonomic description').should raise_error
|
26
26
|
end
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
levels = %w{domain phylum class order family genus species kingdom original strain}
|
34
|
-
|
35
|
-
taxonomy.keys.each do |v|
|
36
|
-
levels.should include v
|
28
|
+
%w{domain phylum class order family genus species strain}.each do |level|
|
29
|
+
it "#parse_taxonomy_taxcollector should parse taxcollector taxonomy (#{level})" do
|
30
|
+
taxcollector_taxonomies.each do |taxonomy|
|
31
|
+
taxonomy = lederhosen.parse_taxonomy_taxcollector(taxonomy)
|
32
|
+
taxonomy[level].should == level
|
37
33
|
end
|
38
34
|
end
|
39
35
|
end
|
36
|
+
|
37
|
+
it '#parse_taxonomy_taxcollector should return original taxonomy' do
|
38
|
+
lederhosen.parse_taxonomy_taxcollector(taxcollector_taxonomies[0])['original'].should == taxcollector_taxonomies[0]
|
39
|
+
end
|
40
40
|
|
41
41
|
it '#parse_taxonomy_greengenes should parse greengenes taxonomy' do
|
42
42
|
greengenes_taxonomies.each do |greengenes_taxonomy|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.8.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-01-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: dna
|
@@ -131,11 +131,16 @@ files:
|
|
131
131
|
- lib/lederhosen/tasks/otu_filter.rb
|
132
132
|
- lib/lederhosen/tasks/otu_table.rb
|
133
133
|
- lib/lederhosen/tasks/split_fasta.rb
|
134
|
-
- lib/lederhosen/tasks/trim.rb
|
135
134
|
- lib/lederhosen/tasks/version.rb
|
136
135
|
- lib/lederhosen/trimmer.rb
|
137
136
|
- lib/lederhosen/version.rb
|
138
137
|
- readme.md
|
138
|
+
- scripts/illumina_pipeline/.gitignore
|
139
|
+
- scripts/illumina_pipeline/Makefile
|
140
|
+
- scripts/illumina_pipeline/pipeline.sh
|
141
|
+
- scripts/illumina_pipeline/readme.md
|
142
|
+
- scripts/otu_ref_picking/readme.md
|
143
|
+
- scripts/readme.md
|
139
144
|
- spec/cli_spec.rb
|
140
145
|
- spec/data/ILT_L_9_B_001_1.txt.gz
|
141
146
|
- spec/data/ILT_L_9_B_001_3.txt.gz
|
@@ -162,7 +167,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
162
167
|
version: '0'
|
163
168
|
segments:
|
164
169
|
- 0
|
165
|
-
hash: -
|
170
|
+
hash: -1539752797284012594
|
166
171
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
167
172
|
none: false
|
168
173
|
requirements:
|
@@ -1,88 +0,0 @@
|
|
1
|
-
##
|
2
|
-
# QUALITY TRIMMING
|
3
|
-
#
|
4
|
-
|
5
|
-
# This should probably be broken into its own module or command-line utility.
|
6
|
-
|
7
|
-
module Lederhosen
|
8
|
-
class CLI
|
9
|
-
|
10
|
-
desc "trim",
|
11
|
-
"trim reads based on quality scores"
|
12
|
-
|
13
|
-
method_option :reads_dir, :type => :string, :required => true
|
14
|
-
method_option :out_dir, :type => :string, :required => true
|
15
|
-
method_option :left_trim, :type => :numeric, :default => 0
|
16
|
-
method_option :read_type, :type => :string, :default => 'qseq'
|
17
|
-
method_option :min_length, :type => :numeric, :default => 75
|
18
|
-
|
19
|
-
def trim
|
20
|
-
raw_reads = options[:reads_dir]
|
21
|
-
out_dir = options[:out_dir]
|
22
|
-
left_trim = options[:left_trim]
|
23
|
-
read_type = options[:read_type]
|
24
|
-
min_length = options[:min_length]
|
25
|
-
|
26
|
-
ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"
|
27
|
-
run "mkdir -p #{out_dir}"
|
28
|
-
|
29
|
-
raw_reads =
|
30
|
-
if read_type == 'qseq'
|
31
|
-
get_grouped_qseq_files(raw_reads)
|
32
|
-
elsif read_type == 'fastq'
|
33
|
-
r = Dir[raw_reads].map do |x|
|
34
|
-
[ File.basename(x, '.fastq'), x ]
|
35
|
-
end
|
36
|
-
Hash[r]
|
37
|
-
end
|
38
|
-
|
39
|
-
if raw_reads.size == 0
|
40
|
-
ohno 'glob matches no reads'
|
41
|
-
end
|
42
|
-
|
43
|
-
pbar = ProgressBar.new 'trimming', raw_reads.size
|
44
|
-
|
45
|
-
raw_reads.each do |prefix, files|
|
46
|
-
|
47
|
-
# get an output handle
|
48
|
-
out = File.join(out_dir, "#{File.basename(prefix)}.fasta")
|
49
|
-
|
50
|
-
# create the trimmed sequence generator
|
51
|
-
trim_args = { :left_trim => left_trim, :min_length => min_length }
|
52
|
-
|
53
|
-
trimmer =
|
54
|
-
if read_type == 'qseq'
|
55
|
-
Trimmer::QSEQTrimmer.new(*files, trim_args)
|
56
|
-
elsif read_type == 'fastq'
|
57
|
-
Trimmer::InterleavedTrimmer.new(files, trim_args)
|
58
|
-
end
|
59
|
-
|
60
|
-
# trim and write
|
61
|
-
File.open(out, 'w') do |o|
|
62
|
-
trimmer.each do |trimmed_record|
|
63
|
-
o.puts trimmed_record
|
64
|
-
end
|
65
|
-
end # File.open
|
66
|
-
|
67
|
-
pbar.inc
|
68
|
-
end
|
69
|
-
|
70
|
-
pbar.finish
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
no_tasks do
|
75
|
-
|
76
|
-
# Function for grouping qseq files produced by splitting illumina
|
77
|
-
# reads by barcode
|
78
|
-
#
|
79
|
-
# Filenames should look like this:
|
80
|
-
# IL5_L_1_B_007_1.txt
|
81
|
-
def get_grouped_qseq_files(glob='raw_reads/*.txt')
|
82
|
-
Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
|
83
|
-
end
|
84
|
-
|
85
|
-
end # no_tasks
|
86
|
-
|
87
|
-
end
|
88
|
-
end
|