lederhosen 1.7.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -1
- data/Gemfile +3 -1
- data/lederhosen.gemspec +8 -3
- data/lib/lederhosen/no_tasks.rb +3 -3
- data/lib/lederhosen/version.rb +2 -2
- data/lib/lederhosen.rb +0 -2
- data/readme.md +16 -26
- data/scripts/illumina_pipeline/.gitignore +1 -0
- data/scripts/illumina_pipeline/Makefile +14 -0
- data/scripts/illumina_pipeline/pipeline.sh +3 -0
- data/scripts/illumina_pipeline/readme.md +3 -0
- data/scripts/otu_ref_picking/readme.md +9 -0
- data/scripts/readme.md +3 -0
- data/spec/no_tasks_spec.rb +10 -10
- metadata +9 -4
- data/lib/lederhosen/tasks/trim.rb +0 -88
data/.rspec
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
-c
|
|
1
|
+
-c -f d
|
data/Gemfile
CHANGED
data/lederhosen.gemspec
CHANGED
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |s|
|
|
7
7
|
s.name = "lederhosen"
|
|
8
|
-
s.version = "1.
|
|
8
|
+
s.version = "1.8.0"
|
|
9
9
|
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
|
12
|
-
s.date = "
|
|
12
|
+
s.date = "2013-01-17"
|
|
13
13
|
s.description = "Various tools for OTU clustering"
|
|
14
14
|
s.email = "harekrishna@gmail.com"
|
|
15
15
|
s.executables = ["lederhosen"]
|
|
@@ -33,11 +33,16 @@ Gem::Specification.new do |s|
|
|
|
33
33
|
"lib/lederhosen/tasks/otu_filter.rb",
|
|
34
34
|
"lib/lederhosen/tasks/otu_table.rb",
|
|
35
35
|
"lib/lederhosen/tasks/split_fasta.rb",
|
|
36
|
-
"lib/lederhosen/tasks/trim.rb",
|
|
37
36
|
"lib/lederhosen/tasks/version.rb",
|
|
38
37
|
"lib/lederhosen/trimmer.rb",
|
|
39
38
|
"lib/lederhosen/version.rb",
|
|
40
39
|
"readme.md",
|
|
40
|
+
"scripts/illumina_pipeline/.gitignore",
|
|
41
|
+
"scripts/illumina_pipeline/Makefile",
|
|
42
|
+
"scripts/illumina_pipeline/pipeline.sh",
|
|
43
|
+
"scripts/illumina_pipeline/readme.md",
|
|
44
|
+
"scripts/otu_ref_picking/readme.md",
|
|
45
|
+
"scripts/readme.md",
|
|
41
46
|
"spec/cli_spec.rb",
|
|
42
47
|
"spec/data/ILT_L_9_B_001_1.txt.gz",
|
|
43
48
|
"spec/data/ILT_L_9_B_001_3.txt.gz",
|
data/lib/lederhosen/no_tasks.rb
CHANGED
|
@@ -65,7 +65,7 @@ module Lederhosen
|
|
|
65
65
|
RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
|
|
66
66
|
|
|
67
67
|
def parse_taxonomy_qiime(taxonomy)
|
|
68
|
-
levels = %w{
|
|
68
|
+
levels = %w{domain phylum class order family genus species}
|
|
69
69
|
match_data = taxonomy.match(RE_QIIME)
|
|
70
70
|
match_data = match_data[1..-1]
|
|
71
71
|
|
|
@@ -78,7 +78,7 @@ module Lederhosen
|
|
|
78
78
|
end
|
|
79
79
|
|
|
80
80
|
def parse_taxonomy_greengenes(taxonomy)
|
|
81
|
-
levels = %w{
|
|
81
|
+
levels = %w{domain phylum class order family genus species}
|
|
82
82
|
match_data = taxonomy.match(RE_GREENGENES)
|
|
83
83
|
match_data = match_data[1..-1]
|
|
84
84
|
|
|
@@ -101,7 +101,7 @@ module Lederhosen
|
|
|
101
101
|
#
|
|
102
102
|
def parse_taxonomy_taxcollector(taxonomy)
|
|
103
103
|
|
|
104
|
-
levels = %w{
|
|
104
|
+
levels = %w{domain phylum class order family genus species strain}
|
|
105
105
|
|
|
106
106
|
match_data =
|
|
107
107
|
begin
|
data/lib/lederhosen/version.rb
CHANGED
data/lib/lederhosen.rb
CHANGED
data/readme.md
CHANGED
|
@@ -4,32 +4,32 @@
|
|
|
4
4
|
|
|
5
5
|
Lederhosen is a set of tools for OTU clustering rRNA amplicons using Robert Edgar's USEARCH.
|
|
6
6
|
|
|
7
|
-
It
|
|
7
|
+
It's used to run USEARCH and create and filter tables. Unlike most of the software in Bioinformatics,
|
|
8
|
+
It is meant to be UNIX-y: do one thing and do it well.
|
|
9
|
+
|
|
10
|
+
Do you want to run Lederhosen on a cluster? Use `--dry-run` and feed it to your cluster's queue management system.
|
|
8
11
|
|
|
9
12
|
Lederhosen is not a pipeline but rather a set of tools broken up into tasks. Tasks are invoked by running `lederhosen TASK ...`.
|
|
10
13
|
|
|
11
14
|
Lederhosen is designed with the following "pipeline" in mind:
|
|
12
15
|
|
|
13
|
-
1.
|
|
14
|
-
2.
|
|
15
|
-
3.
|
|
16
|
-
4. Filtering tables to remove small or insignificant OTUs.
|
|
16
|
+
1. Clustering sequences to centroid or reference sequences (read: database)
|
|
17
|
+
2. Generating tables from USEARCH output.
|
|
18
|
+
3. Filtering tables to remove small or insignificant OTUs.
|
|
17
19
|
|
|
18
20
|
### About
|
|
19
21
|
|
|
20
22
|
- Lederhosen is a project born out of the Triplett Lab at the University of Florida.
|
|
21
|
-
- Lederhosen is designed to be a fast and simple
|
|
22
|
-
using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq).
|
|
23
|
-
- Lederhosen uses [Semantic Versioning](http://semver.org/).
|
|
24
|
-
- Lederhosen is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/).
|
|
23
|
+
- Lederhosen is designed to be a fast and **simple** tool to aid in clustering 16S rRNA amplicons sequenced
|
|
24
|
+
using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq), Ion Torrent, or Roche-454.
|
|
25
|
+
- Lederhosen uses [Semantic Versioning](http://semver.org/), is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/), and has **UNIT TESTS** (omg!).
|
|
25
26
|
- Except for USEARCH which requires a license, Lederhosen is available for commercial use.
|
|
26
27
|
|
|
27
28
|
### Features
|
|
28
29
|
|
|
29
|
-
-
|
|
30
|
-
- Parallel,
|
|
31
|
-
-
|
|
32
|
-
- Support for RDP, TaxCollector or GreenGenes databases.
|
|
30
|
+
- Closed/Open/Mixed OTU clustering to TaxCollector or GreenGenes via USEARCH.
|
|
31
|
+
- Parallel support (pipe commands into [parallel](http://savannah.gnu.org/projects/parallel/), or use your cluster's queue).
|
|
32
|
+
- Support for RDP, TaxCollector or GreenGenes 16S rRNA databases.
|
|
33
33
|
- Generation and filtering of OTU abundancy matrices.
|
|
34
34
|
|
|
35
35
|
### Installation
|
|
@@ -50,19 +50,7 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
|
|
|
50
50
|
|
|
51
51
|
### Trim Reads
|
|
52
52
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
|
|
56
|
-
|
|
57
|
-
The trimming process will reverse complement the "right" pair so that both reads are in the forward orientation.
|
|
58
|
-
|
|
59
|
-
You can also trim interleaved, paired-end FASTQ files:
|
|
60
|
-
|
|
61
|
-
lederhosen trim --reads_dir=reads/*.fastq --out_dir=trimmed/ read-type='fastq'
|
|
62
|
-
|
|
63
|
-
Lederhosen will also trim off adapter sequences from the 5' end of the "left" read with the `--left-trim` option.
|
|
64
|
-
|
|
65
|
-
lederhosen trim --reads_dir=reads/*.fastq --out_dir=trimed/ --read-type='fastq' --left-trim=11
|
|
53
|
+
Trimming removed. I think you should use [Sickle](https://github.com/najoshi/sickle).
|
|
66
54
|
|
|
67
55
|
### Create Database
|
|
68
56
|
|
|
@@ -74,6 +62,8 @@ lederhosen make_udb \
|
|
|
74
62
|
--output=taxcollector.udb
|
|
75
63
|
```
|
|
76
64
|
|
|
65
|
+
(not actually required but will make batch searching a lot faster)
|
|
66
|
+
|
|
77
67
|
### Cluster Reads using USEARCH
|
|
78
68
|
|
|
79
69
|
Cluster reads using USEARCH. Output is a uc file.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
data/
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
# for now, we use the Caporaso reference OTUs
|
|
4
|
+
# In the future, I would like to be able to generate a fresh
|
|
5
|
+
# OTU reference database from scratch.
|
|
6
|
+
|
|
7
|
+
REF_DB='http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Reference_OTUs_for_Pipelines/Caporaso_Reference_OTUs/gg_otus_4feb2011.tgz'
|
|
8
|
+
|
|
9
|
+
default: reference_otus
|
|
10
|
+
|
|
11
|
+
reference_otus:
|
|
12
|
+
mkdir -p data
|
|
13
|
+
curl -L ${REF_DB} > data/ref_otus.tar.gz
|
|
14
|
+
tar -zxvf data/ref_otus.tar.gz # this will end up in some other directory
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# OTU Ref Picking
|
|
2
|
+
|
|
3
|
+
This script will pick reference OTUs to use as centroids for OTU clustering from amplicons.
|
|
4
|
+
|
|
5
|
+
It will also generate multiple sequence alignments and trees from the reference OTUs.
|
|
6
|
+
|
|
7
|
+
It is intended to be used in combination with the Illumina pipeline in order to generate
|
|
8
|
+
datasets that are suitable for analysis using PhyloSeq.
|
|
9
|
+
|
data/scripts/readme.md
ADDED
data/spec/no_tasks_spec.rb
CHANGED
|
@@ -4,7 +4,7 @@ describe 'no_tasks' do
|
|
|
4
4
|
|
|
5
5
|
let(:greengenes_taxonomies) { ['124 U55236.1 Methanobrevibacter thaueri str. CW k__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobrevibacter; Unclassified; otu_127']}
|
|
6
6
|
let(:qiime_taxonomies) { [ 'k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Rahnella;s__' ]}
|
|
7
|
-
let(:taxcollector_taxonomies) { ['[0]
|
|
7
|
+
let(:taxcollector_taxonomies) { ['[0]domain;[1]phylum;[2]class;[3]order;[4]family;[5]genus;[6]species;[7]strain;[8]Genus_species_strain_id'] }
|
|
8
8
|
let(:lederhosen) { Lederhosen::CLI.new }
|
|
9
9
|
|
|
10
10
|
it '#parse_usearch_line should parse a line of usearch output'
|
|
@@ -25,18 +25,18 @@ describe 'no_tasks' do
|
|
|
25
25
|
lederhosen.detect_taxonomy_format('this is not a taxonomic description').should raise_error
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
levels = %w{domain phylum class order family genus species kingdom original strain}
|
|
34
|
-
|
|
35
|
-
taxonomy.keys.each do |v|
|
|
36
|
-
levels.should include v
|
|
28
|
+
%w{domain phylum class order family genus species strain}.each do |level|
|
|
29
|
+
it "#parse_taxonomy_taxcollector should parse taxcollector taxonomy (#{level})" do
|
|
30
|
+
taxcollector_taxonomies.each do |taxonomy|
|
|
31
|
+
taxonomy = lederhosen.parse_taxonomy_taxcollector(taxonomy)
|
|
32
|
+
taxonomy[level].should == level
|
|
37
33
|
end
|
|
38
34
|
end
|
|
39
35
|
end
|
|
36
|
+
|
|
37
|
+
it '#parse_taxonomy_taxcollector should return original taxonomy' do
|
|
38
|
+
lederhosen.parse_taxonomy_taxcollector(taxcollector_taxonomies[0])['original'].should == taxcollector_taxonomies[0]
|
|
39
|
+
end
|
|
40
40
|
|
|
41
41
|
it '#parse_taxonomy_greengenes should parse greengenes taxonomy' do
|
|
42
42
|
greengenes_taxonomies.each do |greengenes_taxonomy|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lederhosen
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.8.0
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date:
|
|
12
|
+
date: 2013-01-17 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: dna
|
|
@@ -131,11 +131,16 @@ files:
|
|
|
131
131
|
- lib/lederhosen/tasks/otu_filter.rb
|
|
132
132
|
- lib/lederhosen/tasks/otu_table.rb
|
|
133
133
|
- lib/lederhosen/tasks/split_fasta.rb
|
|
134
|
-
- lib/lederhosen/tasks/trim.rb
|
|
135
134
|
- lib/lederhosen/tasks/version.rb
|
|
136
135
|
- lib/lederhosen/trimmer.rb
|
|
137
136
|
- lib/lederhosen/version.rb
|
|
138
137
|
- readme.md
|
|
138
|
+
- scripts/illumina_pipeline/.gitignore
|
|
139
|
+
- scripts/illumina_pipeline/Makefile
|
|
140
|
+
- scripts/illumina_pipeline/pipeline.sh
|
|
141
|
+
- scripts/illumina_pipeline/readme.md
|
|
142
|
+
- scripts/otu_ref_picking/readme.md
|
|
143
|
+
- scripts/readme.md
|
|
139
144
|
- spec/cli_spec.rb
|
|
140
145
|
- spec/data/ILT_L_9_B_001_1.txt.gz
|
|
141
146
|
- spec/data/ILT_L_9_B_001_3.txt.gz
|
|
@@ -162,7 +167,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
162
167
|
version: '0'
|
|
163
168
|
segments:
|
|
164
169
|
- 0
|
|
165
|
-
hash: -
|
|
170
|
+
hash: -1539752797284012594
|
|
166
171
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
167
172
|
none: false
|
|
168
173
|
requirements:
|
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
##
|
|
2
|
-
# QUALITY TRIMMING
|
|
3
|
-
#
|
|
4
|
-
|
|
5
|
-
# This should probably be broken into its own module or command-line utility.
|
|
6
|
-
|
|
7
|
-
module Lederhosen
|
|
8
|
-
class CLI
|
|
9
|
-
|
|
10
|
-
desc "trim",
|
|
11
|
-
"trim reads based on quality scores"
|
|
12
|
-
|
|
13
|
-
method_option :reads_dir, :type => :string, :required => true
|
|
14
|
-
method_option :out_dir, :type => :string, :required => true
|
|
15
|
-
method_option :left_trim, :type => :numeric, :default => 0
|
|
16
|
-
method_option :read_type, :type => :string, :default => 'qseq'
|
|
17
|
-
method_option :min_length, :type => :numeric, :default => 75
|
|
18
|
-
|
|
19
|
-
def trim
|
|
20
|
-
raw_reads = options[:reads_dir]
|
|
21
|
-
out_dir = options[:out_dir]
|
|
22
|
-
left_trim = options[:left_trim]
|
|
23
|
-
read_type = options[:read_type]
|
|
24
|
-
min_length = options[:min_length]
|
|
25
|
-
|
|
26
|
-
ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"
|
|
27
|
-
run "mkdir -p #{out_dir}"
|
|
28
|
-
|
|
29
|
-
raw_reads =
|
|
30
|
-
if read_type == 'qseq'
|
|
31
|
-
get_grouped_qseq_files(raw_reads)
|
|
32
|
-
elsif read_type == 'fastq'
|
|
33
|
-
r = Dir[raw_reads].map do |x|
|
|
34
|
-
[ File.basename(x, '.fastq'), x ]
|
|
35
|
-
end
|
|
36
|
-
Hash[r]
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
if raw_reads.size == 0
|
|
40
|
-
ohno 'glob matches no reads'
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
pbar = ProgressBar.new 'trimming', raw_reads.size
|
|
44
|
-
|
|
45
|
-
raw_reads.each do |prefix, files|
|
|
46
|
-
|
|
47
|
-
# get an output handle
|
|
48
|
-
out = File.join(out_dir, "#{File.basename(prefix)}.fasta")
|
|
49
|
-
|
|
50
|
-
# create the trimmed sequence generator
|
|
51
|
-
trim_args = { :left_trim => left_trim, :min_length => min_length }
|
|
52
|
-
|
|
53
|
-
trimmer =
|
|
54
|
-
if read_type == 'qseq'
|
|
55
|
-
Trimmer::QSEQTrimmer.new(*files, trim_args)
|
|
56
|
-
elsif read_type == 'fastq'
|
|
57
|
-
Trimmer::InterleavedTrimmer.new(files, trim_args)
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
# trim and write
|
|
61
|
-
File.open(out, 'w') do |o|
|
|
62
|
-
trimmer.each do |trimmed_record|
|
|
63
|
-
o.puts trimmed_record
|
|
64
|
-
end
|
|
65
|
-
end # File.open
|
|
66
|
-
|
|
67
|
-
pbar.inc
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
pbar.finish
|
|
71
|
-
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
no_tasks do
|
|
75
|
-
|
|
76
|
-
# Function for grouping qseq files produced by splitting illumina
|
|
77
|
-
# reads by barcode
|
|
78
|
-
#
|
|
79
|
-
# Filenames should look like this:
|
|
80
|
-
# IL5_L_1_B_007_1.txt
|
|
81
|
-
def get_grouped_qseq_files(glob='raw_reads/*.txt')
|
|
82
|
-
Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
end # no_tasks
|
|
86
|
-
|
|
87
|
-
end
|
|
88
|
-
end
|