lederhosen 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/lederhosen/tasks/name.rb +9 -2
- data/lib/version.rb +1 -1
- data/pipeline.sh +50 -16
- data/readme.md +12 -41
- metadata +4 -4
data/lib/version.rb
CHANGED
data/pipeline.sh
CHANGED
@@ -1,37 +1,71 @@
|
|
1
1
|
#!/bin/bash
|
2
2
|
|
3
|
-
set +e
|
4
|
-
|
5
3
|
# An example OTU clustering pipeline
|
6
4
|
# Austin G. Davis-Richardson
|
7
5
|
# <harekrishna at gmail dot com>
|
6
|
+
# http://github.com/audy/lederhosen
|
7
|
+
|
8
|
+
set -e
|
8
9
|
|
9
|
-
raw_reads='
|
10
|
-
identities='0.975'
|
10
|
+
raw_reads='spec/data/*.txt'
|
11
11
|
out_dir='pipeline'
|
12
|
+
taxcollector='taxcollector.fa'
|
13
|
+
min_reads=50
|
14
|
+
min_samples=10
|
12
15
|
|
13
16
|
# trim reads
|
14
|
-
bin/lederhosen trim
|
17
|
+
bin/lederhosen trim \
|
18
|
+
--reads-dir=$raw_reads \
|
19
|
+
--out-dir=$out_dir/trimmed
|
15
20
|
|
16
21
|
# join reads
|
17
|
-
bin/lederhosen join
|
22
|
+
bin/lederhosen join \
|
23
|
+
--trimmed=$out_dir/trimmed/*.fasta \
|
24
|
+
--output=$out_dir/joined.fasta
|
18
25
|
|
19
26
|
# filter reads
|
20
|
-
bin/lederhosen
|
27
|
+
bin/lederhosen k_filter \
|
28
|
+
--input=$out_dir/joined.fasta \
|
29
|
+
--output=$out_dir/filtered.fasta \
|
30
|
+
-k=10 \
|
31
|
+
--cutoff=50
|
21
32
|
|
22
33
|
# sort
|
23
|
-
bin/lederhosen sort
|
34
|
+
bin/lederhosen sort \
|
35
|
+
--input=$out_dir/filtered.fasta \
|
36
|
+
--output=$out_dir/sorted.fasta
|
24
37
|
|
25
|
-
|
26
|
-
for i in $identities
|
38
|
+
for i in 0.80 0.90 0.95
|
27
39
|
do
|
28
|
-
|
29
|
-
|
40
|
+
# cluster
|
41
|
+
bin/lederhosen cluster \
|
42
|
+
--input=$out_dir/sorted.fasta \
|
43
|
+
--output=$out_dir/clusters_"$i".uc \
|
44
|
+
--identity=$i
|
30
45
|
|
31
|
-
#
|
32
|
-
|
33
|
-
|
34
|
-
|
46
|
+
# filter uc file
|
47
|
+
bin/lederhosen uc_filter \
|
48
|
+
--input=$out_dir/clusters_"$i".uc \
|
49
|
+
--output=$out_dir/clusters_"$i".uc.filtered \
|
50
|
+
--reads=$min_reads \
|
51
|
+
--samples=$min_samples \
|
52
|
+
|
53
|
+
# generate otu table
|
54
|
+
bin/lederhosen otu_table \
|
55
|
+
--clusters=$out_dir/clusters_"$i".uc.filtered \
|
56
|
+
--output=$out_dir/otus_"$i"
|
57
|
+
|
58
|
+
# get representative reads
|
59
|
+
bin/lederhosen rep_reads \
|
60
|
+
--clusters=$out_dir/clusters_"$i".uc.filtered \
|
61
|
+
--joined=$out_dir/sorted.fasta \
|
62
|
+
--output=$out_dir/representatives_"$i".fasta
|
63
|
+
|
64
|
+
# blast representative reads
|
65
|
+
bin/lederhosen name \
|
66
|
+
--reps=$out_dir/representatives_"$i".fasta \
|
67
|
+
--output=$out_dir/taxonomies_"$i".txt \
|
68
|
+
--database=$taxcollector
|
35
69
|
done
|
36
70
|
|
37
71
|
echo "complete!"
|
data/readme.md
CHANGED
@@ -16,44 +16,15 @@ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own ri
|
|
16
16
|
|
17
17
|
Type `lederhosen help` for complete instructions
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
### 4. Cluster sorted reads
|
32
|
-
|
33
|
-
`$ lederhosen cluster --identity=0.975 --input=sorted.fasta --output=clusters`
|
34
|
-
|
35
|
-
### 5. Make OTU tables
|
36
|
-
|
37
|
-
`% lederhosen otu_table --clusters=clusters.uc --output=clusters_975.csv`
|
38
|
-
|
39
|
-
This will output a csv (`clusters.975.csv`) and a fasta (`clusters.975.fasta`) file. The fasta file can be used to identify clusters in a 16S rRNA database using BLAST or something.
|
40
|
-
|
41
|
-
### 6. Get representative reads from each cluster
|
42
|
-
|
43
|
-
`% lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representatives.fasta`
|
44
|
-
|
45
|
-
### 6. Get a fasta file containing all reads for each cluster
|
46
|
-
|
47
|
-
(time consuming and probably not necessary)
|
48
|
-
|
49
|
-
`% lederhosen split --clusters=clusters_97.5.txt --reads=joined.fasta --min-clst-size=100`
|
50
|
-
|
51
|
-
`--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
|
52
|
-
|
53
|
-
### 7. Identifying Clusters
|
54
|
-
|
55
|
-
(Still under development)
|
56
|
-
|
57
|
-
You need BLAT (in your `$PATH`) & TaxCollector.
|
58
|
-
|
59
|
-
`$ lederhosen name --reps=representatives.fasta --db=taxcollector.fa --output=output_prefix`
|
19
|
+
See pipeline.sh for example usage.
|
20
|
+
|
21
|
+
## Features
|
22
|
+
|
23
|
+
- Sequence trimming (paired-end Illumina).
|
24
|
+
- K-mer filtering.
|
25
|
+
- Clustering w/ UCLUST.
|
26
|
+
- UCLUST output filtering.
|
27
|
+
- Separation of representative reads.
|
28
|
+
- Separation of all reads belonging to each cluster.
|
29
|
+
- Identification of clusters using TaxCollector.
|
30
|
+
- Generation of OTU abundancy matrices.
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 1
|
9
|
-
-
|
10
|
-
version: 0.1.
|
9
|
+
- 2
|
10
|
+
version: 0.1.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Austin G. Davis-Richardson
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-05-
|
18
|
+
date: 2012-05-23 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: dna
|