lederhosen 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lederhosen.gemspec +1 -11
- data/lib/lederhosen/tasks/otu_table.rb +4 -0
- data/lib/lederhosen/tasks/trim.rb +89 -2
- data/lib/lederhosen/version.rb +1 -1
- data/readme.md +13 -11
- data/spec/cli_spec.rb +1 -1
- metadata +3 -13
- data/examples/hierarchical_clustering.sh +0 -51
- data/examples/pipeline.sh +0 -71
- data/lib/lederhosen/buffer.rb +0 -54
- data/lib/lederhosen/helpers.rb +0 -166
- data/lib/lederhosen/tasks/k_filter.rb +0 -82
- data/lib/lederhosen/tasks/rep_reads.rb +0 -45
- data/lib/lederhosen/tasks/split.rb +0 -84
- data/lib/lederhosen/tasks/uc_filter.rb +0 -80
- data/lib/lederhosen/tasks/uc_stats.rb +0 -41
- data/spec/helpers_spec.rb +0 -30
data/lederhosen.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "1.0.
|
8
|
+
s.version = "1.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
@@ -23,24 +23,15 @@ Gem::Specification.new do |s|
|
|
23
23
|
"LICENSE.txt",
|
24
24
|
"Rakefile",
|
25
25
|
"bin/lederhosen",
|
26
|
-
"examples/hierarchical_clustering.sh",
|
27
|
-
"examples/pipeline.sh",
|
28
26
|
"lederhosen.gemspec",
|
29
27
|
"lib/lederhosen.rb",
|
30
|
-
"lib/lederhosen/buffer.rb",
|
31
28
|
"lib/lederhosen/cli.rb",
|
32
|
-
"lib/lederhosen/helpers.rb",
|
33
29
|
"lib/lederhosen/tasks/cluster.rb",
|
34
|
-
"lib/lederhosen/tasks/k_filter.rb",
|
35
30
|
"lib/lederhosen/tasks/make_udb.rb",
|
36
31
|
"lib/lederhosen/tasks/otu_filter.rb",
|
37
32
|
"lib/lederhosen/tasks/otu_table.rb",
|
38
|
-
"lib/lederhosen/tasks/rep_reads.rb",
|
39
|
-
"lib/lederhosen/tasks/split.rb",
|
40
33
|
"lib/lederhosen/tasks/split_fasta.rb",
|
41
34
|
"lib/lederhosen/tasks/trim.rb",
|
42
|
-
"lib/lederhosen/tasks/uc_filter.rb",
|
43
|
-
"lib/lederhosen/tasks/uc_stats.rb",
|
44
35
|
"lib/lederhosen/tasks/version.rb",
|
45
36
|
"lib/lederhosen/version.rb",
|
46
37
|
"readme.md",
|
@@ -50,7 +41,6 @@ Gem::Specification.new do |s|
|
|
50
41
|
"spec/data/ILT_L_9_B_002_1.txt.gz",
|
51
42
|
"spec/data/ILT_L_9_B_002_3.txt.gz",
|
52
43
|
"spec/data/test.uc",
|
53
|
-
"spec/helpers_spec.rb",
|
54
44
|
"spec/misc_spec.rb",
|
55
45
|
"spec/spec_helper.rb"
|
56
46
|
]
|
@@ -26,9 +26,11 @@ module Lederhosen
|
|
26
26
|
sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
27
27
|
|
28
28
|
all_names = Set.new
|
29
|
+
pbar = ProgressBar.new "loading", input.size
|
29
30
|
|
30
31
|
# Load cluster table
|
31
32
|
input.each do |input_file|
|
33
|
+
pbar.inc
|
32
34
|
File.open(input_file) do |handle|
|
33
35
|
handle.each do |line|
|
34
36
|
dat = parse_usearch_line(line.strip)
|
@@ -41,6 +43,8 @@ module Lederhosen
|
|
41
43
|
end
|
42
44
|
end
|
43
45
|
|
46
|
+
pbar.finish
|
47
|
+
|
44
48
|
ohai "found #{all_names.size} unique taxa at #{level} level"
|
45
49
|
|
46
50
|
# save to csv
|
@@ -19,7 +19,7 @@ module Lederhosen
|
|
19
19
|
|
20
20
|
run "mkdir -p #{out_dir}"
|
21
21
|
|
22
|
-
raw_reads =
|
22
|
+
raw_reads = get_grouped_qseq_files raw_reads
|
23
23
|
|
24
24
|
ohai "found #{raw_reads.length} pairs of reads"
|
25
25
|
|
@@ -28,10 +28,97 @@ module Lederhosen
|
|
28
28
|
pbar.inc
|
29
29
|
out = File.join(out_dir, "#{File.basename(a[0])}.fasta")
|
30
30
|
# TODO get total and trimmed
|
31
|
-
total, trimmed =
|
31
|
+
total, trimmed = trim_pairs a[1][0], a[1][1], out, :min_length => 70
|
32
32
|
end
|
33
33
|
pbar.finish
|
34
34
|
|
35
35
|
end
|
36
|
+
|
37
|
+
no_tasks do
|
38
|
+
|
39
|
+
# reverse complement a DNA sequence
|
40
|
+
# assumes only GATCN nucleotides
|
41
|
+
def reverse_complement(s)
|
42
|
+
s.reverse.tr('GATCNgatcn','CTAGNctagn')
|
43
|
+
end
|
44
|
+
|
45
|
+
# Function for grouping qseq files produced by splitting illumina
|
46
|
+
# reads by barcode
|
47
|
+
#
|
48
|
+
# Filenames should look like this:
|
49
|
+
# IL5_L_1_B_007_1.txt
|
50
|
+
def get_grouped_qseq_files(glob='raw_reads/*.txt')
|
51
|
+
Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
|
52
|
+
end
|
53
|
+
|
54
|
+
# Trim a pair of QSEQ files. Saves to a single,
|
55
|
+
# interleaved .fasta file
|
56
|
+
def trim_pairs(left, right, out, args={})
|
57
|
+
cutoff = args[:cutoff] || 20
|
58
|
+
min_length = args[:min_length] || 70
|
59
|
+
|
60
|
+
left_handle, right_handle =
|
61
|
+
begin
|
62
|
+
[ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
|
63
|
+
rescue Zlib::GzipFile::Error
|
64
|
+
[ File.open(left), File.open(right) ]
|
65
|
+
end
|
66
|
+
|
67
|
+
out_handle = File.open out, 'w'
|
68
|
+
|
69
|
+
left_reads = Dna.new left_handle
|
70
|
+
right_reads = Dna.new right_handle
|
71
|
+
|
72
|
+
i = 0
|
73
|
+
left_reads.zip(right_reads).each do |a, b|
|
74
|
+
i += 1
|
75
|
+
seqa = trim_seq a
|
76
|
+
seqb = trim_seq b
|
77
|
+
unless [seqa, seqb].include? nil
|
78
|
+
if seqb.length >= min_length && seqa.length >= min_length
|
79
|
+
seqb = reverse_complement(seqb)
|
80
|
+
out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
left_handle.close
|
85
|
+
right_handle.close
|
86
|
+
out_handle.close
|
87
|
+
end
|
88
|
+
|
89
|
+
# Return longest subsequence with quality scores
|
90
|
+
# greater than min. (Illumina PHRED)
|
91
|
+
# Trim2 from Huang, et. al
|
92
|
+
# returns just the sequence
|
93
|
+
def trim_seq(dna, args={})
|
94
|
+
|
95
|
+
# trim primers off of sequence
|
96
|
+
# (THIS IS EXPERIMENT-SPECIFIC)
|
97
|
+
dna.sequence = dna.sequence[11..-1]
|
98
|
+
dna.quality = dna.quality[11..-1]
|
99
|
+
|
100
|
+
# throw away any read with an ambiguous primer
|
101
|
+
return nil if dna.sequence =~ /N/
|
102
|
+
|
103
|
+
min = args[:min] || 20
|
104
|
+
offset = args[:cutoff] || 64
|
105
|
+
|
106
|
+
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
107
|
+
|
108
|
+
dna.quality.each_byte.each_with_index do |b, a|
|
109
|
+
_sum += (b - offset - min)
|
110
|
+
if _sum > _max
|
111
|
+
_max = _sum
|
112
|
+
_end = a
|
113
|
+
start = first
|
114
|
+
elsif _sum < 0
|
115
|
+
_sum = 0
|
116
|
+
first = a
|
117
|
+
end
|
118
|
+
end
|
119
|
+
dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
36
123
|
end
|
37
124
|
end
|
data/lib/lederhosen/version.rb
CHANGED
data/readme.md
CHANGED
@@ -2,13 +2,18 @@
|
|
2
2
|
|
3
3
|
Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
|
4
4
|
|
5
|
-
|
5
|
+
### About
|
6
6
|
|
7
|
-
Lederhosen is
|
7
|
+
- Lederhosen is a project born out of the Triplett Lab at the University of Florida.
|
8
|
+
- Lederhosen is designed to be a fast and simple method of clustering 16S rRNA amplicons sequenced
|
9
|
+
using paired and non-paired end short reads such as those produced by Illumina (GAIIx, HiSeq and MiSeq).
|
10
|
+
- Lederhosen uses Semantic Versioning.
|
11
|
+
- Lederhosen is free and open source under the [MIT open source license](http://opensource.org/licenses/mit-license.php/).
|
12
|
+
- Except for USEARCH which requires a license, Lederhosen is available for commercial use.
|
8
13
|
|
9
14
|
## How do I get Lederhosen?
|
10
15
|
|
11
|
-
0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine)
|
16
|
+
0. Obtain & Install [USEARCH](http://www.drive5.com/) (32bit is fine for non-commercial use)
|
12
17
|
2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
|
13
18
|
3. Install Lederhosen by typing:
|
14
19
|
|
@@ -18,13 +23,8 @@ Lederhosen is free and open source under the [MIT open source license](http://op
|
|
18
23
|
## Features
|
19
24
|
|
20
25
|
- Sequence trimming (paired-end Illumina).
|
21
|
-
-
|
22
|
-
-
|
23
|
-
- UCLUST output filtering.
|
24
|
-
- Separation of representative reads.
|
25
|
-
- Separation of all reads belonging to each cluster.
|
26
|
-
- Identification of clusters using TaxCollector.
|
27
|
-
- Generation of OTU abundancy matrices.
|
26
|
+
- Parallel, referenced-based clustering to TaxCollector using USEARCH
|
27
|
+
- Generation and filtering of OTU abundancy matrices.
|
28
28
|
|
29
29
|
## How do I use Lederhosen?
|
30
30
|
|
@@ -40,6 +40,8 @@ Trim (Illumina) reads using quality scores. Output will be a directory of fasta
|
|
40
40
|
|
41
41
|
lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
|
42
42
|
|
43
|
+
The trimming process will reverse complement the "right" pair so that both reads are in the forward orientation.
|
44
|
+
|
43
45
|
### Create Database
|
44
46
|
|
45
47
|
Create UDB database required by usearch from TaxCollector
|
@@ -58,4 +60,4 @@ Create an OTU abundance table where rows are samples and columns are clusters. T
|
|
58
60
|
|
59
61
|
lederhosen otu_table --clusters=clusters_95.uc --output=genus.csv --level=genus
|
60
62
|
|
61
|
-
Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus.
|
63
|
+
Level can be Kingdom, Domain, Phylum, Class, Order, Family or Genus.
|
data/spec/cli_spec.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 1.0.
|
9
|
+
- 2
|
10
|
+
version: 1.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Austin G. Davis-Richardson
|
@@ -119,24 +119,15 @@ files:
|
|
119
119
|
- LICENSE.txt
|
120
120
|
- Rakefile
|
121
121
|
- bin/lederhosen
|
122
|
-
- examples/hierarchical_clustering.sh
|
123
|
-
- examples/pipeline.sh
|
124
122
|
- lederhosen.gemspec
|
125
123
|
- lib/lederhosen.rb
|
126
|
-
- lib/lederhosen/buffer.rb
|
127
124
|
- lib/lederhosen/cli.rb
|
128
|
-
- lib/lederhosen/helpers.rb
|
129
125
|
- lib/lederhosen/tasks/cluster.rb
|
130
|
-
- lib/lederhosen/tasks/k_filter.rb
|
131
126
|
- lib/lederhosen/tasks/make_udb.rb
|
132
127
|
- lib/lederhosen/tasks/otu_filter.rb
|
133
128
|
- lib/lederhosen/tasks/otu_table.rb
|
134
|
-
- lib/lederhosen/tasks/rep_reads.rb
|
135
|
-
- lib/lederhosen/tasks/split.rb
|
136
129
|
- lib/lederhosen/tasks/split_fasta.rb
|
137
130
|
- lib/lederhosen/tasks/trim.rb
|
138
|
-
- lib/lederhosen/tasks/uc_filter.rb
|
139
|
-
- lib/lederhosen/tasks/uc_stats.rb
|
140
131
|
- lib/lederhosen/tasks/version.rb
|
141
132
|
- lib/lederhosen/version.rb
|
142
133
|
- readme.md
|
@@ -146,7 +137,6 @@ files:
|
|
146
137
|
- spec/data/ILT_L_9_B_002_1.txt.gz
|
147
138
|
- spec/data/ILT_L_9_B_002_3.txt.gz
|
148
139
|
- spec/data/test.uc
|
149
|
-
- spec/helpers_spec.rb
|
150
140
|
- spec/misc_spec.rb
|
151
141
|
- spec/spec_helper.rb
|
152
142
|
homepage: http://audy.github.com/lederhosen
|
@@ -1,51 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
|
3
|
-
set -e
|
4
|
-
set -x
|
5
|
-
|
6
|
-
# Hierarchical OTU clustering
|
7
|
-
# Austin G. Davis-Richardson
|
8
|
-
# <harekrishna at gmail dot com>
|
9
|
-
# http://github.com/audy/lederhosen
|
10
|
-
|
11
|
-
reads='sorted.fasta'
|
12
|
-
out='h_clustering'
|
13
|
-
|
14
|
-
mkdir -p $out
|
15
|
-
|
16
|
-
# initial clustering at 80%
|
17
|
-
lederhosen cluster --input=$reads --output=$out/clusters_0.80.uc --identity=0.80
|
18
|
-
|
19
|
-
# filter UC file
|
20
|
-
lederhosen uc_filter --input=$out/clusters_0.80.uc --output=$out/clusters_0.80.uc.filtered --reads=1 --samples=1
|
21
|
-
|
22
|
-
# get reads for each cluster
|
23
|
-
mkdir -p $out/split_80
|
24
|
-
lederhosen split --clusters=$out/clusters_0.80.uc.filtered --reads=$reads --out-dir=$out/split_80/
|
25
|
-
|
26
|
-
# now cluster each of those at 90%
|
27
|
-
for fasta in $out/split_80/*.fasta
|
28
|
-
do
|
29
|
-
|
30
|
-
# sort (awww, do I really have to do this again?)
|
31
|
-
lederhosen sort --input=$fasta --output=$fasta.sorted
|
32
|
-
|
33
|
-
# cluster
|
34
|
-
lederhosen cluster --input=$fasta.sorted --output=$fasta.uc --identity=0.90
|
35
|
-
|
36
|
-
# split
|
37
|
-
split=$out/split_80.90_$(basename $fasta .fasta)
|
38
|
-
lederhosen split --clusters=$fasta.uc --reads=$fasta --out-dir=$split
|
39
|
-
done
|
40
|
-
|
41
|
-
# Do it again at 95%
|
42
|
-
for fasta in $out/split_80/split_*_90.fasta/*.fasta
|
43
|
-
do
|
44
|
-
# cluster
|
45
|
-
lederhosen cluster --input=$fasta --output=$fasta.uc --identity=90
|
46
|
-
|
47
|
-
# split
|
48
|
-
split=$outdir/80.90.$fasta.fasta
|
49
|
-
mkdir -p $split
|
50
|
-
lederhosen split --clusters=$fasta.uc --reads=$input --out-dir=$split
|
51
|
-
done
|
data/examples/pipeline.sh
DELETED
@@ -1,71 +0,0 @@
|
|
1
|
-
#!/bash
|
2
|
-
|
3
|
-
# An example OTU clustering pipeline
|
4
|
-
# Austin G. Davis-Richardson
|
5
|
-
# <harekrishna at gmail dot com>
|
6
|
-
# http://github.com/audy/lederhosen
|
7
|
-
|
8
|
-
set -e
|
9
|
-
|
10
|
-
raw_reads='spec/data/*.txt'
|
11
|
-
out_dir='pipeline'
|
12
|
-
taxcollector='taxcollector.fa'
|
13
|
-
min_reads=50
|
14
|
-
min_samples=10
|
15
|
-
|
16
|
-
# trim reads
|
17
|
-
lederhosen trim \
|
18
|
-
--reads-dir=$raw_reads \
|
19
|
-
--out-dir=$out_dir/trimmed
|
20
|
-
|
21
|
-
# join reads
|
22
|
-
lederhosen join \
|
23
|
-
--trimmed=$out_dir/trimmed/*.fasta \
|
24
|
-
--output=$out_dir/joined.fasta
|
25
|
-
|
26
|
-
# filter reads
|
27
|
-
lederhosen k_filter \
|
28
|
-
--input=$out_dir/joined.fasta \
|
29
|
-
--output=$out_dir/filtered.fasta \
|
30
|
-
-k=10 \
|
31
|
-
--cutoff=50
|
32
|
-
|
33
|
-
# sort
|
34
|
-
lederhosen sort \
|
35
|
-
--input=$out_dir/filtered.fasta \
|
36
|
-
--output=$out_dir/sorted.fasta
|
37
|
-
|
38
|
-
for i in 0.80 0.90 0.95
|
39
|
-
do
|
40
|
-
# cluster
|
41
|
-
lederhosen cluster \
|
42
|
-
--input=$out_dir/sorted.fasta \
|
43
|
-
--output=$out_dir/clusters_"$i".uc \
|
44
|
-
--identity=$i
|
45
|
-
|
46
|
-
# filter uc file
|
47
|
-
lederhosen uc_filter \
|
48
|
-
--input=$out_dir/clusters_"$i".uc \
|
49
|
-
--output=$out_dir/clusters_"$i".uc.filtered \
|
50
|
-
--reads=$min_reads \
|
51
|
-
--samples=$min_samples \
|
52
|
-
|
53
|
-
# generate otu table
|
54
|
-
lederhosen otu_table \
|
55
|
-
--clusters=$out_dir/clusters_"$i".uc.filtered \
|
56
|
-
--output=$out_dir/otus_"$i"
|
57
|
-
|
58
|
-
# get representative reads
|
59
|
-
lederhosen rep_reads \
|
60
|
-
--clusters=$out_dir/clusters_"$i".uc.filtered \
|
61
|
-
--joined=$out_dir/sorted.fasta \
|
62
|
-
--output=$out_dir/representatives_"$i".fasta
|
63
|
-
|
64
|
-
# blast representative reads
|
65
|
-
lederhosen name \
|
66
|
-
--reps=$out_dir/representatives_"$i".fasta \
|
67
|
-
--output=$out_dir/taxonomies_"$i".txt \
|
68
|
-
--database=$taxcollector
|
69
|
-
done
|
70
|
-
|
71
|
-
echo "complete!"
|
data/lib/lederhosen/buffer.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
module Lederhosen
|
2
|
-
|
3
|
-
class Buffer
|
4
|
-
# for when you need to write out to a shitload of files.
|
5
|
-
|
6
|
-
#
|
7
|
-
# Create a new buffer
|
8
|
-
#
|
9
|
-
def initialize(args={})
|
10
|
-
@buffer = Hash.new { |h, k| h[k] = Array.new }
|
11
|
-
@buffer_max = args[:buffer_max] || 100_000
|
12
|
-
end
|
13
|
-
|
14
|
-
#
|
15
|
-
# Add an object to the buffer
|
16
|
-
#
|
17
|
-
def add_to bucket, obj
|
18
|
-
|
19
|
-
@buffer[bucket] << obj.to_s
|
20
|
-
|
21
|
-
if @buffer[bucket].length > @buffer_max
|
22
|
-
# write out
|
23
|
-
File.open(bucket, 'a+') do |out|
|
24
|
-
@buffer[bucket].each do |v|
|
25
|
-
out.puts v
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
# clear that bucket
|
30
|
-
@buffer[bucket].clear
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def [] k
|
35
|
-
@buffer[k]
|
36
|
-
end
|
37
|
-
|
38
|
-
#
|
39
|
-
# Writes out leftover objects
|
40
|
-
#
|
41
|
-
def finalize
|
42
|
-
@buffer.each_key do |bucket|
|
43
|
-
File.open(bucket, 'a+') do |out|
|
44
|
-
@buffer[bucket].each do |v|
|
45
|
-
out.puts v
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
@buffer = Hash.new { |h, k| h[k] = Array.new }
|
50
|
-
end
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
data/lib/lederhosen/helpers.rb
DELETED
@@ -1,166 +0,0 @@
|
|
1
|
-
module Lederhosen
|
2
|
-
class Helpers
|
3
|
-
class << self
|
4
|
-
|
5
|
-
# reverse complement a DNA sequence
|
6
|
-
# assumes only GATCN nucleotides
|
7
|
-
def reverse_complement(s)
|
8
|
-
s.reverse.tr('GATCNgatcn','CTAGNctagn')
|
9
|
-
end
|
10
|
-
|
11
|
-
# Function for grouping qseq files produced by splitting illumina
|
12
|
-
# reads by barcode
|
13
|
-
#
|
14
|
-
# Filenames should look like this:
|
15
|
-
# IL5_L_1_B_007_1.txt
|
16
|
-
def get_grouped_qseq_files(glob='raw_reads/*.txt')
|
17
|
-
Dir.glob(glob).group_by { |x| File.basename(x).split('_')[0..4].join('_') }
|
18
|
-
end
|
19
|
-
|
20
|
-
# Trim a pair of QSEQ files. Saves to a single,
|
21
|
-
# interleaved .fasta file
|
22
|
-
def trim_pairs(left, right, out, args={})
|
23
|
-
cutoff = args[:cutoff] || 20
|
24
|
-
min_length = args[:min_length] || 70
|
25
|
-
|
26
|
-
left_handle, right_handle =
|
27
|
-
begin
|
28
|
-
[ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
|
29
|
-
rescue Zlib::GzipFile::Error
|
30
|
-
[ File.open(left), File.open(right) ]
|
31
|
-
end
|
32
|
-
|
33
|
-
out_handle = File.open out, 'w'
|
34
|
-
|
35
|
-
left_reads = Dna.new left_handle
|
36
|
-
right_reads = Dna.new right_handle
|
37
|
-
|
38
|
-
i = 0
|
39
|
-
left_reads.zip(right_reads).each do |a, b|
|
40
|
-
i += 1
|
41
|
-
seqa = trim a
|
42
|
-
seqb = trim b
|
43
|
-
unless [seqa, seqb].include? nil
|
44
|
-
if seqb.length >= min_length && seqa.length >= min_length
|
45
|
-
seqb = reverse_complement(seqb)
|
46
|
-
out_handle.puts ">#{i}:0\n#{seqa}\n>#{i}:1\n#{seqb}"
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
left_handle.close
|
51
|
-
right_handle.close
|
52
|
-
out_handle.close
|
53
|
-
end
|
54
|
-
|
55
|
-
# Return longest subsequence with quality scores
|
56
|
-
# greater than min. (Illumina PHRED)
|
57
|
-
# Trim2 from Huang, et. al
|
58
|
-
# returns just the sequence
|
59
|
-
def trim(dna, args={})
|
60
|
-
|
61
|
-
# trim primers off of sequence
|
62
|
-
# (THIS IS EXPERIMENT-SPECIFIC)
|
63
|
-
dna.sequence = dna.sequence[11..-1]
|
64
|
-
dna.quality = dna.quality[11..-1]
|
65
|
-
|
66
|
-
# throw away any read with an ambiguous primer
|
67
|
-
return nil if dna.sequence =~ /N/
|
68
|
-
|
69
|
-
min = args[:min] || 20
|
70
|
-
offset = args[:cutoff] || 64
|
71
|
-
|
72
|
-
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
73
|
-
|
74
|
-
dna.quality.each_byte.each_with_index do |b, a|
|
75
|
-
_sum += (b - offset - min)
|
76
|
-
if _sum > _max
|
77
|
-
_max = _sum
|
78
|
-
_end = a
|
79
|
-
start = first
|
80
|
-
elsif _sum < 0
|
81
|
-
_sum = 0
|
82
|
-
first = a
|
83
|
-
end
|
84
|
-
end
|
85
|
-
dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
|
86
|
-
end
|
87
|
-
|
88
|
-
# Load uc file from uclust
|
89
|
-
# returns hash with various data
|
90
|
-
def load_uc_file(input)
|
91
|
-
clusters = Hash.new
|
92
|
-
|
93
|
-
# keep track of samples
|
94
|
-
samples = Set.new
|
95
|
-
|
96
|
-
# store a list of all the sample IDs
|
97
|
-
clusters[:samples] = Set.new
|
98
|
-
|
99
|
-
# data for each cluster
|
100
|
-
# clstr_counts[:clstr][:sample] = number_of_reads
|
101
|
-
clstr_counts = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
102
|
-
|
103
|
-
# clstrnr_to_seed[seed_sequence_id] = clstr_nr
|
104
|
-
seed_to_clstrnr = Hash.new
|
105
|
-
bytes = File.size(input)
|
106
|
-
pbar = ProgressBar.new 'loading uc file', bytes
|
107
|
-
File.open(input) do |handle|
|
108
|
-
handle.each do |line|
|
109
|
-
pbar.set handle.pos
|
110
|
-
next if line =~ /^#/ # skip comments
|
111
|
-
|
112
|
-
line = line.strip.split
|
113
|
-
|
114
|
-
# things we want to know
|
115
|
-
type = line[0]
|
116
|
-
clusternr = line[1].to_i
|
117
|
-
querylabel = line[8]
|
118
|
-
targetlabel = line[9]
|
119
|
-
header = line[8]
|
120
|
-
|
121
|
-
sample =
|
122
|
-
begin
|
123
|
-
# get the sample id via regexp match
|
124
|
-
# this way more info can be stored in the header.
|
125
|
-
line[8].match(/sample=(.*)/)[1]
|
126
|
-
rescue NoMethodError # catch no method [] for NilClass
|
127
|
-
# Need to maintain some backwards compatibility here
|
128
|
-
# this is the old way of getting the same id.
|
129
|
-
sample = line[8].split(':')[2]
|
130
|
-
end
|
131
|
-
|
132
|
-
# keep track of samples
|
133
|
-
samples.add(sample)
|
134
|
-
|
135
|
-
# keep track of all samples
|
136
|
-
clusters[:samples].add sample
|
137
|
-
|
138
|
-
# L=LibSeed
|
139
|
-
# S=NewSeed
|
140
|
-
# H=Hit
|
141
|
-
# R=Reject
|
142
|
-
# D=LibCluster
|
143
|
-
# C=NewCluster
|
144
|
-
# N=NoHit
|
145
|
-
|
146
|
-
if type =~ /[LS]/ # = Seed Sequence
|
147
|
-
clstr_counts[clusternr][sample] += 1
|
148
|
-
seed_to_clstrnr[querylabel] = clusternr
|
149
|
-
elsif type =~ /H/ # = Seed Member
|
150
|
-
clstr_counts[clusternr][sample] += 1
|
151
|
-
end
|
152
|
-
|
153
|
-
end
|
154
|
-
end
|
155
|
-
pbar.finish
|
156
|
-
return {
|
157
|
-
:clstr_counts => clstr_counts,
|
158
|
-
:seed_to_clstrnr => seed_to_clstrnr,
|
159
|
-
:samples => samples
|
160
|
-
}
|
161
|
-
end
|
162
|
-
|
163
|
-
|
164
|
-
end # class << self
|
165
|
-
end # class Helpers
|
166
|
-
end # Module
|
@@ -1,82 +0,0 @@
|
|
1
|
-
##
|
2
|
-
# FILTER READS WITH LOW ABUNDANCE KMERS
|
3
|
-
#
|
4
|
-
|
5
|
-
module Lederhosen
|
6
|
-
class CLI
|
7
|
-
|
8
|
-
desc "k_filter",
|
9
|
-
"filter novel reads likely to form small/singleton clusters (experimental)"
|
10
|
-
|
11
|
-
method_option :input, :type => :string, :required => true
|
12
|
-
method_option :output, :type => :string, :required => true
|
13
|
-
method_option :k, :type => :numeric, :required => true
|
14
|
-
method_option :cutoff, :type => :numeric, :required => true
|
15
|
-
|
16
|
-
def k_filter
|
17
|
-
input = options[:input]
|
18
|
-
output = options[:output]
|
19
|
-
k_len = options[:k].to_i
|
20
|
-
cutoff = options[:cutoff]
|
21
|
-
|
22
|
-
ohai "kmer filtering #{input} (k = #{k_len}, cutoff = #{cutoff})"
|
23
|
-
|
24
|
-
counting_table = Hash.new { |h, k| h[k] = 0 }
|
25
|
-
total_reads = 0
|
26
|
-
|
27
|
-
File.open(input) do |handle|
|
28
|
-
pbar = ProgressBar.new 'counting', File.size(input)
|
29
|
-
records = Dna.new handle
|
30
|
-
records.each do |r|
|
31
|
-
pbar.set handle.pos
|
32
|
-
total_reads += 1
|
33
|
-
kmers = r.sequence.to_kmers(k_len)
|
34
|
-
kmers.each { |x| counting_table[x] += 1 }
|
35
|
-
end
|
36
|
-
pbar.finish
|
37
|
-
end
|
38
|
-
|
39
|
-
sum_of_kmers = counting_table.values.inject(:+)
|
40
|
-
|
41
|
-
ohai "total reads = #{total_reads}"
|
42
|
-
ohai "sum of kmers = #{sum_of_kmers}"
|
43
|
-
|
44
|
-
kept = 0
|
45
|
-
total_reads = total_reads.to_f
|
46
|
-
|
47
|
-
pbar = ProgressBar.new "saving", total_reads.to_i
|
48
|
-
output = File.open(output, 'w')
|
49
|
-
File.open(input) do |handle|
|
50
|
-
records = Dna.new handle
|
51
|
-
records.each do |r|
|
52
|
-
kmers = r.sequence.to_kmers(k_len)
|
53
|
-
|
54
|
-
# check if any of the kmers are rare
|
55
|
-
keep = true
|
56
|
-
coverage = 0
|
57
|
-
kmers.each do |kmer|
|
58
|
-
# if any of the kmers are rare, don't print the read
|
59
|
-
c = counting_table[kmer]
|
60
|
-
coverage += c
|
61
|
-
if c < cutoff
|
62
|
-
keep = false
|
63
|
-
break
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
if keep
|
68
|
-
kept += 1
|
69
|
-
output.puts r
|
70
|
-
end
|
71
|
-
pbar.inc
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
pbar.finish
|
76
|
-
|
77
|
-
ohai "survivors = #{kept} (#{kept/total_reads.to_f})"
|
78
|
-
output.close
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
end
|
@@ -1,45 +0,0 @@
|
|
1
|
-
##
|
2
|
-
# GET REPRESENTATIVE READS
|
3
|
-
#
|
4
|
-
|
5
|
-
module Lederhosen
|
6
|
-
class CLI
|
7
|
-
|
8
|
-
desc "rep_reads",
|
9
|
-
"output a fasta file containing representative reads for each cluster given a UCLUST output file and the joined reads file"
|
10
|
-
|
11
|
-
method_option :clusters, :type => :string, :required => true
|
12
|
-
method_option :output, :type => :string, :required => true
|
13
|
-
method_option :joined, :type => :string, :required => true
|
14
|
-
|
15
|
-
def rep_reads
|
16
|
-
input = options[:clusters]
|
17
|
-
output = options[:output]
|
18
|
-
joined_reads = options[:joined]
|
19
|
-
|
20
|
-
ohai "getting represntative reads for #{input} w/ reads #{joined_reads} and saving to #{output}"
|
21
|
-
|
22
|
-
# Load cluster table!
|
23
|
-
clstr_info = Helpers.load_uc_file input
|
24
|
-
clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
|
25
|
-
seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
|
26
|
-
samples = clstr_info[:samples]
|
27
|
-
|
28
|
-
out_handle = File.open("#{output}", 'w')
|
29
|
-
|
30
|
-
File.open(joined_reads) do |handle|
|
31
|
-
records = Dna.new handle
|
32
|
-
records.each do |dna|
|
33
|
-
clstrnr = seed_to_clstrnr[dna.name]
|
34
|
-
unless clstrnr.nil?
|
35
|
-
dna.name = "#{dna.name}:cluster-#{clstrnr}"
|
36
|
-
out_handle.puts dna
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
out_handle.close
|
42
|
-
end
|
43
|
-
|
44
|
-
end
|
45
|
-
end
|
@@ -1,84 +0,0 @@
|
|
1
|
-
##
|
2
|
-
# Create a fasta file with nucleotide sequences for each cluster larger than a cutoff
|
3
|
-
#
|
4
|
-
|
5
|
-
module Lederhosen
|
6
|
-
class CLI
|
7
|
-
|
8
|
-
desc "split",
|
9
|
-
"create fasta files containing reads from each cluster"
|
10
|
-
|
11
|
-
method_option :clusters, :type => :string, :required => true
|
12
|
-
method_option :reads, :type => :string, :required => true
|
13
|
-
method_option :out_dir, :type => :string, :required => true
|
14
|
-
method_option :buffer_size, :type => :numeric, :default => 1000
|
15
|
-
method_option :min_clst_size, :type => :numeric, :default => 1
|
16
|
-
|
17
|
-
def split
|
18
|
-
clusters = options[:clusters]
|
19
|
-
reads = options[:reads]
|
20
|
-
out_dir = options[:out_dir]
|
21
|
-
buffer_size = options[:buffer_size]
|
22
|
-
min_clst_size = options[:min_clst_size]
|
23
|
-
finalize_every = 100_000
|
24
|
-
|
25
|
-
ohai "spltting #{reads} by #{clusters} and saving to #{out_dir}"
|
26
|
-
ohai "minimum cluster size = #{min_clst_size}"
|
27
|
-
|
28
|
-
run "mkdir -p #{out_dir}/"
|
29
|
-
|
30
|
-
ohai "loading #{clusters}"
|
31
|
-
|
32
|
-
# Load read id -> cluster
|
33
|
-
read_to_clusterid = Hash.new
|
34
|
-
|
35
|
-
# keep track of cluster sizes
|
36
|
-
cluster_counts = Hash.new { |h, k| h[k] = 0}
|
37
|
-
|
38
|
-
File.open(clusters)do |handle|
|
39
|
-
handle.each do |line|
|
40
|
-
line = line.strip.split
|
41
|
-
cluster_nr = line[1]
|
42
|
-
if line[0] == 'S' || line[0] == 'H'
|
43
|
-
read = line[8]
|
44
|
-
else
|
45
|
-
next
|
46
|
-
end
|
47
|
-
read_to_clusterid[read] = cluster_nr
|
48
|
-
cluster_counts[cluster_nr] += 1
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
read_to_clusterid.delete_if do |read, cluster_nr|
|
53
|
-
cluster_counts[cluster_nr] < min_clst_size
|
54
|
-
end
|
55
|
-
|
56
|
-
total_reads = read_to_clusterid.length
|
57
|
-
total_clusters = read_to_clusterid.values.uniq.length
|
58
|
-
ohai "#{total_reads} reads in #{total_clusters} clusters"
|
59
|
-
|
60
|
-
pbar = ProgressBar.new "saving", total_reads
|
61
|
-
|
62
|
-
# Write reads to individual fasta files using Buffer
|
63
|
-
buffer = Buffer.new :buffer_max => buffer_size
|
64
|
-
File.open(reads) do |handle|
|
65
|
-
records = Dna.new handle
|
66
|
-
records.each_with_index do |record, i|
|
67
|
-
cluster_id = read_to_clusterid[record.name]
|
68
|
-
if cluster_id
|
69
|
-
pbar.inc
|
70
|
-
filename = File.join(out_dir, cluster_id + '.fasta')
|
71
|
-
buffer[filename] << record
|
72
|
-
buffer.finalize if (i%finalize_every == 0)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
pbar.finish
|
78
|
-
ohai "finalizing output"
|
79
|
-
buffer.finalize # finish writing out
|
80
|
-
|
81
|
-
puts "done"
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
@@ -1,80 +0,0 @@
|
|
1
|
-
##
|
2
|
-
# FILTER UC FILE BY MIN SAMPLES
|
3
|
-
#
|
4
|
-
require 'set'
|
5
|
-
|
6
|
-
module Lederhosen
|
7
|
-
class CLI
|
8
|
-
|
9
|
-
desc "uc_filter",
|
10
|
-
"filter UCLUST output to remove small, infrequent clusters"
|
11
|
-
|
12
|
-
method_option :input, :type => :string, :required => true
|
13
|
-
method_option :output, :type => :string, :required => true
|
14
|
-
method_option :reads, :type => :numeric, :required => true
|
15
|
-
method_option :samples, :type => :numeric, :required => true
|
16
|
-
|
17
|
-
def uc_filter
|
18
|
-
input = options[:input]
|
19
|
-
output = options[:output]
|
20
|
-
reads = options[:reads].to_i
|
21
|
-
samples = options[:samples].to_i
|
22
|
-
|
23
|
-
ohai "filtering #{input} to #{output}, reads = #{reads} & samples = #{samples}"
|
24
|
-
|
25
|
-
# load UC file
|
26
|
-
ohai "loading uc file"
|
27
|
-
clstr_info = Helpers.load_uc_file input
|
28
|
-
clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
|
29
|
-
|
30
|
-
# filter
|
31
|
-
ohai "filtering"
|
32
|
-
survivors = clstr_counts.reject do |a, b|
|
33
|
-
b.reject{ |i, j| j < reads }.length < samples
|
34
|
-
end
|
35
|
-
|
36
|
-
surviving_clusters = survivors.keys.to_set
|
37
|
-
|
38
|
-
# print filtered uc file
|
39
|
-
ohai "saving filtered table"
|
40
|
-
out = File.open(output, 'w')
|
41
|
-
|
42
|
-
lines = `wc -l #{input}`.split.first.to_i
|
43
|
-
|
44
|
-
pbar = ProgressBar.new 'saving', lines
|
45
|
-
kept, total = 1, 0
|
46
|
-
|
47
|
-
# output lederhosen filtering information because I often
|
48
|
-
# forget to write this down :)
|
49
|
-
out.puts "# filtered: #{input}"
|
50
|
-
out.puts "# #{reads} reads in at least #{samples} samples"
|
51
|
-
|
52
|
-
File.open(input) do |handle|
|
53
|
-
pbar = ProgressBar.new 'saving', File.size(input)
|
54
|
-
handle.each do |line|
|
55
|
-
|
56
|
-
pbar.set handle.pos
|
57
|
-
if line =~ /^#/
|
58
|
-
out.print line
|
59
|
-
next
|
60
|
-
end
|
61
|
-
total += 1
|
62
|
-
|
63
|
-
# check if cluster is in surviving clusters
|
64
|
-
if surviving_clusters.include? line.split[1].to_i
|
65
|
-
out.print line
|
66
|
-
kept += 1
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
pbar.finish
|
71
|
-
end
|
72
|
-
|
73
|
-
out.close
|
74
|
-
|
75
|
-
ohai "clusters: #{surviving_clusters.length}/#{clstr_counts.keys.length} = #{100*surviving_clusters.length/clstr_counts.keys.length.to_f}%"
|
76
|
-
ohai "reads: #{kept}/#{total} = #{100*kept/total.to_f}%"
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
end
|
@@ -1,41 +0,0 @@
|
|
1
|
-
##
|
2
|
-
# Get statistics about clusters in a UC file
|
3
|
-
#
|
4
|
-
|
5
|
-
module Lederhosen
|
6
|
-
class CLI
|
7
|
-
desc 'uc_stats',
|
8
|
-
'get statistics about clusters in a UC file. for now, this only calculates the size of each cluster'
|
9
|
-
|
10
|
-
method_option :input, :type => :string, :required => true
|
11
|
-
|
12
|
-
def uc_stats
|
13
|
-
input = options[:input]
|
14
|
-
|
15
|
-
ohai "calculating statistics for #{input}"
|
16
|
-
|
17
|
-
# TODO add more stats
|
18
|
-
cluster_stats = Hash.new { |h, k|
|
19
|
-
h[k] = {
|
20
|
-
:size => 0
|
21
|
-
}
|
22
|
-
}
|
23
|
-
|
24
|
-
File.open(input) do |handle|
|
25
|
-
handle.each do |line|
|
26
|
-
line = line.strip.split
|
27
|
-
type, clustr_nr = line[0], line[1]
|
28
|
-
cluster_stats[clustr_nr][:size] += 1
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
stat_types = cluster_stats.values.first.keys.sort
|
33
|
-
|
34
|
-
puts "cluster,#{stat_types.join(',')}"
|
35
|
-
cluster_stats.each do |cluster, stats|
|
36
|
-
puts "#{cluster},#{stat_types.map { |x| stats[x] }.join(',')}"
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
41
|
-
end
|
data/spec/helpers_spec.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Lederhosen::Helpers do
|
4
|
-
|
5
|
-
let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt.gz') }
|
6
|
-
|
7
|
-
it 'should have a method for grouping QSEQ files' do
|
8
|
-
groups.length.should == 2
|
9
|
-
end
|
10
|
-
|
11
|
-
it 'should have a method for reverse complementing a dna sequence' do
|
12
|
-
Lederhosen::Helpers.reverse_complement("GATCCCGANNANTAGGACCAA").should == "TTGGTCCTANTNNTCGGGATC"
|
13
|
-
end
|
14
|
-
|
15
|
-
it 'should have a method for trimming sequences' do
|
16
|
-
reads = groups.values.first.first
|
17
|
-
record = Zlib::GzipReader.open(reads) do |handle|
|
18
|
-
Dna.new(handle).first
|
19
|
-
end
|
20
|
-
# I should probably test with a bad read
|
21
|
-
Lederhosen::Helpers.trim(record).length.should == 58
|
22
|
-
end
|
23
|
-
|
24
|
-
it 'should be able to trim pairs of qseq files, outputting fasta file' do
|
25
|
-
reads = groups.values.first
|
26
|
-
Lederhosen::Helpers.trim_pairs reads[0], reads[1], "#{$test_dir}/munchen_trim_test.fasta"
|
27
|
-
# this test will break if trim parameters change
|
28
|
-
File.readlines("#{$test_dir}/munchen_trim_test.fasta").grep(/^>/).length.should be_even
|
29
|
-
end
|
30
|
-
end
|