lederhosen 1.8.0 → 1.8.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lederhosen.gemspec +1 -2
- data/lib/lederhosen/version.rb +1 -1
- metadata +2 -3
- data/lib/lederhosen/trimmer.rb +0 -225
data/lederhosen.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "1.8.
|
8
|
+
s.version = "1.8.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
@@ -34,7 +34,6 @@ Gem::Specification.new do |s|
|
|
34
34
|
"lib/lederhosen/tasks/otu_table.rb",
|
35
35
|
"lib/lederhosen/tasks/split_fasta.rb",
|
36
36
|
"lib/lederhosen/tasks/version.rb",
|
37
|
-
"lib/lederhosen/trimmer.rb",
|
38
37
|
"lib/lederhosen/version.rb",
|
39
38
|
"readme.md",
|
40
39
|
"scripts/illumina_pipeline/.gitignore",
|
data/lib/lederhosen/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.8.
|
4
|
+
version: 1.8.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -132,7 +132,6 @@ files:
|
|
132
132
|
- lib/lederhosen/tasks/otu_table.rb
|
133
133
|
- lib/lederhosen/tasks/split_fasta.rb
|
134
134
|
- lib/lederhosen/tasks/version.rb
|
135
|
-
- lib/lederhosen/trimmer.rb
|
136
135
|
- lib/lederhosen/version.rb
|
137
136
|
- readme.md
|
138
137
|
- scripts/illumina_pipeline/.gitignore
|
@@ -167,7 +166,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
167
166
|
version: '0'
|
168
167
|
segments:
|
169
168
|
- 0
|
170
|
-
hash:
|
169
|
+
hash: 4470842345198425739
|
171
170
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
172
171
|
none: false
|
173
172
|
requirements:
|
data/lib/lederhosen/trimmer.rb
DELETED
@@ -1,225 +0,0 @@
|
|
1
|
-
module Lederhosen
|
2
|
-
module Trimmer
|
3
|
-
|
4
|
-
##
|
5
|
-
# Code used for sequence trimming
|
6
|
-
#
|
7
|
-
# - PairedTrimmer
|
8
|
-
# - HuangTrimmer
|
9
|
-
# - ProbabilityTrimmer
|
10
|
-
# - QSEQTrimmer
|
11
|
-
#
|
12
|
-
# Some major refactoring needs to get done here
|
13
|
-
#
|
14
|
-
|
15
|
-
# HaungTrimmer
|
16
|
-
#
|
17
|
-
# class that has the trim function. Used in mixins
|
18
|
-
# this trim function is based on the function documented
|
19
|
-
# in the paper:
|
20
|
-
# Huang X, Wang J, Aluru S, Yang SP, Hillier L. (2003). PCAP:
|
21
|
-
# a whole-genome assembly program. Genome Res 13:
|
22
|
-
# 2164–2170.
|
23
|
-
#
|
24
|
-
# The implementation is a direct copy from the perl implementation
|
25
|
-
# implemented in Pangea 1.0:
|
26
|
-
# PANGEA: pipeline for analysis of next generation amplicons
|
27
|
-
# A Giongo, DB Crabb, AG Davis-Richardson - ISME , 2010
|
28
|
-
#
|
29
|
-
class HuangTrimmer
|
30
|
-
|
31
|
-
def initialize(args={})
|
32
|
-
@min = args[:min]
|
33
|
-
@offset = args[:offset]
|
34
|
-
end
|
35
|
-
|
36
|
-
def trim_seq(dna)
|
37
|
-
|
38
|
-
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
39
|
-
|
40
|
-
dna.quality.each_byte.each_with_index do |b, a|
|
41
|
-
_sum += (b - @offset - @min)
|
42
|
-
if _sum > _max
|
43
|
-
_max = _sum
|
44
|
-
_end = a
|
45
|
-
start = first
|
46
|
-
elsif _sum < 0
|
47
|
-
_sum = 0
|
48
|
-
first = a
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
begin
|
53
|
-
dna.sequence[start, _end - start].gsub('.', 'N')
|
54
|
-
rescue
|
55
|
-
nil
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# return the longest string starting from the left side
|
62
|
-
# where the PROBABILITY OF ERROR as computed from the PHRED
|
63
|
-
# scores does not go above a certain cutoff
|
64
|
-
# (default is 0.005)
|
65
|
-
#
|
66
|
-
class ProbabilityTrimmer
|
67
|
-
|
68
|
-
def initialize(args = {})
|
69
|
-
@cutoff = args[:cutoff] || 0.005
|
70
|
-
@min = args[:min]
|
71
|
-
@seqtech = args[:seq_tech] || fail
|
72
|
-
# must be illumina, sanger or solexa
|
73
|
-
end
|
74
|
-
|
75
|
-
def trim_seq(dna)
|
76
|
-
trim_coord = dna.sequence.size
|
77
|
-
probabilities = dna.send(:"#{@seqtech}_probabilities")
|
78
|
-
probabilities.each_with_index do |q, i|
|
79
|
-
if q > @cutoff
|
80
|
-
trim_coord = i
|
81
|
-
break
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
begin
|
86
|
-
dna.sequence[0..trim_coord].gsub('.', 'N')
|
87
|
-
rescue
|
88
|
-
nil
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
#
|
94
|
-
# Base class for trimming paired-end reads
|
95
|
-
#
|
96
|
-
class PairedTrimmer < Enumerator
|
97
|
-
|
98
|
-
def initialize(args = {})
|
99
|
-
@pretrim = args[:pretrim]
|
100
|
-
# TODO
|
101
|
-
# need to be able to trim from left, right of pairs
|
102
|
-
# thinking about specifying a "trimming language"
|
103
|
-
#
|
104
|
-
# Something like:
|
105
|
-
#
|
106
|
-
# --trim="5L0 0L3"
|
107
|
-
# --trim="0L4 2L6"
|
108
|
-
#
|
109
|
-
# also thinking about breaking all of this trimming stuff
|
110
|
-
# out into its own package. (to be more unixy and stuff ;)
|
111
|
-
#
|
112
|
-
@min_length = args[:min_length] || 70
|
113
|
-
@min = args[:min] || 20
|
114
|
-
@offset = args[:cutoff] || 64 # XXX should both be called 'cutoff'
|
115
|
-
@left_trim = args[:left_trim] || 0 # trim adapter sequence
|
116
|
-
@skip_ambig = args[:skip_ambiguous] || false
|
117
|
-
@trimmer = args[:trimmer] || ProbabilityTrimmer.new(:min => @min,
|
118
|
-
:offset => @offset,
|
119
|
-
:seq_tech =>
|
120
|
-
:illumina)
|
121
|
-
end
|
122
|
-
|
123
|
-
def each(&block)
|
124
|
-
|
125
|
-
skipped_because_singleton = 0
|
126
|
-
skipped_because_length = 0
|
127
|
-
skipped_because_ambig = 0
|
128
|
-
|
129
|
-
@paired_iterator.each_with_index do |a, i|
|
130
|
-
seqa = @trimmer.trim_seq(a[0])[@left_trim..-1] rescue nil # trim adapter sequence
|
131
|
-
seqb = @trimmer.trim_seq a[1]
|
132
|
-
|
133
|
-
# make sure sequences are good
|
134
|
-
# (both pairs survived and both are at least min_length long)
|
135
|
-
# optionally skip reads that contain ambiguous nucleotides (N)
|
136
|
-
if [seqa, seqb].include? nil
|
137
|
-
skipped_because_singleton += 1
|
138
|
-
elsif !(seqb.length >= @min_length && seqa.length >= @min_length)
|
139
|
-
skipped_because_length += 1
|
140
|
-
elsif @skip_ambig and (seqb =~ /N/ or seqa =~ /N/)
|
141
|
-
skipped_because_ambig
|
142
|
-
else # reads are good
|
143
|
-
#
|
144
|
-
# TODO
|
145
|
-
# this is experiment specific. I save memory down the road
|
146
|
-
# by having both of the reads in the forward orientation
|
147
|
-
# but depending on the sequencing technology/pipeline
|
148
|
-
# this may change.
|
149
|
-
#
|
150
|
-
# I'm planning on removing the trimming steps from lederhosen
|
151
|
-
# for their own gem. With that, this will go too.
|
152
|
-
#
|
153
|
-
seqb = reverse_complement(seqb)
|
154
|
-
|
155
|
-
# Create and yield new fasta objects
|
156
|
-
# Perhaps this is slow?
|
157
|
-
a = Fasta.new :name => "#{i}:0", :sequence => seqa
|
158
|
-
b = Fasta.new :name => "#{i}:1", :sequence => seqb
|
159
|
-
block.yield a
|
160
|
-
block.yield b
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
# reverse complement a DNA sequence
|
166
|
-
# assumes only GATCN nucleotides
|
167
|
-
def reverse_complement(s)
|
168
|
-
s.reverse.tr('GATCNgatcn','CTAGNctagn')
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
#
|
173
|
-
# Yields trimmed fasta records given an input
|
174
|
-
# interleaved, paired-end fastq file
|
175
|
-
#
|
176
|
-
class InterleavedTrimmer < PairedTrimmer
|
177
|
-
|
178
|
-
def initialize(interleaved_file, args = {})
|
179
|
-
# create an iterator that yields paired records
|
180
|
-
# as an array
|
181
|
-
|
182
|
-
handle =
|
183
|
-
begin
|
184
|
-
Zlib::GzipReader.open(interleaved_file)
|
185
|
-
rescue Zlib::GzipFile::Error
|
186
|
-
File.open(interleaved_file)
|
187
|
-
end
|
188
|
-
|
189
|
-
reads = Dna.new handle
|
190
|
-
@paired_iterator = reads.each_slice(2)
|
191
|
-
|
192
|
-
super(args)
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
#
|
197
|
-
# Yield trimmed fasta records given an two separate
|
198
|
-
# paired QSEQ files
|
199
|
-
#
|
200
|
-
class QSEQTrimmer < PairedTrimmer
|
201
|
-
def initialize(left_file, right_file, args = {})
|
202
|
-
# create an iterator that yields paired records
|
203
|
-
# as an array
|
204
|
-
|
205
|
-
left_handle, right_handle =
|
206
|
-
begin
|
207
|
-
[ Zlib::GzipReader.open(left_file), Zlib::GzipReader.open(right_file)]
|
208
|
-
rescue Zlib::GzipFile::Error
|
209
|
-
[ File.open(left_file), File.open(right_file) ]
|
210
|
-
end
|
211
|
-
|
212
|
-
left_file_reads = Dna.new left_handle
|
213
|
-
right_reads = Dna.new right_handle
|
214
|
-
|
215
|
-
@paired_iterator = left_file_reads.zip(right_reads)
|
216
|
-
|
217
|
-
super(args)
|
218
|
-
|
219
|
-
left_handle.close
|
220
|
-
right_handle.close
|
221
|
-
end
|
222
|
-
end
|
223
|
-
|
224
|
-
end # module Trimmer
|
225
|
-
end # module Lederhosen
|