lederhosen 1.8.0 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lederhosen.gemspec +1 -2
- data/lib/lederhosen/version.rb +1 -1
- metadata +2 -3
- data/lib/lederhosen/trimmer.rb +0 -225
data/lederhosen.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "1.8.
|
8
|
+
s.version = "1.8.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
@@ -34,7 +34,6 @@ Gem::Specification.new do |s|
|
|
34
34
|
"lib/lederhosen/tasks/otu_table.rb",
|
35
35
|
"lib/lederhosen/tasks/split_fasta.rb",
|
36
36
|
"lib/lederhosen/tasks/version.rb",
|
37
|
-
"lib/lederhosen/trimmer.rb",
|
38
37
|
"lib/lederhosen/version.rb",
|
39
38
|
"readme.md",
|
40
39
|
"scripts/illumina_pipeline/.gitignore",
|
data/lib/lederhosen/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.8.
|
4
|
+
version: 1.8.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -132,7 +132,6 @@ files:
|
|
132
132
|
- lib/lederhosen/tasks/otu_table.rb
|
133
133
|
- lib/lederhosen/tasks/split_fasta.rb
|
134
134
|
- lib/lederhosen/tasks/version.rb
|
135
|
-
- lib/lederhosen/trimmer.rb
|
136
135
|
- lib/lederhosen/version.rb
|
137
136
|
- readme.md
|
138
137
|
- scripts/illumina_pipeline/.gitignore
|
@@ -167,7 +166,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
167
166
|
version: '0'
|
168
167
|
segments:
|
169
168
|
- 0
|
170
|
-
hash:
|
169
|
+
hash: 4470842345198425739
|
171
170
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
172
171
|
none: false
|
173
172
|
requirements:
|
data/lib/lederhosen/trimmer.rb
DELETED
@@ -1,225 +0,0 @@
|
|
1
|
-
module Lederhosen
|
2
|
-
module Trimmer
|
3
|
-
|
4
|
-
##
|
5
|
-
# Code used for sequence trimming
|
6
|
-
#
|
7
|
-
# - PairedTrimmer
|
8
|
-
# - HuangTrimmer
|
9
|
-
# - ProbabilityTrimmer
|
10
|
-
# - QSEQTrimmer
|
11
|
-
#
|
12
|
-
# Some major refactoring needs to get done here
|
13
|
-
#
|
14
|
-
|
15
|
-
# HaungTrimmer
|
16
|
-
#
|
17
|
-
# class that has the trim function. Used in mixins
|
18
|
-
# this trim function is based on the function documented
|
19
|
-
# in the paper:
|
20
|
-
# Huang X, Wang J, Aluru S, Yang SP, Hillier L. (2003). PCAP:
|
21
|
-
# a whole-genome assembly program. Genome Res 13:
|
22
|
-
# 2164–2170.
|
23
|
-
#
|
24
|
-
# The implementation is a direct copy from the perl implementation
|
25
|
-
# implemented in Pangea 1.0:
|
26
|
-
# PANGEA: pipeline for analysis of next generation amplicons
|
27
|
-
# A Giongo, DB Crabb, AG Davis-Richardson - ISME , 2010
|
28
|
-
#
|
29
|
-
class HuangTrimmer
|
30
|
-
|
31
|
-
def initialize(args={})
|
32
|
-
@min = args[:min]
|
33
|
-
@offset = args[:offset]
|
34
|
-
end
|
35
|
-
|
36
|
-
def trim_seq(dna)
|
37
|
-
|
38
|
-
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
39
|
-
|
40
|
-
dna.quality.each_byte.each_with_index do |b, a|
|
41
|
-
_sum += (b - @offset - @min)
|
42
|
-
if _sum > _max
|
43
|
-
_max = _sum
|
44
|
-
_end = a
|
45
|
-
start = first
|
46
|
-
elsif _sum < 0
|
47
|
-
_sum = 0
|
48
|
-
first = a
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
begin
|
53
|
-
dna.sequence[start, _end - start].gsub('.', 'N')
|
54
|
-
rescue
|
55
|
-
nil
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
#
|
61
|
-
# return the longest string starting from the left side
|
62
|
-
# where the PROBABILITY OF ERROR as computed from the PHRED
|
63
|
-
# scores does not go above a certain cutoff
|
64
|
-
# (default is 0.005)
|
65
|
-
#
|
66
|
-
class ProbabilityTrimmer
|
67
|
-
|
68
|
-
def initialize(args = {})
|
69
|
-
@cutoff = args[:cutoff] || 0.005
|
70
|
-
@min = args[:min]
|
71
|
-
@seqtech = args[:seq_tech] || fail
|
72
|
-
# must be illumina, sanger or solexa
|
73
|
-
end
|
74
|
-
|
75
|
-
def trim_seq(dna)
|
76
|
-
trim_coord = dna.sequence.size
|
77
|
-
probabilities = dna.send(:"#{@seqtech}_probabilities")
|
78
|
-
probabilities.each_with_index do |q, i|
|
79
|
-
if q > @cutoff
|
80
|
-
trim_coord = i
|
81
|
-
break
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
begin
|
86
|
-
dna.sequence[0..trim_coord].gsub('.', 'N')
|
87
|
-
rescue
|
88
|
-
nil
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
#
|
94
|
-
# Base class for trimming paired-end reads
|
95
|
-
#
|
96
|
-
class PairedTrimmer < Enumerator
|
97
|
-
|
98
|
-
def initialize(args = {})
|
99
|
-
@pretrim = args[:pretrim]
|
100
|
-
# TODO
|
101
|
-
# need to be able to trim from left, right of pairs
|
102
|
-
# thinking about specifying a "trimming language"
|
103
|
-
#
|
104
|
-
# Something like:
|
105
|
-
#
|
106
|
-
# --trim="5L0 0L3"
|
107
|
-
# --trim="0L4 2L6"
|
108
|
-
#
|
109
|
-
# also thinking about breaking all of this trimming stuff
|
110
|
-
# out into its own package. (to be more unixy and stuff ;)
|
111
|
-
#
|
112
|
-
@min_length = args[:min_length] || 70
|
113
|
-
@min = args[:min] || 20
|
114
|
-
@offset = args[:cutoff] || 64 # XXX should both be called 'cutoff'
|
115
|
-
@left_trim = args[:left_trim] || 0 # trim adapter sequence
|
116
|
-
@skip_ambig = args[:skip_ambiguous] || false
|
117
|
-
@trimmer = args[:trimmer] || ProbabilityTrimmer.new(:min => @min,
|
118
|
-
:offset => @offset,
|
119
|
-
:seq_tech =>
|
120
|
-
:illumina)
|
121
|
-
end
|
122
|
-
|
123
|
-
def each(&block)
|
124
|
-
|
125
|
-
skipped_because_singleton = 0
|
126
|
-
skipped_because_length = 0
|
127
|
-
skipped_because_ambig = 0
|
128
|
-
|
129
|
-
@paired_iterator.each_with_index do |a, i|
|
130
|
-
seqa = @trimmer.trim_seq(a[0])[@left_trim..-1] rescue nil # trim adapter sequence
|
131
|
-
seqb = @trimmer.trim_seq a[1]
|
132
|
-
|
133
|
-
# make sure sequences are good
|
134
|
-
# (both pairs survived and both are at least min_length long)
|
135
|
-
# optionally skip reads that contain ambiguous nucleotides (N)
|
136
|
-
if [seqa, seqb].include? nil
|
137
|
-
skipped_because_singleton += 1
|
138
|
-
elsif !(seqb.length >= @min_length && seqa.length >= @min_length)
|
139
|
-
skipped_because_length += 1
|
140
|
-
elsif @skip_ambig and (seqb =~ /N/ or seqa =~ /N/)
|
141
|
-
skipped_because_ambig
|
142
|
-
else # reads are good
|
143
|
-
#
|
144
|
-
# TODO
|
145
|
-
# this is experiment specific. I save memory down the road
|
146
|
-
# by having both of the reads in the forward orientation
|
147
|
-
# but depending on the sequencing technology/pipeline
|
148
|
-
# this may change.
|
149
|
-
#
|
150
|
-
# I'm planning on removing the trimming steps from lederhosen
|
151
|
-
# for their own gem. With that, this will go too.
|
152
|
-
#
|
153
|
-
seqb = reverse_complement(seqb)
|
154
|
-
|
155
|
-
# Create and yield new fasta objects
|
156
|
-
# Perhaps this is slow?
|
157
|
-
a = Fasta.new :name => "#{i}:0", :sequence => seqa
|
158
|
-
b = Fasta.new :name => "#{i}:1", :sequence => seqb
|
159
|
-
block.yield a
|
160
|
-
block.yield b
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
# reverse complement a DNA sequence
|
166
|
-
# assumes only GATCN nucleotides
|
167
|
-
def reverse_complement(s)
|
168
|
-
s.reverse.tr('GATCNgatcn','CTAGNctagn')
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
#
|
173
|
-
# Yields trimmed fasta records given an input
|
174
|
-
# interleaved, paired-end fastq file
|
175
|
-
#
|
176
|
-
class InterleavedTrimmer < PairedTrimmer
|
177
|
-
|
178
|
-
def initialize(interleaved_file, args = {})
|
179
|
-
# create an iterator that yields paired records
|
180
|
-
# as an array
|
181
|
-
|
182
|
-
handle =
|
183
|
-
begin
|
184
|
-
Zlib::GzipReader.open(interleaved_file)
|
185
|
-
rescue Zlib::GzipFile::Error
|
186
|
-
File.open(interleaved_file)
|
187
|
-
end
|
188
|
-
|
189
|
-
reads = Dna.new handle
|
190
|
-
@paired_iterator = reads.each_slice(2)
|
191
|
-
|
192
|
-
super(args)
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
#
|
197
|
-
# Yield trimmed fasta records given an two separate
|
198
|
-
# paired QSEQ files
|
199
|
-
#
|
200
|
-
class QSEQTrimmer < PairedTrimmer
|
201
|
-
def initialize(left_file, right_file, args = {})
|
202
|
-
# create an iterator that yields paired records
|
203
|
-
# as an array
|
204
|
-
|
205
|
-
left_handle, right_handle =
|
206
|
-
begin
|
207
|
-
[ Zlib::GzipReader.open(left_file), Zlib::GzipReader.open(right_file)]
|
208
|
-
rescue Zlib::GzipFile::Error
|
209
|
-
[ File.open(left_file), File.open(right_file) ]
|
210
|
-
end
|
211
|
-
|
212
|
-
left_file_reads = Dna.new left_handle
|
213
|
-
right_reads = Dna.new right_handle
|
214
|
-
|
215
|
-
@paired_iterator = left_file_reads.zip(right_reads)
|
216
|
-
|
217
|
-
super(args)
|
218
|
-
|
219
|
-
left_handle.close
|
220
|
-
right_handle.close
|
221
|
-
end
|
222
|
-
end
|
223
|
-
|
224
|
-
end # module Trimmer
|
225
|
-
end # module Lederhosen
|