lederhosen 1.8.0 → 1.8.1

Sign up to get free protection for your applications and to get access to all the features.
data/lederhosen.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "1.8.0"
8
+ s.version = "1.8.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
@@ -34,7 +34,6 @@ Gem::Specification.new do |s|
34
34
  "lib/lederhosen/tasks/otu_table.rb",
35
35
  "lib/lederhosen/tasks/split_fasta.rb",
36
36
  "lib/lederhosen/tasks/version.rb",
37
- "lib/lederhosen/trimmer.rb",
38
37
  "lib/lederhosen/version.rb",
39
38
  "readme.md",
40
39
  "scripts/illumina_pipeline/.gitignore",
@@ -3,7 +3,7 @@ module Lederhosen
3
3
  MAJOR = 1
4
4
  MINOR = 8
5
5
  CODENAME = 'Karottensaft' # changes for minor versions
6
- PATCH = 0
6
+ PATCH = 1
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.0
4
+ version: 1.8.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -132,7 +132,6 @@ files:
132
132
  - lib/lederhosen/tasks/otu_table.rb
133
133
  - lib/lederhosen/tasks/split_fasta.rb
134
134
  - lib/lederhosen/tasks/version.rb
135
- - lib/lederhosen/trimmer.rb
136
135
  - lib/lederhosen/version.rb
137
136
  - readme.md
138
137
  - scripts/illumina_pipeline/.gitignore
@@ -167,7 +166,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
167
166
  version: '0'
168
167
  segments:
169
168
  - 0
170
- hash: -1539752797284012594
169
+ hash: 4470842345198425739
171
170
  required_rubygems_version: !ruby/object:Gem::Requirement
172
171
  none: false
173
172
  requirements:
@@ -1,225 +0,0 @@
1
- module Lederhosen
2
- module Trimmer
3
-
4
- ##
5
- # Code used for sequence trimming
6
- #
7
- # - PairedTrimmer
8
- # - HuangTrimmer
9
- # - ProbabilityTrimmer
10
- # - QSEQTrimmer
11
- #
12
- # Some major refactoring needs to get done here
13
- #
14
-
15
- # HaungTrimmer
16
- #
17
- # class that has the trim function. Used in mixins
18
- # this trim function is based on the function documented
19
- # in the paper:
20
- # Huang X, Wang J, Aluru S, Yang SP, Hillier L. (2003). PCAP:
21
- # a whole-genome assembly program. Genome Res 13:
22
- # 2164–2170.
23
- #
24
- # The implementation is a direct copy from the perl implementation
25
- # implemented in Pangea 1.0:
26
- # PANGEA: pipeline for analysis of next generation amplicons
27
- # A Giongo, DB Crabb, AG Davis-Richardson - ISME , 2010
28
- #
29
- class HuangTrimmer
30
-
31
- def initialize(args={})
32
- @min = args[:min]
33
- @offset = args[:offset]
34
- end
35
-
36
- def trim_seq(dna)
37
-
38
- _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
39
-
40
- dna.quality.each_byte.each_with_index do |b, a|
41
- _sum += (b - @offset - @min)
42
- if _sum > _max
43
- _max = _sum
44
- _end = a
45
- start = first
46
- elsif _sum < 0
47
- _sum = 0
48
- first = a
49
- end
50
- end
51
-
52
- begin
53
- dna.sequence[start, _end - start].gsub('.', 'N')
54
- rescue
55
- nil
56
- end
57
- end
58
- end
59
-
60
- #
61
- # return the longest string starting from the left side
62
- # where the PROBABILITY OF ERROR as computed from the PHRED
63
- # scores does not go above a certain cutoff
64
- # (default is 0.005)
65
- #
66
- class ProbabilityTrimmer
67
-
68
- def initialize(args = {})
69
- @cutoff = args[:cutoff] || 0.005
70
- @min = args[:min]
71
- @seqtech = args[:seq_tech] || fail
72
- # must be illumina, sanger or solexa
73
- end
74
-
75
- def trim_seq(dna)
76
- trim_coord = dna.sequence.size
77
- probabilities = dna.send(:"#{@seqtech}_probabilities")
78
- probabilities.each_with_index do |q, i|
79
- if q > @cutoff
80
- trim_coord = i
81
- break
82
- end
83
- end
84
-
85
- begin
86
- dna.sequence[0..trim_coord].gsub('.', 'N')
87
- rescue
88
- nil
89
- end
90
- end
91
- end
92
-
93
- #
94
- # Base class for trimming paired-end reads
95
- #
96
- class PairedTrimmer < Enumerator
97
-
98
- def initialize(args = {})
99
- @pretrim = args[:pretrim]
100
- # TODO
101
- # need to be able to trim from left, right of pairs
102
- # thinking about specifying a "trimming language"
103
- #
104
- # Something like:
105
- #
106
- # --trim="5L0 0L3"
107
- # --trim="0L4 2L6"
108
- #
109
- # also thinking about breaking all of this trimming stuff
110
- # out into its own package. (to be more unixy and stuff ;)
111
- #
112
- @min_length = args[:min_length] || 70
113
- @min = args[:min] || 20
114
- @offset = args[:cutoff] || 64 # XXX should both be called 'cutoff'
115
- @left_trim = args[:left_trim] || 0 # trim adapter sequence
116
- @skip_ambig = args[:skip_ambiguous] || false
117
- @trimmer = args[:trimmer] || ProbabilityTrimmer.new(:min => @min,
118
- :offset => @offset,
119
- :seq_tech =>
120
- :illumina)
121
- end
122
-
123
- def each(&block)
124
-
125
- skipped_because_singleton = 0
126
- skipped_because_length = 0
127
- skipped_because_ambig = 0
128
-
129
- @paired_iterator.each_with_index do |a, i|
130
- seqa = @trimmer.trim_seq(a[0])[@left_trim..-1] rescue nil # trim adapter sequence
131
- seqb = @trimmer.trim_seq a[1]
132
-
133
- # make sure sequences are good
134
- # (both pairs survived and both are at least min_length long)
135
- # optionally skip reads that contain ambiguous nucleotides (N)
136
- if [seqa, seqb].include? nil
137
- skipped_because_singleton += 1
138
- elsif !(seqb.length >= @min_length && seqa.length >= @min_length)
139
- skipped_because_length += 1
140
- elsif @skip_ambig and (seqb =~ /N/ or seqa =~ /N/)
141
- skipped_because_ambig
142
- else # reads are good
143
- #
144
- # TODO
145
- # this is experiment specific. I save memory down the road
146
- # by having both of the reads in the forward orientation
147
- # but depending on the sequencing technology/pipeline
148
- # this may change.
149
- #
150
- # I'm planning on removing the trimming steps from lederhosen
151
- # for their own gem. With that, this will go too.
152
- #
153
- seqb = reverse_complement(seqb)
154
-
155
- # Create and yield new fasta objects
156
- # Perhaps this is slow?
157
- a = Fasta.new :name => "#{i}:0", :sequence => seqa
158
- b = Fasta.new :name => "#{i}:1", :sequence => seqb
159
- block.yield a
160
- block.yield b
161
- end
162
- end
163
- end
164
-
165
- # reverse complement a DNA sequence
166
- # assumes only GATCN nucleotides
167
- def reverse_complement(s)
168
- s.reverse.tr('GATCNgatcn','CTAGNctagn')
169
- end
170
- end
171
-
172
- #
173
- # Yields trimmed fasta records given an input
174
- # interleaved, paired-end fastq file
175
- #
176
- class InterleavedTrimmer < PairedTrimmer
177
-
178
- def initialize(interleaved_file, args = {})
179
- # create an iterator that yields paired records
180
- # as an array
181
-
182
- handle =
183
- begin
184
- Zlib::GzipReader.open(interleaved_file)
185
- rescue Zlib::GzipFile::Error
186
- File.open(interleaved_file)
187
- end
188
-
189
- reads = Dna.new handle
190
- @paired_iterator = reads.each_slice(2)
191
-
192
- super(args)
193
- end
194
- end
195
-
196
- #
197
- # Yield trimmed fasta records given an two separate
198
- # paired QSEQ files
199
- #
200
- class QSEQTrimmer < PairedTrimmer
201
- def initialize(left_file, right_file, args = {})
202
- # create an iterator that yields paired records
203
- # as an array
204
-
205
- left_handle, right_handle =
206
- begin
207
- [ Zlib::GzipReader.open(left_file), Zlib::GzipReader.open(right_file)]
208
- rescue Zlib::GzipFile::Error
209
- [ File.open(left_file), File.open(right_file) ]
210
- end
211
-
212
- left_file_reads = Dna.new left_handle
213
- right_reads = Dna.new right_handle
214
-
215
- @paired_iterator = left_file_reads.zip(right_reads)
216
-
217
- super(args)
218
-
219
- left_handle.close
220
- right_handle.close
221
- end
222
- end
223
-
224
- end # module Trimmer
225
- end # module Lederhosen