lederhosen 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lederhosen.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "1.8.0"
8
+ s.version = "1.8.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
@@ -34,7 +34,6 @@ Gem::Specification.new do |s|
34
34
  "lib/lederhosen/tasks/otu_table.rb",
35
35
  "lib/lederhosen/tasks/split_fasta.rb",
36
36
  "lib/lederhosen/tasks/version.rb",
37
- "lib/lederhosen/trimmer.rb",
38
37
  "lib/lederhosen/version.rb",
39
38
  "readme.md",
40
39
  "scripts/illumina_pipeline/.gitignore",
@@ -3,7 +3,7 @@ module Lederhosen
3
3
  MAJOR = 1
4
4
  MINOR = 8
5
5
  CODENAME = 'Karottensaft' # changes for minor versions
6
- PATCH = 0
6
+ PATCH = 1
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.0
4
+ version: 1.8.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -132,7 +132,6 @@ files:
132
132
  - lib/lederhosen/tasks/otu_table.rb
133
133
  - lib/lederhosen/tasks/split_fasta.rb
134
134
  - lib/lederhosen/tasks/version.rb
135
- - lib/lederhosen/trimmer.rb
136
135
  - lib/lederhosen/version.rb
137
136
  - readme.md
138
137
  - scripts/illumina_pipeline/.gitignore
@@ -167,7 +166,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
167
166
  version: '0'
168
167
  segments:
169
168
  - 0
170
- hash: -1539752797284012594
169
+ hash: 4470842345198425739
171
170
  required_rubygems_version: !ruby/object:Gem::Requirement
172
171
  none: false
173
172
  requirements:
@@ -1,225 +0,0 @@
1
- module Lederhosen
2
- module Trimmer
3
-
4
- ##
5
- # Code used for sequence trimming
6
- #
7
- # - PairedTrimmer
8
- # - HuangTrimmer
9
- # - ProbabilityTrimmer
10
- # - QSEQTrimmer
11
- #
12
- # Some major refactoring needs to get done here
13
- #
14
-
15
- # HaungTrimmer
16
- #
17
- # class that has the trim function. Used in mixins
18
- # this trim function is based on the function documented
19
- # in the paper:
20
- # Huang X, Wang J, Aluru S, Yang SP, Hillier L. (2003). PCAP:
21
- # a whole-genome assembly program. Genome Res 13:
22
- # 2164–2170.
23
- #
24
- # The implementation is a direct copy from the perl implementation
25
- # implemented in Pangea 1.0:
26
- # PANGEA: pipeline for analysis of next generation amplicons
27
- # A Giongo, DB Crabb, AG Davis-Richardson - ISME , 2010
28
- #
29
- class HuangTrimmer
30
-
31
- def initialize(args={})
32
- @min = args[:min]
33
- @offset = args[:offset]
34
- end
35
-
36
- def trim_seq(dna)
37
-
38
- _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
39
-
40
- dna.quality.each_byte.each_with_index do |b, a|
41
- _sum += (b - @offset - @min)
42
- if _sum > _max
43
- _max = _sum
44
- _end = a
45
- start = first
46
- elsif _sum < 0
47
- _sum = 0
48
- first = a
49
- end
50
- end
51
-
52
- begin
53
- dna.sequence[start, _end - start].gsub('.', 'N')
54
- rescue
55
- nil
56
- end
57
- end
58
- end
59
-
60
- #
61
- # return the longest string starting from the left side
62
- # where the PROBABILITY OF ERROR as computed from the PHRED
63
- # scores does not go above a certain cutoff
64
- # (default is 0.005)
65
- #
66
- class ProbabilityTrimmer
67
-
68
- def initialize(args = {})
69
- @cutoff = args[:cutoff] || 0.005
70
- @min = args[:min]
71
- @seqtech = args[:seq_tech] || fail
72
- # must be illumina, sanger or solexa
73
- end
74
-
75
- def trim_seq(dna)
76
- trim_coord = dna.sequence.size
77
- probabilities = dna.send(:"#{@seqtech}_probabilities")
78
- probabilities.each_with_index do |q, i|
79
- if q > @cutoff
80
- trim_coord = i
81
- break
82
- end
83
- end
84
-
85
- begin
86
- dna.sequence[0..trim_coord].gsub('.', 'N')
87
- rescue
88
- nil
89
- end
90
- end
91
- end
92
-
93
- #
94
- # Base class for trimming paired-end reads
95
- #
96
- class PairedTrimmer < Enumerator
97
-
98
- def initialize(args = {})
99
- @pretrim = args[:pretrim]
100
- # TODO
101
- # need to be able to trim from left, right of pairs
102
- # thinking about specifying a "trimming language"
103
- #
104
- # Something like:
105
- #
106
- # --trim="5L0 0L3"
107
- # --trim="0L4 2L6"
108
- #
109
- # also thinking about breaking all of this trimming stuff
110
- # out into its own package. (to be more unixy and stuff ;)
111
- #
112
- @min_length = args[:min_length] || 70
113
- @min = args[:min] || 20
114
- @offset = args[:cutoff] || 64 # XXX should both be called 'cutoff'
115
- @left_trim = args[:left_trim] || 0 # trim adapter sequence
116
- @skip_ambig = args[:skip_ambiguous] || false
117
- @trimmer = args[:trimmer] || ProbabilityTrimmer.new(:min => @min,
118
- :offset => @offset,
119
- :seq_tech =>
120
- :illumina)
121
- end
122
-
123
- def each(&block)
124
-
125
- skipped_because_singleton = 0
126
- skipped_because_length = 0
127
- skipped_because_ambig = 0
128
-
129
- @paired_iterator.each_with_index do |a, i|
130
- seqa = @trimmer.trim_seq(a[0])[@left_trim..-1] rescue nil # trim adapter sequence
131
- seqb = @trimmer.trim_seq a[1]
132
-
133
- # make sure sequences are good
134
- # (both pairs survived and both are at least min_length long)
135
- # optionally skip reads that contain ambiguous nucleotides (N)
136
- if [seqa, seqb].include? nil
137
- skipped_because_singleton += 1
138
- elsif !(seqb.length >= @min_length && seqa.length >= @min_length)
139
- skipped_because_length += 1
140
- elsif @skip_ambig and (seqb =~ /N/ or seqa =~ /N/)
141
- skipped_because_ambig
142
- else # reads are good
143
- #
144
- # TODO
145
- # this is experiment specific. I save memory down the road
146
- # by having both of the reads in the forward orientation
147
- # but depending on the sequencing technology/pipeline
148
- # this may change.
149
- #
150
- # I'm planning on removing the trimming steps from lederhosen
151
- # for their own gem. With that, this will go too.
152
- #
153
- seqb = reverse_complement(seqb)
154
-
155
- # Create and yield new fasta objects
156
- # Perhaps this is slow?
157
- a = Fasta.new :name => "#{i}:0", :sequence => seqa
158
- b = Fasta.new :name => "#{i}:1", :sequence => seqb
159
- block.yield a
160
- block.yield b
161
- end
162
- end
163
- end
164
-
165
- # reverse complement a DNA sequence
166
- # assumes only GATCN nucleotides
167
- def reverse_complement(s)
168
- s.reverse.tr('GATCNgatcn','CTAGNctagn')
169
- end
170
- end
171
-
172
- #
173
- # Yields trimmed fasta records given an input
174
- # interleaved, paired-end fastq file
175
- #
176
- class InterleavedTrimmer < PairedTrimmer
177
-
178
- def initialize(interleaved_file, args = {})
179
- # create an iterator that yields paired records
180
- # as an array
181
-
182
- handle =
183
- begin
184
- Zlib::GzipReader.open(interleaved_file)
185
- rescue Zlib::GzipFile::Error
186
- File.open(interleaved_file)
187
- end
188
-
189
- reads = Dna.new handle
190
- @paired_iterator = reads.each_slice(2)
191
-
192
- super(args)
193
- end
194
- end
195
-
196
- #
197
- # Yield trimmed fasta records given an two separate
198
- # paired QSEQ files
199
- #
200
- class QSEQTrimmer < PairedTrimmer
201
- def initialize(left_file, right_file, args = {})
202
- # create an iterator that yields paired records
203
- # as an array
204
-
205
- left_handle, right_handle =
206
- begin
207
- [ Zlib::GzipReader.open(left_file), Zlib::GzipReader.open(right_file)]
208
- rescue Zlib::GzipFile::Error
209
- [ File.open(left_file), File.open(right_file) ]
210
- end
211
-
212
- left_file_reads = Dna.new left_handle
213
- right_reads = Dna.new right_handle
214
-
215
- @paired_iterator = left_file_reads.zip(right_reads)
216
-
217
- super(args)
218
-
219
- left_handle.close
220
- right_handle.close
221
- end
222
- end
223
-
224
- end # module Trimmer
225
- end # module Lederhosen