parse_fasta 1.8.2 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MDE1YThkNzUyNzI0MTMwZDMyYzBiNzFiODMzZGQzNzQ5ODU3ZTk1MA==
4
+ YWQ1YWNkZmM2Zjc3ZTJkMGE3ZDllMDY4ZDE1MWM2MzAxNjM1MDNlOA==
5
5
  data.tar.gz: !binary |-
6
- NjBmZGUxZTdkM2UyZTQ4YWY1MDliMTI0OTJlYjA5ZDFmMzg4OWRlZQ==
6
+ ZTkxYWEzZmRhZGM0NzJmYTM5YTZmMjM5OWIzMmU0YTI2ZjA1YjdkOA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MjIxNGNlODdkNTk3ZWE1ZDk1Zjg4ZDY0ZWE3NzE0ZWI0ODQ4MDZjZTk1MDY1
10
- OGNiOGViOWYwZDU5ODY0YjNmZWY1ODYwOGVlN2E5MTVmYzZlZmIwNzE4MjNi
11
- YWFjMjgzNGQ4YmMzODdjYjZjNTBmYTM4MWFiYTcyYjlmZWFhYWM=
9
+ Y2VkNDcyMTU3NTVjODAyNTNkNzJlMTkyOTY1ZGE5N2FhMWU0ZDgyOGI0YmJh
10
+ NGYxM2ExNzI2ZGQyYjAwZjNjNDhlYTRjMWRkNWY1Yzk5ZTlhM2I3NDg4ODJm
11
+ YWVhOTBiZGEyODI4M2M0NDJlOTEwN2VlNjNjZDMxZmU4ZWQxODM=
12
12
  data.tar.gz: !binary |-
13
- YTNiMTYzNmJhODkzMjEyMjBlOTgxOGIyMjFmMTFlOTE0NTEyOWZjNTgxMTRj
14
- N2Y4YjE3NWUxMjYyNTRjNTYzZGE3MjBhNjJjZTNmNjRkYzY5ZGI2MGY0MjQz
15
- N2RhMDUxN2E1MjY0NDZkOWQyMjEzYTU2ZDE4M2FlZDg3YzA0N2M=
13
+ YzUxNTViM2NhNTQ2NjMyOWMwZmJjMGRiYjBjYzNkZmY1Y2Y1ZmQyNWUwMzkw
14
+ OGIwNzY5NTE0OTY4YTIxMDhiZjNiMjdiZDdiNGIxNmZhNjMxZTIzOTBhNzAz
15
+ NzcwMGRiZDBjNWMzOTVmOGUzMThhMTRkYzcwM2YzNjAwMTVmZmE=
data/README.md CHANGED
@@ -66,6 +66,15 @@ Read fasta file into a hash.
66
66
 
67
67
  ## Versions ##
68
68
 
69
+ ### 1.9.0 ###
70
+
71
+ Added "fast" versions of `each_record` methods
72
+ (`each_record_fast`). Basically, they return sequences and quality
73
+ strings as Ruby `Sring` objects instead of aa `Sequence` or `Quality`
74
+ objects. Also, if the sequence or quality string has spaces, they will
75
+ be retained. If this is a problem, use the original `each_record`
76
+ methods.
77
+
69
78
  ### 1.8.2 ###
70
79
 
71
80
  Speed up `FastqFile#each_record`.
@@ -137,6 +137,42 @@ class FastaFile < File
137
137
  return f
138
138
  end
139
139
 
140
+ # Fast version of #each_record
141
+ #
142
+ # Yields the sequence as a String, not Sequence. No separate lines
143
+ # option.
144
+ #
145
+ # @note If the fastA file has spaces in the sequence, they will be
146
+ # retained. If this is a problem, use #each_record instead.
147
+ #
148
+ # @yield The header and sequence for each record in the fasta
149
+ # file to the block
150
+ #
151
+ # @yieldparam header [String] The header of the fasta record without
152
+ # the leading '>'
153
+ #
154
+ # @yieldparam sequence [String] The sequence of the fasta record
155
+ #
156
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
157
+ def each_record_fast
158
+ begin
159
+ f = Zlib::GzipReader.open(self)
160
+ rescue Zlib::GzipFile::Error => e
161
+ f = self
162
+ end
163
+
164
+ f.each("\n>") do |line|
165
+ header, sequence = parse_line(line)
166
+
167
+ raise ParseFasta::SequenceFormatError if sequence.include? ">"
168
+
169
+ yield(header.strip, sequence)
170
+ end
171
+
172
+ f.close if f.instance_of?(Zlib::GzipReader)
173
+ return f
174
+ end
175
+
140
176
  private
141
177
 
142
178
  def parse_line(line)
@@ -96,4 +96,63 @@ class FastqFile < File
96
96
  f.close if f.instance_of?(Zlib::GzipReader)
97
97
  return f
98
98
  end
99
+
100
+ # Fast version of #each_record
101
+ #
102
+ # @note If the fastQ file has spaces in the sequence, they will be
103
+ # retained. If this is a problem, use #each_record instead.
104
+ #
105
+ # @example Parsing a fastq file
106
+ # FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual|
107
+ # # do some fun stuff here!
108
+ # end
109
+ # @example Use the same syntax for gzipped files!
110
+ # FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual|
111
+ # # do some fun stuff here!
112
+ # end
113
+ #
114
+ # @yield The header, sequence, description and quality string for
115
+ # each record in the fastq file to the block
116
+ #
117
+ # @yieldparam header [String] The header of the fastq record without
118
+ # the leading '@'
119
+ # @yieldparam sequence [String] The sequence of the fastq record
120
+ # @yieldparam description [String] The description line of the fastq
121
+ # record without the leading '+'
122
+ # @yieldparam quality_string [String] The quality string of the
123
+ # fastq record
124
+ def each_record_fast
125
+ count = 0
126
+ header = ''
127
+ sequence = ''
128
+ description = ''
129
+ quality = ''
130
+
131
+ begin
132
+ f = Zlib::GzipReader.open(self)
133
+ rescue Zlib::GzipFile::Error => e
134
+ f = self
135
+ end
136
+
137
+ f.each_line do |line|
138
+ line.chomp!
139
+
140
+ case count % 4
141
+ when 0
142
+ header = line[1..-1]
143
+ when 1
144
+ sequence = line
145
+ when 2
146
+ description = line[1..-1]
147
+ when 3
148
+ quality = line
149
+ yield(header, sequence, description, quality)
150
+ end
151
+
152
+ count += 1
153
+ end
154
+
155
+ f.close if f.instance_of?(Zlib::GzipReader)
156
+ return f
157
+ end
99
158
  end
@@ -95,6 +95,44 @@ class SeqFile < File
95
95
  end
96
96
  end
97
97
 
98
+ # Fast version of #each_record
99
+ #
100
+ # @note If the sequence file has spaces in the sequence, they will
101
+ # be retained. If this is a problem, use #each_record instead.
102
+ #
103
+ # @example Parse a gzipped fastA file
104
+ # SeqFile.open('reads.fa.gz').each_record_fast do |head, seq|
105
+ # puts [head, seq.length].join "\t"
106
+ # end
107
+ #
108
+ # @example Parse an uncompressed fastQ file
109
+ # SeqFile.open('reads.fq.gz').each_record_fast do |head, seq|
110
+ # puts [head, seq.length].join "\t"
111
+ # end
112
+ #
113
+ # @yieldparam header [String] The header of the record without the
114
+ # leading '>' or '@'
115
+ #
116
+ # @yieldparam sequence [String] The sequence of the record.
117
+ #
118
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
119
+ # and file is a fastA file
120
+ def each_record_fast
121
+ first_char = get_first_char(self)
122
+
123
+ if first_char == '>'
124
+ FastaFile.open(self).each_record_fast do |header, sequence|
125
+ yield(header, sequence)
126
+ end
127
+ elsif first_char == '@'
128
+ FastqFile.open(self).each_record_fast do |head, seq, desc, qual|
129
+ yield(head, seq)
130
+ end
131
+ else
132
+ raise ArgumentError, "Input does not look like FASTA or FASTQ"
133
+ end
134
+ end
135
+
98
136
  private
99
137
 
100
138
  def get_first_char(f)
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "1.8.2"
20
+ VERSION = "1.9.0"
21
21
  end
@@ -148,4 +148,65 @@ describe FastaFile do
148
148
  end
149
149
  end
150
150
  end
151
+
152
+ describe "#each_record_fast" do
153
+ let(:records) { Helpers::RECORDS_FAST }
154
+
155
+ let(:f_handle) { FastaFile.open(@fname).each_record_fast { |s| } }
156
+
157
+ context "with badly catted fasta" do
158
+ it "raises ParseFasta::SequenceFormatError" do
159
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
160
+
161
+ expect { FastaFile.open(fname).each_record_fast {} }.
162
+ to raise_error ParseFasta::SequenceFormatError
163
+ end
164
+ end
165
+
166
+ shared_examples_for "any FastaFile" do
167
+ it "yields proper header and sequence for each record" do
168
+ expect { |b|
169
+ FastaFile.open(@fname).each_record_fast(&b)
170
+ }.to yield_successive_args(*records)
171
+ end
172
+
173
+ it "yields the sequence as a String class" do
174
+ FastaFile.open(@fname).each_record_fast do |_, seq|
175
+ expect(seq).to be_an_instance_of String
176
+ end
177
+ end
178
+ end
179
+
180
+ context "with a gzipped file" do
181
+ before(:each) do
182
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
183
+ end
184
+
185
+ it_behaves_like "any FastaFile"
186
+
187
+ it "closes the GzipReader" do
188
+ expect(f_handle).to be_closed
189
+ end
190
+
191
+ it "returns GzipReader object" do
192
+ expect(f_handle).to be_an_instance_of Zlib::GzipReader
193
+ end
194
+ end
195
+
196
+ context "with a non-gzipped file" do
197
+ before(:each) do
198
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
199
+ end
200
+
201
+ it_behaves_like "any FastaFile"
202
+
203
+ it "doesn't close the FastqFile (approx regular file behavior)" do
204
+ expect(f_handle).not_to be_closed
205
+ end
206
+
207
+ it "returns FastaFile object" do
208
+ expect(f_handle).to be_an_instance_of FastaFile
209
+ end
210
+ end
211
+ end
151
212
  end
@@ -21,7 +21,14 @@ require 'spec_helper'
21
21
  describe FastqFile do
22
22
  let(:records) {
23
23
  [["seq1", "AACCTTGG", "", ")#3gTqN8"],
24
- ["seq2 apples", "ACTG", "seq2 apples", "*ujM"]] }
24
+ ["seq2 apples", "ACTG", "seq2 apples", "*ujM"]]
25
+ }
26
+
27
+ let(:records_fast) {
28
+ [["seq1", "AA CC TT GG", "", ")# 3g Tq N8"],
29
+ ["seq2 apples", "ACTG", "seq2 apples", "*ujM"]]
30
+ }
31
+
25
32
  let(:f_handle) { FastqFile.open(@fname).each_record { |s| } }
26
33
 
27
34
 
@@ -45,6 +52,27 @@ describe FastqFile do
45
52
  end
46
53
  end
47
54
 
55
+ describe "#each_record_fast" do
56
+ before(:each) do
57
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
58
+ end
59
+
60
+ it "yields proper header, sequence, description, and quality" do
61
+ expect { |b|
62
+ FastqFile.open(@fname).each_record_fast(&b)
63
+ }.to yield_successive_args(records_fast[0], records_fast[1])
64
+ end
65
+
66
+ it "yields all params as String" do
67
+ FastqFile.open(@fname).each_record_fast do |h, s, d, q|
68
+ expect(h).to be_an_instance_of String
69
+ expect(s).to be_an_instance_of String
70
+ expect(d).to be_an_instance_of String
71
+ expect(q).to be_an_instance_of String
72
+ end
73
+ end
74
+ end
75
+
48
76
  describe "#to_hash" do
49
77
  let(:records) {
50
78
  { "seq1" => { head: "seq1",
@@ -79,9 +79,8 @@ describe SeqFile do
79
79
  end
80
80
  end
81
81
 
82
- describe "#each_record" do
83
-
84
- context "when input is a fasta file" do
82
+ context "when input is a fasta file" do
83
+ describe "#each_record" do
85
84
  let(:records) { Helpers::RECORDS }
86
85
 
87
86
  let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
@@ -200,8 +199,10 @@ describe SeqFile do
200
199
  end
201
200
  end
202
201
  end
202
+ end
203
203
 
204
- context "when input is bogus" do
204
+ context "when input is bogus" do
205
+ describe "#each_record" do
205
206
  it "raises an ArgumentError with message" do
206
207
  fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
207
208
  err_msg = "Input does not look like FASTA or FASTQ"
@@ -213,4 +214,144 @@ describe SeqFile do
213
214
  end
214
215
  end
215
216
  end
217
+
218
+ #####
219
+
220
+ context "when input is a fasta file" do
221
+ describe "#each_record_fast" do
222
+ let(:records) { Helpers::RECORDS_FAST }
223
+
224
+ let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }
225
+
226
+ context "with badly catted fasta" do
227
+ it "raises ParseFasta::SequenceFormatError" do
228
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
229
+
230
+ expect { FastaFile.open(fname).to_hash }.
231
+ to raise_error ParseFasta::SequenceFormatError
232
+ end
233
+ end
234
+
235
+ shared_examples_for "parsing a fasta file" do
236
+ it "yields proper header and sequence for each record" do
237
+ expect { |b|
238
+ SeqFile.open(@fname).each_record_fast(&b)
239
+ }.to yield_successive_args(*records)
240
+ end
241
+
242
+ it "yields the sequence as a String class" do
243
+ SeqFile.open(@fname).each_record_fast do |_, seq|
244
+ expect(seq).to be_an_instance_of String
245
+ end
246
+ end
247
+ end
248
+
249
+ context "with a gzipped file" do
250
+ before(:each) do
251
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
252
+ end
253
+
254
+ it_behaves_like "parsing a fasta file"
255
+
256
+ it "closes the GzipReader" do
257
+ expect(f_handle).to be_closed
258
+ end
259
+
260
+ it "returns GzipReader object" do
261
+ expect(f_handle).to be_an_instance_of Zlib::GzipReader
262
+ end
263
+ end
264
+
265
+ context "with a non-gzipped file" do
266
+ before(:each) do
267
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
268
+ end
269
+
270
+ it_behaves_like "parsing a fasta file"
271
+
272
+ it "doesn't close the File (approx regular file behavior)" do
273
+ expect(f_handle).not_to be_closed
274
+ end
275
+
276
+ it "returns FastaFile object" do
277
+ expect(f_handle).to be_a FastaFile
278
+ end
279
+ end
280
+ end
281
+ end
282
+
283
+ context "when input is a fastq file" do
284
+ let(:records) {
285
+ [["seq1", "AA CC TT GG"],
286
+ ["seq2 apples", "ACTG"]] }
287
+ let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }
288
+
289
+ shared_examples_for "parsing a fastq file" do
290
+ it "yields only header & sequence" do
291
+ expect { |b|
292
+ SeqFile.open(@fname).each_record_fast(&b)
293
+ }.to yield_successive_args(records[0], records[1])
294
+ end
295
+
296
+ it "yields the sequence as a String class" do
297
+ SeqFile.open(@fname).each_record_fast do |_, seq, _, _|
298
+ expect(seq).to be_an_instance_of String
299
+ end
300
+ end
301
+ end
302
+
303
+ context "with a 4 line per record fastq file" do
304
+ describe "#each_record_fast" do
305
+ context "with a gzipped file" do
306
+ before(:each) do
307
+ @fname =
308
+ "#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
309
+ end
310
+
311
+ it_behaves_like "parsing a fastq file"
312
+
313
+ it "closes the GzipReader" do
314
+ expect(f_handle).to be_closed
315
+ end
316
+
317
+ it "returns GzipReader object" do
318
+ expect(f_handle).to be_an_instance_of Zlib::GzipReader
319
+ end
320
+ end
321
+
322
+ context "with a non-gzipped file" do
323
+ before(:each) do
324
+ @fname =
325
+ "#{File.dirname(__FILE__)}/../../test_files/test.fq"
326
+ end
327
+
328
+ it_behaves_like "parsing a fastq file"
329
+
330
+ it "doesn't close the SeqFile (approx reg file behav)" do
331
+ expect(f_handle).not_to be_closed
332
+ end
333
+
334
+ it "returns FastqFile object" do
335
+ expect(f_handle).to be_a FastqFile
336
+ end
337
+ end
338
+ end
339
+ end
340
+ end
341
+
342
+ context "when input is bogus" do
343
+ describe "#each_record_fast" do
344
+ it "raises an ArgumentError with message" do
345
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
346
+ err_msg = "Input does not look like FASTA or FASTQ"
347
+
348
+ expect { SeqFile.open(fname).each_record_fast do |h, s|
349
+ puts [h, s].join ' '
350
+ end
351
+ }.to raise_error(ArgumentError, err_msg)
352
+ end
353
+ end
354
+ end
355
+
356
+
216
357
  end
data/spec/spec_helper.rb CHANGED
@@ -32,6 +32,15 @@ module Helpers
32
32
  ["seq 4 > has many '>' in header", "ACTGactg"],
33
33
  ["empty seq at end", ""]]
34
34
 
35
+ RECORDS_FAST = [["empty seq at beginning", ""],
36
+ ["seq1 is fun", "AAC TGG NN N"],
37
+ ["seq2", "AATCCTGNNN"],
38
+ ["empty seq 1", ""],
39
+ ["empty seq 2", ""],
40
+ ["seq3", "yyyyyyyyyyyyyyyNNN"],
41
+ ["seq 4 > has many '>' in header", "ACTGactg"],
42
+ ["empty seq at end", ""]]
43
+
35
44
  RECORDS_MAP = {
36
45
  "empty seq at beginning" => "",
37
46
  "seq1 is fun" => "AACTGGNNN",
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parse_fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.2
4
+ version: 1.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore