parse_fasta 1.8.2 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MDE1YThkNzUyNzI0MTMwZDMyYzBiNzFiODMzZGQzNzQ5ODU3ZTk1MA==
4
+ YWQ1YWNkZmM2Zjc3ZTJkMGE3ZDllMDY4ZDE1MWM2MzAxNjM1MDNlOA==
5
5
  data.tar.gz: !binary |-
6
- NjBmZGUxZTdkM2UyZTQ4YWY1MDliMTI0OTJlYjA5ZDFmMzg4OWRlZQ==
6
+ ZTkxYWEzZmRhZGM0NzJmYTM5YTZmMjM5OWIzMmU0YTI2ZjA1YjdkOA==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MjIxNGNlODdkNTk3ZWE1ZDk1Zjg4ZDY0ZWE3NzE0ZWI0ODQ4MDZjZTk1MDY1
10
- OGNiOGViOWYwZDU5ODY0YjNmZWY1ODYwOGVlN2E5MTVmYzZlZmIwNzE4MjNi
11
- YWFjMjgzNGQ4YmMzODdjYjZjNTBmYTM4MWFiYTcyYjlmZWFhYWM=
9
+ Y2VkNDcyMTU3NTVjODAyNTNkNzJlMTkyOTY1ZGE5N2FhMWU0ZDgyOGI0YmJh
10
+ NGYxM2ExNzI2ZGQyYjAwZjNjNDhlYTRjMWRkNWY1Yzk5ZTlhM2I3NDg4ODJm
11
+ YWVhOTBiZGEyODI4M2M0NDJlOTEwN2VlNjNjZDMxZmU4ZWQxODM=
12
12
  data.tar.gz: !binary |-
13
- YTNiMTYzNmJhODkzMjEyMjBlOTgxOGIyMjFmMTFlOTE0NTEyOWZjNTgxMTRj
14
- N2Y4YjE3NWUxMjYyNTRjNTYzZGE3MjBhNjJjZTNmNjRkYzY5ZGI2MGY0MjQz
15
- N2RhMDUxN2E1MjY0NDZkOWQyMjEzYTU2ZDE4M2FlZDg3YzA0N2M=
13
+ YzUxNTViM2NhNTQ2NjMyOWMwZmJjMGRiYjBjYzNkZmY1Y2Y1ZmQyNWUwMzkw
14
+ OGIwNzY5NTE0OTY4YTIxMDhiZjNiMjdiZDdiNGIxNmZhNjMxZTIzOTBhNzAz
15
+ NzcwMGRiZDBjNWMzOTVmOGUzMThhMTRkYzcwM2YzNjAwMTVmZmE=
data/README.md CHANGED
@@ -66,6 +66,15 @@ Read fasta file into a hash.
66
66
 
67
67
  ## Versions ##
68
68
 
69
+ ### 1.9.0 ###
70
+
71
+ Added "fast" versions of `each_record` methods
72
+ (`each_record_fast`). Basically, they return sequences and quality
73
+ strings as Ruby `Sring` objects instead of aa `Sequence` or `Quality`
74
+ objects. Also, if the sequence or quality string has spaces, they will
75
+ be retained. If this is a problem, use the original `each_record`
76
+ methods.
77
+
69
78
  ### 1.8.2 ###
70
79
 
71
80
  Speed up `FastqFile#each_record`.
@@ -137,6 +137,42 @@ class FastaFile < File
137
137
  return f
138
138
  end
139
139
 
140
+ # Fast version of #each_record
141
+ #
142
+ # Yields the sequence as a String, not Sequence. No separate lines
143
+ # option.
144
+ #
145
+ # @note If the fastA file has spaces in the sequence, they will be
146
+ # retained. If this is a problem, use #each_record instead.
147
+ #
148
+ # @yield The header and sequence for each record in the fasta
149
+ # file to the block
150
+ #
151
+ # @yieldparam header [String] The header of the fasta record without
152
+ # the leading '>'
153
+ #
154
+ # @yieldparam sequence [String] The sequence of the fasta record
155
+ #
156
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
157
+ def each_record_fast
158
+ begin
159
+ f = Zlib::GzipReader.open(self)
160
+ rescue Zlib::GzipFile::Error => e
161
+ f = self
162
+ end
163
+
164
+ f.each("\n>") do |line|
165
+ header, sequence = parse_line(line)
166
+
167
+ raise ParseFasta::SequenceFormatError if sequence.include? ">"
168
+
169
+ yield(header.strip, sequence)
170
+ end
171
+
172
+ f.close if f.instance_of?(Zlib::GzipReader)
173
+ return f
174
+ end
175
+
140
176
  private
141
177
 
142
178
  def parse_line(line)
@@ -96,4 +96,63 @@ class FastqFile < File
96
96
  f.close if f.instance_of?(Zlib::GzipReader)
97
97
  return f
98
98
  end
99
+
100
+ # Fast version of #each_record
101
+ #
102
+ # @note If the fastQ file has spaces in the sequence, they will be
103
+ # retained. If this is a problem, use #each_record instead.
104
+ #
105
+ # @example Parsing a fastq file
106
+ # FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual|
107
+ # # do some fun stuff here!
108
+ # end
109
+ # @example Use the same syntax for gzipped files!
110
+ # FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual|
111
+ # # do some fun stuff here!
112
+ # end
113
+ #
114
+ # @yield The header, sequence, description and quality string for
115
+ # each record in the fastq file to the block
116
+ #
117
+ # @yieldparam header [String] The header of the fastq record without
118
+ # the leading '@'
119
+ # @yieldparam sequence [String] The sequence of the fastq record
120
+ # @yieldparam description [String] The description line of the fastq
121
+ # record without the leading '+'
122
+ # @yieldparam quality_string [String] The quality string of the
123
+ # fastq record
124
+ def each_record_fast
125
+ count = 0
126
+ header = ''
127
+ sequence = ''
128
+ description = ''
129
+ quality = ''
130
+
131
+ begin
132
+ f = Zlib::GzipReader.open(self)
133
+ rescue Zlib::GzipFile::Error => e
134
+ f = self
135
+ end
136
+
137
+ f.each_line do |line|
138
+ line.chomp!
139
+
140
+ case count % 4
141
+ when 0
142
+ header = line[1..-1]
143
+ when 1
144
+ sequence = line
145
+ when 2
146
+ description = line[1..-1]
147
+ when 3
148
+ quality = line
149
+ yield(header, sequence, description, quality)
150
+ end
151
+
152
+ count += 1
153
+ end
154
+
155
+ f.close if f.instance_of?(Zlib::GzipReader)
156
+ return f
157
+ end
99
158
  end
@@ -95,6 +95,44 @@ class SeqFile < File
95
95
  end
96
96
  end
97
97
 
98
+ # Fast version of #each_record
99
+ #
100
+ # @note If the sequence file has spaces in the sequence, they will
101
+ # be retained. If this is a problem, use #each_record instead.
102
+ #
103
+ # @example Parse a gzipped fastA file
104
+ # SeqFile.open('reads.fa.gz').each_record_fast do |head, seq|
105
+ # puts [head, seq.length].join "\t"
106
+ # end
107
+ #
108
+ # @example Parse an uncompressed fastQ file
109
+ # SeqFile.open('reads.fq.gz').each_record_fast do |head, seq|
110
+ # puts [head, seq.length].join "\t"
111
+ # end
112
+ #
113
+ # @yieldparam header [String] The header of the record without the
114
+ # leading '>' or '@'
115
+ #
116
+ # @yieldparam sequence [String] The sequence of the record.
117
+ #
118
+ # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
119
+ # and file is a fastA file
120
+ def each_record_fast
121
+ first_char = get_first_char(self)
122
+
123
+ if first_char == '>'
124
+ FastaFile.open(self).each_record_fast do |header, sequence|
125
+ yield(header, sequence)
126
+ end
127
+ elsif first_char == '@'
128
+ FastqFile.open(self).each_record_fast do |head, seq, desc, qual|
129
+ yield(head, seq)
130
+ end
131
+ else
132
+ raise ArgumentError, "Input does not look like FASTA or FASTQ"
133
+ end
134
+ end
135
+
98
136
  private
99
137
 
100
138
  def get_first_char(f)
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "1.8.2"
20
+ VERSION = "1.9.0"
21
21
  end
@@ -148,4 +148,65 @@ describe FastaFile do
148
148
  end
149
149
  end
150
150
  end
151
+
152
+ describe "#each_record_fast" do
153
+ let(:records) { Helpers::RECORDS_FAST }
154
+
155
+ let(:f_handle) { FastaFile.open(@fname).each_record_fast { |s| } }
156
+
157
+ context "with badly catted fasta" do
158
+ it "raises ParseFasta::SequenceFormatError" do
159
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
160
+
161
+ expect { FastaFile.open(fname).each_record_fast {} }.
162
+ to raise_error ParseFasta::SequenceFormatError
163
+ end
164
+ end
165
+
166
+ shared_examples_for "any FastaFile" do
167
+ it "yields proper header and sequence for each record" do
168
+ expect { |b|
169
+ FastaFile.open(@fname).each_record_fast(&b)
170
+ }.to yield_successive_args(*records)
171
+ end
172
+
173
+ it "yields the sequence as a String class" do
174
+ FastaFile.open(@fname).each_record_fast do |_, seq|
175
+ expect(seq).to be_an_instance_of String
176
+ end
177
+ end
178
+ end
179
+
180
+ context "with a gzipped file" do
181
+ before(:each) do
182
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
183
+ end
184
+
185
+ it_behaves_like "any FastaFile"
186
+
187
+ it "closes the GzipReader" do
188
+ expect(f_handle).to be_closed
189
+ end
190
+
191
+ it "returns GzipReader object" do
192
+ expect(f_handle).to be_an_instance_of Zlib::GzipReader
193
+ end
194
+ end
195
+
196
+ context "with a non-gzipped file" do
197
+ before(:each) do
198
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
199
+ end
200
+
201
+ it_behaves_like "any FastaFile"
202
+
203
+ it "doesn't close the FastqFile (approx regular file behavior)" do
204
+ expect(f_handle).not_to be_closed
205
+ end
206
+
207
+ it "returns FastaFile object" do
208
+ expect(f_handle).to be_an_instance_of FastaFile
209
+ end
210
+ end
211
+ end
151
212
  end
@@ -21,7 +21,14 @@ require 'spec_helper'
21
21
  describe FastqFile do
22
22
  let(:records) {
23
23
  [["seq1", "AACCTTGG", "", ")#3gTqN8"],
24
- ["seq2 apples", "ACTG", "seq2 apples", "*ujM"]] }
24
+ ["seq2 apples", "ACTG", "seq2 apples", "*ujM"]]
25
+ }
26
+
27
+ let(:records_fast) {
28
+ [["seq1", "AA CC TT GG", "", ")# 3g Tq N8"],
29
+ ["seq2 apples", "ACTG", "seq2 apples", "*ujM"]]
30
+ }
31
+
25
32
  let(:f_handle) { FastqFile.open(@fname).each_record { |s| } }
26
33
 
27
34
 
@@ -45,6 +52,27 @@ describe FastqFile do
45
52
  end
46
53
  end
47
54
 
55
+ describe "#each_record_fast" do
56
+ before(:each) do
57
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
58
+ end
59
+
60
+ it "yields proper header, sequence, description, and quality" do
61
+ expect { |b|
62
+ FastqFile.open(@fname).each_record_fast(&b)
63
+ }.to yield_successive_args(records_fast[0], records_fast[1])
64
+ end
65
+
66
+ it "yields all params as String" do
67
+ FastqFile.open(@fname).each_record_fast do |h, s, d, q|
68
+ expect(h).to be_an_instance_of String
69
+ expect(s).to be_an_instance_of String
70
+ expect(d).to be_an_instance_of String
71
+ expect(q).to be_an_instance_of String
72
+ end
73
+ end
74
+ end
75
+
48
76
  describe "#to_hash" do
49
77
  let(:records) {
50
78
  { "seq1" => { head: "seq1",
@@ -79,9 +79,8 @@ describe SeqFile do
79
79
  end
80
80
  end
81
81
 
82
- describe "#each_record" do
83
-
84
- context "when input is a fasta file" do
82
+ context "when input is a fasta file" do
83
+ describe "#each_record" do
85
84
  let(:records) { Helpers::RECORDS }
86
85
 
87
86
  let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
@@ -200,8 +199,10 @@ describe SeqFile do
200
199
  end
201
200
  end
202
201
  end
202
+ end
203
203
 
204
- context "when input is bogus" do
204
+ context "when input is bogus" do
205
+ describe "#each_record" do
205
206
  it "raises an ArgumentError with message" do
206
207
  fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
207
208
  err_msg = "Input does not look like FASTA or FASTQ"
@@ -213,4 +214,144 @@ describe SeqFile do
213
214
  end
214
215
  end
215
216
  end
217
+
218
+ #####
219
+
220
+ context "when input is a fasta file" do
221
+ describe "#each_record_fast" do
222
+ let(:records) { Helpers::RECORDS_FAST }
223
+
224
+ let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }
225
+
226
+ context "with badly catted fasta" do
227
+ it "raises ParseFasta::SequenceFormatError" do
228
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
229
+
230
+ expect { FastaFile.open(fname).to_hash }.
231
+ to raise_error ParseFasta::SequenceFormatError
232
+ end
233
+ end
234
+
235
+ shared_examples_for "parsing a fasta file" do
236
+ it "yields proper header and sequence for each record" do
237
+ expect { |b|
238
+ SeqFile.open(@fname).each_record_fast(&b)
239
+ }.to yield_successive_args(*records)
240
+ end
241
+
242
+ it "yields the sequence as a String class" do
243
+ SeqFile.open(@fname).each_record_fast do |_, seq|
244
+ expect(seq).to be_an_instance_of String
245
+ end
246
+ end
247
+ end
248
+
249
+ context "with a gzipped file" do
250
+ before(:each) do
251
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
252
+ end
253
+
254
+ it_behaves_like "parsing a fasta file"
255
+
256
+ it "closes the GzipReader" do
257
+ expect(f_handle).to be_closed
258
+ end
259
+
260
+ it "returns GzipReader object" do
261
+ expect(f_handle).to be_an_instance_of Zlib::GzipReader
262
+ end
263
+ end
264
+
265
+ context "with a non-gzipped file" do
266
+ before(:each) do
267
+ @fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
268
+ end
269
+
270
+ it_behaves_like "parsing a fasta file"
271
+
272
+ it "doesn't close the File (approx regular file behavior)" do
273
+ expect(f_handle).not_to be_closed
274
+ end
275
+
276
+ it "returns FastaFile object" do
277
+ expect(f_handle).to be_a FastaFile
278
+ end
279
+ end
280
+ end
281
+ end
282
+
283
+ context "when input is a fastq file" do
284
+ let(:records) {
285
+ [["seq1", "AA CC TT GG"],
286
+ ["seq2 apples", "ACTG"]] }
287
+ let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }
288
+
289
+ shared_examples_for "parsing a fastq file" do
290
+ it "yields only header & sequence" do
291
+ expect { |b|
292
+ SeqFile.open(@fname).each_record_fast(&b)
293
+ }.to yield_successive_args(records[0], records[1])
294
+ end
295
+
296
+ it "yields the sequence as a String class" do
297
+ SeqFile.open(@fname).each_record_fast do |_, seq, _, _|
298
+ expect(seq).to be_an_instance_of String
299
+ end
300
+ end
301
+ end
302
+
303
+ context "with a 4 line per record fastq file" do
304
+ describe "#each_record_fast" do
305
+ context "with a gzipped file" do
306
+ before(:each) do
307
+ @fname =
308
+ "#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
309
+ end
310
+
311
+ it_behaves_like "parsing a fastq file"
312
+
313
+ it "closes the GzipReader" do
314
+ expect(f_handle).to be_closed
315
+ end
316
+
317
+ it "returns GzipReader object" do
318
+ expect(f_handle).to be_an_instance_of Zlib::GzipReader
319
+ end
320
+ end
321
+
322
+ context "with a non-gzipped file" do
323
+ before(:each) do
324
+ @fname =
325
+ "#{File.dirname(__FILE__)}/../../test_files/test.fq"
326
+ end
327
+
328
+ it_behaves_like "parsing a fastq file"
329
+
330
+ it "doesn't close the SeqFile (approx reg file behav)" do
331
+ expect(f_handle).not_to be_closed
332
+ end
333
+
334
+ it "returns FastqFile object" do
335
+ expect(f_handle).to be_a FastqFile
336
+ end
337
+ end
338
+ end
339
+ end
340
+ end
341
+
342
+ context "when input is bogus" do
343
+ describe "#each_record_fast" do
344
+ it "raises an ArgumentError with message" do
345
+ fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
346
+ err_msg = "Input does not look like FASTA or FASTQ"
347
+
348
+ expect { SeqFile.open(fname).each_record_fast do |h, s|
349
+ puts [h, s].join ' '
350
+ end
351
+ }.to raise_error(ArgumentError, err_msg)
352
+ end
353
+ end
354
+ end
355
+
356
+
216
357
  end
data/spec/spec_helper.rb CHANGED
@@ -32,6 +32,15 @@ module Helpers
32
32
  ["seq 4 > has many '>' in header", "ACTGactg"],
33
33
  ["empty seq at end", ""]]
34
34
 
35
+ RECORDS_FAST = [["empty seq at beginning", ""],
36
+ ["seq1 is fun", "AAC TGG NN N"],
37
+ ["seq2", "AATCCTGNNN"],
38
+ ["empty seq 1", ""],
39
+ ["empty seq 2", ""],
40
+ ["seq3", "yyyyyyyyyyyyyyyNNN"],
41
+ ["seq 4 > has many '>' in header", "ACTGactg"],
42
+ ["empty seq at end", ""]]
43
+
35
44
  RECORDS_MAP = {
36
45
  "empty seq at beginning" => "",
37
46
  "seq1 is fun" => "AACTGGNNN",
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parse_fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.2
4
+ version: 1.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore