parse_fasta 1.8.2 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +9 -0
- data/lib/parse_fasta/fasta_file.rb +36 -0
- data/lib/parse_fasta/fastq_file.rb +59 -0
- data/lib/parse_fasta/seq_file.rb +38 -0
- data/lib/parse_fasta/version.rb +1 -1
- data/spec/lib/fasta_file_spec.rb +61 -0
- data/spec/lib/fastq_file_spec.rb +29 -1
- data/spec/lib/seq_file_spec.rb +145 -4
- data/spec/spec_helper.rb +9 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YWQ1YWNkZmM2Zjc3ZTJkMGE3ZDllMDY4ZDE1MWM2MzAxNjM1MDNlOA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZTkxYWEzZmRhZGM0NzJmYTM5YTZmMjM5OWIzMmU0YTI2ZjA1YjdkOA==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
Y2VkNDcyMTU3NTVjODAyNTNkNzJlMTkyOTY1ZGE5N2FhMWU0ZDgyOGI0YmJh
|
10
|
+
NGYxM2ExNzI2ZGQyYjAwZjNjNDhlYTRjMWRkNWY1Yzk5ZTlhM2I3NDg4ODJm
|
11
|
+
YWVhOTBiZGEyODI4M2M0NDJlOTEwN2VlNjNjZDMxZmU4ZWQxODM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YzUxNTViM2NhNTQ2NjMyOWMwZmJjMGRiYjBjYzNkZmY1Y2Y1ZmQyNWUwMzkw
|
14
|
+
OGIwNzY5NTE0OTY4YTIxMDhiZjNiMjdiZDdiNGIxNmZhNjMxZTIzOTBhNzAz
|
15
|
+
NzcwMGRiZDBjNWMzOTVmOGUzMThhMTRkYzcwM2YzNjAwMTVmZmE=
|
data/README.md
CHANGED
@@ -66,6 +66,15 @@ Read fasta file into a hash.
|
|
66
66
|
|
67
67
|
## Versions ##
|
68
68
|
|
69
|
+
### 1.9.0 ###
|
70
|
+
|
71
|
+
Added "fast" versions of `each_record` methods
|
72
|
+
(`each_record_fast`). Basically, they return sequences and quality
|
73
|
+
strings as Ruby `Sring` objects instead of aa `Sequence` or `Quality`
|
74
|
+
objects. Also, if the sequence or quality string has spaces, they will
|
75
|
+
be retained. If this is a problem, use the original `each_record`
|
76
|
+
methods.
|
77
|
+
|
69
78
|
### 1.8.2 ###
|
70
79
|
|
71
80
|
Speed up `FastqFile#each_record`.
|
@@ -137,6 +137,42 @@ class FastaFile < File
|
|
137
137
|
return f
|
138
138
|
end
|
139
139
|
|
140
|
+
# Fast version of #each_record
|
141
|
+
#
|
142
|
+
# Yields the sequence as a String, not Sequence. No separate lines
|
143
|
+
# option.
|
144
|
+
#
|
145
|
+
# @note If the fastA file has spaces in the sequence, they will be
|
146
|
+
# retained. If this is a problem, use #each_record instead.
|
147
|
+
#
|
148
|
+
# @yield The header and sequence for each record in the fasta
|
149
|
+
# file to the block
|
150
|
+
#
|
151
|
+
# @yieldparam header [String] The header of the fasta record without
|
152
|
+
# the leading '>'
|
153
|
+
#
|
154
|
+
# @yieldparam sequence [String] The sequence of the fasta record
|
155
|
+
#
|
156
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
|
157
|
+
def each_record_fast
|
158
|
+
begin
|
159
|
+
f = Zlib::GzipReader.open(self)
|
160
|
+
rescue Zlib::GzipFile::Error => e
|
161
|
+
f = self
|
162
|
+
end
|
163
|
+
|
164
|
+
f.each("\n>") do |line|
|
165
|
+
header, sequence = parse_line(line)
|
166
|
+
|
167
|
+
raise ParseFasta::SequenceFormatError if sequence.include? ">"
|
168
|
+
|
169
|
+
yield(header.strip, sequence)
|
170
|
+
end
|
171
|
+
|
172
|
+
f.close if f.instance_of?(Zlib::GzipReader)
|
173
|
+
return f
|
174
|
+
end
|
175
|
+
|
140
176
|
private
|
141
177
|
|
142
178
|
def parse_line(line)
|
@@ -96,4 +96,63 @@ class FastqFile < File
|
|
96
96
|
f.close if f.instance_of?(Zlib::GzipReader)
|
97
97
|
return f
|
98
98
|
end
|
99
|
+
|
100
|
+
# Fast version of #each_record
|
101
|
+
#
|
102
|
+
# @note If the fastQ file has spaces in the sequence, they will be
|
103
|
+
# retained. If this is a problem, use #each_record instead.
|
104
|
+
#
|
105
|
+
# @example Parsing a fastq file
|
106
|
+
# FastqFile.open('reads.fq').each_record_fast do |head, seq, desc, qual|
|
107
|
+
# # do some fun stuff here!
|
108
|
+
# end
|
109
|
+
# @example Use the same syntax for gzipped files!
|
110
|
+
# FastqFile.open('reads.fq.gz').each_record_fast do |head, seq, desc, qual|
|
111
|
+
# # do some fun stuff here!
|
112
|
+
# end
|
113
|
+
#
|
114
|
+
# @yield The header, sequence, description and quality string for
|
115
|
+
# each record in the fastq file to the block
|
116
|
+
#
|
117
|
+
# @yieldparam header [String] The header of the fastq record without
|
118
|
+
# the leading '@'
|
119
|
+
# @yieldparam sequence [String] The sequence of the fastq record
|
120
|
+
# @yieldparam description [String] The description line of the fastq
|
121
|
+
# record without the leading '+'
|
122
|
+
# @yieldparam quality_string [String] The quality string of the
|
123
|
+
# fastq record
|
124
|
+
def each_record_fast
|
125
|
+
count = 0
|
126
|
+
header = ''
|
127
|
+
sequence = ''
|
128
|
+
description = ''
|
129
|
+
quality = ''
|
130
|
+
|
131
|
+
begin
|
132
|
+
f = Zlib::GzipReader.open(self)
|
133
|
+
rescue Zlib::GzipFile::Error => e
|
134
|
+
f = self
|
135
|
+
end
|
136
|
+
|
137
|
+
f.each_line do |line|
|
138
|
+
line.chomp!
|
139
|
+
|
140
|
+
case count % 4
|
141
|
+
when 0
|
142
|
+
header = line[1..-1]
|
143
|
+
when 1
|
144
|
+
sequence = line
|
145
|
+
when 2
|
146
|
+
description = line[1..-1]
|
147
|
+
when 3
|
148
|
+
quality = line
|
149
|
+
yield(header, sequence, description, quality)
|
150
|
+
end
|
151
|
+
|
152
|
+
count += 1
|
153
|
+
end
|
154
|
+
|
155
|
+
f.close if f.instance_of?(Zlib::GzipReader)
|
156
|
+
return f
|
157
|
+
end
|
99
158
|
end
|
data/lib/parse_fasta/seq_file.rb
CHANGED
@@ -95,6 +95,44 @@ class SeqFile < File
|
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
98
|
+
# Fast version of #each_record
|
99
|
+
#
|
100
|
+
# @note If the sequence file has spaces in the sequence, they will
|
101
|
+
# be retained. If this is a problem, use #each_record instead.
|
102
|
+
#
|
103
|
+
# @example Parse a gzipped fastA file
|
104
|
+
# SeqFile.open('reads.fa.gz').each_record_fast do |head, seq|
|
105
|
+
# puts [head, seq.length].join "\t"
|
106
|
+
# end
|
107
|
+
#
|
108
|
+
# @example Parse an uncompressed fastQ file
|
109
|
+
# SeqFile.open('reads.fq.gz').each_record_fast do |head, seq|
|
110
|
+
# puts [head, seq.length].join "\t"
|
111
|
+
# end
|
112
|
+
#
|
113
|
+
# @yieldparam header [String] The header of the record without the
|
114
|
+
# leading '>' or '@'
|
115
|
+
#
|
116
|
+
# @yieldparam sequence [String] The sequence of the record.
|
117
|
+
#
|
118
|
+
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
|
119
|
+
# and file is a fastA file
|
120
|
+
def each_record_fast
|
121
|
+
first_char = get_first_char(self)
|
122
|
+
|
123
|
+
if first_char == '>'
|
124
|
+
FastaFile.open(self).each_record_fast do |header, sequence|
|
125
|
+
yield(header, sequence)
|
126
|
+
end
|
127
|
+
elsif first_char == '@'
|
128
|
+
FastqFile.open(self).each_record_fast do |head, seq, desc, qual|
|
129
|
+
yield(head, seq)
|
130
|
+
end
|
131
|
+
else
|
132
|
+
raise ArgumentError, "Input does not look like FASTA or FASTQ"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
98
136
|
private
|
99
137
|
|
100
138
|
def get_first_char(f)
|
data/lib/parse_fasta/version.rb
CHANGED
data/spec/lib/fasta_file_spec.rb
CHANGED
@@ -148,4 +148,65 @@ describe FastaFile do
|
|
148
148
|
end
|
149
149
|
end
|
150
150
|
end
|
151
|
+
|
152
|
+
describe "#each_record_fast" do
|
153
|
+
let(:records) { Helpers::RECORDS_FAST }
|
154
|
+
|
155
|
+
let(:f_handle) { FastaFile.open(@fname).each_record_fast { |s| } }
|
156
|
+
|
157
|
+
context "with badly catted fasta" do
|
158
|
+
it "raises ParseFasta::SequenceFormatError" do
|
159
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
160
|
+
|
161
|
+
expect { FastaFile.open(fname).each_record_fast {} }.
|
162
|
+
to raise_error ParseFasta::SequenceFormatError
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
shared_examples_for "any FastaFile" do
|
167
|
+
it "yields proper header and sequence for each record" do
|
168
|
+
expect { |b|
|
169
|
+
FastaFile.open(@fname).each_record_fast(&b)
|
170
|
+
}.to yield_successive_args(*records)
|
171
|
+
end
|
172
|
+
|
173
|
+
it "yields the sequence as a String class" do
|
174
|
+
FastaFile.open(@fname).each_record_fast do |_, seq|
|
175
|
+
expect(seq).to be_an_instance_of String
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
context "with a gzipped file" do
|
181
|
+
before(:each) do
|
182
|
+
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
|
183
|
+
end
|
184
|
+
|
185
|
+
it_behaves_like "any FastaFile"
|
186
|
+
|
187
|
+
it "closes the GzipReader" do
|
188
|
+
expect(f_handle).to be_closed
|
189
|
+
end
|
190
|
+
|
191
|
+
it "returns GzipReader object" do
|
192
|
+
expect(f_handle).to be_an_instance_of Zlib::GzipReader
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
context "with a non-gzipped file" do
|
197
|
+
before(:each) do
|
198
|
+
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
|
199
|
+
end
|
200
|
+
|
201
|
+
it_behaves_like "any FastaFile"
|
202
|
+
|
203
|
+
it "doesn't close the FastqFile (approx regular file behavior)" do
|
204
|
+
expect(f_handle).not_to be_closed
|
205
|
+
end
|
206
|
+
|
207
|
+
it "returns FastaFile object" do
|
208
|
+
expect(f_handle).to be_an_instance_of FastaFile
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
151
212
|
end
|
data/spec/lib/fastq_file_spec.rb
CHANGED
@@ -21,7 +21,14 @@ require 'spec_helper'
|
|
21
21
|
describe FastqFile do
|
22
22
|
let(:records) {
|
23
23
|
[["seq1", "AACCTTGG", "", ")#3gTqN8"],
|
24
|
-
["seq2 apples", "ACTG", "seq2 apples", "*ujM"]]
|
24
|
+
["seq2 apples", "ACTG", "seq2 apples", "*ujM"]]
|
25
|
+
}
|
26
|
+
|
27
|
+
let(:records_fast) {
|
28
|
+
[["seq1", "AA CC TT GG", "", ")# 3g Tq N8"],
|
29
|
+
["seq2 apples", "ACTG", "seq2 apples", "*ujM"]]
|
30
|
+
}
|
31
|
+
|
25
32
|
let(:f_handle) { FastqFile.open(@fname).each_record { |s| } }
|
26
33
|
|
27
34
|
|
@@ -45,6 +52,27 @@ describe FastqFile do
|
|
45
52
|
end
|
46
53
|
end
|
47
54
|
|
55
|
+
describe "#each_record_fast" do
|
56
|
+
before(:each) do
|
57
|
+
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
|
58
|
+
end
|
59
|
+
|
60
|
+
it "yields proper header, sequence, description, and quality" do
|
61
|
+
expect { |b|
|
62
|
+
FastqFile.open(@fname).each_record_fast(&b)
|
63
|
+
}.to yield_successive_args(records_fast[0], records_fast[1])
|
64
|
+
end
|
65
|
+
|
66
|
+
it "yields all params as String" do
|
67
|
+
FastqFile.open(@fname).each_record_fast do |h, s, d, q|
|
68
|
+
expect(h).to be_an_instance_of String
|
69
|
+
expect(s).to be_an_instance_of String
|
70
|
+
expect(d).to be_an_instance_of String
|
71
|
+
expect(q).to be_an_instance_of String
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
48
76
|
describe "#to_hash" do
|
49
77
|
let(:records) {
|
50
78
|
{ "seq1" => { head: "seq1",
|
data/spec/lib/seq_file_spec.rb
CHANGED
@@ -79,9 +79,8 @@ describe SeqFile do
|
|
79
79
|
end
|
80
80
|
end
|
81
81
|
|
82
|
-
|
83
|
-
|
84
|
-
context "when input is a fasta file" do
|
82
|
+
context "when input is a fasta file" do
|
83
|
+
describe "#each_record" do
|
85
84
|
let(:records) { Helpers::RECORDS }
|
86
85
|
|
87
86
|
let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
|
@@ -200,8 +199,10 @@ describe SeqFile do
|
|
200
199
|
end
|
201
200
|
end
|
202
201
|
end
|
202
|
+
end
|
203
203
|
|
204
|
-
|
204
|
+
context "when input is bogus" do
|
205
|
+
describe "#each_record" do
|
205
206
|
it "raises an ArgumentError with message" do
|
206
207
|
fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
|
207
208
|
err_msg = "Input does not look like FASTA or FASTQ"
|
@@ -213,4 +214,144 @@ describe SeqFile do
|
|
213
214
|
end
|
214
215
|
end
|
215
216
|
end
|
217
|
+
|
218
|
+
#####
|
219
|
+
|
220
|
+
context "when input is a fasta file" do
|
221
|
+
describe "#each_record_fast" do
|
222
|
+
let(:records) { Helpers::RECORDS_FAST }
|
223
|
+
|
224
|
+
let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }
|
225
|
+
|
226
|
+
context "with badly catted fasta" do
|
227
|
+
it "raises ParseFasta::SequenceFormatError" do
|
228
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
229
|
+
|
230
|
+
expect { FastaFile.open(fname).to_hash }.
|
231
|
+
to raise_error ParseFasta::SequenceFormatError
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
shared_examples_for "parsing a fasta file" do
|
236
|
+
it "yields proper header and sequence for each record" do
|
237
|
+
expect { |b|
|
238
|
+
SeqFile.open(@fname).each_record_fast(&b)
|
239
|
+
}.to yield_successive_args(*records)
|
240
|
+
end
|
241
|
+
|
242
|
+
it "yields the sequence as a String class" do
|
243
|
+
SeqFile.open(@fname).each_record_fast do |_, seq|
|
244
|
+
expect(seq).to be_an_instance_of String
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
context "with a gzipped file" do
|
250
|
+
before(:each) do
|
251
|
+
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
|
252
|
+
end
|
253
|
+
|
254
|
+
it_behaves_like "parsing a fasta file"
|
255
|
+
|
256
|
+
it "closes the GzipReader" do
|
257
|
+
expect(f_handle).to be_closed
|
258
|
+
end
|
259
|
+
|
260
|
+
it "returns GzipReader object" do
|
261
|
+
expect(f_handle).to be_an_instance_of Zlib::GzipReader
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
context "with a non-gzipped file" do
|
266
|
+
before(:each) do
|
267
|
+
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
|
268
|
+
end
|
269
|
+
|
270
|
+
it_behaves_like "parsing a fasta file"
|
271
|
+
|
272
|
+
it "doesn't close the File (approx regular file behavior)" do
|
273
|
+
expect(f_handle).not_to be_closed
|
274
|
+
end
|
275
|
+
|
276
|
+
it "returns FastaFile object" do
|
277
|
+
expect(f_handle).to be_a FastaFile
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
context "when input is a fastq file" do
|
284
|
+
let(:records) {
|
285
|
+
[["seq1", "AA CC TT GG"],
|
286
|
+
["seq2 apples", "ACTG"]] }
|
287
|
+
let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }
|
288
|
+
|
289
|
+
shared_examples_for "parsing a fastq file" do
|
290
|
+
it "yields only header & sequence" do
|
291
|
+
expect { |b|
|
292
|
+
SeqFile.open(@fname).each_record_fast(&b)
|
293
|
+
}.to yield_successive_args(records[0], records[1])
|
294
|
+
end
|
295
|
+
|
296
|
+
it "yields the sequence as a String class" do
|
297
|
+
SeqFile.open(@fname).each_record_fast do |_, seq, _, _|
|
298
|
+
expect(seq).to be_an_instance_of String
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
context "with a 4 line per record fastq file" do
|
304
|
+
describe "#each_record_fast" do
|
305
|
+
context "with a gzipped file" do
|
306
|
+
before(:each) do
|
307
|
+
@fname =
|
308
|
+
"#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
|
309
|
+
end
|
310
|
+
|
311
|
+
it_behaves_like "parsing a fastq file"
|
312
|
+
|
313
|
+
it "closes the GzipReader" do
|
314
|
+
expect(f_handle).to be_closed
|
315
|
+
end
|
316
|
+
|
317
|
+
it "returns GzipReader object" do
|
318
|
+
expect(f_handle).to be_an_instance_of Zlib::GzipReader
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
context "with a non-gzipped file" do
|
323
|
+
before(:each) do
|
324
|
+
@fname =
|
325
|
+
"#{File.dirname(__FILE__)}/../../test_files/test.fq"
|
326
|
+
end
|
327
|
+
|
328
|
+
it_behaves_like "parsing a fastq file"
|
329
|
+
|
330
|
+
it "doesn't close the SeqFile (approx reg file behav)" do
|
331
|
+
expect(f_handle).not_to be_closed
|
332
|
+
end
|
333
|
+
|
334
|
+
it "returns FastqFile object" do
|
335
|
+
expect(f_handle).to be_a FastqFile
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
context "when input is bogus" do
|
343
|
+
describe "#each_record_fast" do
|
344
|
+
it "raises an ArgumentError with message" do
|
345
|
+
fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
|
346
|
+
err_msg = "Input does not look like FASTA or FASTQ"
|
347
|
+
|
348
|
+
expect { SeqFile.open(fname).each_record_fast do |h, s|
|
349
|
+
puts [h, s].join ' '
|
350
|
+
end
|
351
|
+
}.to raise_error(ArgumentError, err_msg)
|
352
|
+
end
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
|
216
357
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -32,6 +32,15 @@ module Helpers
|
|
32
32
|
["seq 4 > has many '>' in header", "ACTGactg"],
|
33
33
|
["empty seq at end", ""]]
|
34
34
|
|
35
|
+
RECORDS_FAST = [["empty seq at beginning", ""],
|
36
|
+
["seq1 is fun", "AAC TGG NN N"],
|
37
|
+
["seq2", "AATCCTGNNN"],
|
38
|
+
["empty seq 1", ""],
|
39
|
+
["empty seq 2", ""],
|
40
|
+
["seq3", "yyyyyyyyyyyyyyyNNN"],
|
41
|
+
["seq 4 > has many '>' in header", "ACTGactg"],
|
42
|
+
["empty seq at end", ""]]
|
43
|
+
|
35
44
|
RECORDS_MAP = {
|
36
45
|
"empty seq at beginning" => "",
|
37
46
|
"seq1 is fun" => "AACTGGNNN",
|