parse_fasta 1.9.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +178 -0
- data/README.md +42 -215
- data/Rakefile +2 -4
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/parse_fasta/error.rb +39 -0
- data/lib/parse_fasta/record.rb +88 -0
- data/lib/parse_fasta/seq_file.rb +221 -114
- data/lib/parse_fasta/version.rb +2 -2
- data/lib/parse_fasta.rb +5 -20
- data/spec/parse_fasta/record_spec.rb +115 -0
- data/spec/parse_fasta/seq_file_spec.rb +238 -0
- data/spec/parse_fasta_spec.rb +25 -0
- data/spec/spec_helper.rb +2 -44
- data/spec/test_files/cr.fa +1 -0
- data/spec/test_files/cr.fa.gz +0 -0
- data/spec/test_files/cr.fq +3 -0
- data/spec/test_files/cr.fq.gz +0 -0
- data/spec/test_files/cr_nl.fa +4 -0
- data/spec/test_files/cr_nl.fa.gz +0 -0
- data/spec/test_files/cr_nl.fq +8 -0
- data/spec/test_files/cr_nl.fq.gz +0 -0
- data/spec/test_files/multi_blob.fa.gz +0 -0
- data/spec/test_files/multi_blob.fq.gz +0 -0
- data/spec/test_files/not_a_seq_file.txt +1 -0
- data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
- data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
- data/spec/test_files/seqs.fa.gz +0 -0
- data/spec/test_files/seqs.fq +8 -0
- data/spec/test_files/seqs.fq.gz +0 -0
- metadata +49 -24
- data/lib/parse_fasta/fasta_file.rb +0 -232
- data/lib/parse_fasta/fastq_file.rb +0 -160
- data/lib/parse_fasta/quality.rb +0 -54
- data/lib/parse_fasta/sequence.rb +0 -174
- data/spec/lib/fasta_file_spec.rb +0 -212
- data/spec/lib/fastq_file_spec.rb +0 -143
- data/spec/lib/quality_spec.rb +0 -51
- data/spec/lib/seq_file_spec.rb +0 -357
- data/spec/lib/sequence_spec.rb +0 -188
- data/test_files/benchmark.rb +0 -99
- data/test_files/bogus.txt +0 -2
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq +0 -8
- data/test_files/test.fq.gz +0 -0
data/spec/lib/seq_file_spec.rb
DELETED
@@ -1,357 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
require 'spec_helper'
|
20
|
-
|
21
|
-
describe SeqFile do
|
22
|
-
|
23
|
-
describe "#to_hash" do
|
24
|
-
context "when input is a fasta file" do
|
25
|
-
let(:records) { Helpers::RECORDS_MAP }
|
26
|
-
let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
|
27
|
-
let(:fasta) { SeqFile.open(fname) }
|
28
|
-
|
29
|
-
context "with badly catted fasta" do
|
30
|
-
it "raises ParseFasta::SequenceFormatError" do
|
31
|
-
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
32
|
-
|
33
|
-
expect { FastaFile.open(fname).to_hash }.
|
34
|
-
to raise_error ParseFasta::SequenceFormatError
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
it "reads the records into a hash: header as key and seq as val" do
|
39
|
-
expect(fasta.to_hash).to eq records
|
40
|
-
end
|
41
|
-
|
42
|
-
it "passes the values as Sequence objects" do
|
43
|
-
expect(
|
44
|
-
fasta.to_hash.values.all? { |val| val.instance_of? Sequence }
|
45
|
-
).to eq true
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
context "when input is a fastq file" do
|
50
|
-
let(:records) {
|
51
|
-
{ "seq1" => { head: "seq1",
|
52
|
-
seq: "AACCTTGG",
|
53
|
-
desc: "",
|
54
|
-
qual: ")#3gTqN8" },
|
55
|
-
"seq2 apples" => { head: "seq2 apples",
|
56
|
-
seq: "ACTG",
|
57
|
-
desc: "seq2 apples",
|
58
|
-
qual: "*ujM" }
|
59
|
-
}
|
60
|
-
}
|
61
|
-
let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fq.gz" }
|
62
|
-
let(:fastq) { SeqFile.open(fname) }
|
63
|
-
|
64
|
-
it "reads the records into a hash: header as key and seq as val" do
|
65
|
-
expect(fastq.to_hash).to eq records
|
66
|
-
end
|
67
|
-
|
68
|
-
it "passes the seqs as Sequence objects" do
|
69
|
-
expect(
|
70
|
-
fastq.to_hash.values.all? { |val| val[:seq].instance_of? Sequence }
|
71
|
-
).to eq true
|
72
|
-
end
|
73
|
-
|
74
|
-
it "passes the quals as Quality objects" do
|
75
|
-
expect(
|
76
|
-
fastq.to_hash.values.all? { |val| val[:qual].instance_of? Quality }
|
77
|
-
).to eq true
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
context "when input is a fasta file" do
|
83
|
-
describe "#each_record" do
|
84
|
-
let(:records) { Helpers::RECORDS }
|
85
|
-
|
86
|
-
let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
|
87
|
-
|
88
|
-
context "with badly catted fasta" do
|
89
|
-
it "raises ParseFasta::SequenceFormatError" do
|
90
|
-
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
91
|
-
|
92
|
-
expect { FastaFile.open(fname).to_hash }.
|
93
|
-
to raise_error ParseFasta::SequenceFormatError
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
shared_examples_for "parsing a fasta file" do
|
98
|
-
it "yields proper header and sequence for each record" do
|
99
|
-
expect { |b|
|
100
|
-
SeqFile.open(@fname).each_record(&b)
|
101
|
-
}.to yield_successive_args(*records)
|
102
|
-
end
|
103
|
-
|
104
|
-
it "yields the sequence as a Sequence class" do
|
105
|
-
SeqFile.open(@fname).each_record do |_, seq|
|
106
|
-
expect(seq).to be_an_instance_of Sequence
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
context "with a gzipped file" do
|
112
|
-
before(:each) do
|
113
|
-
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
|
114
|
-
end
|
115
|
-
|
116
|
-
it_behaves_like "parsing a fasta file"
|
117
|
-
|
118
|
-
it "closes the GzipReader" do
|
119
|
-
expect(f_handle).to be_closed
|
120
|
-
end
|
121
|
-
|
122
|
-
it "returns GzipReader object" do
|
123
|
-
expect(f_handle).to be_an_instance_of Zlib::GzipReader
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
context "with a non-gzipped file" do
|
128
|
-
before(:each) do
|
129
|
-
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
|
130
|
-
end
|
131
|
-
|
132
|
-
it_behaves_like "parsing a fasta file"
|
133
|
-
|
134
|
-
it "doesn't close the File (approx regular file behavior)" do
|
135
|
-
expect(f_handle).not_to be_closed
|
136
|
-
end
|
137
|
-
|
138
|
-
it "returns FastaFile object" do
|
139
|
-
expect(f_handle).to be_a FastaFile
|
140
|
-
end
|
141
|
-
end
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
context "when input is a fastq file" do
|
146
|
-
let(:records) {
|
147
|
-
[["seq1", "AACCTTGG"],
|
148
|
-
["seq2 apples", "ACTG"]] }
|
149
|
-
let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
|
150
|
-
|
151
|
-
shared_examples_for "parsing a fastq file" do
|
152
|
-
it "yields only header & sequence" do
|
153
|
-
expect { |b|
|
154
|
-
SeqFile.open(@fname).each_record(&b)
|
155
|
-
}.to yield_successive_args(records[0], records[1])
|
156
|
-
end
|
157
|
-
|
158
|
-
it "yields the sequence as a Sequence class" do
|
159
|
-
SeqFile.open(@fname).each_record do |_, seq, _, _|
|
160
|
-
expect(seq).to be_an_instance_of Sequence
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
context "with a 4 line per record fastq file" do
|
166
|
-
describe "#each_record" do
|
167
|
-
context "with a gzipped file" do
|
168
|
-
before(:each) do
|
169
|
-
@fname =
|
170
|
-
"#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
|
171
|
-
end
|
172
|
-
|
173
|
-
it_behaves_like "parsing a fastq file"
|
174
|
-
|
175
|
-
it "closes the GzipReader" do
|
176
|
-
expect(f_handle).to be_closed
|
177
|
-
end
|
178
|
-
|
179
|
-
it "returns GzipReader object" do
|
180
|
-
expect(f_handle).to be_an_instance_of Zlib::GzipReader
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
context "with a non-gzipped file" do
|
185
|
-
before(:each) do
|
186
|
-
@fname =
|
187
|
-
"#{File.dirname(__FILE__)}/../../test_files/test.fq"
|
188
|
-
end
|
189
|
-
|
190
|
-
it_behaves_like "parsing a fastq file"
|
191
|
-
|
192
|
-
it "doesn't close the SeqFile (approx reg file behav)" do
|
193
|
-
expect(f_handle).not_to be_closed
|
194
|
-
end
|
195
|
-
|
196
|
-
it "returns FastqFile object" do
|
197
|
-
expect(f_handle).to be_a FastqFile
|
198
|
-
end
|
199
|
-
end
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
context "when input is bogus" do
|
205
|
-
describe "#each_record" do
|
206
|
-
it "raises an ArgumentError with message" do
|
207
|
-
fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
|
208
|
-
err_msg = "Input does not look like FASTA or FASTQ"
|
209
|
-
|
210
|
-
expect { SeqFile.open(fname).each_record do |h, s|
|
211
|
-
puts [h, s].join ' '
|
212
|
-
end
|
213
|
-
}.to raise_error(ArgumentError, err_msg)
|
214
|
-
end
|
215
|
-
end
|
216
|
-
end
|
217
|
-
|
218
|
-
#####
|
219
|
-
|
220
|
-
context "when input is a fasta file" do
|
221
|
-
describe "#each_record_fast" do
|
222
|
-
let(:records) { Helpers::RECORDS_FAST }
|
223
|
-
|
224
|
-
let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }
|
225
|
-
|
226
|
-
context "with badly catted fasta" do
|
227
|
-
it "raises ParseFasta::SequenceFormatError" do
|
228
|
-
fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
|
229
|
-
|
230
|
-
expect { FastaFile.open(fname).to_hash }.
|
231
|
-
to raise_error ParseFasta::SequenceFormatError
|
232
|
-
end
|
233
|
-
end
|
234
|
-
|
235
|
-
shared_examples_for "parsing a fasta file" do
|
236
|
-
it "yields proper header and sequence for each record" do
|
237
|
-
expect { |b|
|
238
|
-
SeqFile.open(@fname).each_record_fast(&b)
|
239
|
-
}.to yield_successive_args(*records)
|
240
|
-
end
|
241
|
-
|
242
|
-
it "yields the sequence as a String class" do
|
243
|
-
SeqFile.open(@fname).each_record_fast do |_, seq|
|
244
|
-
expect(seq).to be_an_instance_of String
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
context "with a gzipped file" do
|
250
|
-
before(:each) do
|
251
|
-
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz"
|
252
|
-
end
|
253
|
-
|
254
|
-
it_behaves_like "parsing a fasta file"
|
255
|
-
|
256
|
-
it "closes the GzipReader" do
|
257
|
-
expect(f_handle).to be_closed
|
258
|
-
end
|
259
|
-
|
260
|
-
it "returns GzipReader object" do
|
261
|
-
expect(f_handle).to be_an_instance_of Zlib::GzipReader
|
262
|
-
end
|
263
|
-
end
|
264
|
-
|
265
|
-
context "with a non-gzipped file" do
|
266
|
-
before(:each) do
|
267
|
-
@fname = "#{File.dirname(__FILE__)}/../../test_files/test.fa"
|
268
|
-
end
|
269
|
-
|
270
|
-
it_behaves_like "parsing a fasta file"
|
271
|
-
|
272
|
-
it "doesn't close the File (approx regular file behavior)" do
|
273
|
-
expect(f_handle).not_to be_closed
|
274
|
-
end
|
275
|
-
|
276
|
-
it "returns FastaFile object" do
|
277
|
-
expect(f_handle).to be_a FastaFile
|
278
|
-
end
|
279
|
-
end
|
280
|
-
end
|
281
|
-
end
|
282
|
-
|
283
|
-
context "when input is a fastq file" do
|
284
|
-
let(:records) {
|
285
|
-
[["seq1", "AA CC TT GG"],
|
286
|
-
["seq2 apples", "ACTG"]] }
|
287
|
-
let(:f_handle) { SeqFile.open(@fname).each_record_fast { |s| } }
|
288
|
-
|
289
|
-
shared_examples_for "parsing a fastq file" do
|
290
|
-
it "yields only header & sequence" do
|
291
|
-
expect { |b|
|
292
|
-
SeqFile.open(@fname).each_record_fast(&b)
|
293
|
-
}.to yield_successive_args(records[0], records[1])
|
294
|
-
end
|
295
|
-
|
296
|
-
it "yields the sequence as a String class" do
|
297
|
-
SeqFile.open(@fname).each_record_fast do |_, seq, _, _|
|
298
|
-
expect(seq).to be_an_instance_of String
|
299
|
-
end
|
300
|
-
end
|
301
|
-
end
|
302
|
-
|
303
|
-
context "with a 4 line per record fastq file" do
|
304
|
-
describe "#each_record_fast" do
|
305
|
-
context "with a gzipped file" do
|
306
|
-
before(:each) do
|
307
|
-
@fname =
|
308
|
-
"#{File.dirname(__FILE__)}/../../test_files/test.fq.gz"
|
309
|
-
end
|
310
|
-
|
311
|
-
it_behaves_like "parsing a fastq file"
|
312
|
-
|
313
|
-
it "closes the GzipReader" do
|
314
|
-
expect(f_handle).to be_closed
|
315
|
-
end
|
316
|
-
|
317
|
-
it "returns GzipReader object" do
|
318
|
-
expect(f_handle).to be_an_instance_of Zlib::GzipReader
|
319
|
-
end
|
320
|
-
end
|
321
|
-
|
322
|
-
context "with a non-gzipped file" do
|
323
|
-
before(:each) do
|
324
|
-
@fname =
|
325
|
-
"#{File.dirname(__FILE__)}/../../test_files/test.fq"
|
326
|
-
end
|
327
|
-
|
328
|
-
it_behaves_like "parsing a fastq file"
|
329
|
-
|
330
|
-
it "doesn't close the SeqFile (approx reg file behav)" do
|
331
|
-
expect(f_handle).not_to be_closed
|
332
|
-
end
|
333
|
-
|
334
|
-
it "returns FastqFile object" do
|
335
|
-
expect(f_handle).to be_a FastqFile
|
336
|
-
end
|
337
|
-
end
|
338
|
-
end
|
339
|
-
end
|
340
|
-
end
|
341
|
-
|
342
|
-
context "when input is bogus" do
|
343
|
-
describe "#each_record_fast" do
|
344
|
-
it "raises an ArgumentError with message" do
|
345
|
-
fname = "#{File.dirname(__FILE__)}/../../test_files/bogus.txt"
|
346
|
-
err_msg = "Input does not look like FASTA or FASTQ"
|
347
|
-
|
348
|
-
expect { SeqFile.open(fname).each_record_fast do |h, s|
|
349
|
-
puts [h, s].join ' '
|
350
|
-
end
|
351
|
-
}.to raise_error(ArgumentError, err_msg)
|
352
|
-
end
|
353
|
-
end
|
354
|
-
end
|
355
|
-
|
356
|
-
|
357
|
-
end
|
data/spec/lib/sequence_spec.rb
DELETED
@@ -1,188 +0,0 @@
|
|
1
|
-
# Copyright 2014, 2015 Ryan Moore
|
2
|
-
# Contact: moorer@udel.edu
|
3
|
-
#
|
4
|
-
# This file is part of parse_fasta.
|
5
|
-
#
|
6
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
-
# it under the terms of the GNU General Public License as published by
|
8
|
-
# the Free Software Foundation, either version 3 of the License, or
|
9
|
-
# (at your option) any later version.
|
10
|
-
#
|
11
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
-
# GNU General Public License for more details.
|
15
|
-
#
|
16
|
-
# You should have received a copy of the GNU General Public License
|
17
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
-
|
19
|
-
require 'spec_helper'
|
20
|
-
require 'bio'
|
21
|
-
|
22
|
-
describe Sequence do
|
23
|
-
|
24
|
-
# it "has AmbiguousSequenceError" do
|
25
|
-
# expect(Sequence::AmbiguousSequenceError).not_to be nil
|
26
|
-
# end
|
27
|
-
|
28
|
-
it "inherits from String" do
|
29
|
-
expect(Sequence.new('ACTG')).to be_a String
|
30
|
-
end
|
31
|
-
|
32
|
-
describe "::new" do
|
33
|
-
it "removes any spaces in the sequence" do
|
34
|
-
s = "ACT ACT ACT GCT "
|
35
|
-
s_no_spaces = "ACTACTACTGCT"
|
36
|
-
expect(Sequence.new(s)).to eq s_no_spaces
|
37
|
-
end
|
38
|
-
|
39
|
-
context "when sequence has a '>' in it" do
|
40
|
-
it "raises SequenceFormatError" do
|
41
|
-
s = "actg>sequence 3"
|
42
|
-
expect { Sequence.new(s) }.
|
43
|
-
to raise_error ParseFasta::SequenceFormatError
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
describe "#gc" do
|
49
|
-
it "gives the same answer as BioRuby" do
|
50
|
-
s = 'ACtgcGAtcgCgAaTtGgCcnNuU'
|
51
|
-
bioruby_gc = Bio::Sequence::NA.new(s).gc_content
|
52
|
-
expect(Sequence.new(s).gc).to eq bioruby_gc
|
53
|
-
end
|
54
|
-
|
55
|
-
context "when sequence isn't empty" do
|
56
|
-
it "calculates gc" do
|
57
|
-
s = Sequence.new('ActGnu')
|
58
|
-
expect(s.gc).to eq(2 / 5.to_f)
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
context "when sequence is empty" do
|
63
|
-
it "returns 0" do
|
64
|
-
s = Sequence.new('')
|
65
|
-
expect(s.gc).to eq 0
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
|
-
context "there are no A, C, T, G or U (ie only N)" do
|
70
|
-
it "returns 0" do
|
71
|
-
s = Sequence.new('NNNNNnn')
|
72
|
-
expect(s.gc).to eq 0
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
describe "#base_counts" do
|
78
|
-
context "for a DNA sequence with default or falsy argument" do
|
79
|
-
it "returns a map of A, C, T, and G counts" do
|
80
|
-
s = Sequence.new('ACTGactg')
|
81
|
-
expect(s.base_counts).to eq({ a: 2, c: 2, t: 2, g: 2 })
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
context "for a DNA sequence with truthy argument" do
|
86
|
-
it "returns a map of A, C, T, G and N counts" do
|
87
|
-
s = Sequence.new('ACTGNactgn')
|
88
|
-
expect(s.base_counts(1)).to eq({ a: 2, c: 2, t: 2, g: 2, n: 2 })
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
context "for an RNA sequence with falsy or default argument" do
|
93
|
-
it "returns a map of A, C, U, G counts" do
|
94
|
-
s = Sequence.new('ACUGacug')
|
95
|
-
expect(s.base_counts).to eq({ a: 2, c: 2, u: 2, g: 2 })
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
context "for an RNA sequence with truthy argument" do
|
100
|
-
it "returns a map of A, C, U, G and N counts" do
|
101
|
-
s = Sequence.new('ACUGNacugn')
|
102
|
-
expect(s.base_counts(1)).to eq({ a: 2, c: 2, u: 2, g: 2, n: 2 })
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
context "for a sequence with both U and T present" do
|
107
|
-
s = Sequence.new('AaCcTtGgNnUu')
|
108
|
-
err_message = 'ERROR: A sequence contains both T and U'
|
109
|
-
|
110
|
-
it "warns the user about having both U and T present" do
|
111
|
-
expect(s).to receive(:warn).with(err_message)
|
112
|
-
s.base_counts
|
113
|
-
end
|
114
|
-
|
115
|
-
it "returns a map that counts both U's and T's" do
|
116
|
-
expect(s.base_counts).to eq({ a: 2, c: 2, t: 2, u: 2, g: 2 })
|
117
|
-
end
|
118
|
-
|
119
|
-
it "returns a map with T, U and N if truthy argument given" do
|
120
|
-
base_counts = { a: 2, c: 2, t: 2, u: 2, g: 2, n: 2 }
|
121
|
-
expect(s.base_counts(1)).to eq(base_counts)
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
describe "#base_frequencies" do
|
127
|
-
context "with falsy argument" do
|
128
|
-
it "doesn't count ambiguous bases in total bases" do
|
129
|
-
s = Sequence.new('ACTTn')
|
130
|
-
base_freqs = { a: 0.25, c: 0.25, t: 0.5, g: 0.0 }
|
131
|
-
expect(s.base_frequencies).to eq(base_freqs)
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
context "when counting ambiguous bases" do
|
136
|
-
it "does count ambiguous bases in total bases" do
|
137
|
-
s = Sequence.new('ACTTn')
|
138
|
-
base_freqs = { a: 0.2, c: 0.2, t: 0.4, g: 0.0, n: 0.2 }
|
139
|
-
expect(s.base_frequencies(1)).to eq(base_freqs)
|
140
|
-
end
|
141
|
-
end
|
142
|
-
end
|
143
|
-
|
144
|
-
describe "#rev_comp" do
|
145
|
-
# it "raises error if both T and U are present" do
|
146
|
-
# s = Sequence.new("actGU")
|
147
|
-
# err = Sequence::AmbiguousSequenceError
|
148
|
-
# msg = "Sequence is ambiguous -- both T and U present"
|
149
|
-
# expect { s.rev_comp }.to raise_error(err, msg)
|
150
|
-
# end
|
151
|
-
|
152
|
-
# it "warns if non iupac characters are present" do
|
153
|
-
# s = Sequence.new("--..9284ldkjfalsjf")
|
154
|
-
# msg = "WARNING: Sequence contains non IUPAC characters"
|
155
|
-
# expect(s).to receive(:warn).with(msg)
|
156
|
-
# s.rev_comp
|
157
|
-
# end
|
158
|
-
it "returns a reverse complement of the Sequence" do
|
159
|
-
s = Sequence.new("gARKbdctymvhu").rev_comp
|
160
|
-
expect(s).to eq "adbkraghvMYTc"
|
161
|
-
|
162
|
-
s = Sequence.new("ctyMVhgarKBda").rev_comp
|
163
|
-
expect(s).to eq "thVMytcdBKrag"
|
164
|
-
end
|
165
|
-
|
166
|
-
it "leaves non-IUPAC characters alone" do
|
167
|
-
s = Sequence.new("cccc--CCCcccga").rev_comp
|
168
|
-
expect(s).to eq "tcgggGGG--gggg"
|
169
|
-
end
|
170
|
-
|
171
|
-
it "returns a Sequence" do
|
172
|
-
s = Sequence.new("cccc--CCCcccga")
|
173
|
-
expect(s.rev_comp).to be_an_instance_of(Sequence)
|
174
|
-
end
|
175
|
-
|
176
|
-
it "gives back original sequence when called in succession" do
|
177
|
-
s = Sequence.new("cccc--CCCcccga")
|
178
|
-
expect(s.rev_comp.rev_comp).to eq s
|
179
|
-
end
|
180
|
-
|
181
|
-
context "with an empty sequence" do
|
182
|
-
it "returns an empty sequence" do
|
183
|
-
s = Sequence.new("")
|
184
|
-
expect(s.rev_comp).to be_empty
|
185
|
-
end
|
186
|
-
end
|
187
|
-
end
|
188
|
-
end
|
data/test_files/benchmark.rb
DELETED
@@ -1,99 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# Copyright 2014, 2015 Ryan Moore
|
4
|
-
# Contact: moorer@udel.edu
|
5
|
-
|
6
|
-
# This file is part of parse_fasta.
|
7
|
-
|
8
|
-
# parse_fasta is free software: you can redistribute it and/or modify
|
9
|
-
# it under the terms of the GNU General Public License as published by
|
10
|
-
# the Free Software Foundation, either version 3 of the License, or
|
11
|
-
# (at your option) any later version.
|
12
|
-
|
13
|
-
# parse_fasta is distributed in the hope that it will be useful,
|
14
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
-
# GNU General Public License for more details.
|
17
|
-
|
18
|
-
# You should have received a copy of the GNU General Public License
|
19
|
-
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
20
|
-
|
21
|
-
require 'parse_fasta'
|
22
|
-
require 'bio'
|
23
|
-
require 'benchmark'
|
24
|
-
|
25
|
-
def this_parse_fasta fname
|
26
|
-
FastaFile.open(fname, 'r').each_record do |header, sequence|
|
27
|
-
[header, sequence.length].join("\t")
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def this_parse_fasta_fast fname
|
32
|
-
FastaFile.open(fname, 'r').each_record_fast do |header, sequence|
|
33
|
-
[header, sequence.length].join("\t")
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
def bioruby_parse_fasta fname
|
38
|
-
Bio::FastaFormat.open(fname).each do |entry|
|
39
|
-
[entry.definition, entry.seq.length].join("\t")
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
Benchmark.bmbm do |x|
|
44
|
-
x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
|
45
|
-
x.report('parse_fasta fast') { this_parse_fasta_fast(ARGV.first) }
|
46
|
-
x.report('bioruby') { bioruby_parse_fasta(ARGV.first) }
|
47
|
-
end
|
48
|
-
|
49
|
-
####
|
50
|
-
|
51
|
-
def this_gc(str)
|
52
|
-
Sequence.new(str).gc
|
53
|
-
end
|
54
|
-
|
55
|
-
def bioruby_gc(str)
|
56
|
-
Bio::Sequence::NA.new(str).gc_content
|
57
|
-
end
|
58
|
-
|
59
|
-
# make a random sequence of given length
|
60
|
-
def make_seq(num)
|
61
|
-
num.times.reduce('') { |str, n| str << %w[A a C c T t G g N n].sample }
|
62
|
-
end
|
63
|
-
|
64
|
-
# s1 = make_seq(2000000)
|
65
|
-
# s2 = make_seq(4000000)
|
66
|
-
# s3 = make_seq(8000000)
|
67
|
-
|
68
|
-
# Benchmark.bmbm do |x|
|
69
|
-
# x.report('this_gc 1') { this_gc(s1) }
|
70
|
-
# x.report('bioruby_gc 1') { bioruby_gc(s1) }
|
71
|
-
|
72
|
-
# x.report('this_gc 2') { this_gc(s2) }
|
73
|
-
# x.report('bioruby_gc 2') { bioruby_gc(s2) }
|
74
|
-
|
75
|
-
# x.report('this_gc 3') { this_gc(s3) }
|
76
|
-
# x.report('bioruby_gc 3') { bioruby_gc(s3) }
|
77
|
-
# end
|
78
|
-
|
79
|
-
# fastq = ARGV.first
|
80
|
-
|
81
|
-
def bioruby_fastq(fastq)
|
82
|
-
Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
|
83
|
-
fq.each do |entry|
|
84
|
-
[entry.definition, entry.seq.length].join("\t")
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
def this_fastq(fastq)
|
90
|
-
FastqFile.open(fastq).each_record do |head, seq, desc, qual|
|
91
|
-
[head, seq.length].join("\t")
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
# file is 4 million illumina reads (16,000,000 lines) 1.4gb
|
96
|
-
# Benchmark.bmbm do |x|
|
97
|
-
# x.report('this_fastq') { this_fastq(ARGV.first) }
|
98
|
-
# x.report('bioruby_fastq') { bioruby_fastq(ARGV.first) }
|
99
|
-
# end
|
data/test_files/bogus.txt
DELETED
data/test_files/test.fa.gz
DELETED
Binary file
|
data/test_files/test.fq
DELETED
data/test_files/test.fq.gz
DELETED
Binary file
|