parse_fasta 1.9.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +178 -0
- data/README.md +42 -215
- data/Rakefile +2 -4
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/parse_fasta/error.rb +39 -0
- data/lib/parse_fasta/record.rb +88 -0
- data/lib/parse_fasta/seq_file.rb +221 -114
- data/lib/parse_fasta/version.rb +2 -2
- data/lib/parse_fasta.rb +5 -20
- data/spec/parse_fasta/record_spec.rb +115 -0
- data/spec/parse_fasta/seq_file_spec.rb +238 -0
- data/spec/parse_fasta_spec.rb +25 -0
- data/spec/spec_helper.rb +2 -44
- data/spec/test_files/cr.fa +1 -0
- data/spec/test_files/cr.fa.gz +0 -0
- data/spec/test_files/cr.fq +3 -0
- data/spec/test_files/cr.fq.gz +0 -0
- data/spec/test_files/cr_nl.fa +4 -0
- data/spec/test_files/cr_nl.fa.gz +0 -0
- data/spec/test_files/cr_nl.fq +8 -0
- data/spec/test_files/cr_nl.fq.gz +0 -0
- data/spec/test_files/multi_blob.fa.gz +0 -0
- data/spec/test_files/multi_blob.fq.gz +0 -0
- data/spec/test_files/not_a_seq_file.txt +1 -0
- data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
- data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
- data/spec/test_files/seqs.fa.gz +0 -0
- data/spec/test_files/seqs.fq +8 -0
- data/spec/test_files/seqs.fq.gz +0 -0
- metadata +49 -24
- data/lib/parse_fasta/fasta_file.rb +0 -232
- data/lib/parse_fasta/fastq_file.rb +0 -160
- data/lib/parse_fasta/quality.rb +0 -54
- data/lib/parse_fasta/sequence.rb +0 -174
- data/spec/lib/fasta_file_spec.rb +0 -212
- data/spec/lib/fastq_file_spec.rb +0 -143
- data/spec/lib/quality_spec.rb +0 -51
- data/spec/lib/seq_file_spec.rb +0 -357
- data/spec/lib/sequence_spec.rb +0 -188
- data/test_files/benchmark.rb +0 -99
- data/test_files/bogus.txt +0 -2
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq +0 -8
- data/test_files/test.fq.gz +0 -0
@@ -0,0 +1,238 @@
|
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
|
+
# Contact: moorer@udel.edu
|
3
|
+
#
|
4
|
+
# This file is part of parse_fasta.
|
5
|
+
#
|
6
|
+
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
require "spec_helper"
|
20
|
+
|
21
|
+
module ParseFasta
|
22
|
+
describe SeqFile do
|
23
|
+
let(:test_dir) {
|
24
|
+
File.join File.dirname(__FILE__), "..", "test_files"
|
25
|
+
}
|
26
|
+
|
27
|
+
let(:fasta) {
|
28
|
+
File.join test_dir, "seqs.fa"
|
29
|
+
}
|
30
|
+
let(:fasta_gz) {
|
31
|
+
File.join test_dir, "seqs.fa.gz"
|
32
|
+
}
|
33
|
+
let(:fastq) {
|
34
|
+
File.join test_dir, "seqs.fq"
|
35
|
+
}
|
36
|
+
let(:fastq_gz) {
|
37
|
+
File.join test_dir, "seqs.fq.gz"
|
38
|
+
}
|
39
|
+
|
40
|
+
let(:fasta_records) {
|
41
|
+
[Record.new(header: "empty seq at beginning",
|
42
|
+
seq: ""),
|
43
|
+
Record.new(header: "seq1 is fun",
|
44
|
+
seq: "AACTGGNNN"),
|
45
|
+
Record.new(header: "seq2",
|
46
|
+
seq: "AATCCTGNNN"),
|
47
|
+
Record.new(header: "empty seq 1",
|
48
|
+
seq: ""),
|
49
|
+
Record.new(header: "empty seq 2",
|
50
|
+
seq: ""),
|
51
|
+
Record.new(header: "seq3",
|
52
|
+
seq: "yyyyyyyyyyyyyyyNNN"),
|
53
|
+
Record.new(header: "seq 4 > has many '>' in header",
|
54
|
+
seq: "ACTGactg"),
|
55
|
+
Record.new(header: "empty seq at end",
|
56
|
+
seq: "")]
|
57
|
+
}
|
58
|
+
let(:fastq_records) {
|
59
|
+
[Record.new(header: "seq1",
|
60
|
+
seq: "AA CC TT GG",
|
61
|
+
desc: "",
|
62
|
+
qual: ")# 3g Tq N8"),
|
63
|
+
Record.new(header: "seq2 @pples",
|
64
|
+
seq: "ACTG",
|
65
|
+
desc: "seq2 +pples",
|
66
|
+
qual: "*ujM")]
|
67
|
+
}
|
68
|
+
|
69
|
+
# to test the line endings
|
70
|
+
let(:line_endings_fastq_records) {
|
71
|
+
[Record.new(header: "apple", seq: "ACTG", desc: "", qual: "IIII"),
|
72
|
+
Record.new(header: "pie", seq: "AACC", desc: "", qual: "BBBB"),]
|
73
|
+
}
|
74
|
+
let(:line_endings_fasta_records) {
|
75
|
+
[Record.new(header: "apple", seq: "ACTG"),
|
76
|
+
Record.new(header: "pie", seq: "AACC"),]
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
describe "::open" do
|
81
|
+
context "when the file doesn't exist" do
|
82
|
+
it "raises FileNotFoundError" do
|
83
|
+
expect { SeqFile.open "arstoien" }.
|
84
|
+
to raise_error ParseFasta::Error::FileNotFoundError
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
context "when input looks like neither fastA or fastQ" do
|
89
|
+
it "raises a DataFormatError" do
|
90
|
+
fname = File.join test_dir, "not_a_seq_file.txt"
|
91
|
+
|
92
|
+
expect { SeqFile.open(fname) }.
|
93
|
+
to raise_error ParseFasta::Error::DataFormatError
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
context "when input looks like fastA" do
|
98
|
+
it "sets @type to :fasta" do
|
99
|
+
expect(SeqFile.open(fasta).type).to eq :fasta
|
100
|
+
end
|
101
|
+
|
102
|
+
it "sets @type to :fasta (gzipped)" do
|
103
|
+
expect(SeqFile.open(fasta_gz).type).to eq :fasta
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
context "when input looks like fastQ" do
|
108
|
+
it "sets @type to :fastq" do
|
109
|
+
expect(SeqFile.open(fastq).type).to eq :fastq
|
110
|
+
end
|
111
|
+
|
112
|
+
it "sets @type to :fastq (gzipped)" do
|
113
|
+
expect(SeqFile.open(fastq_gz).type).to eq :fastq
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
it "returns a SeqFile" do
|
118
|
+
expect(SeqFile.open fasta).to be_a SeqFile
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
describe "#each_record" do
|
123
|
+
shared_examples "it yields the records" do
|
124
|
+
it "yields the records" do
|
125
|
+
expect { |b| SeqFile.open(fname).each_record &b }.
|
126
|
+
to yield_successive_args(*records)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
context "input is fastA" do
|
131
|
+
context "with gzipped fastA" do
|
132
|
+
let(:fname) { File.join test_dir, "seqs.fa.gz" }
|
133
|
+
let(:records) { fasta_records }
|
134
|
+
|
135
|
+
include_examples "it yields the records"
|
136
|
+
end
|
137
|
+
|
138
|
+
context "with gzipped fastA with multiple blobs" do
|
139
|
+
# e.g., $ gzip -c a.fa > c.fa.gz; gzip -c b.fa >> c.fa.gz
|
140
|
+
let(:fname) { File.join test_dir, "multi_blob.fa.gz" }
|
141
|
+
let(:records) { fasta_records + fasta_records }
|
142
|
+
|
143
|
+
include_examples "it yields the records"
|
144
|
+
end
|
145
|
+
|
146
|
+
context "with non-gzipped fastA" do
|
147
|
+
let(:fname) { File.join test_dir, "seqs.fa" }
|
148
|
+
let(:records) { fasta_records }
|
149
|
+
|
150
|
+
|
151
|
+
include_examples "it yields the records"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
context "input is fastQ" do
|
156
|
+
context "with gzipped fastQ" do
|
157
|
+
let(:fname) { File.join test_dir, "seqs.fq.gz" }
|
158
|
+
let(:records) { fastq_records }
|
159
|
+
|
160
|
+
include_examples "it yields the records"
|
161
|
+
end
|
162
|
+
|
163
|
+
context "with gzipped fastQ with multiple blobs" do
|
164
|
+
# e.g., $ gzip -c a.fq > c.fq.gz; gzip -c b.fq >> c.fq.gz
|
165
|
+
let(:fname) { File.join test_dir, "multi_blob.fq.gz" }
|
166
|
+
let(:records) { fastq_records + fastq_records }
|
167
|
+
|
168
|
+
include_examples "it yields the records"
|
169
|
+
end
|
170
|
+
|
171
|
+
context "with non-gzipped fastQ" do
|
172
|
+
let(:fname) { File.join test_dir, "seqs.fq" }
|
173
|
+
let(:records) { fastq_records }
|
174
|
+
|
175
|
+
include_examples "it yields the records"
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
context "handles non newline line endings" do
|
180
|
+
context "fastQ, non-gz, carriage return only" do
|
181
|
+
let(:fname) { File.join test_dir, "cr.fq" }
|
182
|
+
let(:records) { line_endings_fastq_records }
|
183
|
+
|
184
|
+
include_examples "it yields the records"
|
185
|
+
end
|
186
|
+
|
187
|
+
context "fastQ, gz, carriage return only" do
|
188
|
+
let(:fname) { File.join test_dir, "cr.fq.gz" }
|
189
|
+
let(:records) { line_endings_fastq_records }
|
190
|
+
|
191
|
+
include_examples "it yields the records"
|
192
|
+
end
|
193
|
+
|
194
|
+
context "fastQ, non-gz, carriage return and newline" do
|
195
|
+
let(:fname) { File.join test_dir, "cr_nl.fq" }
|
196
|
+
let(:records) { line_endings_fastq_records }
|
197
|
+
|
198
|
+
include_examples "it yields the records"
|
199
|
+
end
|
200
|
+
|
201
|
+
context "fastQ, gz, carriage return and newline" do
|
202
|
+
let(:fname) { File.join test_dir, "cr_nl.fq.gz" }
|
203
|
+
let(:records) { line_endings_fastq_records }
|
204
|
+
|
205
|
+
include_examples "it yields the records"
|
206
|
+
end
|
207
|
+
|
208
|
+
context "fastA, non-gz, carriage return only" do
|
209
|
+
let(:fname) { File.join test_dir, "cr.fa" }
|
210
|
+
let(:records) { line_endings_fasta_records }
|
211
|
+
|
212
|
+
include_examples "it yields the records"
|
213
|
+
end
|
214
|
+
|
215
|
+
context "fastA, gz, carriage return only" do
|
216
|
+
let(:fname) { File.join test_dir, "cr.fa.gz" }
|
217
|
+
let(:records) { line_endings_fasta_records }
|
218
|
+
|
219
|
+
include_examples "it yields the records"
|
220
|
+
end
|
221
|
+
|
222
|
+
context "fastA, non-gz, carriage return and newline" do
|
223
|
+
let(:fname) { File.join test_dir, "cr_nl.fa" }
|
224
|
+
let(:records) { line_endings_fasta_records }
|
225
|
+
|
226
|
+
include_examples "it yields the records"
|
227
|
+
end
|
228
|
+
|
229
|
+
context "fastA, gz, carriage return and newline" do
|
230
|
+
let(:fname) { File.join test_dir, "cr_nl.fa.gz" }
|
231
|
+
let(:records) { line_endings_fasta_records }
|
232
|
+
|
233
|
+
include_examples "it yields the records"
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
|
+
# Contact: moorer@udel.edu
|
3
|
+
#
|
4
|
+
# This file is part of parse_fasta.
|
5
|
+
#
|
6
|
+
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
require 'spec_helper'
|
20
|
+
|
21
|
+
describe ParseFasta do
|
22
|
+
it 'has a version number' do
|
23
|
+
expect(ParseFasta::VERSION).not_to be nil
|
24
|
+
end
|
25
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2014
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
2
|
# Contact: moorer@udel.edu
|
3
3
|
#
|
4
4
|
# This file is part of parse_fasta.
|
@@ -19,47 +19,5 @@
|
|
19
19
|
require 'coveralls'
|
20
20
|
Coveralls.wear!
|
21
21
|
|
22
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
22
23
|
require 'parse_fasta'
|
23
|
-
|
24
|
-
module Helpers
|
25
|
-
|
26
|
-
RECORDS = [["empty seq at beginning", ""],
|
27
|
-
["seq1 is fun", "AACTGGNNN"],
|
28
|
-
["seq2", "AATCCTGNNN"],
|
29
|
-
["empty seq 1", ""],
|
30
|
-
["empty seq 2", ""],
|
31
|
-
["seq3", "yyyyyyyyyyyyyyyNNN"],
|
32
|
-
["seq 4 > has many '>' in header", "ACTGactg"],
|
33
|
-
["empty seq at end", ""]]
|
34
|
-
|
35
|
-
RECORDS_FAST = [["empty seq at beginning", ""],
|
36
|
-
["seq1 is fun", "AAC TGG NN N"],
|
37
|
-
["seq2", "AATCCTGNNN"],
|
38
|
-
["empty seq 1", ""],
|
39
|
-
["empty seq 2", ""],
|
40
|
-
["seq3", "yyyyyyyyyyyyyyyNNN"],
|
41
|
-
["seq 4 > has many '>' in header", "ACTGactg"],
|
42
|
-
["empty seq at end", ""]]
|
43
|
-
|
44
|
-
RECORDS_MAP = {
|
45
|
-
"empty seq at beginning" => "",
|
46
|
-
"seq1 is fun" => "AACTGGNNN",
|
47
|
-
"seq2" => "AATCCTGNNN",
|
48
|
-
"empty seq 1" => "",
|
49
|
-
"empty seq 2" => "",
|
50
|
-
"seq3" => "yyyyyyyyyyyyyyyNNN",
|
51
|
-
"seq 4 > has many '>' in header" => "ACTGactg",
|
52
|
-
"empty seq at end" => ""
|
53
|
-
}
|
54
|
-
|
55
|
-
TRUTHY_RECORDS = [["empty seq at beginning", []],
|
56
|
-
["seq1 is fun", ["AACTGGNNN"]],
|
57
|
-
["seq2", ["AAT", "CCTGNNN"]],
|
58
|
-
["empty seq 1", []],
|
59
|
-
["empty seq 2", []],
|
60
|
-
["seq3", ["yyyyyyyyyy", "yyyyy", "NNN"]],
|
61
|
-
["seq 4 > has many '>' in header", ["ACTG" ,
|
62
|
-
"actg"]],
|
63
|
-
["empty seq at end", []]]
|
64
|
-
|
65
|
-
end
|
@@ -0,0 +1 @@
|
|
1
|
+
>apple
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
this is not a seq file
|
File without changes
|
File without changes
|
Binary file
|
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -112,39 +112,50 @@ description: Provides nice, programmatic access to fasta and fastq files, as wel
|
|
112
112
|
as providing Sequence and Quality helper classes. No need for BioRuby ;)
|
113
113
|
email:
|
114
114
|
- moorer@udel.edu
|
115
|
-
executables:
|
115
|
+
executables:
|
116
|
+
- console
|
117
|
+
- setup
|
116
118
|
extensions: []
|
117
119
|
extra_rdoc_files: []
|
118
120
|
files:
|
119
121
|
- .coveralls.yml
|
120
122
|
- .gitignore
|
123
|
+
- .rspec
|
121
124
|
- .travis.yml
|
122
125
|
- .yardopts
|
126
|
+
- CHANGELOG.md
|
123
127
|
- COPYING
|
124
128
|
- Gemfile
|
125
129
|
- README.md
|
126
130
|
- Rakefile
|
131
|
+
- bin/console
|
132
|
+
- bin/setup
|
127
133
|
- lib/parse_fasta.rb
|
128
|
-
- lib/parse_fasta/
|
129
|
-
- lib/parse_fasta/
|
130
|
-
- lib/parse_fasta/quality.rb
|
134
|
+
- lib/parse_fasta/error.rb
|
135
|
+
- lib/parse_fasta/record.rb
|
131
136
|
- lib/parse_fasta/seq_file.rb
|
132
|
-
- lib/parse_fasta/sequence.rb
|
133
137
|
- lib/parse_fasta/version.rb
|
134
138
|
- parse_fasta.gemspec
|
135
|
-
- spec/
|
136
|
-
- spec/
|
137
|
-
- spec/
|
138
|
-
- spec/lib/seq_file_spec.rb
|
139
|
-
- spec/lib/sequence_spec.rb
|
139
|
+
- spec/parse_fasta/record_spec.rb
|
140
|
+
- spec/parse_fasta/seq_file_spec.rb
|
141
|
+
- spec/parse_fasta_spec.rb
|
140
142
|
- spec/spec_helper.rb
|
141
|
-
- test_files/
|
142
|
-
- test_files/
|
143
|
-
- test_files/
|
144
|
-
- test_files/
|
145
|
-
- test_files/
|
146
|
-
- test_files/
|
147
|
-
- test_files/
|
143
|
+
- spec/test_files/cr.fa
|
144
|
+
- spec/test_files/cr.fa.gz
|
145
|
+
- spec/test_files/cr.fq
|
146
|
+
- spec/test_files/cr.fq.gz
|
147
|
+
- spec/test_files/cr_nl.fa
|
148
|
+
- spec/test_files/cr_nl.fa.gz
|
149
|
+
- spec/test_files/cr_nl.fq
|
150
|
+
- spec/test_files/cr_nl.fq.gz
|
151
|
+
- spec/test_files/multi_blob.fa.gz
|
152
|
+
- spec/test_files/multi_blob.fq.gz
|
153
|
+
- spec/test_files/not_a_seq_file.txt
|
154
|
+
- spec/test_files/poorly_catted.fa
|
155
|
+
- spec/test_files/seqs.fa
|
156
|
+
- spec/test_files/seqs.fa.gz
|
157
|
+
- spec/test_files/seqs.fq
|
158
|
+
- spec/test_files/seqs.fq.gz
|
148
159
|
homepage: https://github.com/mooreryan/parse_fasta
|
149
160
|
licenses:
|
150
161
|
- ! 'GPLv3: http://www.gnu.org/licenses/gpl.txt'
|
@@ -170,10 +181,24 @@ signing_key:
|
|
170
181
|
specification_version: 4
|
171
182
|
summary: Easy-peasy parsing of fasta & fastq files!
|
172
183
|
test_files:
|
173
|
-
- spec/
|
174
|
-
- spec/
|
175
|
-
- spec/
|
176
|
-
- spec/lib/seq_file_spec.rb
|
177
|
-
- spec/lib/sequence_spec.rb
|
184
|
+
- spec/parse_fasta/record_spec.rb
|
185
|
+
- spec/parse_fasta/seq_file_spec.rb
|
186
|
+
- spec/parse_fasta_spec.rb
|
178
187
|
- spec/spec_helper.rb
|
188
|
+
- spec/test_files/cr.fa
|
189
|
+
- spec/test_files/cr.fa.gz
|
190
|
+
- spec/test_files/cr.fq
|
191
|
+
- spec/test_files/cr.fq.gz
|
192
|
+
- spec/test_files/cr_nl.fa
|
193
|
+
- spec/test_files/cr_nl.fa.gz
|
194
|
+
- spec/test_files/cr_nl.fq
|
195
|
+
- spec/test_files/cr_nl.fq.gz
|
196
|
+
- spec/test_files/multi_blob.fa.gz
|
197
|
+
- spec/test_files/multi_blob.fq.gz
|
198
|
+
- spec/test_files/not_a_seq_file.txt
|
199
|
+
- spec/test_files/poorly_catted.fa
|
200
|
+
- spec/test_files/seqs.fa
|
201
|
+
- spec/test_files/seqs.fa.gz
|
202
|
+
- spec/test_files/seqs.fq
|
203
|
+
- spec/test_files/seqs.fq.gz
|
179
204
|
has_rdoc:
|