parse_fasta 1.9.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +8 -8
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/README.md +42 -215
  6. data/Rakefile +2 -4
  7. data/bin/console +14 -0
  8. data/bin/setup +8 -0
  9. data/lib/parse_fasta/error.rb +39 -0
  10. data/lib/parse_fasta/record.rb +88 -0
  11. data/lib/parse_fasta/seq_file.rb +221 -114
  12. data/lib/parse_fasta/version.rb +2 -2
  13. data/lib/parse_fasta.rb +5 -20
  14. data/spec/parse_fasta/record_spec.rb +115 -0
  15. data/spec/parse_fasta/seq_file_spec.rb +238 -0
  16. data/spec/parse_fasta_spec.rb +25 -0
  17. data/spec/spec_helper.rb +2 -44
  18. data/spec/test_files/cr.fa +1 -0
  19. data/spec/test_files/cr.fa.gz +0 -0
  20. data/spec/test_files/cr.fq +3 -0
  21. data/spec/test_files/cr.fq.gz +0 -0
  22. data/spec/test_files/cr_nl.fa +4 -0
  23. data/spec/test_files/cr_nl.fa.gz +0 -0
  24. data/spec/test_files/cr_nl.fq +8 -0
  25. data/spec/test_files/cr_nl.fq.gz +0 -0
  26. data/spec/test_files/multi_blob.fa.gz +0 -0
  27. data/spec/test_files/multi_blob.fq.gz +0 -0
  28. data/spec/test_files/not_a_seq_file.txt +1 -0
  29. data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
  30. data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
  31. data/spec/test_files/seqs.fa.gz +0 -0
  32. data/spec/test_files/seqs.fq +8 -0
  33. data/spec/test_files/seqs.fq.gz +0 -0
  34. metadata +49 -24
  35. data/lib/parse_fasta/fasta_file.rb +0 -232
  36. data/lib/parse_fasta/fastq_file.rb +0 -160
  37. data/lib/parse_fasta/quality.rb +0 -54
  38. data/lib/parse_fasta/sequence.rb +0 -174
  39. data/spec/lib/fasta_file_spec.rb +0 -212
  40. data/spec/lib/fastq_file_spec.rb +0 -143
  41. data/spec/lib/quality_spec.rb +0 -51
  42. data/spec/lib/seq_file_spec.rb +0 -357
  43. data/spec/lib/sequence_spec.rb +0 -188
  44. data/test_files/benchmark.rb +0 -99
  45. data/test_files/bogus.txt +0 -2
  46. data/test_files/test.fa.gz +0 -0
  47. data/test_files/test.fq +0 -8
  48. data/test_files/test.fq.gz +0 -0
@@ -0,0 +1,238 @@
1
+ # Copyright 2014 - 2016 Ryan Moore
2
+ # Contact: moorer@udel.edu
3
+ #
4
+ # This file is part of parse_fasta.
5
+ #
6
+ # parse_fasta is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # parse_fasta is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "spec_helper"
20
+
21
+ module ParseFasta
22
+ describe SeqFile do
23
+ let(:test_dir) {
24
+ File.join File.dirname(__FILE__), "..", "test_files"
25
+ }
26
+
27
+ let(:fasta) {
28
+ File.join test_dir, "seqs.fa"
29
+ }
30
+ let(:fasta_gz) {
31
+ File.join test_dir, "seqs.fa.gz"
32
+ }
33
+ let(:fastq) {
34
+ File.join test_dir, "seqs.fq"
35
+ }
36
+ let(:fastq_gz) {
37
+ File.join test_dir, "seqs.fq.gz"
38
+ }
39
+
40
+ let(:fasta_records) {
41
+ [Record.new(header: "empty seq at beginning",
42
+ seq: ""),
43
+ Record.new(header: "seq1 is fun",
44
+ seq: "AACTGGNNN"),
45
+ Record.new(header: "seq2",
46
+ seq: "AATCCTGNNN"),
47
+ Record.new(header: "empty seq 1",
48
+ seq: ""),
49
+ Record.new(header: "empty seq 2",
50
+ seq: ""),
51
+ Record.new(header: "seq3",
52
+ seq: "yyyyyyyyyyyyyyyNNN"),
53
+ Record.new(header: "seq 4 > has many '>' in header",
54
+ seq: "ACTGactg"),
55
+ Record.new(header: "empty seq at end",
56
+ seq: "")]
57
+ }
58
+ let(:fastq_records) {
59
+ [Record.new(header: "seq1",
60
+ seq: "AA CC TT GG",
61
+ desc: "",
62
+ qual: ")# 3g Tq N8"),
63
+ Record.new(header: "seq2 @pples",
64
+ seq: "ACTG",
65
+ desc: "seq2 +pples",
66
+ qual: "*ujM")]
67
+ }
68
+
69
+ # to test the line endings
70
+ let(:line_endings_fastq_records) {
71
+ [Record.new(header: "apple", seq: "ACTG", desc: "", qual: "IIII"),
72
+ Record.new(header: "pie", seq: "AACC", desc: "", qual: "BBBB"),]
73
+ }
74
+ let(:line_endings_fasta_records) {
75
+ [Record.new(header: "apple", seq: "ACTG"),
76
+ Record.new(header: "pie", seq: "AACC"),]
77
+ }
78
+
79
+
80
+ describe "::open" do
81
+ context "when the file doesn't exist" do
82
+ it "raises FileNotFoundError" do
83
+ expect { SeqFile.open "arstoien" }.
84
+ to raise_error ParseFasta::Error::FileNotFoundError
85
+ end
86
+ end
87
+
88
+ context "when input looks like neither fastA or fastQ" do
89
+ it "raises a DataFormatError" do
90
+ fname = File.join test_dir, "not_a_seq_file.txt"
91
+
92
+ expect { SeqFile.open(fname) }.
93
+ to raise_error ParseFasta::Error::DataFormatError
94
+ end
95
+ end
96
+
97
+ context "when input looks like fastA" do
98
+ it "sets @type to :fasta" do
99
+ expect(SeqFile.open(fasta).type).to eq :fasta
100
+ end
101
+
102
+ it "sets @type to :fasta (gzipped)" do
103
+ expect(SeqFile.open(fasta_gz).type).to eq :fasta
104
+ end
105
+ end
106
+
107
+ context "when input looks like fastQ" do
108
+ it "sets @type to :fastq" do
109
+ expect(SeqFile.open(fastq).type).to eq :fastq
110
+ end
111
+
112
+ it "sets @type to :fastq (gzipped)" do
113
+ expect(SeqFile.open(fastq_gz).type).to eq :fastq
114
+ end
115
+ end
116
+
117
+ it "returns a SeqFile" do
118
+ expect(SeqFile.open fasta).to be_a SeqFile
119
+ end
120
+ end
121
+
122
+ describe "#each_record" do
123
+ shared_examples "it yields the records" do
124
+ it "yields the records" do
125
+ expect { |b| SeqFile.open(fname).each_record &b }.
126
+ to yield_successive_args(*records)
127
+ end
128
+ end
129
+
130
+ context "input is fastA" do
131
+ context "with gzipped fastA" do
132
+ let(:fname) { File.join test_dir, "seqs.fa.gz" }
133
+ let(:records) { fasta_records }
134
+
135
+ include_examples "it yields the records"
136
+ end
137
+
138
+ context "with gzipped fastA with multiple blobs" do
139
+ # e.g., $ gzip -c a.fa > c.fa.gz; gzip -c b.fa >> c.fa.gz
140
+ let(:fname) { File.join test_dir, "multi_blob.fa.gz" }
141
+ let(:records) { fasta_records + fasta_records }
142
+
143
+ include_examples "it yields the records"
144
+ end
145
+
146
+ context "with non-gzipped fastA" do
147
+ let(:fname) { File.join test_dir, "seqs.fa" }
148
+ let(:records) { fasta_records }
149
+
150
+
151
+ include_examples "it yields the records"
152
+ end
153
+ end
154
+
155
+ context "input is fastQ" do
156
+ context "with gzipped fastQ" do
157
+ let(:fname) { File.join test_dir, "seqs.fq.gz" }
158
+ let(:records) { fastq_records }
159
+
160
+ include_examples "it yields the records"
161
+ end
162
+
163
+ context "with gzipped fastQ with multiple blobs" do
164
+ # e.g., $ gzip -c a.fq > c.fq.gz; gzip -c b.fq >> c.fq.gz
165
+ let(:fname) { File.join test_dir, "multi_blob.fq.gz" }
166
+ let(:records) { fastq_records + fastq_records }
167
+
168
+ include_examples "it yields the records"
169
+ end
170
+
171
+ context "with non-gzipped fastQ" do
172
+ let(:fname) { File.join test_dir, "seqs.fq" }
173
+ let(:records) { fastq_records }
174
+
175
+ include_examples "it yields the records"
176
+ end
177
+ end
178
+
179
+ context "handles non newline line endings" do
180
+ context "fastQ, non-gz, carriage return only" do
181
+ let(:fname) { File.join test_dir, "cr.fq" }
182
+ let(:records) { line_endings_fastq_records }
183
+
184
+ include_examples "it yields the records"
185
+ end
186
+
187
+ context "fastQ, gz, carriage return only" do
188
+ let(:fname) { File.join test_dir, "cr.fq.gz" }
189
+ let(:records) { line_endings_fastq_records }
190
+
191
+ include_examples "it yields the records"
192
+ end
193
+
194
+ context "fastQ, non-gz, carriage return and newline" do
195
+ let(:fname) { File.join test_dir, "cr_nl.fq" }
196
+ let(:records) { line_endings_fastq_records }
197
+
198
+ include_examples "it yields the records"
199
+ end
200
+
201
+ context "fastQ, gz, carriage return and newline" do
202
+ let(:fname) { File.join test_dir, "cr_nl.fq.gz" }
203
+ let(:records) { line_endings_fastq_records }
204
+
205
+ include_examples "it yields the records"
206
+ end
207
+
208
+ context "fastA, non-gz, carriage return only" do
209
+ let(:fname) { File.join test_dir, "cr.fa" }
210
+ let(:records) { line_endings_fasta_records }
211
+
212
+ include_examples "it yields the records"
213
+ end
214
+
215
+ context "fastA, gz, carriage return only" do
216
+ let(:fname) { File.join test_dir, "cr.fa.gz" }
217
+ let(:records) { line_endings_fasta_records }
218
+
219
+ include_examples "it yields the records"
220
+ end
221
+
222
+ context "fastA, non-gz, carriage return and newline" do
223
+ let(:fname) { File.join test_dir, "cr_nl.fa" }
224
+ let(:records) { line_endings_fasta_records }
225
+
226
+ include_examples "it yields the records"
227
+ end
228
+
229
+ context "fastA, gz, carriage return and newline" do
230
+ let(:fname) { File.join test_dir, "cr_nl.fa.gz" }
231
+ let(:records) { line_endings_fasta_records }
232
+
233
+ include_examples "it yields the records"
234
+ end
235
+ end
236
+ end
237
+ end
238
+ end
@@ -0,0 +1,25 @@
1
+ # Copyright 2014 - 2016 Ryan Moore
2
+ # Contact: moorer@udel.edu
3
+ #
4
+ # This file is part of parse_fasta.
5
+ #
6
+ # parse_fasta is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # parse_fasta is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require 'spec_helper'
20
+
21
+ describe ParseFasta do
22
+ it 'has a version number' do
23
+ expect(ParseFasta::VERSION).not_to be nil
24
+ end
25
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2014, 2015 Ryan Moore
1
+ # Copyright 2014 - 2016 Ryan Moore
2
2
  # Contact: moorer@udel.edu
3
3
  #
4
4
  # This file is part of parse_fasta.
@@ -19,47 +19,5 @@
19
19
  require 'coveralls'
20
20
  Coveralls.wear!
21
21
 
22
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
22
23
  require 'parse_fasta'
23
-
24
- module Helpers
25
-
26
- RECORDS = [["empty seq at beginning", ""],
27
- ["seq1 is fun", "AACTGGNNN"],
28
- ["seq2", "AATCCTGNNN"],
29
- ["empty seq 1", ""],
30
- ["empty seq 2", ""],
31
- ["seq3", "yyyyyyyyyyyyyyyNNN"],
32
- ["seq 4 > has many '>' in header", "ACTGactg"],
33
- ["empty seq at end", ""]]
34
-
35
- RECORDS_FAST = [["empty seq at beginning", ""],
36
- ["seq1 is fun", "AAC TGG NN N"],
37
- ["seq2", "AATCCTGNNN"],
38
- ["empty seq 1", ""],
39
- ["empty seq 2", ""],
40
- ["seq3", "yyyyyyyyyyyyyyyNNN"],
41
- ["seq 4 > has many '>' in header", "ACTGactg"],
42
- ["empty seq at end", ""]]
43
-
44
- RECORDS_MAP = {
45
- "empty seq at beginning" => "",
46
- "seq1 is fun" => "AACTGGNNN",
47
- "seq2" => "AATCCTGNNN",
48
- "empty seq 1" => "",
49
- "empty seq 2" => "",
50
- "seq3" => "yyyyyyyyyyyyyyyNNN",
51
- "seq 4 > has many '>' in header" => "ACTGactg",
52
- "empty seq at end" => ""
53
- }
54
-
55
- TRUTHY_RECORDS = [["empty seq at beginning", []],
56
- ["seq1 is fun", ["AACTGGNNN"]],
57
- ["seq2", ["AAT", "CCTGNNN"]],
58
- ["empty seq 1", []],
59
- ["empty seq 2", []],
60
- ["seq3", ["yyyyyyyyyy", "yyyyy", "NNN"]],
61
- ["seq 4 > has many '>' in header", ["ACTG" ,
62
- "actg"]],
63
- ["empty seq at end", []]]
64
-
65
- end
@@ -0,0 +1 @@
1
+ >apple
Binary file
@@ -0,0 +1 @@
1
+ @apple
2
+
3
+
Binary file
@@ -0,0 +1,4 @@
1
+ >apple
2
+ ACTG
3
+ >pie
4
+ AACC
Binary file
@@ -0,0 +1,8 @@
1
+ @apple
2
+ ACTG
3
+ +
4
+ IIII
5
+ @pie
6
+ AACC
7
+ +
8
+ BBBB
Binary file
Binary file
Binary file
@@ -0,0 +1 @@
1
+ this is not a seq file
File without changes
Binary file
@@ -0,0 +1,8 @@
1
+ @seq1
2
+ AA CC TT GG
3
+ +
4
+ )# 3g Tq N8
5
+ @seq2 @pples
6
+ ACTG
7
+ +seq2 +pples
8
+ *ujM
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parse_fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-20 00:00:00.000000000 Z
11
+ date: 2016-10-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -112,39 +112,50 @@ description: Provides nice, programmatic access to fasta and fastq files, as wel
112
112
  as providing Sequence and Quality helper classes. No need for BioRuby ;)
113
113
  email:
114
114
  - moorer@udel.edu
115
- executables: []
115
+ executables:
116
+ - console
117
+ - setup
116
118
  extensions: []
117
119
  extra_rdoc_files: []
118
120
  files:
119
121
  - .coveralls.yml
120
122
  - .gitignore
123
+ - .rspec
121
124
  - .travis.yml
122
125
  - .yardopts
126
+ - CHANGELOG.md
123
127
  - COPYING
124
128
  - Gemfile
125
129
  - README.md
126
130
  - Rakefile
131
+ - bin/console
132
+ - bin/setup
127
133
  - lib/parse_fasta.rb
128
- - lib/parse_fasta/fasta_file.rb
129
- - lib/parse_fasta/fastq_file.rb
130
- - lib/parse_fasta/quality.rb
134
+ - lib/parse_fasta/error.rb
135
+ - lib/parse_fasta/record.rb
131
136
  - lib/parse_fasta/seq_file.rb
132
- - lib/parse_fasta/sequence.rb
133
137
  - lib/parse_fasta/version.rb
134
138
  - parse_fasta.gemspec
135
- - spec/lib/fasta_file_spec.rb
136
- - spec/lib/fastq_file_spec.rb
137
- - spec/lib/quality_spec.rb
138
- - spec/lib/seq_file_spec.rb
139
- - spec/lib/sequence_spec.rb
139
+ - spec/parse_fasta/record_spec.rb
140
+ - spec/parse_fasta/seq_file_spec.rb
141
+ - spec/parse_fasta_spec.rb
140
142
  - spec/spec_helper.rb
141
- - test_files/bad.fa
142
- - test_files/benchmark.rb
143
- - test_files/bogus.txt
144
- - test_files/test.fa
145
- - test_files/test.fa.gz
146
- - test_files/test.fq
147
- - test_files/test.fq.gz
143
+ - spec/test_files/cr.fa
144
+ - spec/test_files/cr.fa.gz
145
+ - spec/test_files/cr.fq
146
+ - spec/test_files/cr.fq.gz
147
+ - spec/test_files/cr_nl.fa
148
+ - spec/test_files/cr_nl.fa.gz
149
+ - spec/test_files/cr_nl.fq
150
+ - spec/test_files/cr_nl.fq.gz
151
+ - spec/test_files/multi_blob.fa.gz
152
+ - spec/test_files/multi_blob.fq.gz
153
+ - spec/test_files/not_a_seq_file.txt
154
+ - spec/test_files/poorly_catted.fa
155
+ - spec/test_files/seqs.fa
156
+ - spec/test_files/seqs.fa.gz
157
+ - spec/test_files/seqs.fq
158
+ - spec/test_files/seqs.fq.gz
148
159
  homepage: https://github.com/mooreryan/parse_fasta
149
160
  licenses:
150
161
  - ! 'GPLv3: http://www.gnu.org/licenses/gpl.txt'
@@ -170,10 +181,24 @@ signing_key:
170
181
  specification_version: 4
171
182
  summary: Easy-peasy parsing of fasta & fastq files!
172
183
  test_files:
173
- - spec/lib/fasta_file_spec.rb
174
- - spec/lib/fastq_file_spec.rb
175
- - spec/lib/quality_spec.rb
176
- - spec/lib/seq_file_spec.rb
177
- - spec/lib/sequence_spec.rb
184
+ - spec/parse_fasta/record_spec.rb
185
+ - spec/parse_fasta/seq_file_spec.rb
186
+ - spec/parse_fasta_spec.rb
178
187
  - spec/spec_helper.rb
188
+ - spec/test_files/cr.fa
189
+ - spec/test_files/cr.fa.gz
190
+ - spec/test_files/cr.fq
191
+ - spec/test_files/cr.fq.gz
192
+ - spec/test_files/cr_nl.fa
193
+ - spec/test_files/cr_nl.fa.gz
194
+ - spec/test_files/cr_nl.fq
195
+ - spec/test_files/cr_nl.fq.gz
196
+ - spec/test_files/multi_blob.fa.gz
197
+ - spec/test_files/multi_blob.fq.gz
198
+ - spec/test_files/not_a_seq_file.txt
199
+ - spec/test_files/poorly_catted.fa
200
+ - spec/test_files/seqs.fa
201
+ - spec/test_files/seqs.fa.gz
202
+ - spec/test_files/seqs.fq
203
+ - spec/test_files/seqs.fq.gz
179
204
  has_rdoc: