parse_fasta 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/CHANGELOG.md +4 -0
- data/README.md +10 -0
- data/lib/parse_fasta/record.rb +15 -4
- data/lib/parse_fasta/seq_file.rb +15 -6
- data/lib/parse_fasta/version.rb +1 -1
- data/parse_fasta.gemspec +1 -0
- data/spec/parse_fasta/record_spec.rb +26 -3
- data/spec/parse_fasta/seq_file_spec.rb +31 -1
- data/spec/test_files/with_rec_sep_in_seq.fa +4 -0
- metadata +24 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5dc5b2c4063257a491082c62b9ef6faaecd694fa
|
4
|
+
data.tar.gz: 0f45d1a46360f65b83cada74be2b59e84d6c3179
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 26898676ec187d85ded69405b1b193ece436e49693f4cdcc32dd72e313b3143b3d1bbf14d749bad0e5ae7ff45c39b8fb31056bf2fc3aab8e9eb10dee54fbc326
|
7
|
+
data.tar.gz: 0fd4292d811cec77a2f0cb20c8f89fdb06a74ec3f6b2488ae0505a42abfcf4722f81b8f446fd4f940b918b1798f9dff44974214b1b3abd4469df83902267d8aa
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -24,6 +24,8 @@ Or install it yourself as:
|
|
24
24
|
|
25
25
|
Provides nice, programmatic access to fasta and fastq files. It's faster and more lightweight than BioRuby. And more fun!
|
26
26
|
|
27
|
+
It takes care of a lot of whacky edge cases like parsing multi-blob gzipped files, and being strict on formatting by default.
|
28
|
+
|
27
29
|
## Documentation ##
|
28
30
|
|
29
31
|
Checkout
|
@@ -93,3 +95,11 @@ ParseFasta::SeqFile.open(ARGV[0]).each_record do |rec|
|
|
93
95
|
puts rec
|
94
96
|
end
|
95
97
|
```
|
98
|
+
|
99
|
+
Sometimes your fasta file might have record separators (`>`) withen the "sequence". For example, CD-HIT's `.clstr` files have headers within what would be the sequence part of the record. `ParseFasta` is really strict about formatting and will raise an error when trying to read these types of files. If you would like to parse them, use the `check_fasta_seq: false` flag like so:
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
ParseFasta::SeqFile.open(ARGV[0], check_fasta_seq: false).each_record do |rec|
|
103
|
+
puts rec
|
104
|
+
end
|
105
|
+
```
|
data/lib/parse_fasta/record.rb
CHANGED
@@ -39,6 +39,8 @@ module ParseFasta
|
|
39
39
|
#
|
40
40
|
# @example Init a new Record object for a fastA record
|
41
41
|
# Record.new header: "apple", seq: "actg"
|
42
|
+
# @example Init a new Record object for a fastA record without checking for '>' in the sequence.
|
43
|
+
# Record.new header: "apple", seq: "pie>good", check_fasta_seq: false
|
42
44
|
# @example Init a new Record object for a fastQ record
|
43
45
|
# Record.new header: "apple", seq: "actd", desc: "", qual: "IIII"
|
44
46
|
#
|
@@ -46,9 +48,16 @@ module ParseFasta
|
|
46
48
|
# @param seq [String] the sequence of the record
|
47
49
|
# @param desc [String] the description line of a fastQ record
|
48
50
|
# @param qual [String] the quality string of a fastQ record
|
51
|
+
# @param check_fasta_seq [Bool] Pass false if you don't want to
|
52
|
+
# check for '>' characters in the sequence. Defaults to true,
|
53
|
+
# which checks for '>' in the sequence and raises an error.
|
49
54
|
#
|
50
|
-
# @raise [ParseFasta::Error::SequenceFormatError] if a fastA
|
51
|
-
# character in it
|
55
|
+
# @raise [ParseFasta::Error::SequenceFormatError] if a fastA
|
56
|
+
# sequence has a '>' character in it, and :check_fasta_seq is
|
57
|
+
# NOT set to false.
|
58
|
+
#
|
59
|
+
# @todo This is destructive with respect to the input seq
|
60
|
+
# arg. Does it need to be?
|
52
61
|
def initialize args = {}
|
53
62
|
@header = args.fetch :header
|
54
63
|
@id = @header.split(" ")[0]
|
@@ -61,9 +70,11 @@ module ParseFasta
|
|
61
70
|
seq = args.fetch(:seq)
|
62
71
|
seq.tr!(" \t\n\r", "")
|
63
72
|
|
64
|
-
|
73
|
+
do_check_fasta_seq = args.fetch :check_fasta_seq, true
|
74
|
+
|
75
|
+
if fastq? || (!fastq? && !do_check_fasta_seq)
|
65
76
|
@seq = seq
|
66
|
-
else
|
77
|
+
else
|
67
78
|
@seq = check_fasta_seq(seq)
|
68
79
|
end
|
69
80
|
end
|
data/lib/parse_fasta/seq_file.rb
CHANGED
@@ -26,14 +26,18 @@ module ParseFasta
|
|
26
26
|
|
27
27
|
# @param fname [String] the name of the fastA or fastQ file to
|
28
28
|
# parse
|
29
|
+
# @param type [Symbol] whether the file is :fasta or :fastq
|
30
|
+
# @param check_fasta_seq [Bool] keyword arg for whether to check
|
31
|
+
# for '>' in the sequence of fastA files.
|
29
32
|
#
|
30
33
|
# @raise [ParseFasta::Error::FileNotFoundError] if the file is not
|
31
34
|
# found
|
32
35
|
# @raise [ParseFasta::Error::DataFormatError] if the file doesn't
|
33
36
|
# start with a '>' or a '@'
|
34
|
-
def initialize fname
|
37
|
+
def initialize fname, args = {}
|
35
38
|
type = check_file fname
|
36
39
|
|
40
|
+
@check_fasta_seq = args.fetch :check_fasta_seq, true
|
37
41
|
@fname = fname
|
38
42
|
@type = type
|
39
43
|
end
|
@@ -41,8 +45,8 @@ module ParseFasta
|
|
41
45
|
# An alias for SeqFile.new
|
42
46
|
#
|
43
47
|
# @return [SeqFile] a SeqFile object
|
44
|
-
def self.open fname
|
45
|
-
self.new fname
|
48
|
+
def self.open fname, args = {}
|
49
|
+
self.new fname, args
|
46
50
|
end
|
47
51
|
|
48
52
|
# Analagous to IO#each_line, SeqFile#each_record is used to go
|
@@ -69,7 +73,8 @@ module ParseFasta
|
|
69
73
|
# the info of the record
|
70
74
|
#
|
71
75
|
# @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
|
72
|
-
# contains a record with a '>' character in the header
|
76
|
+
# contains a record with a '>' character in the header, and the
|
77
|
+
# SeqFile object was not initialized with check_fasta_seq: false
|
73
78
|
def each_record &b
|
74
79
|
line_parser = "parse_#{@type}_lines"
|
75
80
|
|
@@ -117,7 +122,9 @@ module ParseFasta
|
|
117
122
|
if header.empty? && line.start_with?(">")
|
118
123
|
header = line[1, len] # drop the '>'
|
119
124
|
elsif line.start_with? ">"
|
120
|
-
yield Record.new(header: header.strip,
|
125
|
+
yield Record.new(header: header.strip,
|
126
|
+
seq: sequence,
|
127
|
+
check_fasta_seq: @check_fasta_seq)
|
121
128
|
|
122
129
|
header = line[1, len]
|
123
130
|
sequence = ""
|
@@ -166,7 +173,9 @@ module ParseFasta
|
|
166
173
|
end
|
167
174
|
|
168
175
|
# yield the final seq
|
169
|
-
yield Record.new(header: header.strip,
|
176
|
+
yield Record.new(header: header.strip,
|
177
|
+
seq: sequence,
|
178
|
+
check_fasta_seq: @check_fasta_seq)
|
170
179
|
end
|
171
180
|
|
172
181
|
def parse_fastq_lines file_reader, &b
|
data/lib/parse_fasta/version.rb
CHANGED
data/parse_fasta.gemspec
CHANGED
@@ -30,4 +30,5 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.add_development_dependency "yard", "~> 0.8"
|
31
31
|
spec.add_development_dependency "rdiscount"
|
32
32
|
spec.add_development_dependency "coveralls", "~> 0.7"
|
33
|
+
spec.add_development_dependency "benchmark-ips", "~> 2.7", ">= 2.7.2"
|
33
34
|
end
|
@@ -72,11 +72,34 @@ module ParseFasta
|
|
72
72
|
end
|
73
73
|
|
74
74
|
context "when seq has a '>' in it" do
|
75
|
-
|
76
|
-
|
75
|
+
context "with default strictness" do
|
76
|
+
it "raises SequenceFormatError" do
|
77
|
+
str = "actg>sequence 3"
|
77
78
|
|
78
|
-
|
79
|
+
expect { Record.new header: header, seq: str }.
|
79
80
|
to raise_error ParseFasta::Error::SequenceFormatError
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
context "with lenient checking" do
|
85
|
+
it "does NOT raise error" do
|
86
|
+
str = "actg>sequence 3"
|
87
|
+
|
88
|
+
expect { Record.new(header: header,
|
89
|
+
seq: str,
|
90
|
+
check_fasta_seq: false) }.
|
91
|
+
not_to raise_error
|
92
|
+
end
|
93
|
+
|
94
|
+
it "gives the sequence as is" do
|
95
|
+
str = "actg>sequence 3"
|
96
|
+
|
97
|
+
rec = Record.new(header: header,
|
98
|
+
seq: str.dup,
|
99
|
+
check_fasta_seq: false)
|
100
|
+
|
101
|
+
expect(rec.seq).to eq str.tr(" ", "")
|
102
|
+
end
|
80
103
|
end
|
81
104
|
end
|
82
105
|
end
|
@@ -36,6 +36,9 @@ module ParseFasta
|
|
36
36
|
let(:fastq_gz) {
|
37
37
|
File.join test_dir, "seqs.fq.gz"
|
38
38
|
}
|
39
|
+
let(:with_rec_sep_in_seq) {
|
40
|
+
File.join test_dir, "with_rec_sep_in_seq.fa"
|
41
|
+
}
|
39
42
|
|
40
43
|
let(:fasta_records) {
|
41
44
|
[Record.new(header: "empty seq at beginning",
|
@@ -65,6 +68,14 @@ module ParseFasta
|
|
65
68
|
desc: "seq2 +pples",
|
66
69
|
qual: "*ujM")]
|
67
70
|
}
|
71
|
+
let(:with_rec_sep_in_seq_records) {
|
72
|
+
[Record.new(header: "seq1",
|
73
|
+
seq: "AAAA>TTTT",
|
74
|
+
check_fasta_seq: false),
|
75
|
+
Record.new(header: "seq2",
|
76
|
+
seq: "TTTT>AAAA",
|
77
|
+
check_fasta_seq: false)]
|
78
|
+
}
|
68
79
|
|
69
80
|
# to test the line endings
|
70
81
|
let(:line_endings_fastq_records) {
|
@@ -150,6 +161,25 @@ module ParseFasta
|
|
150
161
|
|
151
162
|
include_examples "it yields the records"
|
152
163
|
end
|
164
|
+
|
165
|
+
context "when the fasta file has '>' in a seq" do
|
166
|
+
context "when the check_fasta_seq flag is false" do
|
167
|
+
it "yields records even with '>' in the sequence" do
|
168
|
+
expect { |b|
|
169
|
+
SeqFile.open(with_rec_sep_in_seq,
|
170
|
+
check_fasta_seq: false).each_record &b
|
171
|
+
}.to yield_successive_args(*with_rec_sep_in_seq_records)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
context "when the check_fasta_seq flag is default" do
|
176
|
+
it "raises SequenceFormatError" do
|
177
|
+
expect { |b|
|
178
|
+
SeqFile.open(with_rec_sep_in_seq).each_record &b
|
179
|
+
}.to raise_error ParseFasta::Error::SequenceFormatError
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
153
183
|
end
|
154
184
|
|
155
185
|
context "input is fastQ" do
|
@@ -235,4 +265,4 @@ module ParseFasta
|
|
235
265
|
end
|
236
266
|
end
|
237
267
|
end
|
238
|
-
end
|
268
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -108,6 +108,26 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0.7'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: benchmark-ips
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.7'
|
118
|
+
- - ">="
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: 2.7.2
|
121
|
+
type: :development
|
122
|
+
prerelease: false
|
123
|
+
version_requirements: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - "~>"
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '2.7'
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: 2.7.2
|
111
131
|
description: Provides nice, programmatic access to fasta and fastq files, as well
|
112
132
|
as providing Sequence and Quality helper classes. No need for BioRuby ;)
|
113
133
|
email:
|
@@ -157,6 +177,7 @@ files:
|
|
157
177
|
- spec/test_files/seqs.fq
|
158
178
|
- spec/test_files/seqs.fq.gz
|
159
179
|
- spec/test_files/test.rb
|
180
|
+
- spec/test_files/with_rec_sep_in_seq.fa
|
160
181
|
homepage: https://github.com/mooreryan/parse_fasta
|
161
182
|
licenses:
|
162
183
|
- 'GPLv3: http://www.gnu.org/licenses/gpl.txt'
|
@@ -203,4 +224,5 @@ test_files:
|
|
203
224
|
- spec/test_files/seqs.fq
|
204
225
|
- spec/test_files/seqs.fq.gz
|
205
226
|
- spec/test_files/test.rb
|
227
|
+
- spec/test_files/with_rec_sep_in_seq.fa
|
206
228
|
has_rdoc:
|