parse_fasta 2.2.0 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/CHANGELOG.md +4 -0
- data/README.md +10 -0
- data/lib/parse_fasta/record.rb +15 -4
- data/lib/parse_fasta/seq_file.rb +15 -6
- data/lib/parse_fasta/version.rb +1 -1
- data/parse_fasta.gemspec +1 -0
- data/spec/parse_fasta/record_spec.rb +26 -3
- data/spec/parse_fasta/seq_file_spec.rb +31 -1
- data/spec/test_files/with_rec_sep_in_seq.fa +4 -0
- metadata +24 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5dc5b2c4063257a491082c62b9ef6faaecd694fa
|
4
|
+
data.tar.gz: 0f45d1a46360f65b83cada74be2b59e84d6c3179
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 26898676ec187d85ded69405b1b193ece436e49693f4cdcc32dd72e313b3143b3d1bbf14d749bad0e5ae7ff45c39b8fb31056bf2fc3aab8e9eb10dee54fbc326
|
7
|
+
data.tar.gz: 0fd4292d811cec77a2f0cb20c8f89fdb06a74ec3f6b2488ae0505a42abfcf4722f81b8f446fd4f940b918b1798f9dff44974214b1b3abd4469df83902267d8aa
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -24,6 +24,8 @@ Or install it yourself as:
|
|
24
24
|
|
25
25
|
Provides nice, programmatic access to fasta and fastq files. It's faster and more lightweight than BioRuby. And more fun!
|
26
26
|
|
27
|
+
It takes care of a lot of whacky edge cases like parsing multi-blob gzipped files, and being strict on formatting by default.
|
28
|
+
|
27
29
|
## Documentation ##
|
28
30
|
|
29
31
|
Checkout
|
@@ -93,3 +95,11 @@ ParseFasta::SeqFile.open(ARGV[0]).each_record do |rec|
|
|
93
95
|
puts rec
|
94
96
|
end
|
95
97
|
```
|
98
|
+
|
99
|
+
Sometimes your fasta file might have record separators (`>`) withen the "sequence". For example, CD-HIT's `.clstr` files have headers within what would be the sequence part of the record. `ParseFasta` is really strict about formatting and will raise an error when trying to read these types of files. If you would like to parse them, use the `check_fasta_seq: false` flag like so:
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
ParseFasta::SeqFile.open(ARGV[0], check_fasta_seq: false).each_record do |rec|
|
103
|
+
puts rec
|
104
|
+
end
|
105
|
+
```
|
data/lib/parse_fasta/record.rb
CHANGED
@@ -39,6 +39,8 @@ module ParseFasta
|
|
39
39
|
#
|
40
40
|
# @example Init a new Record object for a fastA record
|
41
41
|
# Record.new header: "apple", seq: "actg"
|
42
|
+
# @example Init a new Record object for a fastA record without checking for '>' in the sequence.
|
43
|
+
# Record.new header: "apple", seq: "pie>good", check_fasta_seq: false
|
42
44
|
# @example Init a new Record object for a fastQ record
|
43
45
|
# Record.new header: "apple", seq: "actd", desc: "", qual: "IIII"
|
44
46
|
#
|
@@ -46,9 +48,16 @@ module ParseFasta
|
|
46
48
|
# @param seq [String] the sequence of the record
|
47
49
|
# @param desc [String] the description line of a fastQ record
|
48
50
|
# @param qual [String] the quality string of a fastQ record
|
51
|
+
# @param check_fasta_seq [Bool] Pass false if you don't want to
|
52
|
+
# check for '>' characters in the sequence. Defaults to true,
|
53
|
+
# which checks for '>' in the sequence and raises an error.
|
49
54
|
#
|
50
|
-
# @raise [ParseFasta::Error::SequenceFormatError] if a fastA
|
51
|
-
# character in it
|
55
|
+
# @raise [ParseFasta::Error::SequenceFormatError] if a fastA
|
56
|
+
# sequence has a '>' character in it, and :check_fasta_seq is
|
57
|
+
# NOT set to false.
|
58
|
+
#
|
59
|
+
# @todo This is destructive with respect to the input seq
|
60
|
+
# arg. Does it need to be?
|
52
61
|
def initialize args = {}
|
53
62
|
@header = args.fetch :header
|
54
63
|
@id = @header.split(" ")[0]
|
@@ -61,9 +70,11 @@ module ParseFasta
|
|
61
70
|
seq = args.fetch(:seq)
|
62
71
|
seq.tr!(" \t\n\r", "")
|
63
72
|
|
64
|
-
|
73
|
+
do_check_fasta_seq = args.fetch :check_fasta_seq, true
|
74
|
+
|
75
|
+
if fastq? || (!fastq? && !do_check_fasta_seq)
|
65
76
|
@seq = seq
|
66
|
-
else
|
77
|
+
else
|
67
78
|
@seq = check_fasta_seq(seq)
|
68
79
|
end
|
69
80
|
end
|
data/lib/parse_fasta/seq_file.rb
CHANGED
@@ -26,14 +26,18 @@ module ParseFasta
|
|
26
26
|
|
27
27
|
# @param fname [String] the name of the fastA or fastQ file to
|
28
28
|
# parse
|
29
|
+
# @param type [Symbol] whether the file is :fasta or :fastq
|
30
|
+
# @param check_fasta_seq [Bool] keyword arg for whether to check
|
31
|
+
# for '>' in the sequence of fastA files.
|
29
32
|
#
|
30
33
|
# @raise [ParseFasta::Error::FileNotFoundError] if the file is not
|
31
34
|
# found
|
32
35
|
# @raise [ParseFasta::Error::DataFormatError] if the file doesn't
|
33
36
|
# start with a '>' or a '@'
|
34
|
-
def initialize fname
|
37
|
+
def initialize fname, args = {}
|
35
38
|
type = check_file fname
|
36
39
|
|
40
|
+
@check_fasta_seq = args.fetch :check_fasta_seq, true
|
37
41
|
@fname = fname
|
38
42
|
@type = type
|
39
43
|
end
|
@@ -41,8 +45,8 @@ module ParseFasta
|
|
41
45
|
# An alias for SeqFile.new
|
42
46
|
#
|
43
47
|
# @return [SeqFile] a SeqFile object
|
44
|
-
def self.open fname
|
45
|
-
self.new fname
|
48
|
+
def self.open fname, args = {}
|
49
|
+
self.new fname, args
|
46
50
|
end
|
47
51
|
|
48
52
|
# Analagous to IO#each_line, SeqFile#each_record is used to go
|
@@ -69,7 +73,8 @@ module ParseFasta
|
|
69
73
|
# the info of the record
|
70
74
|
#
|
71
75
|
# @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
|
72
|
-
# contains a record with a '>' character in the header
|
76
|
+
# contains a record with a '>' character in the header, and the
|
77
|
+
# SeqFile object was not initialized with check_fasta_seq: false
|
73
78
|
def each_record &b
|
74
79
|
line_parser = "parse_#{@type}_lines"
|
75
80
|
|
@@ -117,7 +122,9 @@ module ParseFasta
|
|
117
122
|
if header.empty? && line.start_with?(">")
|
118
123
|
header = line[1, len] # drop the '>'
|
119
124
|
elsif line.start_with? ">"
|
120
|
-
yield Record.new(header: header.strip,
|
125
|
+
yield Record.new(header: header.strip,
|
126
|
+
seq: sequence,
|
127
|
+
check_fasta_seq: @check_fasta_seq)
|
121
128
|
|
122
129
|
header = line[1, len]
|
123
130
|
sequence = ""
|
@@ -166,7 +173,9 @@ module ParseFasta
|
|
166
173
|
end
|
167
174
|
|
168
175
|
# yield the final seq
|
169
|
-
yield Record.new(header: header.strip,
|
176
|
+
yield Record.new(header: header.strip,
|
177
|
+
seq: sequence,
|
178
|
+
check_fasta_seq: @check_fasta_seq)
|
170
179
|
end
|
171
180
|
|
172
181
|
def parse_fastq_lines file_reader, &b
|
data/lib/parse_fasta/version.rb
CHANGED
data/parse_fasta.gemspec
CHANGED
@@ -30,4 +30,5 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.add_development_dependency "yard", "~> 0.8"
|
31
31
|
spec.add_development_dependency "rdiscount"
|
32
32
|
spec.add_development_dependency "coveralls", "~> 0.7"
|
33
|
+
spec.add_development_dependency "benchmark-ips", "~> 2.7", ">= 2.7.2"
|
33
34
|
end
|
@@ -72,11 +72,34 @@ module ParseFasta
|
|
72
72
|
end
|
73
73
|
|
74
74
|
context "when seq has a '>' in it" do
|
75
|
-
|
76
|
-
|
75
|
+
context "with default strictness" do
|
76
|
+
it "raises SequenceFormatError" do
|
77
|
+
str = "actg>sequence 3"
|
77
78
|
|
78
|
-
|
79
|
+
expect { Record.new header: header, seq: str }.
|
79
80
|
to raise_error ParseFasta::Error::SequenceFormatError
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
context "with lenient checking" do
|
85
|
+
it "does NOT raise error" do
|
86
|
+
str = "actg>sequence 3"
|
87
|
+
|
88
|
+
expect { Record.new(header: header,
|
89
|
+
seq: str,
|
90
|
+
check_fasta_seq: false) }.
|
91
|
+
not_to raise_error
|
92
|
+
end
|
93
|
+
|
94
|
+
it "gives the sequence as is" do
|
95
|
+
str = "actg>sequence 3"
|
96
|
+
|
97
|
+
rec = Record.new(header: header,
|
98
|
+
seq: str.dup,
|
99
|
+
check_fasta_seq: false)
|
100
|
+
|
101
|
+
expect(rec.seq).to eq str.tr(" ", "")
|
102
|
+
end
|
80
103
|
end
|
81
104
|
end
|
82
105
|
end
|
@@ -36,6 +36,9 @@ module ParseFasta
|
|
36
36
|
let(:fastq_gz) {
|
37
37
|
File.join test_dir, "seqs.fq.gz"
|
38
38
|
}
|
39
|
+
let(:with_rec_sep_in_seq) {
|
40
|
+
File.join test_dir, "with_rec_sep_in_seq.fa"
|
41
|
+
}
|
39
42
|
|
40
43
|
let(:fasta_records) {
|
41
44
|
[Record.new(header: "empty seq at beginning",
|
@@ -65,6 +68,14 @@ module ParseFasta
|
|
65
68
|
desc: "seq2 +pples",
|
66
69
|
qual: "*ujM")]
|
67
70
|
}
|
71
|
+
let(:with_rec_sep_in_seq_records) {
|
72
|
+
[Record.new(header: "seq1",
|
73
|
+
seq: "AAAA>TTTT",
|
74
|
+
check_fasta_seq: false),
|
75
|
+
Record.new(header: "seq2",
|
76
|
+
seq: "TTTT>AAAA",
|
77
|
+
check_fasta_seq: false)]
|
78
|
+
}
|
68
79
|
|
69
80
|
# to test the line endings
|
70
81
|
let(:line_endings_fastq_records) {
|
@@ -150,6 +161,25 @@ module ParseFasta
|
|
150
161
|
|
151
162
|
include_examples "it yields the records"
|
152
163
|
end
|
164
|
+
|
165
|
+
context "when the fasta file has '>' in a seq" do
|
166
|
+
context "when the check_fasta_seq flag is false" do
|
167
|
+
it "yields records even with '>' in the sequence" do
|
168
|
+
expect { |b|
|
169
|
+
SeqFile.open(with_rec_sep_in_seq,
|
170
|
+
check_fasta_seq: false).each_record &b
|
171
|
+
}.to yield_successive_args(*with_rec_sep_in_seq_records)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
context "when the check_fasta_seq flag is default" do
|
176
|
+
it "raises SequenceFormatError" do
|
177
|
+
expect { |b|
|
178
|
+
SeqFile.open(with_rec_sep_in_seq).each_record &b
|
179
|
+
}.to raise_error ParseFasta::Error::SequenceFormatError
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
153
183
|
end
|
154
184
|
|
155
185
|
context "input is fastQ" do
|
@@ -235,4 +265,4 @@ module ParseFasta
|
|
235
265
|
end
|
236
266
|
end
|
237
267
|
end
|
238
|
-
end
|
268
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parse_fasta
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -108,6 +108,26 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0.7'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: benchmark-ips
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.7'
|
118
|
+
- - ">="
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: 2.7.2
|
121
|
+
type: :development
|
122
|
+
prerelease: false
|
123
|
+
version_requirements: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - "~>"
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '2.7'
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: 2.7.2
|
111
131
|
description: Provides nice, programmatic access to fasta and fastq files, as well
|
112
132
|
as providing Sequence and Quality helper classes. No need for BioRuby ;)
|
113
133
|
email:
|
@@ -157,6 +177,7 @@ files:
|
|
157
177
|
- spec/test_files/seqs.fq
|
158
178
|
- spec/test_files/seqs.fq.gz
|
159
179
|
- spec/test_files/test.rb
|
180
|
+
- spec/test_files/with_rec_sep_in_seq.fa
|
160
181
|
homepage: https://github.com/mooreryan/parse_fasta
|
161
182
|
licenses:
|
162
183
|
- 'GPLv3: http://www.gnu.org/licenses/gpl.txt'
|
@@ -203,4 +224,5 @@ test_files:
|
|
203
224
|
- spec/test_files/seqs.fq
|
204
225
|
- spec/test_files/seqs.fq.gz
|
205
226
|
- spec/test_files/test.rb
|
227
|
+
- spec/test_files/with_rec_sep_in_seq.fa
|
206
228
|
has_rdoc:
|