parse_fasta 1.9.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +178 -0
- data/README.md +42 -215
- data/Rakefile +2 -4
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/parse_fasta/error.rb +39 -0
- data/lib/parse_fasta/record.rb +88 -0
- data/lib/parse_fasta/seq_file.rb +221 -114
- data/lib/parse_fasta/version.rb +2 -2
- data/lib/parse_fasta.rb +5 -20
- data/spec/parse_fasta/record_spec.rb +115 -0
- data/spec/parse_fasta/seq_file_spec.rb +238 -0
- data/spec/parse_fasta_spec.rb +25 -0
- data/spec/spec_helper.rb +2 -44
- data/spec/test_files/cr.fa +1 -0
- data/spec/test_files/cr.fa.gz +0 -0
- data/spec/test_files/cr.fq +3 -0
- data/spec/test_files/cr.fq.gz +0 -0
- data/spec/test_files/cr_nl.fa +4 -0
- data/spec/test_files/cr_nl.fa.gz +0 -0
- data/spec/test_files/cr_nl.fq +8 -0
- data/spec/test_files/cr_nl.fq.gz +0 -0
- data/spec/test_files/multi_blob.fa.gz +0 -0
- data/spec/test_files/multi_blob.fq.gz +0 -0
- data/spec/test_files/not_a_seq_file.txt +1 -0
- data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
- data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
- data/spec/test_files/seqs.fa.gz +0 -0
- data/spec/test_files/seqs.fq +8 -0
- data/spec/test_files/seqs.fq.gz +0 -0
- metadata +49 -24
- data/lib/parse_fasta/fasta_file.rb +0 -232
- data/lib/parse_fasta/fastq_file.rb +0 -160
- data/lib/parse_fasta/quality.rb +0 -54
- data/lib/parse_fasta/sequence.rb +0 -174
- data/spec/lib/fasta_file_spec.rb +0 -212
- data/spec/lib/fastq_file_spec.rb +0 -143
- data/spec/lib/quality_spec.rb +0 -51
- data/spec/lib/seq_file_spec.rb +0 -357
- data/spec/lib/sequence_spec.rb +0 -188
- data/test_files/benchmark.rb +0 -99
- data/test_files/bogus.txt +0 -2
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq +0 -8
- data/test_files/test.fq.gz +0 -0
data/lib/parse_fasta/seq_file.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2014
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
2
|
# Contact: moorer@udel.edu
|
3
3
|
#
|
4
4
|
# This file is part of parse_fasta.
|
@@ -16,132 +16,239 @@
|
|
16
16
|
# You should have received a copy of the GNU General Public License
|
17
17
|
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
#
|
30
|
-
# @return [Hash] A hash with headers as keys, sequences as the
|
31
|
-
# values (Sequence objects)
|
32
|
-
#
|
33
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
|
34
|
-
# and file is a fastA file
|
35
|
-
def to_hash
|
36
|
-
first_char = get_first_char(self)
|
37
|
-
|
38
|
-
if first_char == '>'
|
39
|
-
FastaFile.open(self).to_hash
|
40
|
-
elsif first_char == '@'
|
41
|
-
FastqFile.open(self).to_hash
|
42
|
-
else
|
43
|
-
raise ArgumentError, "Input does not look like FASTA or FASTQ"
|
19
|
+
require "zlib"
|
20
|
+
|
21
|
+
def get_first_char fname
|
22
|
+
if File.exists? fname
|
23
|
+
begin
|
24
|
+
f = Zlib::GzipReader.open fname
|
25
|
+
rescue Zlib::GzipFile::Error
|
26
|
+
f = File.open fname
|
27
|
+
ensure
|
28
|
+
# f.close
|
44
29
|
end
|
30
|
+
|
31
|
+
first_char = f.each_char.peek[0]
|
32
|
+
f.close
|
33
|
+
return first_char
|
34
|
+
else
|
35
|
+
raise ParseFasta::Error::FileNotFoundError,
|
36
|
+
"No such file or directory -- #{fname}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def check_file fname
|
41
|
+
first_char = get_first_char fname
|
42
|
+
|
43
|
+
if first_char == ">"
|
44
|
+
:fasta
|
45
|
+
elsif first_char == "@"
|
46
|
+
:fastq
|
47
|
+
else
|
48
|
+
raise ParseFasta::Error::DataFormatError,
|
49
|
+
"The file does not look like fastA or fastQ " +
|
50
|
+
"-- #{fname}"
|
45
51
|
end
|
52
|
+
end
|
53
|
+
|
54
|
+
module ParseFasta
|
55
|
+
class SeqFile
|
56
|
+
# @!attribute type
|
57
|
+
# @return [Symbol] the type of the SeqFile (:fasta or :fastq)
|
58
|
+
attr_accessor :type
|
59
|
+
|
60
|
+
# @param fname [String] the name of the fastA or fastQ file to
|
61
|
+
# parse
|
62
|
+
#
|
63
|
+
# @raise [ParseFasta::Error::FileNotFoundError] if the file is not
|
64
|
+
# found
|
65
|
+
# @raise [ParseFasta::Error::DataFormatError] if the file doesn't
|
66
|
+
# start with a '>' or a '@'
|
67
|
+
def initialize fname
|
68
|
+
type = check_file fname
|
69
|
+
|
70
|
+
@fname = fname
|
71
|
+
@type = type
|
72
|
+
end
|
46
73
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
FastaFile.open(self).each_record do |header, sequence|
|
87
|
-
yield(header, sequence)
|
74
|
+
# An alias for SeqFile.new
|
75
|
+
#
|
76
|
+
# @return [SeqFile] a SeqFile object
|
77
|
+
def self.open fname
|
78
|
+
self.new fname
|
79
|
+
end
|
80
|
+
|
81
|
+
# Analagous to IO#each_line, SeqFile#each_record is used to go
|
82
|
+
# through a fastA or fastQ file record by record. It will accept
|
83
|
+
# gzipped files as well.
|
84
|
+
#
|
85
|
+
# If the input is a fastA file, then the record that is yielded
|
86
|
+
# will have the desc and qual instance variables be nil. If it is
|
87
|
+
# a fastQ record then those instance variables will not be nil.
|
88
|
+
#
|
89
|
+
# @example Parsing a fastA file
|
90
|
+
# ParseFasta::SeqFile.open("seqs.fa").each_record do |rec|
|
91
|
+
# puts [rec.header, rec.seq].join "\t"
|
92
|
+
#
|
93
|
+
# rec.desc.nil? #=> true
|
94
|
+
# rec.qual.nil? #=> true
|
95
|
+
# end
|
96
|
+
# @example Parsing a gzipped fastQ file
|
97
|
+
# ParseFasta::SeqFile.open("seqs.fq.gz").each_record do |rec|
|
98
|
+
# puts [rec.header, rec.seq, rec.desc, rec.qual].join "\t"
|
99
|
+
# end
|
100
|
+
#
|
101
|
+
# @yieldparam record [ParseFasta::Record] A Record object with all
|
102
|
+
# the info of the record
|
103
|
+
#
|
104
|
+
# @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
|
105
|
+
# contains a record with a '>' character in the header
|
106
|
+
def each_record &b
|
107
|
+
line_parser = "parse_#{@type}_lines"
|
108
|
+
|
109
|
+
if gzipped? @fname
|
110
|
+
each_record_gzipped line_parser, &b
|
111
|
+
else
|
112
|
+
each_record_non_gzipped line_parser, &b
|
88
113
|
end
|
89
|
-
|
90
|
-
|
91
|
-
|
114
|
+
end
|
115
|
+
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def each_record_non_gzipped line_parser, &b
|
120
|
+
File.open(@fname, "rt") do |f|
|
121
|
+
self.send line_parser, f, &b
|
92
122
|
end
|
93
|
-
else
|
94
|
-
raise ArgumentError, "Input does not look like FASTA or FASTQ"
|
95
123
|
end
|
96
|
-
end
|
97
124
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
125
|
+
def each_record_gzipped line_parser, &b
|
126
|
+
File.open(@fname, "rt") do |file|
|
127
|
+
loop do
|
128
|
+
begin
|
129
|
+
gz_reader = Zlib::GzipReader.new file
|
130
|
+
|
131
|
+
self.send line_parser, gz_reader, &b
|
132
|
+
|
133
|
+
# check if there are any more blobs to read
|
134
|
+
if (unused = gz_reader.unused)
|
135
|
+
# rewind to the start of the last blob
|
136
|
+
file.seek -unused.length, IO::SEEK_END
|
137
|
+
else
|
138
|
+
# there are no more blobs to read
|
139
|
+
break
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def parse_fasta_line line, header, sequence, &b
|
147
|
+
line.chomp!
|
148
|
+
len = line.length
|
149
|
+
|
150
|
+
if header.empty? && line.start_with?(">")
|
151
|
+
header = line[1, len] # drop the '>'
|
152
|
+
elsif line.start_with? ">"
|
153
|
+
yield Record.new(header: header.strip, seq: sequence)
|
154
|
+
|
155
|
+
header = line[1, len]
|
156
|
+
sequence = ""
|
157
|
+
else
|
158
|
+
sequence << line
|
126
159
|
end
|
127
|
-
|
128
|
-
|
129
|
-
|
160
|
+
|
161
|
+
[header, sequence]
|
162
|
+
end
|
163
|
+
|
164
|
+
def parse_fastq_line line, header, seq, desc, qual, count, &b
|
165
|
+
line.chomp!
|
166
|
+
|
167
|
+
case count
|
168
|
+
when 0
|
169
|
+
header = line[1..-1]
|
170
|
+
when 1
|
171
|
+
seq = line
|
172
|
+
when 2
|
173
|
+
desc = line[1..-1]
|
174
|
+
when 3
|
175
|
+
count = -1
|
176
|
+
qual = line
|
177
|
+
|
178
|
+
yield Record.new(header: header,
|
179
|
+
seq: seq,
|
180
|
+
desc: desc,
|
181
|
+
qual: qual)
|
182
|
+
else
|
183
|
+
raise ParseFasta::Error::ParseFastaError,
|
184
|
+
"Something went wrong in parse_fastq_line"
|
130
185
|
end
|
131
|
-
|
132
|
-
|
186
|
+
|
187
|
+
count += 1
|
188
|
+
|
189
|
+
[header, seq, desc, qual, count]
|
133
190
|
end
|
134
|
-
end
|
135
191
|
|
136
|
-
|
192
|
+
def parse_fasta_lines file_reader, &b
|
193
|
+
header = ""
|
194
|
+
sequence = ""
|
137
195
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
196
|
+
line_reader = which_line_reader file_reader
|
197
|
+
file_reader.send(*line_reader) do |line|
|
198
|
+
header, sequence = parse_fasta_line line, header, sequence, &b
|
199
|
+
end
|
200
|
+
|
201
|
+
# yield the final seq
|
202
|
+
yield Record.new(header: header.strip, seq: sequence)
|
203
|
+
end
|
204
|
+
|
205
|
+
def parse_fastq_lines file_reader, &b
|
206
|
+
count = 0
|
207
|
+
header = ""
|
208
|
+
seq = ""
|
209
|
+
desc = ""
|
210
|
+
qual = ""
|
211
|
+
|
212
|
+
line_reader = which_line_reader file_reader
|
213
|
+
file_reader.send(*line_reader) do |line|
|
214
|
+
header, seq, desc, qual, count =
|
215
|
+
parse_fastq_line line, header, seq, desc, qual, count, &b
|
216
|
+
end
|
143
217
|
end
|
144
218
|
|
145
|
-
|
219
|
+
def gzipped? fname
|
220
|
+
begin
|
221
|
+
f = Zlib::GzipReader.open fname
|
222
|
+
return true
|
223
|
+
rescue Zlib::GzipFile::Error => e
|
224
|
+
return false
|
225
|
+
ensure
|
226
|
+
f.close if f
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
# The Zlib::GzipReader can't handle files where the line separator
|
231
|
+
# is \r. This could all be avoided by using IO.popen("gzip -cd
|
232
|
+
# #{fname}", "rt"), but will gzip always be available?
|
233
|
+
def which_line_reader file_reader
|
234
|
+
line_reader = [:each_line]
|
235
|
+
# a valid fasta file must have at least two lines, the header
|
236
|
+
# and the sequence
|
237
|
+
begin
|
238
|
+
enum = file_reader.each_line
|
239
|
+
# if this was ruby v2.3, then we could just call .size on enum
|
240
|
+
2.times do
|
241
|
+
enum.next
|
242
|
+
end
|
243
|
+
rescue StopIteration
|
244
|
+
# Zlib::GzipReader can handle \n and \r\n, but not \r, so if
|
245
|
+
# we get here, the file has \r only for line endings
|
246
|
+
line_reader = [:each, "\r"]
|
247
|
+
ensure
|
248
|
+
file_reader.rewind
|
249
|
+
end
|
250
|
+
|
251
|
+
line_reader
|
252
|
+
end
|
146
253
|
end
|
147
254
|
end
|
data/lib/parse_fasta/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2014
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
2
|
# Contact: moorer@udel.edu
|
3
3
|
#
|
4
4
|
# This file is part of parse_fasta.
|
@@ -17,5 +17,5 @@
|
|
17
17
|
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
18
|
|
19
19
|
module ParseFasta
|
20
|
-
VERSION = "
|
20
|
+
VERSION = "2.0.0"
|
21
21
|
end
|
data/lib/parse_fasta.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2014
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
2
|
# Contact: moorer@udel.edu
|
3
3
|
#
|
4
4
|
# This file is part of parse_fasta.
|
@@ -16,25 +16,10 @@
|
|
16
16
|
# You should have received a copy of the GNU General Public License
|
17
17
|
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
18
|
|
19
|
-
require
|
20
|
-
require
|
21
|
-
require
|
22
|
-
require
|
23
|
-
require 'parse_fasta/sequence'
|
24
|
-
require 'parse_fasta/quality'
|
19
|
+
require "parse_fasta/version"
|
20
|
+
require "parse_fasta/record"
|
21
|
+
require "parse_fasta/seq_file"
|
22
|
+
require "parse_fasta/error"
|
25
23
|
|
26
24
|
module ParseFasta
|
27
|
-
|
28
|
-
class Error < StandardError
|
29
|
-
end
|
30
|
-
|
31
|
-
# Error raised when FASTA file is malformed
|
32
|
-
class DataFormatError < IOError
|
33
|
-
def message
|
34
|
-
"Data format error -- check input file"
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
class SequenceFormatError < Error
|
39
|
-
end
|
40
25
|
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
|
+
# Contact: moorer@udel.edu
|
3
|
+
#
|
4
|
+
# This file is part of parse_fasta.
|
5
|
+
#
|
6
|
+
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
require "spec_helper"
|
20
|
+
|
21
|
+
module ParseFasta
|
22
|
+
describe Record do
|
23
|
+
let(:header) { "apple pie is good"}
|
24
|
+
let(:seq) { "ACTG" }
|
25
|
+
let(:desc) { "apple" }
|
26
|
+
let(:qual) { "abcd" }
|
27
|
+
|
28
|
+
let(:fasta_rec) {
|
29
|
+
Record.new header: header,
|
30
|
+
seq: "A C\t\t T G\r"
|
31
|
+
}
|
32
|
+
let(:fastq_rec) {
|
33
|
+
Record.new header: header,
|
34
|
+
seq: "A C\t\t T G\r",
|
35
|
+
desc: desc,
|
36
|
+
qual: " a b \tcd "
|
37
|
+
}
|
38
|
+
|
39
|
+
describe "::new" do
|
40
|
+
context "fastA input" do
|
41
|
+
it "sets :header" do
|
42
|
+
expect(fasta_rec.header).to eq header
|
43
|
+
end
|
44
|
+
|
45
|
+
it "sets :seq" do
|
46
|
+
expect(fasta_rec.seq).to eq seq
|
47
|
+
end
|
48
|
+
|
49
|
+
it "sets :desc to nil" do
|
50
|
+
expect(fasta_rec.desc).to eq nil
|
51
|
+
end
|
52
|
+
|
53
|
+
it "sets :qual to nil" do
|
54
|
+
expect(fasta_rec.qual).to eq nil
|
55
|
+
end
|
56
|
+
|
57
|
+
context "when seq has a '>' in it" do
|
58
|
+
it "raises SequenceFormatError" do
|
59
|
+
str = "actg>sequence 3"
|
60
|
+
|
61
|
+
expect { Record.new header: header, seq: str }.
|
62
|
+
to raise_error ParseFasta::Error::SequenceFormatError
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context "fastQ input" do
|
68
|
+
it "sets :header" do
|
69
|
+
expect(fastq_rec.header).to eq header
|
70
|
+
end
|
71
|
+
|
72
|
+
it "sets :seq" do
|
73
|
+
expect(fastq_rec.seq).to eq seq
|
74
|
+
end
|
75
|
+
|
76
|
+
it "sets :desc to nil" do
|
77
|
+
expect(fastq_rec.desc).to eq desc
|
78
|
+
end
|
79
|
+
|
80
|
+
it "sets :qual to nil" do
|
81
|
+
expect(fastq_rec.qual).to eq qual
|
82
|
+
end
|
83
|
+
|
84
|
+
context "when seq has a '>' in it" do
|
85
|
+
it "does NOT rais SequenceFormatError" do
|
86
|
+
str = "actg>sequence 3"
|
87
|
+
|
88
|
+
expect { Record.new header: header,
|
89
|
+
seq: str,
|
90
|
+
desc: desc,
|
91
|
+
qual: qual }.
|
92
|
+
not_to raise_error
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe "#==" do
|
99
|
+
it "returns true if each of the attr_accessors are ==" do
|
100
|
+
rec1 = Record.new header: "a", seq: "a", desc: "", qual: "A"
|
101
|
+
rec2 = Record.new header: "a", seq: "a", desc: "", qual: "A"
|
102
|
+
|
103
|
+
expect(rec1 == rec2).to eq true
|
104
|
+
end
|
105
|
+
|
106
|
+
it "returns false otherwise" do
|
107
|
+
rec1 = Record.new header: "a", seq: "a", desc: "", qual: "A"
|
108
|
+
rec2 = Record.new header: "a", seq: "a", desc: "", qual: "b"
|
109
|
+
|
110
|
+
expect(rec1 == rec2).to eq false
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|