parse_fasta 1.9.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +178 -0
- data/README.md +42 -215
- data/Rakefile +2 -4
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/parse_fasta/error.rb +39 -0
- data/lib/parse_fasta/record.rb +88 -0
- data/lib/parse_fasta/seq_file.rb +221 -114
- data/lib/parse_fasta/version.rb +2 -2
- data/lib/parse_fasta.rb +5 -20
- data/spec/parse_fasta/record_spec.rb +115 -0
- data/spec/parse_fasta/seq_file_spec.rb +238 -0
- data/spec/parse_fasta_spec.rb +25 -0
- data/spec/spec_helper.rb +2 -44
- data/spec/test_files/cr.fa +1 -0
- data/spec/test_files/cr.fa.gz +0 -0
- data/spec/test_files/cr.fq +3 -0
- data/spec/test_files/cr.fq.gz +0 -0
- data/spec/test_files/cr_nl.fa +4 -0
- data/spec/test_files/cr_nl.fa.gz +0 -0
- data/spec/test_files/cr_nl.fq +8 -0
- data/spec/test_files/cr_nl.fq.gz +0 -0
- data/spec/test_files/multi_blob.fa.gz +0 -0
- data/spec/test_files/multi_blob.fq.gz +0 -0
- data/spec/test_files/not_a_seq_file.txt +1 -0
- data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
- data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
- data/spec/test_files/seqs.fa.gz +0 -0
- data/spec/test_files/seqs.fq +8 -0
- data/spec/test_files/seqs.fq.gz +0 -0
- metadata +49 -24
- data/lib/parse_fasta/fasta_file.rb +0 -232
- data/lib/parse_fasta/fastq_file.rb +0 -160
- data/lib/parse_fasta/quality.rb +0 -54
- data/lib/parse_fasta/sequence.rb +0 -174
- data/spec/lib/fasta_file_spec.rb +0 -212
- data/spec/lib/fastq_file_spec.rb +0 -143
- data/spec/lib/quality_spec.rb +0 -51
- data/spec/lib/seq_file_spec.rb +0 -357
- data/spec/lib/sequence_spec.rb +0 -188
- data/test_files/benchmark.rb +0 -99
- data/test_files/bogus.txt +0 -2
- data/test_files/test.fa.gz +0 -0
- data/test_files/test.fq +0 -8
- data/test_files/test.fq.gz +0 -0
data/lib/parse_fasta/seq_file.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2014
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
2
|
# Contact: moorer@udel.edu
|
3
3
|
#
|
4
4
|
# This file is part of parse_fasta.
|
@@ -16,132 +16,239 @@
|
|
16
16
|
# You should have received a copy of the GNU General Public License
|
17
17
|
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
#
|
30
|
-
# @return [Hash] A hash with headers as keys, sequences as the
|
31
|
-
# values (Sequence objects)
|
32
|
-
#
|
33
|
-
# @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
|
34
|
-
# and file is a fastA file
|
35
|
-
def to_hash
|
36
|
-
first_char = get_first_char(self)
|
37
|
-
|
38
|
-
if first_char == '>'
|
39
|
-
FastaFile.open(self).to_hash
|
40
|
-
elsif first_char == '@'
|
41
|
-
FastqFile.open(self).to_hash
|
42
|
-
else
|
43
|
-
raise ArgumentError, "Input does not look like FASTA or FASTQ"
|
19
|
+
require "zlib"
|
20
|
+
|
21
|
+
def get_first_char fname
|
22
|
+
if File.exists? fname
|
23
|
+
begin
|
24
|
+
f = Zlib::GzipReader.open fname
|
25
|
+
rescue Zlib::GzipFile::Error
|
26
|
+
f = File.open fname
|
27
|
+
ensure
|
28
|
+
# f.close
|
44
29
|
end
|
30
|
+
|
31
|
+
first_char = f.each_char.peek[0]
|
32
|
+
f.close
|
33
|
+
return first_char
|
34
|
+
else
|
35
|
+
raise ParseFasta::Error::FileNotFoundError,
|
36
|
+
"No such file or directory -- #{fname}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def check_file fname
|
41
|
+
first_char = get_first_char fname
|
42
|
+
|
43
|
+
if first_char == ">"
|
44
|
+
:fasta
|
45
|
+
elsif first_char == "@"
|
46
|
+
:fastq
|
47
|
+
else
|
48
|
+
raise ParseFasta::Error::DataFormatError,
|
49
|
+
"The file does not look like fastA or fastQ " +
|
50
|
+
"-- #{fname}"
|
45
51
|
end
|
52
|
+
end
|
53
|
+
|
54
|
+
module ParseFasta
|
55
|
+
class SeqFile
|
56
|
+
# @!attribute type
|
57
|
+
# @return [Symbol] the type of the SeqFile (:fasta or :fastq)
|
58
|
+
attr_accessor :type
|
59
|
+
|
60
|
+
# @param fname [String] the name of the fastA or fastQ file to
|
61
|
+
# parse
|
62
|
+
#
|
63
|
+
# @raise [ParseFasta::Error::FileNotFoundError] if the file is not
|
64
|
+
# found
|
65
|
+
# @raise [ParseFasta::Error::DataFormatError] if the file doesn't
|
66
|
+
# start with a '>' or a '@'
|
67
|
+
def initialize fname
|
68
|
+
type = check_file fname
|
69
|
+
|
70
|
+
@fname = fname
|
71
|
+
@type = type
|
72
|
+
end
|
46
73
|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
FastaFile.open(self).each_record do |header, sequence|
|
87
|
-
yield(header, sequence)
|
74
|
+
# An alias for SeqFile.new
|
75
|
+
#
|
76
|
+
# @return [SeqFile] a SeqFile object
|
77
|
+
def self.open fname
|
78
|
+
self.new fname
|
79
|
+
end
|
80
|
+
|
81
|
+
# Analagous to IO#each_line, SeqFile#each_record is used to go
|
82
|
+
# through a fastA or fastQ file record by record. It will accept
|
83
|
+
# gzipped files as well.
|
84
|
+
#
|
85
|
+
# If the input is a fastA file, then the record that is yielded
|
86
|
+
# will have the desc and qual instance variables be nil. If it is
|
87
|
+
# a fastQ record then those instance variables will not be nil.
|
88
|
+
#
|
89
|
+
# @example Parsing a fastA file
|
90
|
+
# ParseFasta::SeqFile.open("seqs.fa").each_record do |rec|
|
91
|
+
# puts [rec.header, rec.seq].join "\t"
|
92
|
+
#
|
93
|
+
# rec.desc.nil? #=> true
|
94
|
+
# rec.qual.nil? #=> true
|
95
|
+
# end
|
96
|
+
# @example Parsing a gzipped fastQ file
|
97
|
+
# ParseFasta::SeqFile.open("seqs.fq.gz").each_record do |rec|
|
98
|
+
# puts [rec.header, rec.seq, rec.desc, rec.qual].join "\t"
|
99
|
+
# end
|
100
|
+
#
|
101
|
+
# @yieldparam record [ParseFasta::Record] A Record object with all
|
102
|
+
# the info of the record
|
103
|
+
#
|
104
|
+
# @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
|
105
|
+
# contains a record with a '>' character in the header
|
106
|
+
def each_record &b
|
107
|
+
line_parser = "parse_#{@type}_lines"
|
108
|
+
|
109
|
+
if gzipped? @fname
|
110
|
+
each_record_gzipped line_parser, &b
|
111
|
+
else
|
112
|
+
each_record_non_gzipped line_parser, &b
|
88
113
|
end
|
89
|
-
|
90
|
-
|
91
|
-
|
114
|
+
end
|
115
|
+
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def each_record_non_gzipped line_parser, &b
|
120
|
+
File.open(@fname, "rt") do |f|
|
121
|
+
self.send line_parser, f, &b
|
92
122
|
end
|
93
|
-
else
|
94
|
-
raise ArgumentError, "Input does not look like FASTA or FASTQ"
|
95
123
|
end
|
96
|
-
end
|
97
124
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
125
|
+
def each_record_gzipped line_parser, &b
|
126
|
+
File.open(@fname, "rt") do |file|
|
127
|
+
loop do
|
128
|
+
begin
|
129
|
+
gz_reader = Zlib::GzipReader.new file
|
130
|
+
|
131
|
+
self.send line_parser, gz_reader, &b
|
132
|
+
|
133
|
+
# check if there are any more blobs to read
|
134
|
+
if (unused = gz_reader.unused)
|
135
|
+
# rewind to the start of the last blob
|
136
|
+
file.seek -unused.length, IO::SEEK_END
|
137
|
+
else
|
138
|
+
# there are no more blobs to read
|
139
|
+
break
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def parse_fasta_line line, header, sequence, &b
|
147
|
+
line.chomp!
|
148
|
+
len = line.length
|
149
|
+
|
150
|
+
if header.empty? && line.start_with?(">")
|
151
|
+
header = line[1, len] # drop the '>'
|
152
|
+
elsif line.start_with? ">"
|
153
|
+
yield Record.new(header: header.strip, seq: sequence)
|
154
|
+
|
155
|
+
header = line[1, len]
|
156
|
+
sequence = ""
|
157
|
+
else
|
158
|
+
sequence << line
|
126
159
|
end
|
127
|
-
|
128
|
-
|
129
|
-
|
160
|
+
|
161
|
+
[header, sequence]
|
162
|
+
end
|
163
|
+
|
164
|
+
def parse_fastq_line line, header, seq, desc, qual, count, &b
|
165
|
+
line.chomp!
|
166
|
+
|
167
|
+
case count
|
168
|
+
when 0
|
169
|
+
header = line[1..-1]
|
170
|
+
when 1
|
171
|
+
seq = line
|
172
|
+
when 2
|
173
|
+
desc = line[1..-1]
|
174
|
+
when 3
|
175
|
+
count = -1
|
176
|
+
qual = line
|
177
|
+
|
178
|
+
yield Record.new(header: header,
|
179
|
+
seq: seq,
|
180
|
+
desc: desc,
|
181
|
+
qual: qual)
|
182
|
+
else
|
183
|
+
raise ParseFasta::Error::ParseFastaError,
|
184
|
+
"Something went wrong in parse_fastq_line"
|
130
185
|
end
|
131
|
-
|
132
|
-
|
186
|
+
|
187
|
+
count += 1
|
188
|
+
|
189
|
+
[header, seq, desc, qual, count]
|
133
190
|
end
|
134
|
-
end
|
135
191
|
|
136
|
-
|
192
|
+
def parse_fasta_lines file_reader, &b
|
193
|
+
header = ""
|
194
|
+
sequence = ""
|
137
195
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
196
|
+
line_reader = which_line_reader file_reader
|
197
|
+
file_reader.send(*line_reader) do |line|
|
198
|
+
header, sequence = parse_fasta_line line, header, sequence, &b
|
199
|
+
end
|
200
|
+
|
201
|
+
# yield the final seq
|
202
|
+
yield Record.new(header: header.strip, seq: sequence)
|
203
|
+
end
|
204
|
+
|
205
|
+
def parse_fastq_lines file_reader, &b
|
206
|
+
count = 0
|
207
|
+
header = ""
|
208
|
+
seq = ""
|
209
|
+
desc = ""
|
210
|
+
qual = ""
|
211
|
+
|
212
|
+
line_reader = which_line_reader file_reader
|
213
|
+
file_reader.send(*line_reader) do |line|
|
214
|
+
header, seq, desc, qual, count =
|
215
|
+
parse_fastq_line line, header, seq, desc, qual, count, &b
|
216
|
+
end
|
143
217
|
end
|
144
218
|
|
145
|
-
|
219
|
+
def gzipped? fname
|
220
|
+
begin
|
221
|
+
f = Zlib::GzipReader.open fname
|
222
|
+
return true
|
223
|
+
rescue Zlib::GzipFile::Error => e
|
224
|
+
return false
|
225
|
+
ensure
|
226
|
+
f.close if f
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
# The Zlib::GzipReader can't handle files where the line separator
|
231
|
+
# is \r. This could all be avoided by using IO.popen("gzip -cd
|
232
|
+
# #{fname}", "rt"), but will gzip always be available?
|
233
|
+
def which_line_reader file_reader
|
234
|
+
line_reader = [:each_line]
|
235
|
+
# a valid fasta file must have at least two lines, the header
|
236
|
+
# and the sequence
|
237
|
+
begin
|
238
|
+
enum = file_reader.each_line
|
239
|
+
# if this was ruby v2.3, then we could just call .size on enum
|
240
|
+
2.times do
|
241
|
+
enum.next
|
242
|
+
end
|
243
|
+
rescue StopIteration
|
244
|
+
# Zlib::GzipReader can handle \n and \r\n, but not \r, so if
|
245
|
+
# we get here, the file has \r only for line endings
|
246
|
+
line_reader = [:each, "\r"]
|
247
|
+
ensure
|
248
|
+
file_reader.rewind
|
249
|
+
end
|
250
|
+
|
251
|
+
line_reader
|
252
|
+
end
|
146
253
|
end
|
147
254
|
end
|
data/lib/parse_fasta/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2014
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
2
|
# Contact: moorer@udel.edu
|
3
3
|
#
|
4
4
|
# This file is part of parse_fasta.
|
@@ -17,5 +17,5 @@
|
|
17
17
|
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
18
|
|
19
19
|
module ParseFasta
|
20
|
-
VERSION = "
|
20
|
+
VERSION = "2.0.0"
|
21
21
|
end
|
data/lib/parse_fasta.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Copyright 2014
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
2
|
# Contact: moorer@udel.edu
|
3
3
|
#
|
4
4
|
# This file is part of parse_fasta.
|
@@ -16,25 +16,10 @@
|
|
16
16
|
# You should have received a copy of the GNU General Public License
|
17
17
|
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
18
|
|
19
|
-
require
|
20
|
-
require
|
21
|
-
require
|
22
|
-
require
|
23
|
-
require 'parse_fasta/sequence'
|
24
|
-
require 'parse_fasta/quality'
|
19
|
+
require "parse_fasta/version"
|
20
|
+
require "parse_fasta/record"
|
21
|
+
require "parse_fasta/seq_file"
|
22
|
+
require "parse_fasta/error"
|
25
23
|
|
26
24
|
module ParseFasta
|
27
|
-
|
28
|
-
class Error < StandardError
|
29
|
-
end
|
30
|
-
|
31
|
-
# Error raised when FASTA file is malformed
|
32
|
-
class DataFormatError < IOError
|
33
|
-
def message
|
34
|
-
"Data format error -- check input file"
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
class SequenceFormatError < Error
|
39
|
-
end
|
40
25
|
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
# Copyright 2014 - 2016 Ryan Moore
|
2
|
+
# Contact: moorer@udel.edu
|
3
|
+
#
|
4
|
+
# This file is part of parse_fasta.
|
5
|
+
#
|
6
|
+
# parse_fasta is free software: you can redistribute it and/or modify
|
7
|
+
# it under the terms of the GNU General Public License as published by
|
8
|
+
# the Free Software Foundation, either version 3 of the License, or
|
9
|
+
# (at your option) any later version.
|
10
|
+
#
|
11
|
+
# parse_fasta is distributed in the hope that it will be useful,
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
14
|
+
# GNU General Public License for more details.
|
15
|
+
#
|
16
|
+
# You should have received a copy of the GNU General Public License
|
17
|
+
# along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
|
18
|
+
|
19
|
+
require "spec_helper"
|
20
|
+
|
21
|
+
module ParseFasta
|
22
|
+
describe Record do
|
23
|
+
let(:header) { "apple pie is good"}
|
24
|
+
let(:seq) { "ACTG" }
|
25
|
+
let(:desc) { "apple" }
|
26
|
+
let(:qual) { "abcd" }
|
27
|
+
|
28
|
+
let(:fasta_rec) {
|
29
|
+
Record.new header: header,
|
30
|
+
seq: "A C\t\t T G\r"
|
31
|
+
}
|
32
|
+
let(:fastq_rec) {
|
33
|
+
Record.new header: header,
|
34
|
+
seq: "A C\t\t T G\r",
|
35
|
+
desc: desc,
|
36
|
+
qual: " a b \tcd "
|
37
|
+
}
|
38
|
+
|
39
|
+
describe "::new" do
|
40
|
+
context "fastA input" do
|
41
|
+
it "sets :header" do
|
42
|
+
expect(fasta_rec.header).to eq header
|
43
|
+
end
|
44
|
+
|
45
|
+
it "sets :seq" do
|
46
|
+
expect(fasta_rec.seq).to eq seq
|
47
|
+
end
|
48
|
+
|
49
|
+
it "sets :desc to nil" do
|
50
|
+
expect(fasta_rec.desc).to eq nil
|
51
|
+
end
|
52
|
+
|
53
|
+
it "sets :qual to nil" do
|
54
|
+
expect(fasta_rec.qual).to eq nil
|
55
|
+
end
|
56
|
+
|
57
|
+
context "when seq has a '>' in it" do
|
58
|
+
it "raises SequenceFormatError" do
|
59
|
+
str = "actg>sequence 3"
|
60
|
+
|
61
|
+
expect { Record.new header: header, seq: str }.
|
62
|
+
to raise_error ParseFasta::Error::SequenceFormatError
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
context "fastQ input" do
|
68
|
+
it "sets :header" do
|
69
|
+
expect(fastq_rec.header).to eq header
|
70
|
+
end
|
71
|
+
|
72
|
+
it "sets :seq" do
|
73
|
+
expect(fastq_rec.seq).to eq seq
|
74
|
+
end
|
75
|
+
|
76
|
+
it "sets :desc to nil" do
|
77
|
+
expect(fastq_rec.desc).to eq desc
|
78
|
+
end
|
79
|
+
|
80
|
+
it "sets :qual to nil" do
|
81
|
+
expect(fastq_rec.qual).to eq qual
|
82
|
+
end
|
83
|
+
|
84
|
+
context "when seq has a '>' in it" do
|
85
|
+
it "does NOT rais SequenceFormatError" do
|
86
|
+
str = "actg>sequence 3"
|
87
|
+
|
88
|
+
expect { Record.new header: header,
|
89
|
+
seq: str,
|
90
|
+
desc: desc,
|
91
|
+
qual: qual }.
|
92
|
+
not_to raise_error
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe "#==" do
|
99
|
+
it "returns true if each of the attr_accessors are ==" do
|
100
|
+
rec1 = Record.new header: "a", seq: "a", desc: "", qual: "A"
|
101
|
+
rec2 = Record.new header: "a", seq: "a", desc: "", qual: "A"
|
102
|
+
|
103
|
+
expect(rec1 == rec2).to eq true
|
104
|
+
end
|
105
|
+
|
106
|
+
it "returns false otherwise" do
|
107
|
+
rec1 = Record.new header: "a", seq: "a", desc: "", qual: "A"
|
108
|
+
rec2 = Record.new header: "a", seq: "a", desc: "", qual: "b"
|
109
|
+
|
110
|
+
expect(rec1 == rec2).to eq false
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|