parse_fasta 1.9.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +8 -8
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/README.md +42 -215
  6. data/Rakefile +2 -4
  7. data/bin/console +14 -0
  8. data/bin/setup +8 -0
  9. data/lib/parse_fasta/error.rb +39 -0
  10. data/lib/parse_fasta/record.rb +88 -0
  11. data/lib/parse_fasta/seq_file.rb +221 -114
  12. data/lib/parse_fasta/version.rb +2 -2
  13. data/lib/parse_fasta.rb +5 -20
  14. data/spec/parse_fasta/record_spec.rb +115 -0
  15. data/spec/parse_fasta/seq_file_spec.rb +238 -0
  16. data/spec/parse_fasta_spec.rb +25 -0
  17. data/spec/spec_helper.rb +2 -44
  18. data/spec/test_files/cr.fa +1 -0
  19. data/spec/test_files/cr.fa.gz +0 -0
  20. data/spec/test_files/cr.fq +3 -0
  21. data/spec/test_files/cr.fq.gz +0 -0
  22. data/spec/test_files/cr_nl.fa +4 -0
  23. data/spec/test_files/cr_nl.fa.gz +0 -0
  24. data/spec/test_files/cr_nl.fq +8 -0
  25. data/spec/test_files/cr_nl.fq.gz +0 -0
  26. data/spec/test_files/multi_blob.fa.gz +0 -0
  27. data/spec/test_files/multi_blob.fq.gz +0 -0
  28. data/spec/test_files/not_a_seq_file.txt +1 -0
  29. data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
  30. data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
  31. data/spec/test_files/seqs.fa.gz +0 -0
  32. data/spec/test_files/seqs.fq +8 -0
  33. data/spec/test_files/seqs.fq.gz +0 -0
  34. metadata +49 -24
  35. data/lib/parse_fasta/fasta_file.rb +0 -232
  36. data/lib/parse_fasta/fastq_file.rb +0 -160
  37. data/lib/parse_fasta/quality.rb +0 -54
  38. data/lib/parse_fasta/sequence.rb +0 -174
  39. data/spec/lib/fasta_file_spec.rb +0 -212
  40. data/spec/lib/fastq_file_spec.rb +0 -143
  41. data/spec/lib/quality_spec.rb +0 -51
  42. data/spec/lib/seq_file_spec.rb +0 -357
  43. data/spec/lib/sequence_spec.rb +0 -188
  44. data/test_files/benchmark.rb +0 -99
  45. data/test_files/bogus.txt +0 -2
  46. data/test_files/test.fa.gz +0 -0
  47. data/test_files/test.fq +0 -8
  48. data/test_files/test.fq.gz +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2014, 2015 Ryan Moore
1
+ # Copyright 2014 - 2016 Ryan Moore
2
2
  # Contact: moorer@udel.edu
3
3
  #
4
4
  # This file is part of parse_fasta.
@@ -16,132 +16,239 @@
16
16
  # You should have received a copy of the GNU General Public License
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
- # Provides a class that will parse either fastA or fastQ files,
20
- # depending on what the user provides. Handles, gzipped files.
21
- class SeqFile < File
22
-
23
- # Returns the records in the sequence file as a hash map with the
24
- # headers as keys and the Sequences as values. For a fastq file,
25
- # acts the same as `FastaFile#to_hash`
26
- #
27
- # @example Read a fastA into a hash table.
28
- # seqs = SeqFile.open('reads.fa').to_hash
29
- #
30
- # @return [Hash] A hash with headers as keys, sequences as the
31
- # values (Sequence objects)
32
- #
33
- # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
34
- # and file is a fastA file
35
- def to_hash
36
- first_char = get_first_char(self)
37
-
38
- if first_char == '>'
39
- FastaFile.open(self).to_hash
40
- elsif first_char == '@'
41
- FastqFile.open(self).to_hash
42
- else
43
- raise ArgumentError, "Input does not look like FASTA or FASTQ"
19
+ require "zlib"
20
+
21
+ def get_first_char fname
22
+ if File.exists? fname
23
+ begin
24
+ f = Zlib::GzipReader.open fname
25
+ rescue Zlib::GzipFile::Error
26
+ f = File.open fname
27
+ ensure
28
+ # f.close
44
29
  end
30
+
31
+ first_char = f.each_char.peek[0]
32
+ f.close
33
+ return first_char
34
+ else
35
+ raise ParseFasta::Error::FileNotFoundError,
36
+ "No such file or directory -- #{fname}"
37
+ end
38
+ end
39
+
40
+ def check_file fname
41
+ first_char = get_first_char fname
42
+
43
+ if first_char == ">"
44
+ :fasta
45
+ elsif first_char == "@"
46
+ :fastq
47
+ else
48
+ raise ParseFasta::Error::DataFormatError,
49
+ "The file does not look like fastA or fastQ " +
50
+ "-- #{fname}"
45
51
  end
52
+ end
53
+
54
+ module ParseFasta
55
+ class SeqFile
56
+ # @!attribute type
57
+ # @return [Symbol] the type of the SeqFile (:fasta or :fastq)
58
+ attr_accessor :type
59
+
60
+ # @param fname [String] the name of the fastA or fastQ file to
61
+ # parse
62
+ #
63
+ # @raise [ParseFasta::Error::FileNotFoundError] if the file is not
64
+ # found
65
+ # @raise [ParseFasta::Error::DataFormatError] if the file doesn't
66
+ # start with a '>' or a '@'
67
+ def initialize fname
68
+ type = check_file fname
69
+
70
+ @fname = fname
71
+ @type = type
72
+ end
46
73
 
47
- # Analagous to IO#each_line, #each_record will go through a fastA or
48
- # fastQ file record by record.
49
- #
50
- # This #each_record is used in a similar fashion as
51
- # FastaFile#each_record except that it yields the header and the
52
- # sequence regardless of whether the input is a fastA file or a
53
- # fastQ file.
54
- #
55
- # If the input is a fastQ file, this method will yield the header
56
- # and the sequence and ignore the description and the quality
57
- # string. This SeqFile class should only be used if your program
58
- # needs to work on either fastA or fastQ files, thus it ignores the
59
- # quality string and description and treats either file type as if
60
- # it were a fastA file.
61
- #
62
- # If you need the description or quality, you should use
63
- # FastqFile#each_record instead.
64
- #
65
- # @example Parse a gzipped fastA file
66
- # SeqFile.open('reads.fa.gz').each_record do |head, seq|
67
- # puts [head, seq.length].join "\t"
68
- # end
69
- #
70
- # @example Parse an uncompressed fastQ file
71
- # SeqFile.open('reads.fq.gz').each_record do |head, seq|
72
- # puts [head, seq.length].join "\t"
73
- # end
74
- #
75
- # @yieldparam header [String] The header of the record without the
76
- # leading '>' or '@'
77
- #
78
- # @yieldparam sequence [Sequence] The sequence of the record.
79
- #
80
- # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
81
- # and file is a fastA file
82
- def each_record
83
- first_char = get_first_char(self)
84
-
85
- if first_char == '>'
86
- FastaFile.open(self).each_record do |header, sequence|
87
- yield(header, sequence)
74
+ # An alias for SeqFile.new
75
+ #
76
+ # @return [SeqFile] a SeqFile object
77
+ def self.open fname
78
+ self.new fname
79
+ end
80
+
81
+ # Analagous to IO#each_line, SeqFile#each_record is used to go
82
+ # through a fastA or fastQ file record by record. It will accept
83
+ # gzipped files as well.
84
+ #
85
+ # If the input is a fastA file, then the record that is yielded
86
+ # will have the desc and qual instance variables be nil. If it is
87
+ # a fastQ record then those instance variables will not be nil.
88
+ #
89
+ # @example Parsing a fastA file
90
+ # ParseFasta::SeqFile.open("seqs.fa").each_record do |rec|
91
+ # puts [rec.header, rec.seq].join "\t"
92
+ #
93
+ # rec.desc.nil? #=> true
94
+ # rec.qual.nil? #=> true
95
+ # end
96
+ # @example Parsing a gzipped fastQ file
97
+ # ParseFasta::SeqFile.open("seqs.fq.gz").each_record do |rec|
98
+ # puts [rec.header, rec.seq, rec.desc, rec.qual].join "\t"
99
+ # end
100
+ #
101
+ # @yieldparam record [ParseFasta::Record] A Record object with all
102
+ # the info of the record
103
+ #
104
+ # @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
105
+ # contains a record with a '>' character in the header
106
+ def each_record &b
107
+ line_parser = "parse_#{@type}_lines"
108
+
109
+ if gzipped? @fname
110
+ each_record_gzipped line_parser, &b
111
+ else
112
+ each_record_non_gzipped line_parser, &b
88
113
  end
89
- elsif first_char == '@'
90
- FastqFile.open(self).each_record do |head, seq, desc, qual|
91
- yield(head, seq)
114
+ end
115
+
116
+
117
+ private
118
+
119
+ def each_record_non_gzipped line_parser, &b
120
+ File.open(@fname, "rt") do |f|
121
+ self.send line_parser, f, &b
92
122
  end
93
- else
94
- raise ArgumentError, "Input does not look like FASTA or FASTQ"
95
123
  end
96
- end
97
124
 
98
- # Fast version of #each_record
99
- #
100
- # @note If the sequence file has spaces in the sequence, they will
101
- # be retained. If this is a problem, use #each_record instead.
102
- #
103
- # @example Parse a gzipped fastA file
104
- # SeqFile.open('reads.fa.gz').each_record_fast do |head, seq|
105
- # puts [head, seq.length].join "\t"
106
- # end
107
- #
108
- # @example Parse an uncompressed fastQ file
109
- # SeqFile.open('reads.fq.gz').each_record_fast do |head, seq|
110
- # puts [head, seq.length].join "\t"
111
- # end
112
- #
113
- # @yieldparam header [String] The header of the record without the
114
- # leading '>' or '@'
115
- #
116
- # @yieldparam sequence [String] The sequence of the record.
117
- #
118
- # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
119
- # and file is a fastA file
120
- def each_record_fast
121
- first_char = get_first_char(self)
122
-
123
- if first_char == '>'
124
- FastaFile.open(self).each_record_fast do |header, sequence|
125
- yield(header, sequence)
125
+ def each_record_gzipped line_parser, &b
126
+ File.open(@fname, "rt") do |file|
127
+ loop do
128
+ begin
129
+ gz_reader = Zlib::GzipReader.new file
130
+
131
+ self.send line_parser, gz_reader, &b
132
+
133
+ # check if there are any more blobs to read
134
+ if (unused = gz_reader.unused)
135
+ # rewind to the start of the last blob
136
+ file.seek -unused.length, IO::SEEK_END
137
+ else
138
+ # there are no more blobs to read
139
+ break
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ def parse_fasta_line line, header, sequence, &b
147
+ line.chomp!
148
+ len = line.length
149
+
150
+ if header.empty? && line.start_with?(">")
151
+ header = line[1, len] # drop the '>'
152
+ elsif line.start_with? ">"
153
+ yield Record.new(header: header.strip, seq: sequence)
154
+
155
+ header = line[1, len]
156
+ sequence = ""
157
+ else
158
+ sequence << line
126
159
  end
127
- elsif first_char == '@'
128
- FastqFile.open(self).each_record_fast do |head, seq, desc, qual|
129
- yield(head, seq)
160
+
161
+ [header, sequence]
162
+ end
163
+
164
+ def parse_fastq_line line, header, seq, desc, qual, count, &b
165
+ line.chomp!
166
+
167
+ case count
168
+ when 0
169
+ header = line[1..-1]
170
+ when 1
171
+ seq = line
172
+ when 2
173
+ desc = line[1..-1]
174
+ when 3
175
+ count = -1
176
+ qual = line
177
+
178
+ yield Record.new(header: header,
179
+ seq: seq,
180
+ desc: desc,
181
+ qual: qual)
182
+ else
183
+ raise ParseFasta::Error::ParseFastaError,
184
+ "Something went wrong in parse_fastq_line"
130
185
  end
131
- else
132
- raise ArgumentError, "Input does not look like FASTA or FASTQ"
186
+
187
+ count += 1
188
+
189
+ [header, seq, desc, qual, count]
133
190
  end
134
- end
135
191
 
136
- private
192
+ def parse_fasta_lines file_reader, &b
193
+ header = ""
194
+ sequence = ""
137
195
 
138
- def get_first_char(f)
139
- begin
140
- handle = Zlib::GzipReader.open(f)
141
- rescue Zlib::GzipFile::Error => e
142
- handle = f
196
+ line_reader = which_line_reader file_reader
197
+ file_reader.send(*line_reader) do |line|
198
+ header, sequence = parse_fasta_line line, header, sequence, &b
199
+ end
200
+
201
+ # yield the final seq
202
+ yield Record.new(header: header.strip, seq: sequence)
203
+ end
204
+
205
+ def parse_fastq_lines file_reader, &b
206
+ count = 0
207
+ header = ""
208
+ seq = ""
209
+ desc = ""
210
+ qual = ""
211
+
212
+ line_reader = which_line_reader file_reader
213
+ file_reader.send(*line_reader) do |line|
214
+ header, seq, desc, qual, count =
215
+ parse_fastq_line line, header, seq, desc, qual, count, &b
216
+ end
143
217
  end
144
218
 
145
- handle.each_line.peek[0]
219
+ def gzipped? fname
220
+ begin
221
+ f = Zlib::GzipReader.open fname
222
+ return true
223
+ rescue Zlib::GzipFile::Error => e
224
+ return false
225
+ ensure
226
+ f.close if f
227
+ end
228
+ end
229
+
230
+ # The Zlib::GzipReader can't handle files where the line separator
231
+ # is \r. This could all be avoided by using IO.popen("gzip -cd
232
+ # #{fname}", "rt"), but will gzip always be available?
233
+ def which_line_reader file_reader
234
+ line_reader = [:each_line]
235
+ # a valid fasta file must have at least two lines, the header
236
+ # and the sequence
237
+ begin
238
+ enum = file_reader.each_line
239
+ # if this was ruby v2.3, then we could just call .size on enum
240
+ 2.times do
241
+ enum.next
242
+ end
243
+ rescue StopIteration
244
+ # Zlib::GzipReader can handle \n and \r\n, but not \r, so if
245
+ # we get here, the file has \r only for line endings
246
+ line_reader = [:each, "\r"]
247
+ ensure
248
+ file_reader.rewind
249
+ end
250
+
251
+ line_reader
252
+ end
146
253
  end
147
254
  end
@@ -1,4 +1,4 @@
1
- # Copyright 2014, 2015 Ryan Moore
1
+ # Copyright 2014 - 2016 Ryan Moore
2
2
  # Contact: moorer@udel.edu
3
3
  #
4
4
  # This file is part of parse_fasta.
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "1.9.2"
20
+ VERSION = "2.0.0"
21
21
  end
data/lib/parse_fasta.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2014, 2015 Ryan Moore
1
+ # Copyright 2014 - 2016 Ryan Moore
2
2
  # Contact: moorer@udel.edu
3
3
  #
4
4
  # This file is part of parse_fasta.
@@ -16,25 +16,10 @@
16
16
  # You should have received a copy of the GNU General Public License
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
- require 'parse_fasta/version'
20
- require 'parse_fasta/fasta_file'
21
- require 'parse_fasta/fastq_file'
22
- require 'parse_fasta/seq_file'
23
- require 'parse_fasta/sequence'
24
- require 'parse_fasta/quality'
19
+ require "parse_fasta/version"
20
+ require "parse_fasta/record"
21
+ require "parse_fasta/seq_file"
22
+ require "parse_fasta/error"
25
23
 
26
24
  module ParseFasta
27
-
28
- class Error < StandardError
29
- end
30
-
31
- # Error raised when FASTA file is malformed
32
- class DataFormatError < IOError
33
- def message
34
- "Data format error -- check input file"
35
- end
36
- end
37
-
38
- class SequenceFormatError < Error
39
- end
40
25
  end
@@ -0,0 +1,115 @@
1
+ # Copyright 2014 - 2016 Ryan Moore
2
+ # Contact: moorer@udel.edu
3
+ #
4
+ # This file is part of parse_fasta.
5
+ #
6
+ # parse_fasta is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # parse_fasta is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "spec_helper"
20
+
21
+ module ParseFasta
22
+ describe Record do
23
+ let(:header) { "apple pie is good"}
24
+ let(:seq) { "ACTG" }
25
+ let(:desc) { "apple" }
26
+ let(:qual) { "abcd" }
27
+
28
+ let(:fasta_rec) {
29
+ Record.new header: header,
30
+ seq: "A C\t\t T G\r"
31
+ }
32
+ let(:fastq_rec) {
33
+ Record.new header: header,
34
+ seq: "A C\t\t T G\r",
35
+ desc: desc,
36
+ qual: " a b \tcd "
37
+ }
38
+
39
+ describe "::new" do
40
+ context "fastA input" do
41
+ it "sets :header" do
42
+ expect(fasta_rec.header).to eq header
43
+ end
44
+
45
+ it "sets :seq" do
46
+ expect(fasta_rec.seq).to eq seq
47
+ end
48
+
49
+ it "sets :desc to nil" do
50
+ expect(fasta_rec.desc).to eq nil
51
+ end
52
+
53
+ it "sets :qual to nil" do
54
+ expect(fasta_rec.qual).to eq nil
55
+ end
56
+
57
+ context "when seq has a '>' in it" do
58
+ it "raises SequenceFormatError" do
59
+ str = "actg>sequence 3"
60
+
61
+ expect { Record.new header: header, seq: str }.
62
+ to raise_error ParseFasta::Error::SequenceFormatError
63
+ end
64
+ end
65
+ end
66
+
67
+ context "fastQ input" do
68
+ it "sets :header" do
69
+ expect(fastq_rec.header).to eq header
70
+ end
71
+
72
+ it "sets :seq" do
73
+ expect(fastq_rec.seq).to eq seq
74
+ end
75
+
76
+ it "sets :desc to nil" do
77
+ expect(fastq_rec.desc).to eq desc
78
+ end
79
+
80
+ it "sets :qual to nil" do
81
+ expect(fastq_rec.qual).to eq qual
82
+ end
83
+
84
+ context "when seq has a '>' in it" do
85
+ it "does NOT rais SequenceFormatError" do
86
+ str = "actg>sequence 3"
87
+
88
+ expect { Record.new header: header,
89
+ seq: str,
90
+ desc: desc,
91
+ qual: qual }.
92
+ not_to raise_error
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ describe "#==" do
99
+ it "returns true if each of the attr_accessors are ==" do
100
+ rec1 = Record.new header: "a", seq: "a", desc: "", qual: "A"
101
+ rec2 = Record.new header: "a", seq: "a", desc: "", qual: "A"
102
+
103
+ expect(rec1 == rec2).to eq true
104
+ end
105
+
106
+ it "returns false otherwise" do
107
+ rec1 = Record.new header: "a", seq: "a", desc: "", qual: "A"
108
+ rec2 = Record.new header: "a", seq: "a", desc: "", qual: "b"
109
+
110
+ expect(rec1 == rec2).to eq false
111
+ end
112
+
113
+ end
114
+ end
115
+ end