parse_fasta 1.9.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +8 -8
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -0
  4. data/CHANGELOG.md +178 -0
  5. data/README.md +42 -215
  6. data/Rakefile +2 -4
  7. data/bin/console +14 -0
  8. data/bin/setup +8 -0
  9. data/lib/parse_fasta/error.rb +39 -0
  10. data/lib/parse_fasta/record.rb +88 -0
  11. data/lib/parse_fasta/seq_file.rb +221 -114
  12. data/lib/parse_fasta/version.rb +2 -2
  13. data/lib/parse_fasta.rb +5 -20
  14. data/spec/parse_fasta/record_spec.rb +115 -0
  15. data/spec/parse_fasta/seq_file_spec.rb +238 -0
  16. data/spec/parse_fasta_spec.rb +25 -0
  17. data/spec/spec_helper.rb +2 -44
  18. data/spec/test_files/cr.fa +1 -0
  19. data/spec/test_files/cr.fa.gz +0 -0
  20. data/spec/test_files/cr.fq +3 -0
  21. data/spec/test_files/cr.fq.gz +0 -0
  22. data/spec/test_files/cr_nl.fa +4 -0
  23. data/spec/test_files/cr_nl.fa.gz +0 -0
  24. data/spec/test_files/cr_nl.fq +8 -0
  25. data/spec/test_files/cr_nl.fq.gz +0 -0
  26. data/spec/test_files/multi_blob.fa.gz +0 -0
  27. data/spec/test_files/multi_blob.fq.gz +0 -0
  28. data/spec/test_files/not_a_seq_file.txt +1 -0
  29. data/{test_files/bad.fa → spec/test_files/poorly_catted.fa} +0 -0
  30. data/{test_files/test.fa → spec/test_files/seqs.fa} +0 -0
  31. data/spec/test_files/seqs.fa.gz +0 -0
  32. data/spec/test_files/seqs.fq +8 -0
  33. data/spec/test_files/seqs.fq.gz +0 -0
  34. metadata +49 -24
  35. data/lib/parse_fasta/fasta_file.rb +0 -232
  36. data/lib/parse_fasta/fastq_file.rb +0 -160
  37. data/lib/parse_fasta/quality.rb +0 -54
  38. data/lib/parse_fasta/sequence.rb +0 -174
  39. data/spec/lib/fasta_file_spec.rb +0 -212
  40. data/spec/lib/fastq_file_spec.rb +0 -143
  41. data/spec/lib/quality_spec.rb +0 -51
  42. data/spec/lib/seq_file_spec.rb +0 -357
  43. data/spec/lib/sequence_spec.rb +0 -188
  44. data/test_files/benchmark.rb +0 -99
  45. data/test_files/bogus.txt +0 -2
  46. data/test_files/test.fa.gz +0 -0
  47. data/test_files/test.fq +0 -8
  48. data/test_files/test.fq.gz +0 -0
@@ -1,4 +1,4 @@
1
- # Copyright 2014, 2015 Ryan Moore
1
+ # Copyright 2014 - 2016 Ryan Moore
2
2
  # Contact: moorer@udel.edu
3
3
  #
4
4
  # This file is part of parse_fasta.
@@ -16,132 +16,239 @@
16
16
  # You should have received a copy of the GNU General Public License
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
- # Provides a class that will parse either fastA or fastQ files,
20
- # depending on what the user provides. Handles, gzipped files.
21
- class SeqFile < File
22
-
23
- # Returns the records in the sequence file as a hash map with the
24
- # headers as keys and the Sequences as values. For a fastq file,
25
- # acts the same as `FastaFile#to_hash`
26
- #
27
- # @example Read a fastA into a hash table.
28
- # seqs = SeqFile.open('reads.fa').to_hash
29
- #
30
- # @return [Hash] A hash with headers as keys, sequences as the
31
- # values (Sequence objects)
32
- #
33
- # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
34
- # and file is a fastA file
35
- def to_hash
36
- first_char = get_first_char(self)
37
-
38
- if first_char == '>'
39
- FastaFile.open(self).to_hash
40
- elsif first_char == '@'
41
- FastqFile.open(self).to_hash
42
- else
43
- raise ArgumentError, "Input does not look like FASTA or FASTQ"
19
+ require "zlib"
20
+
21
+ def get_first_char fname
22
+ if File.exists? fname
23
+ begin
24
+ f = Zlib::GzipReader.open fname
25
+ rescue Zlib::GzipFile::Error
26
+ f = File.open fname
27
+ ensure
28
+ # f.close
44
29
  end
30
+
31
+ first_char = f.each_char.peek[0]
32
+ f.close
33
+ return first_char
34
+ else
35
+ raise ParseFasta::Error::FileNotFoundError,
36
+ "No such file or directory -- #{fname}"
37
+ end
38
+ end
39
+
40
+ def check_file fname
41
+ first_char = get_first_char fname
42
+
43
+ if first_char == ">"
44
+ :fasta
45
+ elsif first_char == "@"
46
+ :fastq
47
+ else
48
+ raise ParseFasta::Error::DataFormatError,
49
+ "The file does not look like fastA or fastQ " +
50
+ "-- #{fname}"
45
51
  end
52
+ end
53
+
54
+ module ParseFasta
55
+ class SeqFile
56
+ # @!attribute type
57
+ # @return [Symbol] the type of the SeqFile (:fasta or :fastq)
58
+ attr_accessor :type
59
+
60
+ # @param fname [String] the name of the fastA or fastQ file to
61
+ # parse
62
+ #
63
+ # @raise [ParseFasta::Error::FileNotFoundError] if the file is not
64
+ # found
65
+ # @raise [ParseFasta::Error::DataFormatError] if the file doesn't
66
+ # start with a '>' or a '@'
67
+ def initialize fname
68
+ type = check_file fname
69
+
70
+ @fname = fname
71
+ @type = type
72
+ end
46
73
 
47
- # Analagous to IO#each_line, #each_record will go through a fastA or
48
- # fastQ file record by record.
49
- #
50
- # This #each_record is used in a similar fashion as
51
- # FastaFile#each_record except that it yields the header and the
52
- # sequence regardless of whether the input is a fastA file or a
53
- # fastQ file.
54
- #
55
- # If the input is a fastQ file, this method will yield the header
56
- # and the sequence and ignore the description and the quality
57
- # string. This SeqFile class should only be used if your program
58
- # needs to work on either fastA or fastQ files, thus it ignores the
59
- # quality string and description and treats either file type as if
60
- # it were a fastA file.
61
- #
62
- # If you need the description or quality, you should use
63
- # FastqFile#each_record instead.
64
- #
65
- # @example Parse a gzipped fastA file
66
- # SeqFile.open('reads.fa.gz').each_record do |head, seq|
67
- # puts [head, seq.length].join "\t"
68
- # end
69
- #
70
- # @example Parse an uncompressed fastQ file
71
- # SeqFile.open('reads.fq.gz').each_record do |head, seq|
72
- # puts [head, seq.length].join "\t"
73
- # end
74
- #
75
- # @yieldparam header [String] The header of the record without the
76
- # leading '>' or '@'
77
- #
78
- # @yieldparam sequence [Sequence] The sequence of the record.
79
- #
80
- # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
81
- # and file is a fastA file
82
- def each_record
83
- first_char = get_first_char(self)
84
-
85
- if first_char == '>'
86
- FastaFile.open(self).each_record do |header, sequence|
87
- yield(header, sequence)
74
+ # An alias for SeqFile.new
75
+ #
76
+ # @return [SeqFile] a SeqFile object
77
+ def self.open fname
78
+ self.new fname
79
+ end
80
+
81
+ # Analagous to IO#each_line, SeqFile#each_record is used to go
82
+ # through a fastA or fastQ file record by record. It will accept
83
+ # gzipped files as well.
84
+ #
85
+ # If the input is a fastA file, then the record that is yielded
86
+ # will have the desc and qual instance variables be nil. If it is
87
+ # a fastQ record then those instance variables will not be nil.
88
+ #
89
+ # @example Parsing a fastA file
90
+ # ParseFasta::SeqFile.open("seqs.fa").each_record do |rec|
91
+ # puts [rec.header, rec.seq].join "\t"
92
+ #
93
+ # rec.desc.nil? #=> true
94
+ # rec.qual.nil? #=> true
95
+ # end
96
+ # @example Parsing a gzipped fastQ file
97
+ # ParseFasta::SeqFile.open("seqs.fq.gz").each_record do |rec|
98
+ # puts [rec.header, rec.seq, rec.desc, rec.qual].join "\t"
99
+ # end
100
+ #
101
+ # @yieldparam record [ParseFasta::Record] A Record object with all
102
+ # the info of the record
103
+ #
104
+ # @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
105
+ # contains a record with a '>' character in the header
106
+ def each_record &b
107
+ line_parser = "parse_#{@type}_lines"
108
+
109
+ if gzipped? @fname
110
+ each_record_gzipped line_parser, &b
111
+ else
112
+ each_record_non_gzipped line_parser, &b
88
113
  end
89
- elsif first_char == '@'
90
- FastqFile.open(self).each_record do |head, seq, desc, qual|
91
- yield(head, seq)
114
+ end
115
+
116
+
117
+ private
118
+
119
+ def each_record_non_gzipped line_parser, &b
120
+ File.open(@fname, "rt") do |f|
121
+ self.send line_parser, f, &b
92
122
  end
93
- else
94
- raise ArgumentError, "Input does not look like FASTA or FASTQ"
95
123
  end
96
- end
97
124
 
98
- # Fast version of #each_record
99
- #
100
- # @note If the sequence file has spaces in the sequence, they will
101
- # be retained. If this is a problem, use #each_record instead.
102
- #
103
- # @example Parse a gzipped fastA file
104
- # SeqFile.open('reads.fa.gz').each_record_fast do |head, seq|
105
- # puts [head, seq.length].join "\t"
106
- # end
107
- #
108
- # @example Parse an uncompressed fastQ file
109
- # SeqFile.open('reads.fq.gz').each_record_fast do |head, seq|
110
- # puts [head, seq.length].join "\t"
111
- # end
112
- #
113
- # @yieldparam header [String] The header of the record without the
114
- # leading '>' or '@'
115
- #
116
- # @yieldparam sequence [String] The sequence of the record.
117
- #
118
- # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
119
- # and file is a fastA file
120
- def each_record_fast
121
- first_char = get_first_char(self)
122
-
123
- if first_char == '>'
124
- FastaFile.open(self).each_record_fast do |header, sequence|
125
- yield(header, sequence)
125
+ def each_record_gzipped line_parser, &b
126
+ File.open(@fname, "rt") do |file|
127
+ loop do
128
+ begin
129
+ gz_reader = Zlib::GzipReader.new file
130
+
131
+ self.send line_parser, gz_reader, &b
132
+
133
+ # check if there are any more blobs to read
134
+ if (unused = gz_reader.unused)
135
+ # rewind to the start of the last blob
136
+ file.seek -unused.length, IO::SEEK_END
137
+ else
138
+ # there are no more blobs to read
139
+ break
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ def parse_fasta_line line, header, sequence, &b
147
+ line.chomp!
148
+ len = line.length
149
+
150
+ if header.empty? && line.start_with?(">")
151
+ header = line[1, len] # drop the '>'
152
+ elsif line.start_with? ">"
153
+ yield Record.new(header: header.strip, seq: sequence)
154
+
155
+ header = line[1, len]
156
+ sequence = ""
157
+ else
158
+ sequence << line
126
159
  end
127
- elsif first_char == '@'
128
- FastqFile.open(self).each_record_fast do |head, seq, desc, qual|
129
- yield(head, seq)
160
+
161
+ [header, sequence]
162
+ end
163
+
164
+ def parse_fastq_line line, header, seq, desc, qual, count, &b
165
+ line.chomp!
166
+
167
+ case count
168
+ when 0
169
+ header = line[1..-1]
170
+ when 1
171
+ seq = line
172
+ when 2
173
+ desc = line[1..-1]
174
+ when 3
175
+ count = -1
176
+ qual = line
177
+
178
+ yield Record.new(header: header,
179
+ seq: seq,
180
+ desc: desc,
181
+ qual: qual)
182
+ else
183
+ raise ParseFasta::Error::ParseFastaError,
184
+ "Something went wrong in parse_fastq_line"
130
185
  end
131
- else
132
- raise ArgumentError, "Input does not look like FASTA or FASTQ"
186
+
187
+ count += 1
188
+
189
+ [header, seq, desc, qual, count]
133
190
  end
134
- end
135
191
 
136
- private
192
+ def parse_fasta_lines file_reader, &b
193
+ header = ""
194
+ sequence = ""
137
195
 
138
- def get_first_char(f)
139
- begin
140
- handle = Zlib::GzipReader.open(f)
141
- rescue Zlib::GzipFile::Error => e
142
- handle = f
196
+ line_reader = which_line_reader file_reader
197
+ file_reader.send(*line_reader) do |line|
198
+ header, sequence = parse_fasta_line line, header, sequence, &b
199
+ end
200
+
201
+ # yield the final seq
202
+ yield Record.new(header: header.strip, seq: sequence)
203
+ end
204
+
205
+ def parse_fastq_lines file_reader, &b
206
+ count = 0
207
+ header = ""
208
+ seq = ""
209
+ desc = ""
210
+ qual = ""
211
+
212
+ line_reader = which_line_reader file_reader
213
+ file_reader.send(*line_reader) do |line|
214
+ header, seq, desc, qual, count =
215
+ parse_fastq_line line, header, seq, desc, qual, count, &b
216
+ end
143
217
  end
144
218
 
145
- handle.each_line.peek[0]
219
+ def gzipped? fname
220
+ begin
221
+ f = Zlib::GzipReader.open fname
222
+ return true
223
+ rescue Zlib::GzipFile::Error => e
224
+ return false
225
+ ensure
226
+ f.close if f
227
+ end
228
+ end
229
+
230
+ # The Zlib::GzipReader can't handle files where the line separator
231
+ # is \r. This could all be avoided by using IO.popen("gzip -cd
232
+ # #{fname}", "rt"), but will gzip always be available?
233
+ def which_line_reader file_reader
234
+ line_reader = [:each_line]
235
+ # a valid fasta file must have at least two lines, the header
236
+ # and the sequence
237
+ begin
238
+ enum = file_reader.each_line
239
+ # if this was ruby v2.3, then we could just call .size on enum
240
+ 2.times do
241
+ enum.next
242
+ end
243
+ rescue StopIteration
244
+ # Zlib::GzipReader can handle \n and \r\n, but not \r, so if
245
+ # we get here, the file has \r only for line endings
246
+ line_reader = [:each, "\r"]
247
+ ensure
248
+ file_reader.rewind
249
+ end
250
+
251
+ line_reader
252
+ end
146
253
  end
147
254
  end
@@ -1,4 +1,4 @@
1
- # Copyright 2014, 2015 Ryan Moore
1
+ # Copyright 2014 - 2016 Ryan Moore
2
2
  # Contact: moorer@udel.edu
3
3
  #
4
4
  # This file is part of parse_fasta.
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "1.9.2"
20
+ VERSION = "2.0.0"
21
21
  end
data/lib/parse_fasta.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2014, 2015 Ryan Moore
1
+ # Copyright 2014 - 2016 Ryan Moore
2
2
  # Contact: moorer@udel.edu
3
3
  #
4
4
  # This file is part of parse_fasta.
@@ -16,25 +16,10 @@
16
16
  # You should have received a copy of the GNU General Public License
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
- require 'parse_fasta/version'
20
- require 'parse_fasta/fasta_file'
21
- require 'parse_fasta/fastq_file'
22
- require 'parse_fasta/seq_file'
23
- require 'parse_fasta/sequence'
24
- require 'parse_fasta/quality'
19
+ require "parse_fasta/version"
20
+ require "parse_fasta/record"
21
+ require "parse_fasta/seq_file"
22
+ require "parse_fasta/error"
25
23
 
26
24
  module ParseFasta
27
-
28
- class Error < StandardError
29
- end
30
-
31
- # Error raised when FASTA file is malformed
32
- class DataFormatError < IOError
33
- def message
34
- "Data format error -- check input file"
35
- end
36
- end
37
-
38
- class SequenceFormatError < Error
39
- end
40
25
  end
@@ -0,0 +1,115 @@
1
+ # Copyright 2014 - 2016 Ryan Moore
2
+ # Contact: moorer@udel.edu
3
+ #
4
+ # This file is part of parse_fasta.
5
+ #
6
+ # parse_fasta is free software: you can redistribute it and/or modify
7
+ # it under the terms of the GNU General Public License as published by
8
+ # the Free Software Foundation, either version 3 of the License, or
9
+ # (at your option) any later version.
10
+ #
11
+ # parse_fasta is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
+
19
+ require "spec_helper"
20
+
21
+ module ParseFasta
22
+ describe Record do
23
+ let(:header) { "apple pie is good"}
24
+ let(:seq) { "ACTG" }
25
+ let(:desc) { "apple" }
26
+ let(:qual) { "abcd" }
27
+
28
+ let(:fasta_rec) {
29
+ Record.new header: header,
30
+ seq: "A C\t\t T G\r"
31
+ }
32
+ let(:fastq_rec) {
33
+ Record.new header: header,
34
+ seq: "A C\t\t T G\r",
35
+ desc: desc,
36
+ qual: " a b \tcd "
37
+ }
38
+
39
+ describe "::new" do
40
+ context "fastA input" do
41
+ it "sets :header" do
42
+ expect(fasta_rec.header).to eq header
43
+ end
44
+
45
+ it "sets :seq" do
46
+ expect(fasta_rec.seq).to eq seq
47
+ end
48
+
49
+ it "sets :desc to nil" do
50
+ expect(fasta_rec.desc).to eq nil
51
+ end
52
+
53
+ it "sets :qual to nil" do
54
+ expect(fasta_rec.qual).to eq nil
55
+ end
56
+
57
+ context "when seq has a '>' in it" do
58
+ it "raises SequenceFormatError" do
59
+ str = "actg>sequence 3"
60
+
61
+ expect { Record.new header: header, seq: str }.
62
+ to raise_error ParseFasta::Error::SequenceFormatError
63
+ end
64
+ end
65
+ end
66
+
67
+ context "fastQ input" do
68
+ it "sets :header" do
69
+ expect(fastq_rec.header).to eq header
70
+ end
71
+
72
+ it "sets :seq" do
73
+ expect(fastq_rec.seq).to eq seq
74
+ end
75
+
76
+ it "sets :desc to nil" do
77
+ expect(fastq_rec.desc).to eq desc
78
+ end
79
+
80
+ it "sets :qual to nil" do
81
+ expect(fastq_rec.qual).to eq qual
82
+ end
83
+
84
+ context "when seq has a '>' in it" do
85
+ it "does NOT rais SequenceFormatError" do
86
+ str = "actg>sequence 3"
87
+
88
+ expect { Record.new header: header,
89
+ seq: str,
90
+ desc: desc,
91
+ qual: qual }.
92
+ not_to raise_error
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ describe "#==" do
99
+ it "returns true if each of the attr_accessors are ==" do
100
+ rec1 = Record.new header: "a", seq: "a", desc: "", qual: "A"
101
+ rec2 = Record.new header: "a", seq: "a", desc: "", qual: "A"
102
+
103
+ expect(rec1 == rec2).to eq true
104
+ end
105
+
106
+ it "returns false otherwise" do
107
+ rec1 = Record.new header: "a", seq: "a", desc: "", qual: "A"
108
+ rec2 = Record.new header: "a", seq: "a", desc: "", qual: "b"
109
+
110
+ expect(rec1 == rec2).to eq false
111
+ end
112
+
113
+ end
114
+ end
115
+ end