parse_fasta 2.2.0 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e2fe1d91cfec3272d6f28872c5f048edd518b290
4
- data.tar.gz: ef220e3c20cba089556a727a1cbe739f9a869037
3
+ metadata.gz: 5dc5b2c4063257a491082c62b9ef6faaecd694fa
4
+ data.tar.gz: 0f45d1a46360f65b83cada74be2b59e84d6c3179
5
5
  SHA512:
6
- metadata.gz: 909f37e235e112841ec124768e30ccbccd92e50065446ca132475b7c148dd63f5a4d6ce08be7a3194c8bd15cd12cd8e243008739ae26c0c6b9d09bb37b0e1cac
7
- data.tar.gz: e6a9080b6f836a1b14987b3b9c8185c3a05997583d82d209f7b9f8c5f83b98fad4a5ce02c66bc9880e1a872feeb4fee1cbac34f98714ec37cb8022832009325c
6
+ metadata.gz: 26898676ec187d85ded69405b1b193ece436e49693f4cdcc32dd72e313b3143b3d1bbf14d749bad0e5ae7ff45c39b8fb31056bf2fc3aab8e9eb10dee54fbc326
7
+ data.tar.gz: 0fd4292d811cec77a2f0cb20c8f89fdb06a74ec3f6b2488ae0505a42abfcf4722f81b8f446fd4f940b918b1798f9dff44974214b1b3abd4469df83902267d8aa
data/.gitignore CHANGED
@@ -21,4 +21,6 @@ tmp
21
21
  *.a
22
22
  mkmf.log
23
23
  .ruby-*
24
- .idea
24
+ .idea
25
+ time.html
26
+ big_test_files
@@ -1,5 +1,9 @@
1
1
  ## Versions ##
2
2
 
3
+ ### 2.3.0 ###
4
+
5
+ Allow parsing of fastA files with `>` characters in the sequence with the `check_fasta_seq: false` option.
6
+
3
7
  ### 2.2.0 ###
4
8
 
5
9
  Add `id` attribute to `Record`.
data/README.md CHANGED
@@ -24,6 +24,8 @@ Or install it yourself as:
24
24
 
25
25
  Provides nice, programmatic access to fasta and fastq files. It's faster and more lightweight than BioRuby. And more fun!
26
26
 
27
+ It takes care of a lot of whacky edge cases like parsing multi-blob gzipped files, and being strict on formatting by default.
28
+
27
29
  ## Documentation ##
28
30
 
29
31
  Checkout
@@ -93,3 +95,11 @@ ParseFasta::SeqFile.open(ARGV[0]).each_record do |rec|
93
95
  puts rec
94
96
  end
95
97
  ```
98
+
99
+ Sometimes your fasta file might have record separators (`>`) withen the "sequence". For example, CD-HIT's `.clstr` files have headers within what would be the sequence part of the record. `ParseFasta` is really strict about formatting and will raise an error when trying to read these types of files. If you would like to parse them, use the `check_fasta_seq: false` flag like so:
100
+
101
+ ```ruby
102
+ ParseFasta::SeqFile.open(ARGV[0], check_fasta_seq: false).each_record do |rec|
103
+ puts rec
104
+ end
105
+ ```
@@ -39,6 +39,8 @@ module ParseFasta
39
39
  #
40
40
  # @example Init a new Record object for a fastA record
41
41
  # Record.new header: "apple", seq: "actg"
42
+ # @example Init a new Record object for a fastA record without checking for '>' in the sequence.
43
+ # Record.new header: "apple", seq: "pie>good", check_fasta_seq: false
42
44
  # @example Init a new Record object for a fastQ record
43
45
  # Record.new header: "apple", seq: "actd", desc: "", qual: "IIII"
44
46
  #
@@ -46,9 +48,16 @@ module ParseFasta
46
48
  # @param seq [String] the sequence of the record
47
49
  # @param desc [String] the description line of a fastQ record
48
50
  # @param qual [String] the quality string of a fastQ record
51
+ # @param check_fasta_seq [Bool] Pass false if you don't want to
52
+ # check for '>' characters in the sequence. Defaults to true,
53
+ # which checks for '>' in the sequence and raises an error.
49
54
  #
50
- # @raise [ParseFasta::Error::SequenceFormatError] if a fastA sequence has a '>'
51
- # character in it
55
+ # @raise [ParseFasta::Error::SequenceFormatError] if a fastA
56
+ # sequence has a '>' character in it, and :check_fasta_seq is
57
+ # NOT set to false.
58
+ #
59
+ # @todo This is destructive with respect to the input seq
60
+ # arg. Does it need to be?
52
61
  def initialize args = {}
53
62
  @header = args.fetch :header
54
63
  @id = @header.split(" ")[0]
@@ -61,9 +70,11 @@ module ParseFasta
61
70
  seq = args.fetch(:seq)
62
71
  seq.tr!(" \t\n\r", "")
63
72
 
64
- if fastq? # is fastQ
73
+ do_check_fasta_seq = args.fetch :check_fasta_seq, true
74
+
75
+ if fastq? || (!fastq? && !do_check_fasta_seq)
65
76
  @seq = seq
66
- else # is fastA
77
+ else
67
78
  @seq = check_fasta_seq(seq)
68
79
  end
69
80
  end
@@ -26,14 +26,18 @@ module ParseFasta
26
26
 
27
27
  # @param fname [String] the name of the fastA or fastQ file to
28
28
  # parse
29
+ # @param type [Symbol] whether the file is :fasta or :fastq
30
+ # @param check_fasta_seq [Bool] keyword arg for whether to check
31
+ # for '>' in the sequence of fastA files.
29
32
  #
30
33
  # @raise [ParseFasta::Error::FileNotFoundError] if the file is not
31
34
  # found
32
35
  # @raise [ParseFasta::Error::DataFormatError] if the file doesn't
33
36
  # start with a '>' or a '@'
34
- def initialize fname
37
+ def initialize fname, args = {}
35
38
  type = check_file fname
36
39
 
40
+ @check_fasta_seq = args.fetch :check_fasta_seq, true
37
41
  @fname = fname
38
42
  @type = type
39
43
  end
@@ -41,8 +45,8 @@ module ParseFasta
41
45
  # An alias for SeqFile.new
42
46
  #
43
47
  # @return [SeqFile] a SeqFile object
44
- def self.open fname
45
- self.new fname
48
+ def self.open fname, args = {}
49
+ self.new fname, args
46
50
  end
47
51
 
48
52
  # Analagous to IO#each_line, SeqFile#each_record is used to go
@@ -69,7 +73,8 @@ module ParseFasta
69
73
  # the info of the record
70
74
  #
71
75
  # @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
72
- # contains a record with a '>' character in the header
76
+ # contains a record with a '>' character in the header, and the
77
+ # SeqFile object was not initialized with check_fasta_seq: false
73
78
  def each_record &b
74
79
  line_parser = "parse_#{@type}_lines"
75
80
 
@@ -117,7 +122,9 @@ module ParseFasta
117
122
  if header.empty? && line.start_with?(">")
118
123
  header = line[1, len] # drop the '>'
119
124
  elsif line.start_with? ">"
120
- yield Record.new(header: header.strip, seq: sequence)
125
+ yield Record.new(header: header.strip,
126
+ seq: sequence,
127
+ check_fasta_seq: @check_fasta_seq)
121
128
 
122
129
  header = line[1, len]
123
130
  sequence = ""
@@ -166,7 +173,9 @@ module ParseFasta
166
173
  end
167
174
 
168
175
  # yield the final seq
169
- yield Record.new(header: header.strip, seq: sequence)
176
+ yield Record.new(header: header.strip,
177
+ seq: sequence,
178
+ check_fasta_seq: @check_fasta_seq)
170
179
  end
171
180
 
172
181
  def parse_fastq_lines file_reader, &b
@@ -17,5 +17,5 @@
17
17
  # along with parse_fasta. If not, see <http://www.gnu.org/licenses/>.
18
18
 
19
19
  module ParseFasta
20
- VERSION = "2.2.0"
20
+ VERSION = "2.3.0"
21
21
  end
@@ -30,4 +30,5 @@ Gem::Specification.new do |spec|
30
30
  spec.add_development_dependency "yard", "~> 0.8"
31
31
  spec.add_development_dependency "rdiscount"
32
32
  spec.add_development_dependency "coveralls", "~> 0.7"
33
+ spec.add_development_dependency "benchmark-ips", "~> 2.7", ">= 2.7.2"
33
34
  end
@@ -72,11 +72,34 @@ module ParseFasta
72
72
  end
73
73
 
74
74
  context "when seq has a '>' in it" do
75
- it "raises SequenceFormatError" do
76
- str = "actg>sequence 3"
75
+ context "with default strictness" do
76
+ it "raises SequenceFormatError" do
77
+ str = "actg>sequence 3"
77
78
 
78
- expect { Record.new header: header, seq: str }.
79
+ expect { Record.new header: header, seq: str }.
79
80
  to raise_error ParseFasta::Error::SequenceFormatError
81
+ end
82
+ end
83
+
84
+ context "with lenient checking" do
85
+ it "does NOT raise error" do
86
+ str = "actg>sequence 3"
87
+
88
+ expect { Record.new(header: header,
89
+ seq: str,
90
+ check_fasta_seq: false) }.
91
+ not_to raise_error
92
+ end
93
+
94
+ it "gives the sequence as is" do
95
+ str = "actg>sequence 3"
96
+
97
+ rec = Record.new(header: header,
98
+ seq: str.dup,
99
+ check_fasta_seq: false)
100
+
101
+ expect(rec.seq).to eq str.tr(" ", "")
102
+ end
80
103
  end
81
104
  end
82
105
  end
@@ -36,6 +36,9 @@ module ParseFasta
36
36
  let(:fastq_gz) {
37
37
  File.join test_dir, "seqs.fq.gz"
38
38
  }
39
+ let(:with_rec_sep_in_seq) {
40
+ File.join test_dir, "with_rec_sep_in_seq.fa"
41
+ }
39
42
 
40
43
  let(:fasta_records) {
41
44
  [Record.new(header: "empty seq at beginning",
@@ -65,6 +68,14 @@ module ParseFasta
65
68
  desc: "seq2 +pples",
66
69
  qual: "*ujM")]
67
70
  }
71
+ let(:with_rec_sep_in_seq_records) {
72
+ [Record.new(header: "seq1",
73
+ seq: "AAAA>TTTT",
74
+ check_fasta_seq: false),
75
+ Record.new(header: "seq2",
76
+ seq: "TTTT>AAAA",
77
+ check_fasta_seq: false)]
78
+ }
68
79
 
69
80
  # to test the line endings
70
81
  let(:line_endings_fastq_records) {
@@ -150,6 +161,25 @@ module ParseFasta
150
161
 
151
162
  include_examples "it yields the records"
152
163
  end
164
+
165
+ context "when the fasta file has '>' in a seq" do
166
+ context "when the check_fasta_seq flag is false" do
167
+ it "yields records even with '>' in the sequence" do
168
+ expect { |b|
169
+ SeqFile.open(with_rec_sep_in_seq,
170
+ check_fasta_seq: false).each_record &b
171
+ }.to yield_successive_args(*with_rec_sep_in_seq_records)
172
+ end
173
+ end
174
+
175
+ context "when the check_fasta_seq flag is default" do
176
+ it "raises SequenceFormatError" do
177
+ expect { |b|
178
+ SeqFile.open(with_rec_sep_in_seq).each_record &b
179
+ }.to raise_error ParseFasta::Error::SequenceFormatError
180
+ end
181
+ end
182
+ end
153
183
  end
154
184
 
155
185
  context "input is fastQ" do
@@ -235,4 +265,4 @@ module ParseFasta
235
265
  end
236
266
  end
237
267
  end
238
- end
268
+ end
@@ -0,0 +1,4 @@
1
+ >seq1
2
+ AAAA>TTTT
3
+ >seq2
4
+ TTTT>AAAA
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parse_fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.0
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-08 00:00:00.000000000 Z
11
+ date: 2017-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -108,6 +108,26 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0.7'
111
+ - !ruby/object:Gem::Dependency
112
+ name: benchmark-ips
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '2.7'
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: 2.7.2
121
+ type: :development
122
+ prerelease: false
123
+ version_requirements: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - "~>"
126
+ - !ruby/object:Gem::Version
127
+ version: '2.7'
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: 2.7.2
111
131
  description: Provides nice, programmatic access to fasta and fastq files, as well
112
132
  as providing Sequence and Quality helper classes. No need for BioRuby ;)
113
133
  email:
@@ -157,6 +177,7 @@ files:
157
177
  - spec/test_files/seqs.fq
158
178
  - spec/test_files/seqs.fq.gz
159
179
  - spec/test_files/test.rb
180
+ - spec/test_files/with_rec_sep_in_seq.fa
160
181
  homepage: https://github.com/mooreryan/parse_fasta
161
182
  licenses:
162
183
  - 'GPLv3: http://www.gnu.org/licenses/gpl.txt'
@@ -203,4 +224,5 @@ test_files:
203
224
  - spec/test_files/seqs.fq
204
225
  - spec/test_files/seqs.fq.gz
205
226
  - spec/test_files/test.rb
227
+ - spec/test_files/with_rec_sep_in_seq.fa
206
228
  has_rdoc: