RubyGems - parse_fasta - Versions diffs - 2.2.0 → 2.3.0 - Mend

parse_fasta 2.2.0 → 2.3.0

Files changed (12) hide show

checksums.yaml +4 -4
data/.gitignore +3 -1
data/CHANGELOG.md +4 -0
data/README.md +10 -0
data/lib/parse_fasta/record.rb +15 -4
data/lib/parse_fasta/seq_file.rb +15 -6
data/lib/parse_fasta/version.rb +1 -1
data/parse_fasta.gemspec +1 -0
data/spec/parse_fasta/record_spec.rb +26 -3
data/spec/parse_fasta/seq_file_spec.rb +31 -1
data/spec/test_files/with_rec_sep_in_seq.fa +4 -0
metadata +24 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e2fe1d91cfec3272d6f28872c5f048edd518b290
-  data.tar.gz: ef220e3c20cba089556a727a1cbe739f9a869037
+  metadata.gz: 5dc5b2c4063257a491082c62b9ef6faaecd694fa
+  data.tar.gz: 0f45d1a46360f65b83cada74be2b59e84d6c3179
 SHA512:
-  metadata.gz: 909f37e235e112841ec124768e30ccbccd92e50065446ca132475b7c148dd63f5a4d6ce08be7a3194c8bd15cd12cd8e243008739ae26c0c6b9d09bb37b0e1cac
-  data.tar.gz: e6a9080b6f836a1b14987b3b9c8185c3a05997583d82d209f7b9f8c5f83b98fad4a5ce02c66bc9880e1a872feeb4fee1cbac34f98714ec37cb8022832009325c
+  metadata.gz: 26898676ec187d85ded69405b1b193ece436e49693f4cdcc32dd72e313b3143b3d1bbf14d749bad0e5ae7ff45c39b8fb31056bf2fc3aab8e9eb10dee54fbc326
+  data.tar.gz: 0fd4292d811cec77a2f0cb20c8f89fdb06a74ec3f6b2488ae0505a42abfcf4722f81b8f446fd4f940b918b1798f9dff44974214b1b3abd4469df83902267d8aa

data/.gitignore CHANGED

@@ -21,4 +21,6 @@ tmp
 *.a
 mkmf.log
 .ruby-*
-.idea
+.idea
+time.html
+big_test_files

data/CHANGELOG.md CHANGED

@@ -1,5 +1,9 @@
 ## Versions ##
+### 2.3.0 ###
+Allow parsing of fastA files with `>` characters in the sequence with the `check_fasta_seq: false` option.
 ### 2.2.0 ###
 Add `id` attribute to `Record`.

data/README.md CHANGED

@@ -24,6 +24,8 @@ Or install it yourself as:
 Provides nice, programmatic access to fasta and fastq files. It's faster and more lightweight than BioRuby. And more fun!
+It takes care of a lot of whacky edge cases like parsing multi-blob gzipped files, and being strict on formatting by default.
 ## Documentation ##
 Checkout
@@ -93,3 +95,11 @@ ParseFasta::SeqFile.open(ARGV[0]).each_record do |rec|
   puts rec
 end
 ```
+Sometimes your fasta file might have record separators (`>`) withen the "sequence". For example, CD-HIT's `.clstr` files have headers within what would be the sequence part of the record. `ParseFasta` is really strict about formatting and will raise an error when trying to read these types of files. If you would like to parse them, use the `check_fasta_seq: false` flag like so:
+```ruby
+ParseFasta::SeqFile.open(ARGV[0], check_fasta_seq: false).each_record do |rec|
+  puts rec
+end
+```

data/lib/parse_fasta/record.rb CHANGED

@@ -39,6 +39,8 @@ module ParseFasta
     #
     # @example Init a new Record object for a fastA record
     #   Record.new header: "apple", seq: "actg"
+    # @example Init a new Record object for a fastA record without checking for '>' in the sequence.
+    #   Record.new header: "apple", seq: "pie>good", check_fasta_seq: false
     # @example Init a new Record object for a fastQ record
     #   Record.new header: "apple", seq: "actd", desc: "", qual: "IIII"
     #
@@ -46,9 +48,16 @@ module ParseFasta
     # @param seq [String] the sequence of the record
     # @param desc [String] the description line of a fastQ record
     # @param qual [String] the quality string of a fastQ record
+    # @param check_fasta_seq [Bool] Pass false if you don't want to
+    #   check for '>' characters in the sequence. Defaults to true,
+    #   which checks for '>' in the sequence and raises an error.
     #
-    # @raise [ParseFasta::Error::SequenceFormatError] if a fastA sequence has a '>'
-    #   character in it
+    # @raise [ParseFasta::Error::SequenceFormatError] if a fastA
+    #   sequence has a '>' character in it, and :check_fasta_seq is
+    #   NOT set to false.
+    #
+    # @todo This is destructive with respect to the input seq
+    #   arg. Does it need to be?
     def initialize args = {}
       @header = args.fetch :header
       @id = @header.split(" ")[0]
@@ -61,9 +70,11 @@ module ParseFasta
       seq = args.fetch(:seq)
       seq.tr!(" \t\n\r", "")
-      if fastq? # is fastQ
+      do_check_fasta_seq = args.fetch :check_fasta_seq, true
+      if fastq? || (!fastq? && !do_check_fasta_seq)
         @seq = seq
-      else # is fastA
+      else
         @seq = check_fasta_seq(seq)
       end
     end

data/lib/parse_fasta/seq_file.rb CHANGED

@@ -26,14 +26,18 @@ module ParseFasta
     # @param fname [String] the name of the fastA or fastQ file to
     #   parse
+    # @param type [Symbol] whether the file is :fasta or :fastq
+    # @param check_fasta_seq [Bool] keyword arg for whether to check
+    #   for '>' in the sequence of fastA files.
     #
     # @raise [ParseFasta::Error::FileNotFoundError] if the file is not
     #   found
     # @raise [ParseFasta::Error::DataFormatError] if the file doesn't
     #   start with a '>' or a '@'
-    def initialize fname
+    def initialize fname, args = {}
       type = check_file fname
+      @check_fasta_seq = args.fetch :check_fasta_seq, true
       @fname = fname
       @type = type
     end
@@ -41,8 +45,8 @@ module ParseFasta
     # An alias for SeqFile.new
     #
     # @return [SeqFile] a SeqFile object
-    def self.open fname
-      self.new fname
+    def self.open fname, args = {}
+      self.new fname, args
     end
     # Analagous to IO#each_line, SeqFile#each_record is used to go
@@ -69,7 +73,8 @@ module ParseFasta
     #   the info of the record
     #
     # @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
-    #   contains a record with a '>' character in the header
+    #   contains a record with a '>' character in the header, and the
+    #   SeqFile object was not initialized with check_fasta_seq: false
     def each_record &b
       line_parser = "parse_#{@type}_lines"
@@ -117,7 +122,9 @@ module ParseFasta
       if header.empty? && line.start_with?(">")
         header = line[1, len] # drop the '>'
       elsif line.start_with? ">"
-        yield Record.new(header: header.strip, seq: sequence)
+        yield Record.new(header: header.strip,
+                         seq: sequence,
+                         check_fasta_seq: @check_fasta_seq)
         header = line[1, len]
         sequence = ""
@@ -166,7 +173,9 @@ module ParseFasta
       end
       # yield the final seq
-      yield Record.new(header: header.strip, seq: sequence)
+      yield Record.new(header: header.strip,
+                       seq: sequence,
+                       check_fasta_seq: @check_fasta_seq)
     end
     def parse_fastq_lines file_reader, &b

data/lib/parse_fasta/version.rb CHANGED

@@ -17,5 +17,5 @@
 # along with parse_fasta.  If not, see <http://www.gnu.org/licenses/>.
 module ParseFasta
-  VERSION = "2.2.0"
+  VERSION = "2.3.0"
 end

data/parse_fasta.gemspec CHANGED

@@ -30,4 +30,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "yard", "~> 0.8"
   spec.add_development_dependency "rdiscount"
   spec.add_development_dependency "coveralls", "~> 0.7"
+  spec.add_development_dependency "benchmark-ips", "~> 2.7", ">= 2.7.2"
 end

data/spec/parse_fasta/record_spec.rb CHANGED

@@ -72,11 +72,34 @@ module ParseFasta
         end
         context "when seq has a '>' in it" do
-          it "raises SequenceFormatError" do
-            str = "actg>sequence 3"
+          context "with default strictness" do
+            it "raises SequenceFormatError" do
+              str = "actg>sequence 3"
-            expect { Record.new header: header, seq: str }.
+              expect { Record.new header: header, seq: str }.
                 to raise_error ParseFasta::Error::SequenceFormatError
+            end
+          end
+          context "with lenient checking" do
+            it "does NOT raise error" do
+              str = "actg>sequence 3"
+              expect { Record.new(header: header,
+                                  seq: str,
+                                  check_fasta_seq: false) }.
+                not_to raise_error
+            end
+            it "gives the sequence as is" do
+              str = "actg>sequence 3"
+              rec = Record.new(header: header,
+                               seq: str.dup,
+                               check_fasta_seq: false)
+              expect(rec.seq).to eq str.tr(" ", "")
+            end
           end
         end
       end

data/spec/parse_fasta/seq_file_spec.rb CHANGED

@@ -36,6 +36,9 @@ module ParseFasta
     let(:fastq_gz) {
       File.join test_dir, "seqs.fq.gz"
     }
+    let(:with_rec_sep_in_seq) {
+      File.join test_dir, "with_rec_sep_in_seq.fa"
+    }
     let(:fasta_records) {
       [Record.new(header: "empty seq at beginning",
@@ -65,6 +68,14 @@ module ParseFasta
                   desc:   "seq2 +pples",
                   qual:   "*ujM")]
     }
+    let(:with_rec_sep_in_seq_records) {
+      [Record.new(header: "seq1",
+                  seq: "AAAA>TTTT",
+                  check_fasta_seq: false),
+       Record.new(header: "seq2",
+                  seq: "TTTT>AAAA",
+                  check_fasta_seq: false)]
+    }
     # to test the line endings
     let(:line_endings_fastq_records) {
@@ -150,6 +161,25 @@ module ParseFasta
           include_examples "it yields the records"
         end
+        context "when the fasta file has '>' in a seq" do
+          context "when the check_fasta_seq flag is false" do
+            it "yields records even with '>' in the sequence" do
+              expect { |b|
+                SeqFile.open(with_rec_sep_in_seq,
+                             check_fasta_seq: false).each_record &b
+              }.to yield_successive_args(*with_rec_sep_in_seq_records)
+            end
+          end
+          context "when the check_fasta_seq flag is default" do
+            it "raises SequenceFormatError" do
+              expect { |b|
+                SeqFile.open(with_rec_sep_in_seq).each_record &b
+              }.to raise_error ParseFasta::Error::SequenceFormatError
+            end
+          end
+        end
       end
       context "input is fastQ" do
@@ -235,4 +265,4 @@ module ParseFasta
       end
     end
   end
-end
+end

data/spec/test_files/with_rec_sep_in_seq.fa ADDED

@@ -0,0 +1,4 @@
+>seq1
+AAAA>TTTT
+>seq2
+TTTT>AAAA

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parse_fasta
 version: !ruby/object:Gem::Version
-  version: 2.2.0
+  version: 2.3.0
 platform: ruby
 authors:
 - Ryan Moore
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-11-08 00:00:00.000000000 Z
+date: 2017-10-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -108,6 +108,26 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '0.7'
+- !ruby/object:Gem::Dependency
+  name: benchmark-ips
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.7'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.7.2
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.7'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.7.2
 description: Provides nice, programmatic access to fasta and fastq files, as well
   as providing Sequence and Quality helper classes. No need for BioRuby ;)
 email:
@@ -157,6 +177,7 @@ files:
 - spec/test_files/seqs.fq
 - spec/test_files/seqs.fq.gz
 - spec/test_files/test.rb
+- spec/test_files/with_rec_sep_in_seq.fa
 homepage: https://github.com/mooreryan/parse_fasta
 licenses:
 - 'GPLv3: http://www.gnu.org/licenses/gpl.txt'
@@ -203,4 +224,5 @@ test_files:
 - spec/test_files/seqs.fq
 - spec/test_files/seqs.fq.gz
 - spec/test_files/test.rb
+- spec/test_files/with_rec_sep_in_seq.fa
 has_rdoc: