RubyGems - parse_fasta - Versions diffs - 2.2.0 → 2.3.0 - Mend

parse_fasta 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/.gitignore +3 -1
data/CHANGELOG.md +4 -0
data/README.md +10 -0
data/lib/parse_fasta/record.rb +15 -4
data/lib/parse_fasta/seq_file.rb +15 -6
data/lib/parse_fasta/version.rb +1 -1
data/parse_fasta.gemspec +1 -0
data/spec/parse_fasta/record_spec.rb +26 -3
data/spec/parse_fasta/seq_file_spec.rb +31 -1
data/spec/test_files/with_rec_sep_in_seq.fa +4 -0
metadata +24 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e2fe1d91cfec3272d6f28872c5f048edd518b290
-  data.tar.gz: ef220e3c20cba089556a727a1cbe739f9a869037
+  metadata.gz: 5dc5b2c4063257a491082c62b9ef6faaecd694fa
+  data.tar.gz: 0f45d1a46360f65b83cada74be2b59e84d6c3179
 SHA512:
-  metadata.gz: 909f37e235e112841ec124768e30ccbccd92e50065446ca132475b7c148dd63f5a4d6ce08be7a3194c8bd15cd12cd8e243008739ae26c0c6b9d09bb37b0e1cac
-  data.tar.gz: e6a9080b6f836a1b14987b3b9c8185c3a05997583d82d209f7b9f8c5f83b98fad4a5ce02c66bc9880e1a872feeb4fee1cbac34f98714ec37cb8022832009325c
+  metadata.gz: 26898676ec187d85ded69405b1b193ece436e49693f4cdcc32dd72e313b3143b3d1bbf14d749bad0e5ae7ff45c39b8fb31056bf2fc3aab8e9eb10dee54fbc326
+  data.tar.gz: 0fd4292d811cec77a2f0cb20c8f89fdb06a74ec3f6b2488ae0505a42abfcf4722f81b8f446fd4f940b918b1798f9dff44974214b1b3abd4469df83902267d8aa

data/.gitignore CHANGED

@@ -21,4 +21,6 @@ tmp
 *.a
 mkmf.log
 .ruby-*
-.idea
+.idea
+time.html
+big_test_files

data/CHANGELOG.md CHANGED

@@ -1,5 +1,9 @@
 ## Versions ##
+### 2.3.0 ###
+Allow parsing of fastA files with `>` characters in the sequence with the `check_fasta_seq: false` option.
 ### 2.2.0 ###
 Add `id` attribute to `Record`.

data/README.md CHANGED

@@ -24,6 +24,8 @@ Or install it yourself as:
 Provides nice, programmatic access to fasta and fastq files. It's faster and more lightweight than BioRuby. And more fun!
+It takes care of a lot of whacky edge cases like parsing multi-blob gzipped files, and being strict on formatting by default.
 ## Documentation ##
 Checkout
@@ -93,3 +95,11 @@ ParseFasta::SeqFile.open(ARGV[0]).each_record do |rec|
   puts rec
 end
 ```
+Sometimes your fasta file might have record separators (`>`) withen the "sequence". For example, CD-HIT's `.clstr` files have headers within what would be the sequence part of the record. `ParseFasta` is really strict about formatting and will raise an error when trying to read these types of files. If you would like to parse them, use the `check_fasta_seq: false` flag like so:
+```ruby
+ParseFasta::SeqFile.open(ARGV[0], check_fasta_seq: false).each_record do |rec|
+  puts rec
+end
+```

data/lib/parse_fasta/record.rb CHANGED

@@ -39,6 +39,8 @@ module ParseFasta
     #
     # @example Init a new Record object for a fastA record
     #   Record.new header: "apple", seq: "actg"
+    # @example Init a new Record object for a fastA record without checking for '>' in the sequence.
+    #   Record.new header: "apple", seq: "pie>good", check_fasta_seq: false
     # @example Init a new Record object for a fastQ record
     #   Record.new header: "apple", seq: "actd", desc: "", qual: "IIII"
     #
@@ -46,9 +48,16 @@ module ParseFasta
     # @param seq [String] the sequence of the record
     # @param desc [String] the description line of a fastQ record
     # @param qual [String] the quality string of a fastQ record
+    # @param check_fasta_seq [Bool] Pass false if you don't want to
+    #   check for '>' characters in the sequence. Defaults to true,
+    #   which checks for '>' in the sequence and raises an error.
     #
-    # @raise [ParseFasta::Error::SequenceFormatError] if a fastA sequence has a '>'
-    #   character in it
+    # @raise [ParseFasta::Error::SequenceFormatError] if a fastA
+    #   sequence has a '>' character in it, and :check_fasta_seq is
+    #   NOT set to false.
+    #
+    # @todo This is destructive with respect to the input seq
+    #   arg. Does it need to be?
     def initialize args = {}
       @header = args.fetch :header
       @id = @header.split(" ")[0]
@@ -61,9 +70,11 @@ module ParseFasta
       seq = args.fetch(:seq)
       seq.tr!(" \t\n\r", "")
-      if fastq? # is fastQ
+      do_check_fasta_seq = args.fetch :check_fasta_seq, true
+      if fastq? || (!fastq? && !do_check_fasta_seq)
         @seq = seq
-      else # is fastA
+      else
         @seq = check_fasta_seq(seq)
       end
     end

data/lib/parse_fasta/seq_file.rb CHANGED

@@ -26,14 +26,18 @@ module ParseFasta
     # @param fname [String] the name of the fastA or fastQ file to
     #   parse
+    # @param type [Symbol] whether the file is :fasta or :fastq
+    # @param check_fasta_seq [Bool] keyword arg for whether to check
+    #   for '>' in the sequence of fastA files.
     #
     # @raise [ParseFasta::Error::FileNotFoundError] if the file is not
     #   found
     # @raise [ParseFasta::Error::DataFormatError] if the file doesn't
     #   start with a '>' or a '@'
-    def initialize fname
+    def initialize fname, args = {}
       type = check_file fname
+      @check_fasta_seq = args.fetch :check_fasta_seq, true
       @fname = fname
       @type = type
     end
@@ -41,8 +45,8 @@ module ParseFasta
     # An alias for SeqFile.new
     #
     # @return [SeqFile] a SeqFile object
-    def self.open fname
-      self.new fname
+    def self.open fname, args = {}
+      self.new fname, args
     end
     # Analagous to IO#each_line, SeqFile#each_record is used to go
@@ -69,7 +73,8 @@ module ParseFasta
     #   the info of the record
     #
     # @raise [ParseFasta::Error::SequenceFormatError] if a fastA file
-    #   contains a record with a '>' character in the header
+    #   contains a record with a '>' character in the header, and the
+    #   SeqFile object was not initialized with check_fasta_seq: false
     def each_record &b
       line_parser = "parse_#{@type}_lines"
@@ -117,7 +122,9 @@ module ParseFasta
       if header.empty? && line.start_with?(">")
         header = line[1, len] # drop the '>'
       elsif line.start_with? ">"
-        yield Record.new(header: header.strip, seq: sequence)
+        yield Record.new(header: header.strip,
+                         seq: sequence,
+                         check_fasta_seq: @check_fasta_seq)
         header = line[1, len]
         sequence = ""
@@ -166,7 +173,9 @@ module ParseFasta
       end
       # yield the final seq
-      yield Record.new(header: header.strip, seq: sequence)
+      yield Record.new(header: header.strip,
+                       seq: sequence,
+                       check_fasta_seq: @check_fasta_seq)
     end
     def parse_fastq_lines file_reader, &b

data/lib/parse_fasta/version.rb CHANGED

@@ -17,5 +17,5 @@
 # along with parse_fasta.  If not, see <http://www.gnu.org/licenses/>.
 module ParseFasta
-  VERSION = "2.2.0"
+  VERSION = "2.3.0"
 end

data/parse_fasta.gemspec CHANGED

@@ -30,4 +30,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "yard", "~> 0.8"
   spec.add_development_dependency "rdiscount"
   spec.add_development_dependency "coveralls", "~> 0.7"
+  spec.add_development_dependency "benchmark-ips", "~> 2.7", ">= 2.7.2"
 end

data/spec/parse_fasta/record_spec.rb CHANGED

@@ -72,11 +72,34 @@ module ParseFasta
         end
         context "when seq has a '>' in it" do
-          it "raises SequenceFormatError" do
-            str = "actg>sequence 3"
+          context "with default strictness" do
+            it "raises SequenceFormatError" do
+              str = "actg>sequence 3"
-            expect { Record.new header: header, seq: str }.
+              expect { Record.new header: header, seq: str }.
                 to raise_error ParseFasta::Error::SequenceFormatError
+            end
+          end
+          context "with lenient checking" do
+            it "does NOT raise error" do
+              str = "actg>sequence 3"
+              expect { Record.new(header: header,
+                                  seq: str,
+                                  check_fasta_seq: false) }.
+                not_to raise_error
+            end
+            it "gives the sequence as is" do
+              str = "actg>sequence 3"
+              rec = Record.new(header: header,
+                               seq: str.dup,
+                               check_fasta_seq: false)
+              expect(rec.seq).to eq str.tr(" ", "")
+            end
           end
         end
       end

data/spec/parse_fasta/seq_file_spec.rb CHANGED

@@ -36,6 +36,9 @@ module ParseFasta
     let(:fastq_gz) {
       File.join test_dir, "seqs.fq.gz"
     }
+    let(:with_rec_sep_in_seq) {
+      File.join test_dir, "with_rec_sep_in_seq.fa"
+    }
     let(:fasta_records) {
       [Record.new(header: "empty seq at beginning",
@@ -65,6 +68,14 @@ module ParseFasta
                   desc:   "seq2 +pples",
                   qual:   "*ujM")]
     }
+    let(:with_rec_sep_in_seq_records) {
+      [Record.new(header: "seq1",
+                  seq: "AAAA>TTTT",
+                  check_fasta_seq: false),
+       Record.new(header: "seq2",
+                  seq: "TTTT>AAAA",
+                  check_fasta_seq: false)]
+    }
     # to test the line endings
     let(:line_endings_fastq_records) {
@@ -150,6 +161,25 @@ module ParseFasta
           include_examples "it yields the records"
         end
+        context "when the fasta file has '>' in a seq" do
+          context "when the check_fasta_seq flag is false" do
+            it "yields records even with '>' in the sequence" do
+              expect { |b|
+                SeqFile.open(with_rec_sep_in_seq,
+                             check_fasta_seq: false).each_record &b
+              }.to yield_successive_args(*with_rec_sep_in_seq_records)
+            end
+          end
+          context "when the check_fasta_seq flag is default" do
+            it "raises SequenceFormatError" do
+              expect { |b|
+                SeqFile.open(with_rec_sep_in_seq).each_record &b
+              }.to raise_error ParseFasta::Error::SequenceFormatError
+            end
+          end
+        end
       end
       context "input is fastQ" do
@@ -235,4 +265,4 @@ module ParseFasta
       end
     end
   end
-end
+end

data/spec/test_files/with_rec_sep_in_seq.fa ADDED

@@ -0,0 +1,4 @@
+>seq1
+AAAA>TTTT
+>seq2
+TTTT>AAAA

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parse_fasta
 version: !ruby/object:Gem::Version
-  version: 2.2.0
+  version: 2.3.0
 platform: ruby
 authors:
 - Ryan Moore
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-11-08 00:00:00.000000000 Z
+date: 2017-10-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -108,6 +108,26 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '0.7'
+- !ruby/object:Gem::Dependency
+  name: benchmark-ips
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.7'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.7.2
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.7'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.7.2
 description: Provides nice, programmatic access to fasta and fastq files, as well
   as providing Sequence and Quality helper classes. No need for BioRuby ;)
 email:
@@ -157,6 +177,7 @@ files:
 - spec/test_files/seqs.fq
 - spec/test_files/seqs.fq.gz
 - spec/test_files/test.rb
+- spec/test_files/with_rec_sep_in_seq.fa
 homepage: https://github.com/mooreryan/parse_fasta
 licenses:
 - 'GPLv3: http://www.gnu.org/licenses/gpl.txt'
@@ -203,4 +224,5 @@ test_files:
 - spec/test_files/seqs.fq
 - spec/test_files/seqs.fq.gz
 - spec/test_files/test.rb
+- spec/test_files/with_rec_sep_in_seq.fa
 has_rdoc: