RubyGems - parse_fasta - Versions diffs - 1.8.0 → 1.8.1 - Mend

parse_fasta 1.8.0 → 1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +8 -8
data/.gitignore +1 -0
data/README.md +22 -1
data/lib/parse_fasta/fasta_file.rb +28 -2
data/lib/parse_fasta/seq_file.rb +6 -0
data/lib/parse_fasta/sequence.rb +6 -0
data/lib/parse_fasta/version.rb +1 -1
data/lib/parse_fasta.rb +7 -0
data/spec/lib/fasta_file_spec.rb +18 -0
data/spec/lib/seq_file_spec.rb +18 -0
data/spec/lib/sequence_spec.rb +8 -0
data/spec/spec_helper.rb +4 -1
data/test_files/bad.fa +5 -0
data/test_files/test.fa +3 -0
data/test_files/test.fa.gz +0 -0
data/test_files/test.fq.gz +0 -0
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    MmIxNzgzM2E2MzkzYTc1ZjM2MWE0NzIxN2ZmZmRkMGUxMDQ0Y2MzYw==
+    YzlkNTQ5NGQ5YTFlNzVkOTJjOTJkMTM2YmUwN2FlMjhmOTg2ZDZlMQ==
   data.tar.gz: !binary |-
-    Njc4ZTVlN2EwMTVjZDkwNWZmYTM2MzcwY2FkY2NlNjVmNmJiNWFmYw==
+    OThhYTU5NTAzYzlkMTg2N2IxOWNjYTExMWEyODRiY2Q2OGFhMzQ4MQ==
 SHA512:
   metadata.gz: !binary |-
-    ZjNiOTc2NTM0OTc0M2U4NDNkYzhlNTBlNjM3MzI4Mjc4YWY4YmE4MjRhMDdj
-    MGRiNWNiMmExZTZjYmRhZDk0YzcxMTQ1OGQ0NzdkYTVmMmQ0Mzg1YmU0NmFl
-    NmNjYWM1MmY4ZWRhNTY5MDAzNTk1YzcyN2IzYWE2MTNkMzAwNDg=
+    NzMyYTNmZmQ0YThlMThkZmE3ZjZhZjAzNDM2MGQ4ZTcwODhkODY3NzI2NzU1
+    NDQzYmU1ZDBiZjljYzVhZmNlMDIzZDMxMDc4Zjk3N2E1YTAxOTUzZTIyOGNj
+    NzdjOWJiODA2ZDA0NGNmMjFkOGI1ZjgxZWY3NTRmMTQ1MDc5MTU=
   data.tar.gz: !binary |-
-    Mjk5ZjU3YWI3YTJlN2Q4NWJjMWY3NDczOTBhNzI3NzlkMDViZjRhZGFkN2I4
-    NmU5YzFhYTI4ZDc2N2RhYTE5ODdkODE5NTQ5ZjJmNzNmNjEyNzY3NzJiZTk3
-    ODA2MWI3YjUzZDdmMTE5MGM1MDA3ZTk1NmMyNGU3NjFmOTIyMWY=
+    ODU2NzMwZTk3ZmE0ZTIxYzMwOWVkMWUyY2U4MTE3YzAzMzI5MzU1ZDAzNWE3
+    OGQ3ODk2ZjQwYTNjNTJlZTVjYzg3MGU5YzliZjAyYjQ4ZDNmNjRlNzE2YmJk
+    MmY2OTRhYjI3NTM3ODFmYWYwNDk2ZjQ0YzI3YjIxMzI3MGU3MmE=

data/.gitignore CHANGED Viewed

@@ -20,3 +20,4 @@ tmp
 *.o
 *.a
 mkmf.log
+.ruby-*

data/README.md CHANGED Viewed

@@ -27,7 +27,7 @@ lightweight than BioRuby. And more fun! ;)
 ## Documentation ##
 Checkout
-[parse_fasta docs](http://rubydoc.info/gems/parse_fasta/1.8.0/frames)
+[parse_fasta docs](http://rubydoc.info/gems/parse_fasta)
 for the full api documentation.
 ## Usage ##
@@ -73,6 +73,27 @@ Add `Sequence#rev_comp`. It can handle IUPAC characters. Since
 an amino acid string, things will get weird as it will complement the
 IUPAC characters in the AA string and leave others.
+#### 1.8.1 ####
+An error will be raised if a fasta file has a `>` in the
+sequence. Sometimes files are not terminated with a newline
+character. If this is the case, then catting two fasta files will
+smush the first header of the second file right in with the last
+sequence of the first file. This is bad, raise an error! ;)
+Example
+    >seq1
+    ACTG>seq2
+    ACTG
+    >seq3
+    ACTG
+This will raise `ParseFasta::SequenceFormatError`.
+Also, headers with lots of `>` within are fine now.
 ### 1.7 ###
 Add `SeqFile#to_hash`, `FastaFile#to_hash` and `FastqFile#to_hash`.

data/lib/parse_fasta/fasta_file.rb CHANGED Viewed

@@ -51,6 +51,8 @@ class FastaFile < File
   #
   # @return [Hash] A hash with headers as keys, sequences as the
   #   values (Sequence objects)
+  #
+  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
   def to_hash
     hash = {}
     self.each_record do |head, seq|
@@ -88,6 +90,8 @@ class FastaFile < File
   #   fasta record. If `separate_lines` is falsy (the default
   #   behavior), will be Sequence, but if truthy will be
   #   Array<String>.
+  #
+  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
   def each_record(separate_lines=nil)
     begin
       f = Zlib::GzipReader.open(self)
@@ -100,11 +104,33 @@ class FastaFile < File
         header, sequence = parse_line_separately(line)
         yield(header.strip, sequence)
       end
+      # f.each_with_index(">") do |line, idx|
+      #   if idx.zero?
+      #     if line != ">"
+      #       raise ParseFasta::DataFormatError
+      #     end
+      #   else
+      #     header, sequence = parse_line_separately(line)
+      #     yield(header.strip, sequence)
+      #   end
+      # end
     else
       f.each("\n>") do |line|
         header, sequence = parse_line(line)
         yield(header.strip, Sequence.new(sequence || ""))
       end
+      # f.each_with_index(sep=/^>/) do |line, idx|
+      #   if idx.zero?
+      #     if line != ">"
+      #       raise ParseFasta::DataFormatError
+      #     end
+      #   else
+      #     header, sequence = parse_line(line)
+      #     yield(header.strip, Sequence.new(sequence || ""))
+      #   end
+      # end
     end
     f.close if f.instance_of?(Zlib::GzipReader)
@@ -114,12 +140,12 @@ class FastaFile < File
   private
   def parse_line(line)
-    line.split("\n", 2).map { |s| s.gsub(/\n|>/, '') }
+    line.split("\n", 2).map { |s| s.gsub(/\n|^>|>$/, '') }
   end
   def parse_line_separately(line)
     header, sequence =
-      line.split("\n", 2).map { |s| s.gsub(/>/, '') }
+      line.split("\n", 2).map { |s| s.gsub(/^>|>$/, '') }
     if sequence.nil?
       sequences = []

data/lib/parse_fasta/seq_file.rb CHANGED Viewed

@@ -29,6 +29,9 @@ class SeqFile < File
   #
   # @return [Hash] A hash with headers as keys, sequences as the
   #   values (Sequence objects)
+  #
+  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
+  #   and file is a fastA file
   def to_hash
     first_char = get_first_char(self)
@@ -73,6 +76,9 @@ class SeqFile < File
   #   leading '>' or '@'
   #
   # @yieldparam sequence [Sequence] The sequence of the record.
+  #
+  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>',
+  #   and file is a fastA file
   def each_record
     first_char = get_first_char(self)

data/lib/parse_fasta/sequence.rb CHANGED Viewed

@@ -35,7 +35,13 @@ class Sequence < String
   #
   # @example Removes whitespace
   #   Sequence.new "AA CC TT" #=> "AACCTT"
+  #
+  # @raise [ParseFasta::SequenceFormatError] if sequence has a '>'
   def initialize(str)
+    if str.match(/>/)
+      raise ParseFasta::SequenceFormatError
+    end
     super(str.gsub(/ +/, ""))
   end

data/lib/parse_fasta/version.rb CHANGED Viewed

@@ -17,5 +17,5 @@
 # along with parse_fasta.  If not, see <http://www.gnu.org/licenses/>.
 module ParseFasta
-  VERSION = "1.8.0"
+  VERSION = "1.8.1"
 end

data/lib/parse_fasta.rb CHANGED Viewed

@@ -24,10 +24,17 @@ require 'parse_fasta/sequence'
 require 'parse_fasta/quality'
 module ParseFasta
+  class Error < StandardError
+  end
   # Error raised when FASTA file is malformed
   class DataFormatError < IOError
     def message
       "Data format error -- check input file"
     end
   end
+  class SequenceFormatError < Error
+  end
 end

data/spec/lib/fasta_file_spec.rb CHANGED Viewed

@@ -49,6 +49,15 @@ describe FastaFile do
     let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
     let(:fasta) { FastaFile.open(fname) }
+    context "with badly catted fasta" do
+      it "raises ParseFasta::SequenceFormatError" do
+        fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
+        expect { FastaFile.open(fname).to_hash }.
+          to raise_error ParseFasta::SequenceFormatError
+      end
+    end
     it "reads the records into a hash: header as key and seq as val" do
       expect(fasta.to_hash).to eq records
     end
@@ -66,6 +75,15 @@ describe FastaFile do
     let(:truthy_records) { Helpers::TRUTHY_RECORDS }
     let(:f_handle) { FastaFile.open(@fname).each_record { |s| } }
+    context "with badly catted fasta" do
+      it "raises ParseFasta::SequenceFormatError" do
+        fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
+        expect { FastaFile.open(fname).each_record {} }.
+          to raise_error ParseFasta::SequenceFormatError
+      end
+    end
     shared_examples_for "any FastaFile" do
       context "with no arguments" do
         it "yields proper header and sequence for each record" do

data/spec/lib/seq_file_spec.rb CHANGED Viewed

@@ -26,6 +26,15 @@ describe SeqFile do
       let(:fname) { "#{File.dirname(__FILE__)}/../../test_files/test.fa.gz" }
       let(:fasta) { SeqFile.open(fname) }
+      context "with badly catted fasta" do
+        it "raises ParseFasta::SequenceFormatError" do
+          fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
+          expect { FastaFile.open(fname).to_hash }.
+            to raise_error ParseFasta::SequenceFormatError
+        end
+      end
       it "reads the records into a hash: header as key and seq as val" do
         expect(fasta.to_hash).to eq records
       end
@@ -77,6 +86,15 @@ describe SeqFile do
       let(:f_handle) { SeqFile.open(@fname).each_record { |s| } }
+      context "with badly catted fasta" do
+        it "raises ParseFasta::SequenceFormatError" do
+          fname = "#{File.dirname(__FILE__)}/../../test_files/bad.fa"
+          expect { FastaFile.open(fname).to_hash }.
+            to raise_error ParseFasta::SequenceFormatError
+        end
+      end
       shared_examples_for "parsing a fasta file" do
         it "yields proper header and sequence for each record" do
           expect { |b|

data/spec/lib/sequence_spec.rb CHANGED Viewed

@@ -35,6 +35,14 @@ describe Sequence do
       s_no_spaces = "ACTACTACTGCT"
       expect(Sequence.new(s)).to eq s_no_spaces
     end
+    context "when sequence has a '>' in it" do
+      it "raises SequenceFormatError" do
+        s = "actg>sequence 3"
+        expect { Sequence.new(s) }.
+          to raise_error ParseFasta::SequenceFormatError
+      end
+    end
   end
   describe "#gc" do

data/spec/spec_helper.rb CHANGED Viewed

@@ -29,6 +29,7 @@ module Helpers
              ["empty seq 1", ""],
              ["empty seq 2", ""],
              ["seq3", "yyyyyyyyyyyyyyyNNN"],
+             ["seq 4 > has many '>' in header", "ACTGactg"],
              ["empty seq at end", ""]]
   RECORDS_MAP = {
@@ -38,16 +39,18 @@ module Helpers
     "empty seq 1" => "",
     "empty seq 2" => "",
     "seq3" => "yyyyyyyyyyyyyyyNNN",
+    "seq 4 > has many '>' in header" => "ACTGactg",
     "empty seq at end" => ""
   }
   TRUTHY_RECORDS = [["empty seq at beginning", []],
                     ["seq1 is fun", ["AACTGGNNN"]],
                     ["seq2", ["AAT", "CCTGNNN"]],
                     ["empty seq 1", []],
                     ["empty seq 2", []],
                     ["seq3", ["yyyyyyyyyy", "yyyyy", "NNN"]],
+                    ["seq 4 > has many '>' in header", ["ACTG" ,
+                                                        "actg"]],
                     ["empty seq at end", []]]
 end

data/test_files/bad.fa ADDED Viewed

@@ -0,0 +1,5 @@
+>seq1
+ACTG>seq2
+ACTG
+>seq3
+ACTG

data/test_files/test.fa CHANGED Viewed

@@ -15,4 +15,7 @@ yyyyyyyyyy
 yyyyy
 NNN
+>seq 4 > has many '>' in header
+ACTG
+actg
 >empty seq at end

data/test_files/test.fa.gz CHANGED Viewed

Binary file

data/test_files/test.fq.gz CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parse_fasta
 version: !ruby/object:Gem::Version
-  version: 1.8.0
+  version: 1.8.1
 platform: ruby
 authors:
 - Ryan Moore
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-11-17 00:00:00.000000000 Z
+date: 2016-03-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -138,6 +138,7 @@ files:
 - spec/lib/seq_file_spec.rb
 - spec/lib/sequence_spec.rb
 - spec/spec_helper.rb
+- test_files/bad.fa
 - test_files/benchmark.rb
 - test_files/bogus.txt
 - test_files/test.fa