RubyGems - parse_fasta - Versions diffs - 1.9.1 → 1.9.2 - Mend

parse_fasta 1.9.1 → 1.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +8 -8
data/README.md +9 -46
data/lib/parse_fasta/fasta_file.rb +44 -8
data/lib/parse_fasta/version.rb +1 -1
data/test_files/benchmark.rb +13 -6
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    MDAyZjJjNDJlMGZkM2Y2YmZmOTNmZTIwOTE2ZmVlMTRlYjhiNThkNA==
+    NmM5ZWYwOGM5YWIxMzU2YjBmZTk4Y2I5YzI0NjY0MzUwM2YwMjgyOA==
   data.tar.gz: !binary |-
-    ZmQxMDI5MDI4MjEyN2UyMjQ3ZGQwNDU2YzZhYmI0ZTNjMGQ2ZWFiOQ==
+    NzI2NDY1MWZmYmUwNDUxMTk2MmI4YjgwYWVlYjcyZDI4MDUzMzk4NA==
 SHA512:
   metadata.gz: !binary |-
-    NDZjNTk2OTUzYmM1MzE3MmJmOWJlMGM0NjQxY2RlODRiY2RiOTNlNWRiNmM1
-    NTA3OWNmMzdiZTRlNmVlZWZiOTU1ZjhkZjE4YTA3ZjQ5OTFhOWQ3NjViOTM3
-    N2IwY2I5YjI1MDIwMDczY2M2MzNmZTE4OTdiMzI0NWQ4ODcwMGE=
+    ODY1ZTQ1MzU4MTc2MDhhMjA0OThiYzM4Yzk4YjJiZjU4ZGY4MGM5NTRjYTE5
+    OWZkODk0M2ZmODE5ODY1MjE3NTQ5MzgyNTFjMTk2NzU2NGVjN2NkNGUzYzA3
+    ODliNjRlOGJjOGJhNjhlMWZmMmU1NjkyMjgwNzAyODQ1MDExOTI=
   data.tar.gz: !binary |-
-    NmNkZTI0NzE3NjU5M2VmNzgzYTk0MDM1NGM2NTdmNDA2YWY0ZmY2OWRhNTE4
-    ODE1MzIxYTcyNzhhMTJmMTc5NmE5MjI3OWEyYjUwMzM5NzJhYzAxZjJkYTE1
-    YTNmMTcyNWE1ZDg1NTNhMTg0ZDU3N2I0Y2E0YzlmOWZhOWRmNmE=
+    YWY4NWU3NDFiYTVmMmE1Y2MxMDI3ZjE3NTIyY2Q1N2Q2ZDQxM2ZlZjI4NjUy
+    MWM5OTZhNzEzZWNmMGVlYTQ1MDc1MzViMDBkOTQ0YzQyY2IxYjlmOGQwNzRh
+    YmIyOTg2Yjk0OTFlNWVhOGU3MTMzM2I1ZGY0ZjlkMzExZGNkZDk=

data/README.md CHANGED Viewed

@@ -66,6 +66,10 @@ Read fasta file into a hash.
 ## Versions ##
+### 1.9.2 ###
+Speed up fastA `each_record` and `each_record_fast`.
 ### 1.9.1 ###
 Speed up fastQ `each_record` and `each_record_fast`. Courtesy of
@@ -221,60 +225,19 @@ Last version with File monkey patch.
 ## Benchmark ##
-**NOTE**: These benchmarks are against an older version of
-  `parse_fasta`.
 Some quick and dirty benchmarks against `BioRuby`.
 ### FastaFile#each_record ###
-Calculating sequence length length for each fasta record with both the
-`each_record` method from this gem and using the `FastaFormat` class
-from BioRuby. You can see the test script in `benchmark.rb`.
-The test file contained 2,009,897 illumina reads and the file size
-was 1.1 gigabytes. Here are the results from Ruby's `Benchmark` class:
+You can see the test script in `benchmark.rb`.
-                      user     system      total        real
-    parse_fasta  64.530000   1.740000  66.270000 ( 67.081502)
-    bioruby     116.250000   2.260000 118.510000 (120.223710)
+                           user     system      total        real
+    parse_fasta        1.920000   0.160000   2.080000 (  2.145932)
+    parse_fasta fast   1.210000   0.160000   1.370000 (  1.377770)
+    bioruby            4.330000   0.290000   4.620000 (  4.655567)
 Hot dog! It's faster :)
-### FastqFile#each_record ###
-The same sequence length test as above, but this time with a fastq
-file containing 4,000,000 illumina reads.
-                        user     system      total        real
-    this_fastq     62.610000   1.660000  64.270000 ( 64.389408)
-    bioruby_fastq 165.500000   2.100000 167.600000 (167.969636)
-### Sequence#gc ###
-The test is done on random strings matcing `/[AaCcTtGgUu]/`. `this_gc`
-is `Sequence.new(str).gc`, and `bioruby_gc` is
-`Bio::Sequence::NA.new(str).gc_content`.
-To see how the methods scales, the test 1 string was 2,000,000 bases,
-test 2 was 4,000,000 and test 3 was 8,000,000 bases.
-                       user     system      total        real
-    this_gc 1      0.030000   0.000000   0.030000 (  0.029145)
-    bioruby_gc 1   2.030000   0.010000   2.040000 (  2.157512)
-	this_gc 2      0.060000   0.000000   0.060000 (  0.059408)
-    bioruby_gc 2   4.060000   0.020000   4.080000 (  4.334159)
-	this_gc 3      0.120000   0.000000   0.120000 (  0.185434)
-    bioruby_gc 3   8.060000   0.020000   8.080000 (  8.659071)
-Nice!
-Troll: "When will you find the GC of an 8,000,000 base sequence?"
-Me: "Step off, troll!"
 ## Notes ##
 Only the `SeqFile` class actually checks to make sure that you passed

data/lib/parse_fasta/fasta_file.rb CHANGED Viewed

@@ -116,10 +116,28 @@ class FastaFile < File
       #   end
       # end
     else
-      f.each("\n>") do |line|
-        header, sequence = parse_line(line)
-        yield(header.strip, Sequence.new(sequence || ""))
+      header = ""
+      sequence = ""
+      f.each_line do |line|
+        line.chomp!
+        len = line.length
+        if header.empty? && line.start_with?(">")
+          header = line[1, len]
+        elsif line.start_with?(">")
+          yield(header.strip, Sequence.new(sequence || ""))
+          header = line[1, len]
+          sequence = ""
+        else
+          raise ParseFasta::SequenceFormatError if sequence.include? ">"
+          sequence << line
+        end
       end
+      yield(header, Sequence.new(sequence || ""))
+      # f.each("\n>") do |line|
+      #     header, sequence = parse_line(line)
+      #     yield(header.strip, Sequence.new(sequence || ""))
+      #   end
       # f.each_with_index(sep=/^>/) do |line, idx|
       #   if idx.zero?
@@ -161,13 +179,31 @@ class FastaFile < File
       f = self
     end
-    f.each("\n>") do |line|
-      header, sequence = parse_line(line)
+    header = ""
+    sequence = ""
+    f.each_line do |line|
+      line.chomp!
+      len = line.length
+      if header.empty? && line.start_with?(">")
+        header = line[1, len]
+      elsif line.start_with?(">")
+        yield(header.strip, sequence)
+        header = line[1, len]
+        sequence = ""
+      else
+        raise ParseFasta::SequenceFormatError if sequence.include? ">"
+        sequence << line
+      end
+    end
+    yield(header, sequence)
-      raise ParseFasta::SequenceFormatError if sequence.include? ">"
+    # f.each("\n>") do |line|
+    #   header, sequence = parse_line(line)
-      yield(header.strip, sequence)
-    end
+    #   raise ParseFasta::SequenceFormatError if sequence.include? ">"
+    #   yield(header.strip, sequence)
+    # end
     f.close if f.instance_of?(Zlib::GzipReader)
     return f

data/lib/parse_fasta/version.rb CHANGED Viewed

@@ -17,5 +17,5 @@
 # along with parse_fasta.  If not, see <http://www.gnu.org/licenses/>.
 module ParseFasta
-  VERSION = "1.9.1"
+  VERSION = "1.9.2"
 end

data/test_files/benchmark.rb CHANGED Viewed

@@ -28,16 +28,23 @@ def this_parse_fasta fname
   end
 end
+def this_parse_fasta_fast fname
+  FastaFile.open(fname, 'r').each_record_fast do |header, sequence|
+    [header, sequence.length].join("\t")
+  end
+end
 def bioruby_parse_fasta fname
   Bio::FastaFormat.open(fname).each do |entry|
     [entry.definition, entry.seq.length].join("\t")
   end
 end
-# Benchmark.bmbm do |x|
-#   x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
-#   x.report('bioruby')     { bioruby_parse_fasta(ARGV.first) }
-# end
+Benchmark.bmbm do |x|
+  x.report('parse_fasta') { this_parse_fasta(ARGV.first) }
+  x.report('parse_fasta fast') { this_parse_fasta_fast(ARGV.first) }
+  x.report('bioruby')     { bioruby_parse_fasta(ARGV.first) }
+end
 ####
@@ -72,8 +79,8 @@ end
 # fastq = ARGV.first
 def bioruby_fastq(fastq)
-  Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
-    fq.each do |entry|
+  Bio::FlatFile.open(Bio::Fastq, fastq) do |fq|
+    fq.each do |entry|
       [entry.definition, entry.seq.length].join("\t")
     end
   end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parse_fasta
 version: !ruby/object:Gem::Version
-  version: 1.9.1
+  version: 1.9.2
 platform: ruby
 authors:
 - Ryan Moore
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-04-20 00:00:00.000000000 Z
+date: 2016-05-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler