RubyGems - fastest-csv - Versions diffs - 0.0.1 - Mend

fastest-csv 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in fastest-csv.gemspec
+gemspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 Maarten Oelering
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,68 @@
+# FastestCSV
+Fastest CSV class for MRI Ruby. Faster than faster_csv and fasterer-csv.
+Uses native C code to parse CSV lines. Not (yet) compatible with JRuby.
+Supports standard CSV according to RFC4180. Not the so-called "csv" from Excel.
+The interface is a subset of the CSV interface in Ruby 1.9.3. The options parameter is not supported.
+Originally developed to parse large CSV log files from PowerMTA.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'fastest-csv'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install fastest-csv
+## Usage
+Parse single line
+    FastestCSV.parse_line("one,two,three")
+     => ["one", "two", "three"]
+    "one,two,three".parse_csv
+     => ["one", "two", "three"]
+Parse file without header
+    FastestCSV.foreach("path/to/file.csv") do |row|
+      while row = csv.shift
+        #
+      end
+    end
+Parse file with header
+    FastestCSV.open("path/to/file.csv") do |csv|
+      fields = csv.shift
+      while values = csv.shift
+        #
+      end
+    end
+Parse file in array of arrays
+    rows = FastestCSV.read("path/to/file.csv")
+Parse string in array of arrays
+    rows = FastestCSV.parse(csv_data)
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Added some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env rake
2	+ require "bundler/gem_tasks"

data/ext/csv_parser/extconf.rb ADDED Viewed

@@ -0,0 +1,9 @@
+#!/usr/bin/ruby -w
+require 'mkmf'
+if RUBY_VERSION =~ /1.8/ then
+  $CPPFLAGS += " -DRUBY_18"
+end
+create_makefile('csv_parser')

data/ext/csv_parser/parser.c ADDED Viewed

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Maarten Oelering, BrightCode BV
+ */
+#include "ruby.h"
+#ifdef RUBY_18
+  #include "rubyio.h"
+#else
+  #include "ruby/io.h"
+#endif
+/* default allocated size is 16 */
+#define DEF_ARRAY_LEN 32
+static VALUE cFastestCSV;
+static VALUE parse_line(VALUE self, VALUE str)
+{
+    if (NIL_P(str))
+        return Qnil;
+    const char *ptr = RSTRING_PTR(str);
+    int len = (int) RSTRING_LEN(str);  /* cast to prevent warning in 64-bit OS */
+    if (len == 0)
+        return Qnil;
+    VALUE array = rb_ary_new2(DEF_ARRAY_LEN);
+    char value[len];  /* field value, no longer than line */
+    int state = 0;
+    int index = 0;
+    int i;
+    char c;
+    for (i = 0; i < len; i++)
+    {
+        c = ptr[i];
+        switch (c)
+        {
+            case ',':
+                if (state == 0) {
+                    rb_ary_push(array, (index == 0 ? Qnil: rb_str_new(value, index)));
+                    index = 0;
+                }
+                else if (state == 1) {
+                    value[index++] = c;
+                }
+                else if (state == 2) {
+                    rb_ary_push(array, rb_str_new(value, index));
+                    index = 0;
+                    state = 0;  /* outside quoted */
+                }
+                break;
+            case '"':
+                if (state == 0) {
+                    state = 1;  /* in quoted */
+                }
+                else if (state == 1) {
+                    state = 2;  /* quote in quoted */
+                }
+                else if (state == 2) {
+                    value[index++] = c;  /* escaped quote */
+                    state = 1;  /* in quoted */
+                }
+                break;
+            case 13:  /* \r */
+            case 10:  /* \n */
+                if (state == 1) { /* quoted */
+                    value[index++] = c;
+                }
+                else {
+                    /* only do first line */
+                    i = len;
+                }
+                /* else eat it ??? or return so far */
+                break;
+            default:
+                value[index++] = c;
+        }
+    }
+    if (state == 0) {
+        rb_ary_push(array, (index == 0 ? Qnil: rb_str_new(value, index)));
+    }
+    else if (state == 2) {
+        rb_ary_push(array, rb_str_new(value, index));
+    }
+    return array;
+}
+void Init_csv_parser()
+{
+    cFastestCSV = rb_define_class("FastestCSV", rb_cObject);
+    rb_define_singleton_method(cFastestCSV, "parse_line", parse_line, 1);
+}

data/fastest-csv.gemspec ADDED Viewed

@@ -0,0 +1,19 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path('../lib/fastest-csv/version', __FILE__)
+Gem::Specification.new do |gem|
+  gem.authors       = ["Maarten Oelering"]
+  gem.email         = ["maarten@brightcode.nl"]
+  gem.description   = %q{Fastest standard CSV parser for MRI Ruby}
+  gem.summary       = %q{Fastest standard CSV parser for MRI Ruby}
+  gem.homepage      = "https://github.com/brightcode/fastest-csv"
+  gem.files         = `git ls-files`.split($\)
+  #gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.name          = "fastest-csv"
+  gem.require_paths = ["lib"]
+  gem.version       = FastestCSV::VERSION
+  gem.extensions    = ['ext/csv_parser/extconf.rb']
+end

data/lib/fastest-csv.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'fastest_csv'

data/lib/fastest-csv/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class FastestCSV
+  VERSION = "0.0.1"
+end

data/lib/fastest_csv.rb ADDED Viewed

@@ -0,0 +1,91 @@
+require 'csv_parser'
+require 'stringio'
+class FastestCSV
+  # This method opens an accounting file and passes each record to the provided +block+.
+  def self.foreach(path, &block)
+    open(path) do |reader|
+      reader.each(&block)
+    end
+  end
+  # This method opens a csv file. It will pass a Reader object to the provided block,
+  # or return a Reader object when no block is provided.
+  def self.open(path, mode = "rb")
+    csv = new(File.open(path, mode))
+    if block_given?
+      begin
+        yield csv
+      ensure
+        csv.close
+      end
+    else
+      csv
+    end
+  end
+  def self.read(path)
+    open(path, "rb") { |csv| csv.read }
+  end
+  def self.readlines(path)
+    read(path)
+  end
+  def self.parse(data, &block)
+    csv = new(StringIO.new(data))
+    if block.nil?
+      begin
+        csv.read
+      ensure
+        csv.close
+      end
+    else
+      csv.each(&block)
+    end
+  end
+  def initialize(io)
+    @io = io
+  end
+  def each
+    while row = shift
+      yield row
+    end
+  end
+  def read
+    table = Array.new
+    each {|row| table << row}
+    table
+  end
+  alias_method :readlines, :read
+  def shift
+    if line = @io.gets
+      FastestCSV.parse_line(line)
+    else
+      nil
+    end
+  end
+  alias_method :gets,     :shift
+  alias_method :readline, :shift
+  def close
+    @io.close
+  end
+  def closed?
+    @io.closed?
+  end
+end
+class String
+  # Equivalent to <tt>FasterCSV::parse_line(self, options)</tt>.
+  def parse_csv
+    FastestCSV.parse_line(self)
+  end
+end

data/test/tc_csv_parsing.rb ADDED Viewed

@@ -0,0 +1,126 @@
+#
+# Tests copied from faster_csv by James Edward Gray II
+#
+require 'test/unit'
+require 'fastest_csv'
+#
+# Following tests are my interpretation of the
+# {CSV RCF}[http://www.ietf.org/rfc/rfc4180.txt].  I only deviate from that
+# document in one place (intentionally) and that is to make the default row
+# separator <tt>$/</tt>.
+#
+class TestCSVParsing < Test::Unit::TestCase
+  def test_mastering_regex_example
+    ex = %Q{Ten Thousand,10000, 2710 ,,"10,000","It's ""10 Grand"", baby",10K}
+    assert_equal( [ "Ten Thousand", "10000", " 2710 ", nil, "10,000",
+                    "It's \"10 Grand\", baby", "10K" ],
+                  FastestCSV.parse_line(ex) )
+  end
+  # Pulled from:  http://www.ruby-lang.org/cgi-bin/cvsweb.cgi/ruby/test/csv/test_csv.rb?rev=1.12.2.2;content-type=text%2Fplain
+  def test_std_lib_csv
+    [ ["\t", ["\t"]],
+      ["foo,\"\"\"\"\"\",baz", ["foo", "\"\"", "baz"]],
+      ["foo,\"\"\"bar\"\"\",baz", ["foo", "\"bar\"", "baz"]],
+      ["\"\"\"\n\",\"\"\"\n\"", ["\"\n", "\"\n"]],
+      ["foo,\"\r\n\",baz", ["foo", "\r\n", "baz"]],
+      ["\"\"", [""]],
+      ["foo,\"\"\"\",baz", ["foo", "\"", "baz"]],
+      ["foo,\"\r.\n\",baz", ["foo", "\r.\n", "baz"]],
+      ["foo,\"\r\",baz", ["foo", "\r", "baz"]],
+      ["foo,\"\",baz", ["foo", "", "baz"]],
+      ["\",\"", [","]],
+      ["foo", ["foo"]],
+      [",,", [nil, nil, nil]],
+      [",", [nil, nil]],
+      ["foo,\"\n\",baz", ["foo", "\n", "baz"]],
+      ["foo,,baz", ["foo", nil, "baz"]],
+      ["\"\"\"\r\",\"\"\"\r\"", ["\"\r", "\"\r"]],
+      ["\",\",\",\"", [",", ","]],
+      ["foo,bar,", ["foo", "bar", nil]],
+      [",foo,bar", [nil, "foo", "bar"]],
+      ["foo,bar", ["foo", "bar"]],
+      [";", [";"]],
+      ["\t,\t", ["\t", "\t"]],
+      ["foo,\"\r\n\r\",baz", ["foo", "\r\n\r", "baz"]],
+      ["foo,\"\r\n\n\",baz", ["foo", "\r\n\n", "baz"]],
+      ["foo,\"foo,bar\",baz", ["foo", "foo,bar", "baz"]],
+      [";,;", [";", ";"]] ].each do |csv_test|
+      assert_equal(csv_test.last, FastestCSV.parse_line(csv_test.first))
+    end
+    [ ["foo,\"\"\"\"\"\",baz", ["foo", "\"\"", "baz"]],
+      ["foo,\"\"\"bar\"\"\",baz", ["foo", "\"bar\"", "baz"]],
+      ["foo,\"\r\n\",baz", ["foo", "\r\n", "baz"]],
+      ["\"\"", [""]],
+      ["foo,\"\"\"\",baz", ["foo", "\"", "baz"]],
+      ["foo,\"\r.\n\",baz", ["foo", "\r.\n", "baz"]],
+      ["foo,\"\r\",baz", ["foo", "\r", "baz"]],
+      ["foo,\"\",baz", ["foo", "", "baz"]],
+      ["foo", ["foo"]],
+      [",,", [nil, nil, nil]],
+      [",", [nil, nil]],
+      ["foo,\"\n\",baz", ["foo", "\n", "baz"]],
+      ["foo,,baz", ["foo", nil, "baz"]],
+      ["foo,bar", ["foo", "bar"]],
+      ["foo,\"\r\n\n\",baz", ["foo", "\r\n\n", "baz"]],
+      ["foo,\"foo,bar\",baz", ["foo", "foo,bar", "baz"]] ].each do |csv_test|
+      assert_equal(csv_test.last, FastestCSV.parse_line(csv_test.first))
+     end
+  end
+  # From:  http://ruby-talk.org/cgi-bin/scat.rb/ruby/ruby-core/6496
+  def test_aras_edge_cases
+    [ [%Q{a,b},               ["a", "b"]],
+      [%Q{a,"""b"""},         ["a", "\"b\""]],
+      [%Q{a,"""b"},           ["a", "\"b"]],
+      [%Q{a,"b"""},           ["a", "b\""]],
+      [%Q{a,"\nb"""},         ["a", "\nb\""]],
+      [%Q{a,"""\nb"},         ["a", "\"\nb"]],
+      [%Q{a,"""\nb\n"""},     ["a", "\"\nb\n\""]],
+      [%Q{a,"""\nb\n""",\nc}, ["a", "\"\nb\n\"", nil]],
+      [%Q{a,,,},              ["a", nil, nil, nil]],
+      [%Q{,},                 [nil, nil]],
+      [%Q{"",""},             ["", ""]],
+      [%Q{""""},              ["\""]],
+      [%Q{"""",""},           ["\"",""]],
+      [%Q{,""},               [nil,""]],
+      [%Q{,"\r"},             [nil,"\r"]],
+      [%Q{"\r\n,"},           ["\r\n,"]],
+      [%Q{"\r\n,",},          ["\r\n,", nil]] ].each do |edge_case|
+        assert_equal(edge_case.last, FastestCSV.parse_line(edge_case.first))
+      end
+  end
+  def test_james_edge_cases
+    # A read at eof? should return nil.
+    assert_equal(nil, FastestCSV.parse_line(""))
+    #
+    # With CSV it's impossible to tell an empty line from a line containing a
+    # single +nil+ field.  The standard CSV library returns <tt>[nil]</tt>
+    # in these cases, but <tt>Array.new</tt> makes more sense to me.
+    #
+    #assert_equal(Array.new, FastestCSV.parse_line("\n1,2,3\n"))
+    assert_equal([nil], FastestCSV.parse_line("\n1,2,3\n"))
+  end
+  def test_rob_edge_cases
+    [ [%Q{"a\nb"},                         ["a\nb"]],
+      [%Q{"\n\n\n"},                       ["\n\n\n"]],
+      [%Q{a,"b\n\nc"},                     ['a', "b\n\nc"]],
+      [%Q{,"\r\n"},                        [nil,"\r\n"]],
+      [%Q{,"\r\n."},                       [nil,"\r\n."]],
+      [%Q{"a\na","one newline"},           ["a\na", 'one newline']],
+      [%Q{"a\n\na","two newlines"},        ["a\n\na", 'two newlines']],
+      [%Q{"a\r\na","one CRLF"},            ["a\r\na", 'one CRLF']],
+      [%Q{"a\r\n\r\na","two CRLFs"},       ["a\r\n\r\na", 'two CRLFs']],
+      [%Q{with blank,"start\n\nfinish"\n}, ['with blank', "start\n\nfinish"]],
+    ].each do |edge_case|
+      assert_equal(edge_case.last, FastestCSV.parse_line(edge_case.first))
+    end
+  end
+end

data/test/tc_interface.rb ADDED Viewed

@@ -0,0 +1,119 @@
+#
+# Tests copied from faster_csv by James Edward Gray II
+#
+require 'test/unit'
+require 'fastest_csv'
+class TestFastestCSVInterface < Test::Unit::TestCase
+  def setup
+    @path = File.join(File.dirname(__FILE__), "temp_test_data.csv")
+    File.open(@path, "w") do |file|
+      file << "1,2,3\r\n"
+      file << "4,5\r\n"
+    end
+    @expected = [%w{1 2 3}, %w{4 5}]
+  end
+  def teardown
+    File.unlink(@path)
+  end
+  ### Test Read Interface ###
+  def test_foreach
+    FastestCSV.foreach(@path) do |row|
+      assert_equal(@expected.shift, row)
+    end
+  end
+  def test_open_and_close
+    csv = FastestCSV.open(@path, "r+")
+    assert_not_nil(csv)
+    assert_instance_of(FastestCSV, csv)
+    assert_equal(false, csv.closed?)
+    csv.close
+    assert(csv.closed?)
+    ret = FastestCSV.open(@path) do |csv|
+      assert_instance_of(FastestCSV, csv)
+      "Return value."
+    end
+    assert(csv.closed?)
+    assert_equal("Return value.", ret)
+  end
+  def test_parse
+    data = File.read(@path)
+    assert_equal( @expected,
+                  FastestCSV.parse(data) )
+    FastestCSV.parse(data) do |row|
+      assert_equal(@expected.shift, row)
+    end
+  end
+  #def test_parse_line
+  #  row = FasterCSV.parse_line("1;2;3", :col_sep => ";")
+  #  assert_not_nil(row)
+  #  assert_instance_of(Array, row)
+  #  assert_equal(%w{1 2 3}, row)
+  #
+  #  # shortcut interface
+  #  row = "1;2;3".parse_csv(:col_sep => ";")
+  #  assert_not_nil(row)
+  #  assert_instance_of(Array, row)
+  #  assert_equal(%w{1 2 3}, row)
+  #end
+  def test_parse_line_with_empty_lines
+    assert_equal(nil,       FastestCSV.parse_line(""))  # to signal eof
+    #assert_equal(Array.new, FastestCSV.parse_line("\n1,2,3"))
+    assert_equal([nil], FastestCSV.parse_line("\n1,2,3"))
+  end
+  def test_read_and_readlines
+    assert_equal( @expected,
+                  FastestCSV.read(@path) )
+    assert_equal( @expected,
+                  FastestCSV.readlines(@path))
+    data = FastestCSV.open(@path) do |csv|
+      csv.read
+    end
+    assert_equal(@expected, data)
+    data = FastestCSV.open(@path) do |csv|
+      csv.readlines
+    end
+    assert_equal(@expected, data)
+  end
+  #def test_table
+  #  table = FastestCSV.table(@path)
+  #  assert_instance_of(FastestCSV::Table, table)
+  #  assert_equal([[:"1", :"2", :"3"], [4, 5, nil]], table.to_a)
+  #end
+  def test_shift  # aliased as gets() and readline()
+    FastestCSV.open(@path, "r+") do |csv|
+      assert_equal(@expected.shift, csv.shift)
+      assert_equal(@expected.shift, csv.shift)
+      assert_equal(nil, csv.shift)
+    end
+  end
+  def test_long_line # ruby's regex parser may have problems with long rows
+    File.unlink(@path)
+    long_field_length = 2800
+    File.open(@path, "w") do |file|
+      file << "1,2,#{'3' * long_field_length}\r\n"
+    end
+    @expected = [%w{1 2} + ['3' * long_field_length]]
+    test_shift
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,61 @@
+--- !ruby/object:Gem::Specification
+name: fastest-csv
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Maarten Oelering
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-06-28 00:00:00.000000000 Z
+dependencies: []
+description: Fastest standard CSV parser for MRI Ruby
+email:
+- maarten@brightcode.nl
+executables: []
+extensions:
+- ext/csv_parser/extconf.rb
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE
+- README.md
+- Rakefile
+- ext/csv_parser/extconf.rb
+- ext/csv_parser/parser.c
+- fastest-csv.gemspec
+- lib/fastest-csv.rb
+- lib/fastest-csv/version.rb
+- lib/fastest_csv.rb
+- test/tc_csv_parsing.rb
+- test/tc_interface.rb
+homepage: https://github.com/brightcode/fastest-csv
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Fastest standard CSV parser for MRI Ruby
+test_files:
+- test/tc_csv_parsing.rb
+- test/tc_interface.rb