RubyGems - fastcsv - Versions diffs - 0.0.1 - Mend

fastcsv 0.0.1

Files changed (18) hide show

data/ext/fastcsv/fastcsv.rl ADDED Viewed

@@ -0,0 +1,356 @@
+#include <ruby.h>
+#include <ruby/encoding.h>
+// CSV specifications.
+// http://tools.ietf.org/html/rfc4180
+// http://w3c.github.io/csvw/syntax/#ebnf
+// CSV implementation.
+// https://github.com/ruby/ruby/blob/master/lib/csv.rb
+// Ruby C extensions help.
+// https://github.com/ruby/ruby/blob/trunk/README.EXT
+// http://rxr.whitequark.org/mri/source
+// Ragel help.
+// https://www.mail-archive.com/ragel-users@complang.org/
+# define ASSOCIATE_INDEX \
+  if (internal_index >= 0) { \
+    rb_enc_associate_index(field, internal_index); \
+    field = rb_str_encode(field, rb_enc_from_encoding(external_encoding), 0, Qnil); \
+  } \
+  else { \
+    rb_enc_associate_index(field, rb_enc_to_index(external_encoding)); \
+  }
+static VALUE mModule, rb_eParseError;
+static ID s_read, s_to_str;
+%%{
+  machine fastcsv;
+  action new_line {
+    curline++;
+  }
+  action open_quote {
+    unclosed_line = curline;
+  }
+  action close_quote {
+    unclosed_line = 0;
+  }
+  action read_unquoted {
+    if (p == ts) {
+      // Unquoted empty fields are nil, not "", in Ruby.
+      field = Qnil;
+    }
+    else if (p > ts) {
+      field = rb_str_new(ts, p - ts);
+      ASSOCIATE_INDEX;
+    }
+  }
+  action read_quoted {
+    if (p == ts) {
+      field = rb_str_new2("");
+      ASSOCIATE_INDEX;
+    }
+    // @note If we add an action on '""', we can skip some steps if no '""' is found.
+    else if (p > ts) {
+      // Operating on ts in-place produces odd behavior, FYI.
+      char *copy = ALLOC_N(char, p - ts);
+      memcpy(copy, ts, p - ts);
+      char *reader = ts, *writer = copy;
+      int escaped = 0;
+      while (p > reader) {
+        if (*reader == quote_char && !escaped) {
+          // Skip the escaping character.
+          escaped = 1;
+        }
+        else {
+          escaped = 0;
+          *writer++ = *reader;
+        }
+        reader++;
+      }
+      field = rb_str_new(copy, writer - copy);
+      ASSOCIATE_INDEX;
+      if (copy != NULL) {
+        free(copy);
+      }
+    }
+  }
+  action new_field {
+    rb_ary_push(row, field);
+    field = Qnil;
+  }
+  action new_row {
+    if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
+      rb_ary_push(row, field);
+      field = Qnil;
+    }
+    rb_yield(row);
+    row = rb_ary_new();
+  }
+  action last_row {
+    if (!NIL_P(field) || RARRAY_LEN(row)) {
+      rb_ary_push(row, field);
+    }
+    if (RARRAY_LEN(row)) {
+      rb_yield(row);
+    }
+  }
+  EOF = 0 >last_row;
+  quote_char = '"';
+  col_sep = ',' >new_field;
+  row_sep = ('\r' '\n'? | '\n') @new_line;
+  unquoted = (any* -- quote_char -- col_sep -- row_sep - EOF) %read_unquoted;
+  quoted = quote_char >open_quote (any - quote_char - EOF | quote_char quote_char | row_sep)* %read_quoted quote_char >close_quote;
+  field = unquoted | quoted;
+  # fields = (field col_sep)* field?;
+  # file = (fields row_sep >new_row)* fields?;
+  # @see Ragel Guide: 6.3 Scanners
+  # Remember that an unquoted field can be zero-length.
+  main := |*
+    field col_sep EOF?;
+    field row_sep >new_row EOF?;
+    field EOF;
+  *|;
+  # Non-scanner version requires very large buffer.
+  # main := file $/{
+  #   if (!NIL_P(field) || RARRAY_LEN(row)) {
+  #     rb_ary_push(row, field);
+  #     rb_yield(row);
+  #   }
+  # };
+}%%
+%% write data;
+#define BUFSIZE 16384
+VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
+  int cs, act, have = 0, curline = 1, io = 0;
+  char *ts = 0, *te = 0, *buf = 0, *eof = 0;
+  VALUE port, opts;
+  VALUE row = rb_ary_new(), field = Qnil, bufsize = Qnil;
+  int done = 0, unclosed_line = 0, buffer_size = 0, taint = 0;
+  int internal_index = 0, external_index = rb_enc_to_index(rb_default_external_encoding());
+  rb_encoding *external_encoding = rb_default_external_encoding();
+  VALUE option;
+  char quote_char = '"'; //, *col_sep = ",", *row_sep = "\r\n";
+  rb_scan_args(argc, argv, "11", &port, &opts);
+  taint = OBJ_TAINTED(port);
+  io = rb_respond_to(port, s_read);
+  if (!io) {
+    if (rb_respond_to(port, s_to_str)) {
+      port = rb_funcall(port, s_to_str, 0);
+      StringValue(port);
+    }
+    else {
+      rb_raise(rb_eArgError, "data has to respond to #read or #to_str");
+    }
+  }
+  if (NIL_P(opts)) {
+    opts = rb_hash_new();
+  }
+  else if (TYPE(opts) != T_HASH) {
+    rb_raise(rb_eArgError, "options has to be a Hash or nil");
+  }
+  // @note Add machines for common CSV dialects, or see if we can use "when"
+  // from Chapter 6 to compare the character to the host program's variable.
+  // option = rb_hash_aref(opts, ID2SYM(rb_intern("quote_char")));
+  // if (TYPE(option) == T_STRING && RSTRING_LEN(option) == 1) {
+  //   quote_char = *StringValueCStr(option);
+  // }
+  // else if (!NIL_P(option)) {
+  //   rb_raise(rb_eArgError, ":quote_char has to be a single character String");
+  // }
+  // option = rb_hash_aref(opts, ID2SYM(rb_intern("col_sep")));
+  // if (TYPE(option) == T_STRING) {
+  //   col_sep = StringValueCStr(option);
+  // }
+  // else if (!NIL_P(option)) {
+  //   rb_raise(rb_eArgError, ":col_sep has to be a String");
+  // }
+  // option = rb_hash_aref(opts, ID2SYM(rb_intern("row_sep")));
+  // if (TYPE(option) == T_STRING) {
+  //   row_sep = StringValueCStr(option);
+  // }
+  // else if (!NIL_P(option)) {
+  //   rb_raise(rb_eArgError, ":row_sep has to be a String");
+  // }
+  option = rb_hash_aref(opts, ID2SYM(rb_intern("encoding")));
+  if (TYPE(option) == T_STRING) {
+    // @see parse_mode_enc in Ruby's io.c
+    const char *string = StringValueCStr(option), *pointer;
+    char internal_encoding_name[ENCODING_MAXNAMELEN + 1];
+    pointer = strrchr(string, ':');
+    if (pointer) {
+      long len = (pointer++) - string;
+      if (len == 0 || len > ENCODING_MAXNAMELEN) {
+        internal_index = -1;
+      }
+      else {
+        memcpy(internal_encoding_name, string, len);
+        internal_encoding_name[len] = '\0';
+        string = internal_encoding_name;
+        internal_index = rb_enc_find_index(internal_encoding_name);
+      }
+    }
+    else {
+      internal_index = rb_enc_find_index(string);
+    }
+    if (internal_index < 0 && internal_index != -2) {
+      rb_warn("Unsupported encoding %s ignored", string);
+    }
+    if (pointer) {
+      external_index = rb_enc_find_index(pointer);
+      if (external_index >= 0) {
+        external_encoding = rb_enc_from_index(external_index);
+      }
+      else {
+        rb_warn("Unsupported encoding %s ignored", string);
+      }
+    }
+    else if (internal_index >= 0) {
+      external_encoding = rb_enc_from_index(internal_index);
+    }
+  }
+  else if (!NIL_P(option)) {
+    rb_raise(rb_eArgError, ":encoding has to be a String");
+  }
+  buffer_size = BUFSIZE;
+  if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
+    bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
+    if (!NIL_P(bufsize)) {
+      buffer_size = NUM2INT(bufsize);
+    }
+  }
+  if (io) {
+    buf = ALLOC_N(char, buffer_size);
+  }
+  %% write init;
+  while (!done) {
+    VALUE str;
+    char *p, *pe;
+    int len, space = buffer_size - have, tokstart_diff, tokend_diff;
+    if (io) {
+      if (space == 0) {
+         tokstart_diff = ts - buf;
+         tokend_diff = te - buf;
+         buffer_size += BUFSIZE;
+         REALLOC_N(buf, char, buffer_size);
+         space = buffer_size - have;
+         ts = buf + tokstart_diff;
+         te = buf + tokend_diff;
+      }
+      p = buf + have;
+      str = rb_funcall(port, s_read, 1, INT2FIX(space));
+      if (NIL_P(str)) {
+        // StringIO#read returns nil for empty string.
+        len = 0;
+      }
+      else {
+        len = RSTRING_LEN(str);
+        memcpy(p, StringValuePtr(str), len);
+      }
+      if (len < space) {
+        // EOF actions don't work in scanners, so we add a sentinel value.
+        // @see http://www.complang.org/pipermail/ragel-users/2007-May/001516.html
+        // @see https://github.com/leeonix/lua-csv-ragel/blob/master/src/csv.rl
+        p[len++] = 0;
+        done = 1;
+      }
+    }
+    else {
+      p = RSTRING_PTR(port);
+      len = RSTRING_LEN(port);
+      p[len++] = 0;
+      done = 1;
+    }
+    pe = p + len;
+    // if (done) {
+    //   // This triggers the eof action in the non-scanner version.
+    //   eof = pe;
+    // }
+    %% write exec;
+    if (done && cs < fastcsv_first_final) {
+      if (buf != NULL) {
+        free(buf);
+      }
+      if (unclosed_line) {
+        rb_raise(rb_eParseError, "Unclosed quoted field on line %d.", unclosed_line);
+      }
+      // Ruby raises different errors for illegal quoting, depending on whether
+      // a quoted string is followed by a string ("Unclosed quoted field on line
+      // %d.") or by a string ending in a quote ("Missing or stray quote in line
+      // %d"). These precisions are kind of bogus, but we can try using $!.
+      else {
+        rb_raise(rb_eParseError, "Illegal quoting in line %d.", curline);
+      }
+    }
+    if (ts == 0) {
+      have = 0;
+    }
+    else if (io) {
+      have = pe - ts;
+      memmove(buf, ts, have);
+      te = buf + (te - ts);
+      ts = buf;
+    }
+  }
+  if (buf != NULL) {
+    free(buf);
+  }
+  return Qnil;
+}
+void Init_fastcsv() {
+  s_read = rb_intern("read");
+  s_to_str = rb_intern("to_str");
+  mModule = rb_define_module("FastCSV");
+  rb_define_attr(rb_singleton_class(mModule), "buffer_size", 1, 1);
+  rb_define_singleton_method(mModule, "raw_parse", fastcsv, -1);
+  rb_eParseError = rb_define_class_under(mModule, "ParseError", rb_eStandardError);
+}

data/fastcsv.gemspec ADDED Viewed

@@ -0,0 +1,24 @@
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name        = "fastcsv"
+  s.version     = '0.0.1'
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Open North"]
+  s.email       = ["info@opennorth.ca"]
+  s.homepage    = "http://github.com/opennorth/fastcsv"
+  s.summary     = %q{A fast Ragel-based CSV parser}
+  s.license     = 'MIT'
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.extensions    = ["ext/fastcsv/extconf.rb"]
+  s.add_development_dependency('coveralls')
+  s.add_development_dependency('json', '~> 1.7.7') # to silence coveralls warning
+  s.add_development_dependency('rake')
+  s.add_development_dependency('rake-compiler')
+  s.add_development_dependency('rspec', '~> 3.1')
+end

data/lib/fastcsv.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'fastcsv/fastcsv'

data/spec/fastcsv_spec.rb ADDED Viewed

@@ -0,0 +1,218 @@
+require 'spec_helper'
+require 'csv'
+RSpec.shared_examples 'a CSV parser' do
+  let :simple do
+    "foo\nbar\nbaz"
+  end
+  [
+    # Single tokens.
+    "",
+    "x",
+    %(""),
+    %("x"),
+    ",",
+    "\n",
+    # Last tokens.
+    "x,y",
+    %(x,"y"),
+    "x,",
+    "x\n",
+    # Line endings.
+    "\n\n\n",
+    "\r\r\r",
+    "\r\n\r\n\r\n",
+    "foo\rbar\rbaz\r",
+    "foo\nbar\nbaz\n",
+    "foo\r\nbar\r\nbaz\r\n",
+    # Repetition.
+    "x,x,x",
+    "x\nx\nx",
+    %("x","x","x"),
+    %("x"\n"x"\n"x"),
+    ",,,",
+    ",\n,\n,",
+    # Blank.
+    %(,""),
+    %("",),
+    "\n\n\nfoo\n\n\n",
+    # Whitespace.
+    "   x",
+    "x   ",
+    "   x   ",
+    # Tab.
+    " x",
+    "x  ",
+    "  x  ",
+    # Quoting.
+    %(foo,"bar,baz",bzz),
+    %(foo,"bar\nbaz",bzz),
+    %(foo,"""bar""baz""bzz""",zzz),
+    # Buffers.
+    "01234567890" * 2_000, # 20,000 > BUFSIZE
+    "0123456789," * 2_000,
+    # Uneven rows.
+    "1,2,3\n1,2",
+    "1,2\n1,2,3",
+    # Uneven data types.
+    "2000-01-01,2,x\nx,2000-01-01,2",
+  ].each do |csv|
+    it "should parse: #{csv}" do
+      expect(parse(csv)).to eq(CSV.parse(csv))
+    end
+  end
+  [
+    # Whitespace.
+    # @note Ruby's CSV library has inexplicably inconsistent error messages for
+    #   the same class of error.
+    [%(   "x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
+    [%("x"   ), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
+    [%(   "x"   ), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
+    # Tab.
+    [%(	"x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
+    [%("x"	), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
+    [%(	"x"	), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
+    # Quoted next to unquoted.
+    [%("x"x), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
+    [%(x"x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
+    [%(x"x"x), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
+    [%("x"x"x"), 'Missing or stray quote in line %d', 'Illegal quoting in line %d.'],
+    # Unclosed quote.
+    [%("x), 'Unclosed quoted field on line %d.', 'Unclosed quoted field on line %d.'],
+    # Quote in unquoted field.
+    [%(x"x), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
+    # Unescaped quote in quoted field.
+    [%("x"x"), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
+  ].each do |csv,csv_error,fastcsv_error|
+    it "should raise an error on: #{csv.inspect.gsub('\"', '"')}" do
+      expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, csv_error % 1)
+      expect{parse(csv)}.to raise_error(FastCSV::ParseError, fastcsv_error % 1)
+    end
+    it "should raise an error with the correct line number on: #{"\n#{csv}\n".inspect.gsub('\"', '"')}" do
+      csv = "\n#{csv}\n"
+      expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, csv_error % 2)
+      expect{parse(csv)}.to raise_error(FastCSV::ParseError, fastcsv_error % 2)
+    end
+  end
+  it 'should raise an error on mixed row separators are' do
+    csv = "foo\rbar\nbaz\r\n"
+    expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, 'Unquoted fields do not allow \r or \n (line 2).')
+    skip
+  end
+  it 'should raise an error if no block is given' do
+    expect{parse_without_block('x')}.to raise_error(LocalJumpError, 'no block given')
+  end
+  it 'should not raise an error if no block and empty input' do
+    expect{parse_without_block('')}.to_not raise_error
+  end
+  it 'should raise an error if the options are not a Hash or nil' do
+    expect{parse('', '')}.to raise_error(ArgumentError, 'options has to be a Hash or nil')
+  end
+  it 'should allow nil buffer size' do
+    FastCSV.buffer_size = nil
+    expect(parse(simple)).to eq(CSV.parse(simple))
+    FastCSV.buffer_size = nil
+  end
+  it 'should recover from a zero buffer size' do
+    FastCSV.buffer_size = 0
+    expect(parse(simple)).to eq(CSV.parse(simple))
+    FastCSV.buffer_size = nil
+  end
+end
+RSpec.describe FastCSV do
+  context "with String" do
+    def parse(csv, options = nil)
+      rows = []
+      FastCSV.raw_parse(csv, options){|row| rows << row}
+      rows
+    end
+    def parse_without_block(csv, options = nil)
+      FastCSV.raw_parse(csv, options)
+    end
+    include_examples 'a CSV parser'
+    it 'should not raise an error on negative buffer size' do
+      FastCSV.buffer_size = -1
+      expect{parse(simple)}.to_not raise_error
+      FastCSV.buffer_size = nil
+    end
+  end
+  context "with StringIO" do
+    def parse(csv, options = nil)
+      rows = []
+      FastCSV.raw_parse(StringIO.new(csv), options){|row| rows << row}
+      rows
+    end
+    def parse_without_block(csv, options = nil)
+      FastCSV.raw_parse(StringIO.new(csv), options)
+    end
+    include_examples 'a CSV parser'
+    it 'should raise an error on negative buffer size' do
+      FastCSV.buffer_size = -1
+      expect{parse(simple)}.to raise_error(NoMemoryError)
+      FastCSV.buffer_size = nil
+    end
+  end
+  def parse_with_encoding(basename, encoding)
+    filename = File.expand_path(File.join('..', 'fixtures', basename), __FILE__)
+    options = {encoding: encoding}
+    File.open(filename) do |io|
+      rows = []
+      FastCSV.raw_parse(io, options){|row| rows << row}
+      expected = CSV.read(filename, options)
+      expect(rows).to eq(expected)
+      expect(rows[0][0].encoding).to eq(expected[0][0].encoding)
+    end
+  end
+  it 'should encode the input' do
+    parse_with_encoding('iso-8859-1.csv', 'iso-8859-1')
+  end
+  it 'should encode the input with a blank internal encoding' do
+    parse_with_encoding('utf-8.csv', ':utf-8')
+  end
+  it 'should transcode the input' do
+    parse_with_encoding('iso-8859-1.csv', 'iso-8859-1:utf-8')
+  end
+  it 'should invalid encoding' do
+    parse_with_encoding('utf-8.csv', 'invalid')
+  end
+  it 'should raise an error if the input is not a String or IO' do
+    expect{FastCSV.raw_parse(nil)}.to raise_error(ArgumentError, 'data has to respond to #read or #to_str')
+  end
+end