RubyGems - csvreader - Versions diffs - 0.5.0 → 0.6.0 - Mend

csvreader 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/Manifest.txt +7 -1
data/lib/csvreader.rb +12 -2
data/lib/csvreader/buffer.rb +8 -3
data/lib/csvreader/parser.rb +47 -336
data/lib/csvreader/parser_std.rb +255 -0
data/lib/csvreader/parser_strict.rb +269 -0
data/lib/csvreader/parser_tab.rb +57 -0
data/lib/csvreader/reader.rb +40 -100
data/lib/csvreader/reader_hash.rb +88 -0
data/lib/csvreader/version.rb +1 -1
data/test/helper.rb +4 -0
data/test/test_parser.rb +0 -3
data/test/test_parser_formats.rb +8 -11
data/test/test_parser_java.rb +219 -0
data/test/{test_parser_rfc4180.rb → test_parser_strict.rb} +17 -20
data/test/test_parser_tab.rb +48 -0
data/test/test_reader.rb +15 -16
metadata +9 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ea1d667219773e3a355c81f815d91e92340d61a1
-  data.tar.gz: ba7a43ccb5e110fc1f6eca76ca2a74a62f1131fb
+  metadata.gz: c87e1cac5f0988f4423a0c5aaf96d2a625bf4d60
+  data.tar.gz: 5af8f5875ac0e18ade4cc793ba8ad658f905d1df
 SHA512:
-  metadata.gz: 0543a4338d2d12e36da16acdad9abff28633e519baa1d92044d1ca8f5e3472d835d00a10d8b19c24561b06e0d724f87414495600f4c83eef7c9e033474b4c09e
-  data.tar.gz: 8df669bc86f2066b2650a67bda5698fae7b6d58766b9c318f47958b0499671d0a4d39e862b8d3af842105a2777a3cf7ad05168380c338a3053fd3d363697abfb
+  metadata.gz: c9528101aa8a2db3a8e0dfb3685e6d15fcd262a76ed16f69b34ca9d54003e772f9441eb1673e11886ee14ac3347a99c22bd06662a8191214189f5c57f0ecfe7b
+  data.tar.gz: acc9ada28d539dbc7ce1a2178e904ba247f511327f5828eebfdda78b21b263ca5d153d8fc234d7483cb60229f764094bc3c2fbeefa2381335d4e21a30487a828

data/Manifest.txt CHANGED Viewed

@@ -6,7 +6,11 @@ Rakefile
 lib/csvreader.rb
 lib/csvreader/buffer.rb
 lib/csvreader/parser.rb
+lib/csvreader/parser_std.rb
+lib/csvreader/parser_strict.rb
+lib/csvreader/parser_tab.rb
 lib/csvreader/reader.rb
+lib/csvreader/reader_hash.rb
 lib/csvreader/version.rb
 test/data/beer.csv
 test/data/beer11.csv
@@ -14,6 +18,8 @@ test/data/shakespeare.csv
 test/helper.rb
 test/test_parser.rb
 test/test_parser_formats.rb
-test/test_parser_rfc4180.rb
+test/test_parser_java.rb
+test/test_parser_strict.rb
+test/test_parser_tab.rb
 test/test_reader.rb
 test/test_reader_hash.rb

data/lib/csvreader.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 # encoding: utf-8
-require 'csv'
-require 'json'
 require 'pp'
 require 'logger'
@@ -10,8 +9,19 @@ require 'logger'
 # our own code
 require 'csvreader/version' # let version always go first
 require 'csvreader/buffer'
+require 'csvreader/parser_std'      # best practices pre-configured out-of-the-box
+require 'csvreader/parser_strict'   # flexible (strict - no leading/trailing space triming, blanks, etc.), configure for different formats/dialects
+require 'csvreader/parser_tab'
 require 'csvreader/parser'
 require 'csvreader/reader'
+require 'csvreader/reader_hash'
+## add convenience / shortcut alias
+Csv     = CsvReader
+CsvHash = CsvHashReader
 puts CsvReader.banner   # say hello

data/lib/csvreader/buffer.rb CHANGED Viewed

@@ -1,7 +1,12 @@
 # encoding: utf-8
 class CsvReader
-class BufferIO   ## todo: find a better name - why? why not? is really just for reading (keep io?)
+class Buffer   ## todo: find a better name:
+               ##   BufferedReader
+               ##   BufferedInput
+               ##   BufferI
+               ## - why? why not? is really just for reading (keep io?)
   def initialize( data )
     # create the IO object we will read from
     @io = data.is_a?(String) ? StringIO.new(data) : data
@@ -20,7 +25,7 @@ class BufferIO   ## todo: find a better name - why? why not? is really just for
   def peek
     if @buf.size == 0 && @io.eof?
-      puts "peek - hitting eof!!!"
+      ## puts "peek - hitting eof!!!"
       return  "\0"   ## return NUL char (0) for now
     end
@@ -33,5 +38,5 @@ class BufferIO   ## todo: find a better name - why? why not? is really just for
     @buf.first
   end # method peek
-end # class BufferIO
+end # class Buffer
 end # class CsvReader

data/lib/csvreader/parser.rb CHANGED Viewed

@@ -2,363 +2,74 @@
 class CsvReader
 class Parser
+## use/allow different "backends" e.g. ParserStd, ParserStrict, ParserTab, etc.
+##   parser must support parse method (with and without block)
+##    e.g.  records = parse( data )
+##             -or-
+##          parse( data ) do |record|
+##          end
-## char constants
-DOUBLE_QUOTE = "\""
-BACKSLASH    = "\\"    ## use BACKSLASH_ESCAPE ??
-COMMENT      = "#"      ## use COMMENT_HASH or HASH or ??
-SPACE        = " "      ##   \s == ASCII 32 (dec)            =    (Space)
-TAB          = "\t"     ##   \t == ASCII 0x09 (hex)          = HT (Tab/horizontal tab)
-LF	         = "\n"     ##   \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
-CR	         = "\r"     ##   \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
-###################################
-## add simple logger with debug flag/switch
-#
-#  use Parser.debug = true   # to turn on
-#
-#  todo/fix: use logutils instead of std logger - why? why not?
-def self.logger() @@logger ||= Logger.new( STDOUT ); end
-def logger()  self.class.logger; end
-attr_reader :config   ## todo/fix: change config to proper dialect class/struct - why? why not?
-def initialize( sep:         ',',
-                quote:       DOUBLE_QUOTE, ## note: set to nil for no quote
-                doublequote: true,
-                escape:      BACKSLASH,   ## note: set to nil for no escapes
-                trim:        true,   ## note: will toggle between human/default and strict mode parser!!!
-                na:          ['\N', 'NA'],  ## note: set to nil for no null vales / not availabe (na)
-                quoted_empty:   '',   ## note: only available in strict mode (e.g. trim=false)
-                unquoted_empty: ''    ## note: only available in strict mode (e.g. trim=false)
-               )
-  @config = {}   ## todo/fix: change config to proper dialect class/struct - why? why not?
-  @config[:sep]          = sep
-  @config[:quote]        = quote
-  @config[:doublequote]  = doublequote
-  @config[:escape]  = escape
-  @config[:trim]         = trim
-  @config[:na]     = na
-  @config[:quoted_empty] = quoted_empty
-  @config[:unquoted_empty] = unquoted_empty
-end
-def strict?
-  ## note:  use trim for separating two different parsers / code paths:
-  ##   - human with trim leading and trailing whitespace and
-  ##   - strict with no leading and trailing whitespaces allowed
-  ## for now use - trim == false for strict version flag alias
-  ##   todo/fix: add strict flag - why? why not?
-  @config[:trim] ? false : true
-end
-DEFAULT = new( sep: ',', trim: true )
-RFC4180 = new( sep: ',', trim: false )
-EXCEL   = new( sep: ',', trim: false )
-def self.default()  DEFAULT; end    ## alternative alias for DEFAULT
-def self.rfc4180()  RFC4180; end    ## alternative alias for RFC4180
-def self.excel()    EXCEL; end      ## alternative alias for EXCEL
-def parse_field( io, sep: )
-  logger.debug "parse field - sep: >#{sep}< (#{sep.ord})"  if logger.debug?
-  value = ""
-  skip_spaces( io )   ## strip leading spaces
-  if (c=io.peek; c=="," || c==LF || c==CR || io.eof?) ## empty field
-     ## return value; do nothing
-  elsif io.peek == DOUBLE_QUOTE
-    logger.debug "start double_quote field - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-    io.getc  ## eat-up double_quote
-    loop do
-      while (c=io.peek; !(c==DOUBLE_QUOTE || io.eof?))
-        value << io.getc   ## eat-up everything unit quote (")
-      end
+DEFAULT = ParserStd.new
-      break if io.eof?
+RFC4180 = ParserStrict.new
+STRICT  = ParserStrict.new  ## note: make strict its own instance (so you can change config without "breaking" rfc4180)
+EXCEL   = ParserStrict.new   ## note: make excel its own instance (so you can change configs without "breaking" rfc4180/strict)
-      io.getc ## eat-up double_quote
+MYSQL   = ParserStrict.new( sep: "\t",
+                            quote: false,
+                            escape: true,
+                            null: "\\N" )
-      if io.peek == DOUBLE_QUOTE  ## doubled up quote?
-        value << io.getc   ## add doube quote and continue!!!!
-      else
-        break
-      end
-    end
+POSTGRES = POSTGRESQL = ParserStrict.new( doublequote: false,
+                                          escape: true,
+                                          unquoted_empty_null: true )
-    ## note: always eat-up all trailing spaces (" ") and tabs (\t)
-    skip_spaces( io )
-    logger.debug "end double_quote field - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-  else
-    logger.debug "start reg field - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-    ## consume simple value
-    ##   until we hit "," or "\n" or "\r"
-    ##    note: will eat-up quotes too!!!
-    while (c=io.peek; !(c=="," || c==LF || c==CR || io.eof?))
-      logger.debug "  add char >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-      value << io.getc   ## eat-up all spaces (" ") and tabs (\t)
-    end
-    value = value.strip   ## strip all trailing spaces
-    logger.debug "end reg field - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-  end
-  value
-end
-def parse_field_strict( io, sep: )
-  logger.debug "parse field (strict) - sep: >#{sep}< (#{sep.ord})"  if logger.debug?
-  value = ""
-  if (c=io.peek; c==sep || c==LF || c==CR || io.eof?) ## empty unquoted field
-     value = config[:unquoted_empty]   ## defaults to "" (might be set to nil if needed)
-     ## return value; do nothing
-  elsif config[:quote] && io.peek == config[:quote]
-    logger.debug "start quote field (strict) - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-    io.getc  ## eat-up double_quote
-    loop do
-      while (c=io.peek; !(c==config[:quote] || io.eof?))
-        value << io.getc   ## eat-up everything unit quote (")
-      end
-      break if io.eof?
-      io.getc ## eat-up double_quote
-      if config[:doublequote] && io.peek == config[:quote]  ## doubled up quote?
-        value << io.getc   ## add doube quote and continue!!!!
-      else
-        break
-      end
-    end
-    value = config[:quoted_empty]  if value == ""   ## defaults to "" (might be set to nil if needed)
-    logger.debug "end double_quote field (strict) - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-  else
-    logger.debug "start reg field (strict) - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-    ## consume simple value
-    ##   until we hit "," or "\n" or "\r" or stroy "\"" double quote
-    while (c=io.peek; !(c==sep || c==LF || c==CR || c==config[:quote] || io.eof?))
-      logger.debug "  add char >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-      value << io.getc
-    end
-    logger.debug "end reg field (strict) - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-  end
-  value
-end
-def parse_record( io, sep: )
-  values = []
-  loop do
-     value = parse_field( io, sep: sep )
-     logger.debug "value: »#{value}«"  if logger.debug?
-     values << value
-     if io.eof?
-        break
-     elsif (c=io.peek; c==LF || c==CR)
-       skip_newlines( io )
-       break
-     elsif io.peek == ","
-       io.getc   ## eat-up FS(,)
-     else
-       puts "*** csv parse error: found >#{io.peek} (#{io.peek.ord})< - FS (,) or RS (\\n) expected!!!!"
-       exit(1)
-     end
-  end
-  values
-end
+POSTGRES_TEXT = POSTGRESQL_TEXT = ParserStrict.new( sep: "\t",
+                                                    quote: false,
+                                                    escape: true,
+                                                    null: "\\N" )
+TAB     = ParserTab.new
-def parse_record_strict( io, sep: )
-  values = []
+def self.default()         DEFAULT;         end ## alternative alias for DEFAULT
+def self.strict()          STRICT;          end ## alternative alias for STRICT
+def self.rfc4180()         RFC4180;         end ## alternative alias for RFC4180
+def self.excel()           EXCEL;           end ## alternative alias for EXCEL
+def self.mysql()           MYSQL;           end
+def self.postgresql()      POSTGRESQL;      end
+def self.postgres()        postgresql;      end
+def self.postgresql_text() POSTGRESQL_TEXT; end
+def self.postgres_text()   postgresql_text; end
+def self.tab()             TAB;             end
-  loop do
-     value = parse_field_strict( io, sep: sep )
-     logger.debug "value: »#{value}«"  if logger.debug?
-     values << value
-     if io.eof?
-        break
-     elsif (c=io.peek; c==LF || c==CR)
-       skip_newline( io )   ## note: singular / single newline only (NOT plural)
-       break
-     elsif io.peek == sep
-       io.getc   ## eat-up FS (,)
-     else
-       puts "*** csv parse error (strict): found >#{io.peek} (#{io.peek.ord})< - FS (,) or RS (\\n) expected!!!!"
-       exit(1)
-     end
-  end
-  values
-end
-def skip_newlines( io )
-  return if io.eof?
+end # class Parser
-  while (c=io.peek; c==LF || c==CR)
-    io.getc    ## eat-up all \n and \r
-  end
-end
-def skip_newline( io )    ## note: singular (strict) version
-  return if io.eof?
+####################################
+# define errors / exceptions
+#   for all parsers for (re)use
-  ## only skip CR LF or LF or CR
-  if io.peek == CR
-    io.getc ## eat-up
-    io.getc  if io.peek == LF
-  elsif io.peek == LF
-    io.getc ## eat-up
-  else
-    # do nothing
-  end
+class Error < StandardError
 end
+####
+# todo/check:
+#  use "common" error class - why? why not?
+class ParseError < Error
+  attr_reader :message
-def skip_until_eol( io )
-  return if io.eof?
-  while (c=io.peek; !(c==LF || c==CR || io.eof?))
-    io.getc    ## eat-up all until end of line
+  def initialize( message )
+    @message = message
   end
-end
-def skip_spaces( io )
-  return if io.eof?
-  while (c=io.peek; c==SPACE || c==TAB)
-    io.getc   ## note: always eat-up all spaces (" ") and tabs (\t)
+  def to_s
+    "*** csv parse error: #{@message}"
   end
-end
-def parse_lines_human( io, sep:, &block )
-  loop do
-    break if io.eof?
-    skip_spaces( io )
-    if io.peek == COMMENT        ## comment line
-      logger.debug "skipping comment - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-      skip_until_eol( io )
-      skip_newlines( io )
-    elsif (c=io.peek; c==LF || c==CR || io.eof?)
-      logger.debug "skipping blank - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-      skip_newlines( io )
-    else
-      logger.debug "start record - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-      record = parse_record( io, sep: sep )
-      ## note: requires block - enforce? how? why? why not?
-      block.call( record )   ## yield( record )
-    end
-  end  # loop
-end # method parse_lines_human
-def parse_lines_strict( io, sep:, &block )
-  ## no leading and trailing whitespaces trimmed/stripped
-  ## no comments skipped
-  ## no blanks skipped
-  ## - follows strict rules of
-  ##  note: this csv format is NOT recommended;
-  ##    please, use a format with comments, leading and trailing whitespaces, etc.
-  ##    only added for checking compatibility
-  loop do
-    break if io.eof?
-    logger.debug "start record (strict) - peek >#{io.peek}< (#{io.peek.ord})"  if logger.debug?
-    record = parse_record_strict( io, sep: sep )
-    ## note: requires block - enforce? how? why? why not?
-    block.call( record )   ## yield( record )
-  end  # loop
-end # method parse_lines_strict
-def parse_lines( io_maybe, sep: config[:sep], &block )
-  ## find a better name for io_maybe
-  ##   make sure io is a wrapped into BufferIO!!!!!!
-  if io_maybe.is_a?( BufferIO )    ### allow (re)use of BufferIO if managed from "outside"
-    io = io_maybe
-  else
-    io = BufferIO.new( io_maybe )
-  end
-  if strict?
-    parse_lines_strict( io, sep: sep, &block )
-  else
-    parse_lines_human( io, sep: sep, &block )
-  end
-end  ## parse_lines
-##   fix: add optional block  - lets you use it like foreach!!!
-##    make foreach an alias of parse with block - why? why not?
-##
-##   unifiy with (make one) parse and parse_lines!!!! - why? why not?
-def parse( io_maybe, sep: config[:sep], limit: nil )
-  records = []
-  parse_lines( io_maybe, sep: sep  ) do |record|
-    records << record
-    ## set limit to 1 for processing "single" line (that is, get one record)
-    break  if limit && limit >= records.size
-  end
-  records
-end ## method parse
-end # class Parser
+end # class ParseError
 end # class CsvReader