RubyGems - censive - Versions diffs - 0.20 → 0.21 - Mend

censive 0.20 → 0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/censive.gemspec +1 -1
data/lib/censive.rb +34 -9
metadata +2 -6
data/lib/censive.rb-20230208182732 +0 -266
data/lib/censive.rb-20230208195221 +0 -276
data/lib/censive.rb-20230209050227 +0 -282
data/lib/flay.rb +0 -227

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c8daaabf3200a72964f44377e4a1a5723e7790a96cb00f76433666cccdc73809
-  data.tar.gz: b34491a185ccdd3e79d0877107d0ad33e9c9487b974398c18a911313795009b8
+  metadata.gz: 5dffdaf597e038881e378eb30acb7c44cde08de1f9e40e2180076eaa11356c68
+  data.tar.gz: f9d7f77ac597a5d5a86fc1adcad430802ab20bd306bf5856f1191f57ff22f872
 SHA512:
-  metadata.gz: cc739653c328fd1c49e6a17d1aebc4a1e14f0707f252847cd134f07b9636ee4e043fed5a10b0d57550357d3ecd016621adfa69f8b6162765634e3e5759923804
-  data.tar.gz: 0a34a13b24778d300d3e0cfb274f60c94263a960398984fec7d742280d66439a6453561f1913c9f607bf20ff4d7fd52172c09faad76bf460a236264f0cca53bc
+  metadata.gz: a0187489ebac8a9011f0f77dc9d52ca821ab080271f3eca6a1a40409b587534a9f4608d1f3b65a0253e587c242d01465e3cd773377f8d00b2fbd1723db4b5650
+  data.tar.gz: 94f2e7a204d8b40e058f41d193add0002d169d5d244e81c6895e465de159c6a953f09e313689891f7d12c05bead3baa41ad6fd525a8e297143758553e39ef1ba

data/censive.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name        = "censive"
-  s.version     = "0.20"
+  s.version     = "0.21"
   s.author      = "Steve Shreeve"
   s.email       = "steve.shreeve@gmail.com"
   s.summary     =

data/lib/censive.rb CHANGED Viewed

@@ -4,7 +4,7 @@
 # censive - A quick and lightweight CSV handling library for Ruby
 #
 # Author: Steve Shreeve (steve.shreeve@gmail.com)
-#   Date: Feb 10, 2023
+#   Date: Feb 14, 2023
 #
 # https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
 # https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
@@ -20,7 +20,7 @@
 # TODO:
 # 1. Support IO streaming
 # 2. Review all encodings, we may be losing speed when mixing encodings
-# 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
+# 3. Speedup possible if our @unquoted regex reads beyond @eol's
 # 4. Will using String#freeze give us a speed up?
 # 5. Implement support for scan_until(string) <= right now only regex is valid
 # ============================================================================
@@ -28,7 +28,7 @@
 require "strscan"
 class Censive < StringScanner
-  attr :encoding
+  attr :encoding, :out
   def self.parse(...)
     new(...).parse
@@ -114,12 +114,6 @@ class Censive < StringScanner
   # ==[ Parser ]==
   def parse
-    # TODO: crazy optimization if NO QUOTES in rest
-    # unless rest.include?(@quote)
-    #   @rows = rest...
-    # end
     @rows = []
     while row = next_row
       @rows << row
@@ -188,6 +182,7 @@ class Censive < StringScanner
   def export(**opts)
     out = opts.empty? ? self : self.class.writer(**opts)
     each {|row| out << row }
+    out.out
   end
   # ==[ Helpers ]==
@@ -252,3 +247,33 @@ class Censive < StringScanner
     abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
   end
 end
+if __FILE__ == $0
+  str = DATA.gets("\n\n").chomp
+  # str = File.read(ARGV.first || "lc-2023.csv")
+  # str = File.open("KEN_ALL.CSV", "r:cp932").read
+  # require "stringio"
+  # csv = Censive.new(str, excel: true, relax: true)
+  # out = "" # StringIO.new
+  # csv.export(out: out) # (excel: true) # sep: "|")
+  # puts out # .string
+  puts Censive.new(str, excel: true, relax: true, out: "").export
+end
+__END__
+"Don",="007",10,"Ed"
+Name,Age,,,Shoe,,,
+"Alice",27,5
+Bob,33,10 1/2
+Charlie or "Chuck",=B2 + B3,9
+Subtotal,=sum(B2:B5),="01234"
+A,B,C,D
+A,B,"C",D
+A,B,C",D
+A,B,"C",D
+123,"CHO, JOELLE "JOJO"",456
+123,"CHO, JOELLE ""JOJO""",456
+=,=x,x=,="x",="","","=",123,0123,="123",="0123"
+,=x,x=,x,,,,,,=,,123,="0123",123,,="0123"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: censive
 version: !ruby/object:Gem::Version
-  version: '0.20'
+  version: '0.21'
 platform: ruby
 authors:
 - Steve Shreeve
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-02-10 00:00:00.000000000 Z
+date: 2023-02-14 00:00:00.000000000 Z
 dependencies: []
 description: A quick and lightweight CSV handling library for Ruby
 email: steve.shreeve@gmail.com
@@ -29,10 +29,6 @@ files:
 - diagram/diagram.dot
 - diagram/diagram.rl
 - lib/censive.rb
-- lib/censive.rb-20230208182732
-- lib/censive.rb-20230208195221
-- lib/censive.rb-20230209050227
-- lib/flay.rb
 - lib/test-censive.rb
 - lib/test-csv.rb
 - test/a-uses-tabs-and-single-quotes-and-no-trailing-newline.tsv

data/lib/censive.rb-20230208182732 DELETED Viewed

@@ -1,266 +0,0 @@
-#!/usr/bin/env ruby
-# ============================================================================
-# censive - A quick and lightweight CSV handling library for Ruby
-#
-# Author: Steve Shreeve (steve.shreeve@gmail.com)
-#   Date: Feb 8, 2023
-#
-# https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
-# https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
-#
-# Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
-# ============================================================================
-# GOALS:
-# 1. Faster than Ruby's default CSV library
-# 2. Lightweight code with streamlined and optimized logic
-# 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
-#
-# TODO:
-# 1. Support IO streaming
-# 2. Review all encodings, we may be losing speed when mixing encodings
-# 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
-# 4. Will using String#freeze give us a speed up?
-# 5. Implement support for scan_until(string) <= right now only regex is valid
-# ============================================================================
-require "strscan"
-class Censive < StringScanner
-  attr :encoding
-  def self.parse(...)
-    new(...).parse
-  end
-  def self.writer(obj=nil, **opts, &code)
-    case obj
-    when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
-    when IO,nil then new(out: obj, **opts, &code)
-    else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
-    end
-  end
-  def initialize(str=nil,
-    drop:     false   , # drop trailing empty fields?
-    encoding: nil     , # character encoding
-    excel:    false   , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
-    mode:     :compact, # export mode: compact or full
-    out:      nil     , # output stream, needs to respond to <<
-    quote:    '"'     , # quote character
-    relax:    false   , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
-    rowsep:   "\n"    , # row separator for export
-    sep:      ","     , # column separator character
-    strip:    false   , # strip fields when reading
-    **opts              # grab bag
-  )
-    # initialize data source
-    if str && str.size < 100 && File.readable?(str)
-      str = File.open(str, encoding ? "r:#{encoding}" : "r").read
-    else
-      str ||= ""
-      str = str.encode(encoding) if encoding
-    end
-    super(str)
-    reset
-    # config options
-    @drop     = drop
-    @encoding = str.encoding
-    @excel    = excel
-    @mode     = mode
-    @out      = out || $stdout
-    @relax    = relax
-    @strip    = strip
-    # config strings
-    @quote    = quote
-    @rowsep   = rowsep
-    @sep      = sep
-    # static strings
-    @cr       = "\r"
-    @lf       = "\n"
-    @es       = ""
-    @eq       = "="
-    # combinations
-    @esc      = (@quote * 2)
-    @seq      = [@sep, @eq].join # used for parsing in excel mode
-    #!# TODO: come up with a clean way to escape/encode all this
-    #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
-    # regexes
-    @eoc      = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
-    @eol      = /#{@cr}#{@lf}?|#{@lf}/o                # end of line
-    @escapes  = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
-    @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
-    @quotes   = /#{@quote}/o
-    @seps     = /#{@sep}+/o
-    @quoted   = @excel ? /(?:=)?#{@quote}/o : @quote
-    @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
-    @leadzero = /\A0\d*\z/
-  end
-  def reset(str=nil)
-    @rows = nil
-    @cols = @cells = 0
-    #!# TODO: reset all encodings?
-    self.string = str if str
-    @encoding = string.encoding
-    super()
-  end
-  # ==[ Parser ]==
-  def parse
-    @rows = []
-    while row = next_row
-      @rows << row
-      count = row.size
-      @cols = count if count > @cols
-      @cells += count
-    end
-    @rows
-  end
-  def next_row
-    token = next_token or return
-    row = []
-    row.push(*token)
-    row.push(*token) while token = next_token
-    row
-  end
-  def next_token
-    if scan(@quoted) # quoted cell
-      token = ""
-      while true
-        token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
-        token << @quote and next if scan(@quote)
-        scan(@eoc) and break
-        @relax or bomb "invalid character after quote"
-        token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
-      end
-      scan(@sep)
-      @strip ? token.strip : token
-    elsif match = scan(@unquoted) # unquoted cell(s)
-      if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
-        unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
-          match << (scan_until(@eoc) or bomb "stray quote")
-          scan(@sep)
-        end
-      end
-      tokens = match.split(@sep, -1)
-      @strip ? tokens.map!(&:strip) : tokens
-    elsif scan(@sep)
-      match = scan(@seps)
-      match ? match.split(@sep, -1) : @es
-    else
-      scan(@eol)
-      nil
-    end
-  end
-  def each
-    @rows ||= parse
-    @rows.each {|row| yield row }
-  end
-  def export(**opts)
-    out = opts.empty? ? self : self.class.writer(**opts)
-    each {|row| out << row }
-  end
-  # ==[ Helpers ]==
-  # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
-  def grok(str)
-    if idx = str.index(@escapes)
-      $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
-    else
-      0
-    end
-  end
-  # output a row
-  def <<(row)
-    # drop trailing empty columns
-    row.pop while row.last.empty? if @drop
-    s,q = @sep, @quote
-    out = case @mode
-    when :compact
-      case @excel ? 2 : grok(row.join)
-      when 0
-        row
-      when 1
-        row.map do |col|
-          col.match?(@quotable) ? "#{q}#{col}#{q}" : col
-        end
-      else
-        row.map do |col|
-          @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
-          case grok(col)
-          when 0 then col
-          when 1 then "#{q}#{col}#{q}"
-          else        "#{q}#{col.gsub(q, @esc)}#{q}"
-          end
-        end
-      end
-    when :full
-      if @excel
-        row.map do |col|
-          col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
-        end
-      else
-        row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
-      end
-    end.join(s)
-    @out << out + @rowsep
-  end
-  def stats
-    wide = string.size.to_s.size
-    puts "%#{wide}d rows"    % @rows.size
-    puts "%#{wide}d columns" % @cols
-    puts "%#{wide}d cells"   % @cells
-    puts "%#{wide}d bytes"   % string.size
-  end
-  def bomb(msg)
-    abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
-  end
-end
-if __FILE__ == $0
-  raw = DATA.gets("\n\n").chomp
-# raw = File.read(ARGV.first || "lc-2023.csv")
-  csv = Censive.new(raw, excel: true, relax: true)
-  csv.export # (excel: true) # sep: "|")
-end
-__END__
-"Don",="007",10,"Ed"
-Name,Age,,,Shoe,,,
-"Alice",27,5
-Bob,33,10 1/2
-Charlie or "Chuck",=B2 + B3,9
-Subtotal,=sum(B2:B5),="01234"
-A,B,C,D
-A,B,"C",D
-A,B,C",D
-A,B,"C",D
-# first line works in "relax" mode, bottom line is compliant
-123,"CHO, JOELLE "JOJO"",456
-123,"CHO, JOELLE ""JOJO""",456
-# Excel mode checking
-=,=x,x=,="x",="","","=",123,0123,="123",="0123"
-,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off

data/lib/censive.rb-20230208195221 DELETED Viewed

@@ -1,276 +0,0 @@
-#!/usr/bin/env ruby
-# ============================================================================
-# censive - A quick and lightweight CSV handling library for Ruby
-#
-# Author: Steve Shreeve (steve.shreeve@gmail.com)
-#   Date: Feb 8, 2023
-#
-# https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
-# https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
-#
-# Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
-# ============================================================================
-# GOALS:
-# 1. Faster than Ruby's default CSV library
-# 2. Lightweight code with streamlined and optimized logic
-# 3. Support most non-compliant CSV variations (eg - @excel, @relax, @strip)
-#
-# TODO:
-# 1. Support IO streaming
-# 2. Review all encodings, we may be losing speed when mixing encodings
-# 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
-# 4. Will using String#freeze give us a speed up?
-# 5. Implement support for scan_until(string) <= right now only regex is valid
-# ============================================================================
-require "strscan"
-class Censive < StringScanner
-  attr :encoding
-  def self.parse(...)
-    new(...).parse
-  end
-  def self.writer(obj=nil, **opts, &code)
-    case obj
-    when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
-    when IO,nil then new(out: obj, **opts, &code)
-    else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
-    end
-  end
-  def initialize(str=nil,
-    drop:     false   , # drop trailing empty fields?
-    encoding: nil     , # character encoding
-    excel:    false   , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
-    mode:     :compact, # export mode: compact or full
-    out:      nil     , # output stream, needs to respond to <<
-    quote:    '"'     , # quote character
-    relax:    false   , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
-    rowsep:   "\n"    , # row separator for export
-    sep:      ","     , # column separator character
-    strip:    false   , # strip fields when reading
-    **opts              # grab bag
-  )
-    # initialize data source
-    if str && str.size < 100 && File.readable?(str)
-      str = File.open(str, encoding ? "r:#{encoding}" : "r").read
-    else
-      str ||= ""
-      str = str.encode(encoding) if encoding
-    end
-    super(str)
-    reset
-    # config options
-    @drop     = drop
-    @encoding = str.encoding
-    @excel    = excel
-    @mode     = mode
-    @out      = out || $stdout
-    @relax    = relax
-    @strip    = strip
-    # config strings
-    @quote    = quote
-    @rowsep   = rowsep
-    @sep      = sep
-    # static strings
-    @cr       = "\r"
-    @lf       = "\n"
-    @es       = ""
-    @eq       = "="
-    # combinations
-    @esc      = (@quote * 2)
-    @seq      = [@sep, @eq].join # used for parsing in excel mode
-    #!# TODO: come up with a clean way to escape/encode all this
-    #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
-    # regexes
-    @eoc      = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
-    @eol      = /#{@cr}#{@lf}?|#{@lf}/o                # end of line
-    @escapes  = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
-    @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
-    @quotes   = /#{@quote}/o
-    @seps     = /#{@sep}+/o
-    @quoted   = @excel ? /(?:=)?#{@quote}/o : @quote
-    @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}]*/o
-    @leadzero = /\A0\d*\z/
-  end
-  def reset(str=nil)
-    @rows = nil
-    @cols = @cells = 0
-    #!# TODO: reset all encodings?
-    self.string = str if str
-    @encoding = string.encoding
-    super()
-  end
-  # ==[ Parser ]==
-  def parse
-    @rows = []
-    @hold = []
-    while row = next_row
-      @rows << row
-      count = row.size
-      @cols = count if count > @cols
-      @cells += count
-    end
-    @rows
-  end
-  def next_row
-    token = next_token or return
-    row = []
-    row.push(*token)
-    row.push(*token) while token = next_token
-    row
-  end
-  def next_token
-    @hold.empty? or return @hold.shift
-    if scan(@quoted) # quoted cell
-      token = ""
-      while true
-        token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
-        token << @quote and next if scan(@quote)
-        scan(@eoc) and break
-        @relax or bomb "invalid character after quote"
-        token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
-      end
-      scan(@sep)
-      @strip ? token.strip : token
-    elsif match = scan(@unquoted) # unquoted cell(s)
-      if check(@quote) && !match.chomp!(@sep) && !match.end_with?(@cr, @lf)
-        unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
-          match << (scan_until(@eoc) or bomb "stray quote")
-          scan(@sep)
-        end
-      end
-      match.split(@eol, -1).each_with_index do |line, i|
-        if line.empty?
-          @hold.push(nil)
-        else
-          @hold.push(nil) if i > 0
-          cells = line.split(@sep, -1)
-          @hold.push(@strip ? cells.map!(&:strip) : cells)
-        end
-      end
-      @hold.shift
-    elsif scan(@sep)
-      match = scan(@seps)
-      match ? match.split(@sep, -1) : @es
-    else
-      scan(@eol)
-      nil
-    end
-  end
-  def each
-    @rows ||= parse
-    @rows.each {|row| yield row }
-  end
-  def export(**opts)
-    out = opts.empty? ? self : self.class.writer(**opts)
-    each {|row| out << row }
-  end
-  # ==[ Helpers ]==
-  # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
-  def grok(str)
-    if idx = str.index(@escapes)
-      $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
-    else
-      0
-    end
-  end
-  # output a row
-  def <<(row)
-    # drop trailing empty columns
-    row.pop while row.last.empty? if @drop
-    s,q = @sep, @quote
-    out = case @mode
-    when :compact
-      case @excel ? 2 : grok(row.join)
-      when 0
-        row
-      when 1
-        row.map do |col|
-          col.match?(@quotable) ? "#{q}#{col}#{q}" : col
-        end
-      else
-        row.map do |col|
-          @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
-          case grok(col)
-          when 0 then col
-          when 1 then "#{q}#{col}#{q}"
-          else        "#{q}#{col.gsub(q, @esc)}#{q}"
-          end
-        end
-      end
-    when :full
-      if @excel
-        row.map do |col|
-          col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
-        end
-      else
-        row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
-      end
-    end.join(s)
-    @out << out + @rowsep
-  end
-  def stats
-    wide = string.size.to_s.size
-    puts "%#{wide}d rows"    % @rows.size
-    puts "%#{wide}d columns" % @cols
-    puts "%#{wide}d cells"   % @cells
-    puts "%#{wide}d bytes"   % string.size
-  end
-  def bomb(msg)
-    abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
-  end
-end
-if __FILE__ == $0
-  raw = DATA.gets("\n\n").chomp
-# raw = File.read(ARGV.first || "lc-2023.csv")
-  csv = Censive.new(raw, excel: true, relax: true)
-  csv.export # (excel: true) # sep: "|")
-end
-__END__
-"Don",="007",10,11,"Ed",20
-Name,Age,,,Shoe,,,
-"Alice",27,5
-Bob,33,10 1/2
-Charlie or "Chuck",=B2 + B3,9
-Subtotal,=sum(B2:B5),="01234"
-A,B,C,D
-A,B,"C",D
-A,B,C",D
-A,B,"C",D
-# first line works in "relax" mode, bottom line is compliant
-123,"CHO, JOELLE "JOJO"",456
-123,"CHO, JOELLE ""JOJO""",456
-# Excel mode checking
-=,=x,x=,="x",="","","=",123,0123,="123",="0123"
-,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off

data/lib/censive.rb-20230209050227 DELETED Viewed

@@ -1,282 +0,0 @@
-#!/usr/bin/env ruby
-# ============================================================================
-# censive - A quick and lightweight CSV handling library for Ruby
-#
-# Author: Steve Shreeve (steve.shreeve@gmail.com)
-#   Date: Feb 9, 2023
-#
-# https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
-# https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
-#
-# Thanks to Sutou Kouhei (kou) for his excellent advice on using scan
-# ============================================================================
-# GOALS:
-# 1. Faster than Ruby's default CSV library
-# 2. Lightweight code with streamlined and optimized logic
-# 3. Support most non-compliant CSV variations (@excel, @relax, etc)
-# 4. Support most commonly used CSV options (@sep, @quote, @strip, @drop, etc)
-#
-# TODO:
-# 1. Support IO streaming
-# 2. Review all encodings, we may be losing speed when mixing encodings
-# 3. Huge speedup possible if our @unquoted regex reads beyond @cr?@lf's
-# 4. Will using String#freeze give us a speed up?
-# 5. Implement support for scan_until(string) <= right now only regex is valid
-# ============================================================================
-require "strscan"
-class Censive < StringScanner
-  attr :encoding
-  def self.parse(...)
-    new(...).parse
-  end
-  def self.writer(obj=nil, **opts, &code)
-    case obj
-    when String then File.open(obj, "w") {|io| yield new(out: io, **opts, &code) }
-    when IO,nil then new(out: obj, **opts, &code)
-    else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
-    end
-  end
-  def initialize(str=nil,
-    drop:     false   , # drop trailing empty columns?
-    encoding: nil     , # character encoding
-    excel:    false   , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
-    mode:     :compact, # export mode: compact or full
-    out:      nil     , # output stream, needs to respond to <<
-    quote:    '"'     , # quote character
-    relax:    false   , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
-    rowsep:   "\n"    , # row separator for export
-    sep:      ","     , # column separator character
-    strip:    false   , # strip columns when reading
-    **opts              # grab bag
-  )
-    # initialize data source
-    if str && str.size < 100 && File.readable?(str)
-      str = File.open(str, encoding ? "r:#{encoding}" : "r").read
-    else
-      str ||= ""
-      str = str.encode(encoding) if encoding
-    end
-    super(str)
-    reset
-    # config options
-    @cheat    = true
-    @drop     = drop
-    @encoding = str.encoding
-    @excel    = excel
-    @mode     = mode
-    @out      = out || $stdout
-    @relax    = relax
-    @strip    = strip
-    # config strings
-    @quote    = quote
-    @rowsep   = rowsep
-    @sep      = sep
-    # static strings
-    @cr       = "\r"
-    @lf       = "\n"
-    @es       = ""
-    @eq       = "="
-    # combinations
-    @esc      = (@quote * 2)
-    @seq      = [@sep, @eq].join # used for parsing in excel mode
-    #!# TODO: come up with a clean way to escape/encode all this
-    #!# TODO: maybe define @tokens = "#{@quote}#{@sep}#{@cr}#{@lf}", etc.
-    # regexes
-    @eoc      = /(?=#{"\\" + @sep}|#{@cr}|#{@lf}|\z)/o # end of cell
-    @eol      = /#{@cr}#{@lf}?|#{@lf}/o                # end of line
-    @escapes  = /(#{@quote})|#{"\\"+@sep}|#{@cr}|#{@lf}/o
-    @quotable = /#{"\\"+@sep}|#{@cr}|#{@lf}/o
-    @quotes   = /#{@quote}/o
-    @seps     = /#{@sep}+/o
-    @quoted   = @excel ? /(?:=)?#{@quote}/o : @quote
-    @unquoted = /[^#{@sep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
-    @leadzero = /\A0\d*\z/
-  end
-  def reset(str=nil)
-    @rows = nil
-    @cols = @cells = 0
-    #!# TODO: reset all encodings?
-    self.string = str if str
-    @encoding = string.encoding
-    super()
-  end
-  # ==[ Parser ]==
-  def parse
-    @rows = []
-    while row = next_row
-      @rows << row
-      count = row.size
-      @cols = count if count > @cols
-      @cells += count
-    end
-    @rows
-  end
-  def next_row
-    if @cheat and line = scan_until(@eol)
-      row = line.chomp!.split(@sep, -1)
-      row.each do |col|
-        next if (saw = col.count(@quote)).zero?
-        next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
-        @cheat = false
-        break
-      end if line.include?(@quote)
-      @cheat and return @strip ? row.each(&:strip!) : row
-      unscan
-    end
-    token = next_token or return
-    row = []
-    row.push(*token)
-    row.push(*token) while token = next_token
-    row
-  end
-  def next_token
-    if scan(@quoted) # quoted cell
-      token = ""
-      while true
-        token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
-        token << @quote and next if scan(@quote)
-        scan(@eoc) and break
-        @relax or bomb "invalid character after quote"
-        token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
-      end
-      scan(@sep)
-      @strip ? token.strip : token
-    elsif match = scan(@unquoted) # unquoted cell(s)
-      if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
-        unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
-          match << (scan_until(@eoc) or bomb "stray quote")
-          scan(@sep)
-        end
-      end
-      tokens = match.split(@sep, -1)
-      @strip ? tokens.map!(&:strip) : tokens
-    elsif scan(@sep)
-      match = scan(@seps)
-      match ? match.split(@sep, -1) : @es
-    else
-      scan(@eol)
-      nil
-    end
-  end
-  def each
-    @rows ||= parse
-    @rows.each {|row| yield row }
-  end
-  def export(**opts)
-    out = opts.empty? ? self : self.class.writer(**opts)
-    each {|row| out << row }
-  end
-  # ==[ Helpers ]==
-  # returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
-  def grok(str)
-    if idx = str.index(@escapes)
-      $1 ? 2 : str.index(@quotes, idx) ? 2 : 1
-    else
-      0
-    end
-  end
-  # output a row
-  def <<(row)
-    # drop trailing empty columns
-    row.pop while row.last.empty? if @drop
-    s,q = @sep, @quote
-    out = case @mode
-    when :compact
-      case @excel ? 2 : grok(row.join)
-      when 0
-        row
-      when 1
-        row.map do |col|
-          col.match?(@quotable) ? "#{q}#{col}#{q}" : col
-        end
-      else
-        row.map do |col|
-          @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
-          case grok(col)
-          when 0 then col
-          when 1 then "#{q}#{col}#{q}"
-          else        "#{q}#{col.gsub(q, @esc)}#{q}"
-          end
-        end
-      end
-    when :full
-      if @excel
-        row.map do |col|
-          col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
-        end
-      else
-        row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
-      end
-    end.join(s)
-    @out << out + @rowsep
-  end
-  def stats
-    wide = string.size.to_s.size
-    puts "%#{wide}d rows"    % @rows.size
-    puts "%#{wide}d columns" % @cols
-    puts "%#{wide}d cells"   % @cells
-    puts "%#{wide}d bytes"   % string.size
-  end
-  def bomb(msg)
-    abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
-  end
-end
-if __FILE__ == $0
-  # raw = DATA.gets("\n\n").chomp
-  # raw = File.read(ARGV.first || "lc-2023.csv")
-  raw = File.open("KEN_ALL.CSV", "r:cp932").read
-  csv = Censive.new(raw, excel: true, relax: true)
-  csv.export # (excel: true) # sep: "|")
-end
-__END__
-"Don",="007",10,"Ed"
-Name,Age,,,Shoe,,,
-"Alice",27,5
-Bob,33,10 1/2
-Charlie or "Chuck",=B2 + B3,9
-Subtotal,=sum(B2:B5),="01234"
-A,B,C,D
-A,B,"C",D
-A,B,C",D
-A,B,"C",D
-# first line works in "relax" mode, bottom line is compliant
-123,"CHO, JOELLE "JOJO"",456
-123,"CHO, JOELLE ""JOJO""",456
-# Excel mode checking
-=,=x,x=,="x",="","","=",123,0123,="123",="0123"
-,=x,x=,x,,,,,,=,,123,="0123",123,,="0123" # <= a little off

data/lib/flay.rb DELETED Viewed

@@ -1,227 +0,0 @@
-#!/usr/bin/env ruby
-# ============================================================================
-# flay - A quick and lightweight benchmarking tool for Ruby
-#
-# Author: Steve Shreeve (steve.shreeve@gmail.com)
-#   Date: Feb 9, 2023
-# ============================================================================
-# GOALS:
-# 1. Provide a simple way to benchmark various code
-# 2. Easy to configure and start comparing results
-#
-# TODO:
-# 1. Everything
-# ============================================================================
-class Hash
-  alias_method :default_lookup, :[]
-  def [](key, miss=nil)
-    key?(sym = key.to_sym) and return default_lookup(sym) || miss
-    ary = key.to_s.split(/(?:[.\/\[]|\][.\/]?)/)
-    val = ary.inject(self) do |obj, sub|
-      if    obj == self        then default_lookup(sub.to_sym)
-      elsif obj == nil         then break
-      elsif sub =~ /\A-?\d*\z/ then obj[sub.to_i]
-      else                          obj[sub.to_sym]
-      end
-    end or miss
-  end
-  def method_missing(name, *args)
-    name !~ /=$/ ? self[name, *args] : self[$`.to_sym] = args.first
-  end
-end
-config = {
-  environments: [
-    {
-      name: "Environment 1",
-      before: <<~"|",
-        # Environment 1 before
-      |
-      after: <<~"|",
-        # Environment 1 after
-      |
-    },
-    {
-      name: "Environment 2",
-      before: <<~"|",
-        # Environment 1 before
-      |
-      after: <<~"|",
-        # Environment 1 after
-      |
-    },
-  ],
-  contexts: [
-    {
-      name: "Context 1",
-      before: <<~"|",
-        # context 1 before
-      |
-      script: <<~"|",
-        a = [*1..1e5]
-        a.sum
-      |
-      after: <<~"|",
-        # context 1 after
-      |
-    },
-    {
-      name: "Context 2",
-      before: <<~"|",
-        # context 2 before
-      |
-      after: <<~"|",
-        # context 2 after
-      |
-    },
-  ],
-  tasks: [
-    {
-      name: "Task 1",
-      runs: 35,
-      before: <<~"|",
-        # Task 1 before
-      |
-      after: <<~"|",
-        # Task 1 after
-      |
-    },
-    {
-      name: "Task 2",
-      secs: 30,
-      before: <<~"|",
-        # Task 2 before
-      |
-      after: <<~"|",
-        # Task 2 after
-      |
-    },
-  ],
-}
-# ==[ Helpers ]==
-def wrapper(object, type=nil)
-  puts case type
-  when :environment then template_for_environment object
-  when :context     then template_for_context     object
-  when :task        then template_for_task        object
-  else                   section                  object
-  end
-end
-def wrap(list, type=nil, **opts)
-  list.each do |item|
-    wrapper(item, type)
-    yield item
-  end
-end
-def section(text, wide=78, left=0)
-  [
-    "# ".ljust(wide, "="),
-    "# #{text}",
-    "# ".ljust(wide, "="),
-  ].join("\n")
-end
-def hr(text, wide=78, left=0)
-  [ " " * left, "# ==[ ", text, " ]" ].join.ljust(wide, "=")
-end
-# ==[ Templates ]==
-def template_for_environment(environment)
-  <<~"|"
-    #{ section "Environment: #{environment.name} " }
-    # ==[ Code before environment ]==
-    #{ environment.before }
-  |
-end
-def template_for_context(context)
-  <<~"|"
-    #{ section "Context: #{context.name} " }
-    # ==[ Code before context ]==
-    #{ context.before }
-  |
-end
-def template_for_task(task)
-  <<~"|"
-    #{ section "Task: #{task.name} " }
-    # ==[ Code before task ]==
-    #{ task.before }
-    # ==[ Calculate the duration of a loop of empty runs ]==
-    if #{ task.runs } == 1
-      __flay_before_empty = 0
-      __flay_after_empty  = 0
-    else
-      __flay_before_empty = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-      __flay_runs = 0
-      while __flay_runs < #{ task.runs } # this empty loop improves accuracy
-        __flay_runs += 1
-      end
-      __flay_after_empty = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-    end
-    # ==[ Calculate the duration of a loop of script runs ]==
-    if #{ task.runs } == 1
-      __flay_before_script = 0
-      __flay_after_script  = 0
-    else
-      __flay_before_script = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-      __flay_runs = 0
-      while __flay_runs < #{ task.runs }
-        # ==[ Before script ]==
-        #{ task.script }
-        # ==[ After script ]==
-        __flay_runs += 1
-      end
-      __flay_after_script = Process.clock_gettime(Process::CLOCK_MONOTONIC)
-    end
-    # ==[ Code after task ]==
-    #{ task.after }
-    # ==[ Write out timestamps ]==
-    __flay_duration = (__flay_after_script - __flay_before_script) -
-                      (__flay_after_empty  - __flay_before_empty )
-    File.write("/dev/null", __flay_duration.inspect)
-  |
-end
-# ==[ Workflow ]==
-environments = config.environments
-contexts     = config.contexts
-tasks        = config.tasks
-wrap(environments, :environment) do |environment|
-  wrap(tasks, :task) do |task|
-    wrap(contexts, :context) do |context|
-    end
-  end
-end