RubyGems - censive - Versions diffs - 0.7 → 0.9 - Mend

censive 0.7 → 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8a757fa8bbc5ddf364889e4b7feca2001f3784e8d0b2ff70a1b0349691a34aae
-  data.tar.gz: 68dced562eb0dc9b7ad300447091ceb74c04a55201e88cc9fffbe1ba3bbc534d
+  metadata.gz: cbca33c415269ae1fafea5297f2b409879a46c37c884a0a7017bca322bcff2a6
+  data.tar.gz: ac021ddf3d7503aebc5791b0912c6409a0888627060b532e65f6eb72b94965a3
 SHA512:
-  metadata.gz: c48d7e2bd3d1a7baa5fb2fae7b0553de665737849e9a50721f704a1a1f67c758c545dfe53d21f32ce386b20ea21f04c67ee8d765bf20653774b9475ebb60711f
-  data.tar.gz: 411d59006ebcb6a07161186b56f73a8dcc73beeaecbe14e786ad237935c62fd6ef0631483c8f297399098b0dea2387863f7be8c878568e0558804e5bd20b55ee
+  metadata.gz: 8095c0c7704e3a6ee66930b36f0131b38d52a68cdd066d9677e8ceb58c4ecd7ce7eed496c78b1841cabe845b8c82624ca808b33a7cf7ec4c8fd504b287b3ffb5
+  data.tar.gz: 2e363b63b37977784a38c06e091f3201a1cd7a13138e8101e0e41ca49c47b3c4b433e7e6f2843a6816ddcbf9c1c8293da0d858f6be38bd0d3d82ed5dbd904bfe

data/README.md CHANGED Viewed

@@ -12,7 +12,7 @@ data = File.read('data.csv')
 # write out a tab-separated tsv file
 Censive.writer('out.tsv', sep: "\t", mode: :full) do |out|
-  Censive.new(data).each do |row|
+  Censive.new(data, excel: true, relax: true).each do |row|
     out << row
   end
 end

data/censive.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name        = "censive"
-  s.version     = "0.7"
+  s.version     = "0.9"
   s.author      = "Steve Shreeve"
   s.email       = "steve.shreeve@gmail.com"
   s.summary     = "A quick and lightweight CSV handling library for Ruby"

data/lib/censive.rb CHANGED Viewed

@@ -5,6 +5,9 @@
 #
 # Author: Steve Shreeve (steve.shreeve@gmail.com)
 #   Date: Jan 30, 2023
+#
+# Thanks to https://crystal-lang.org/api/1.7.2/CSV.html (Crystal's CSV library)
+# and, also https://github.com/ruby/strscan/blob/master/ext/strscan/strscan.c
 # ==============================================================================
 # The goals are:
 #
@@ -15,10 +18,8 @@
 #
 # 1. Option to support IO streaming
 # 2. Option to strip whitespace
-# 3. Option to change output line endings
-# 4. Option to force quotes in output
-# 5. Option to allow reading excel CSV (="Text" for cells)
-# 6. Confirm file encodings such as UTF-8, UTF-16, etc.
+# 3. Option to support headers in readers and writers
+# 4. Confirm file encodings such as UTF-8, UTF-16, etc.
 #
 # NOTE: Only getch and scan_until advance strscan's position
 # ==============================================================================
@@ -39,6 +40,7 @@ class Censive < StringScanner
     drop:  false   , # enable to drop trailing separators
     eol:   "\n"    , # desired line endings for exports
+    excel: false   , # literals (="01"), formulas (=A1 + B2), see http://bit.ly/3Y7jIvc
     mode:  :compact, # export mode: compact or full
     out:   nil     , # output IO/file
     relax: false   , # relax parsing of quotes
@@ -48,56 +50,69 @@ class Censive < StringScanner
     super(str || '')
     reset
-    @sep   = sep  .freeze
-    @quote = quote.freeze
+    @sep    = sep  .freeze
+    @quote  = quote.freeze
+    @drop   = drop
+    @eol    = eol.freeze
+    @excel  = excel
+    @mode   = mode
+    @out    = out
+    @relax  = relax
-    @drop  = drop
-    @eol   = eol.freeze
-    @mode  = mode
-    @out   = out
-    @relax = relax
+    @es     = ""   .freeze
+    @cr     = "\r" .freeze
+    @lf     = "\n" .freeze
+    @eq     = "="  .freeze
+    @esc    = (@quote * 2).freeze
-    @es    = ""   .freeze
-    @cr    = "\r" .freeze
-    @lf    = "\n" .freeze
-    @esc   = (@quote * 2).freeze
+    @tokens = [@sep,@quote,@cr,@lf,@es,nil]
   end
   def reset(str=nil)
     self.string = str if str
     super()
-    @char  = string[pos]
-    @flag  = nil
+    @char = peek(1)
+    @flag = nil
-    @rows  = nil
-    @cols  = @cells = 0
+    @rows = nil
+    @cols = @cells = 0
   end
   # ==[ Lexer ]==
   def next_char
     getch
-    @char = string[pos]
+    @char = peek(1)
   end
   def next_token
+    # process and clear @flag
     case @flag
-    when @es then @flag = nil; [@cr,@lf,nil].include?(@char) and return @es
+    when @es then @flag = nil; [@cr,@lf,@es,nil].include?(@char) and return @es
     when @cr then @flag = nil; next_char == @lf and next_char
     when @lf then @flag = nil; next_char
+    else          @flag = nil
     end if @flag
-    if [@sep,@quote,@cr,@lf,nil].include?(@char)
+    # See http://bit.ly/3Y7jIvc
+    if @excel && @char == @eq
+      @flag = @eq
+      next_char
+    end
+    if @tokens.include?(@char)
       case @char
-      when @quote # consume_quoted_cell
+      when @quote # consume quoted cell
         match = ""
         while true
-          getch # consume the quote (optimized by not calling next_char)
+          getch # consume the quote that got us here
           match << (scan_until(/(?=#{@quote})/o) or bomb "unclosed quote")
           case next_char
-          when @sep        then @flag = @es; next_char; break
-          when @quote      then match << @quote
-          when @cr,@lf,nil then break
+          when @sep            then @flag = @es; next_char; break
+          when @quote          then match << @quote
+          when @cr,@lf,@es,nil then break
           else
             if @relax
               match << @quote + @char
@@ -107,14 +122,15 @@ class Censive < StringScanner
           end
         end
         match
-      when @sep then @flag = @es; next_char; @es
-      when @cr  then @flag = @cr; nil
-      when @lf  then @flag = @lf; nil
-      when nil  then nil
+      when @sep    then @flag = @es; next_char; @es
+      when @cr     then @flag = @cr; nil
+      when @lf     then @flag = @lf; nil
+      when @es,nil then              nil
       end
-    else # consume_unquoted_cell
+    else # consume unquoted cell
       match = scan_until(/(?=#{@sep}|#{@cr}|#{@lf}|\z)/o) or bomb "unexpected character"
-      @char = string[pos]
+      match = @eq + match if @flag == @eq # preserve @eq for excel formulas
+      @char = peek(1)
       @char == @sep and @flag = @es and next_char
       match
     end
@@ -146,9 +162,9 @@ class Censive < StringScanner
   # ==[ Helpers ]==
-  # grok returns: 2 for seps and quotes, 1 for seps only, and 0 for neither
+  # grok returns: 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)
   def grok(str)
-    if pos = str.index(/(#{@quote})|#{@sep}/o)
+    if pos = str.index(/(#{@quote})|#{@sep}|#{@cr}|#{@lf}/o)
       $1 ? 2 : str.index(/#{@quote}/o, pos) ? 2 : 1
     else
       0
@@ -158,13 +174,20 @@ class Censive < StringScanner
   def <<(row)
     @out or return super
+    # drop trailing seps, if specified
+    row.pop while row.last.empty? if @drop
     # most compact export format
     s,q = @sep, @quote
     out = case @mode
     when :compact
       case grok(row.join)
-      when 0 then row
-      when 1 then row.map {|col| col.include?(s) ? "#{q}#{col}#{q}" : col }
+      when 0
+        row
+      when 1
+        row.map do |col|
+          col.match?(/#{@sep}|#{@cr}|#{@lf}/o) ? "#{q}#{col}#{q}" : col
+        end
       else
         row.map do |col|
           case grok(col)
@@ -178,9 +201,6 @@ class Censive < StringScanner
       row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
     end.join(s)
-    # drop trailing seps, if specified
-    out.gsub!(/#{s}+\z/,'') if @drop
     # write output, using desired line endings
     @out << out + @eol
   end
@@ -203,81 +223,25 @@ __END__
 # ==[ Playground... ]==
-STDOUT.sync = true
-require 'fileutils'
-ARGV << "101.csv"
-rand = `LC_ALL=C tr -dc a-zA-Z0-9 < /dev/random | head -c12`
-rows = []
-cols = []
-coun = 0
-full = 0
-ARGV.each do |path|
-  File.file?(path) or next
-  print "Processing #{path.inspect}"
-  rows.clear
-  cols.clear
-  seen = 0
-  coun += 1
-  dest = "#{path}-#{rand}"
-  begin
-    Censive.writer(dest) do |file|
-      Censive.new(File.read(path), relax: true).each do |cols|
-        cols.each {|cell| cell && cell.size >= 3 && cell.sub!(/\A="/, '') && cell.sub!(/"\z/, '') }
-        file << cols
-        seen += 1
-        print "." if (seen % 1e5) == 0
-      end
-    end
-    FileUtils.mv(dest, path)
-    full += (seen - 1)
-    puts " (#{seen - 1} rows of data)"
-  rescue
-    puts " - unable to process (#{$!})"
-    FileUtils.rm_f(dest)
-  end
-end
-puts "Processed #{coun} files with a total of #{full} rows of data" if coun > 1
-__END__
-,"CHUI, LOK HANG "BENNY",   =>   ,"""CHUI, LOK HANG ""BENNY""",
+# STDOUT.sync = true
+#
+# data = File.read('1.csv')
+#
+# Censive.writer('out.csv') do |out|
+#   Censive.new(data, relax: true, excel: true).each do |row|
+#     out << row
+#   end
+# end
+#
+# __END__
-,"..............."B
+ARGV << "z.csv" if ARGV.empty?
-__END__
+path = ARGV.first
+data = File.read(path)
+csv = Censive.new(data)
-data = File.read('1.csv')
+data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
-Censive.writer('out.csv', sep: ',', quote: "'") do |out|
-  Censive.new(data).each do |row|
-    out << row
-  end
-end
-# ARGV << "z.csv" if ARGV.empty?
-#
-# case 1
-# when 1
-#   path = ARGV.first
-#   data = File.read(path)
-# when 2
-#   data = DATA.gets("\n\n").rstrip
-# end
-#
-# STDOUT.sync = true
-#
-# csv = Censive.new(data)
-#
-# data.size > 1e6 ? csv.parse : csv.parse.each {|cols| p cols }
-#
-# csv.stats
+csv.stats

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: censive
 version: !ruby/object:Gem::Version
-  version: '0.7'
+  version: '0.9'
 platform: ruby
 authors:
 - Steve Shreeve