RubyGems - csvutils - Versions diffs - 0.2.2 → 0.3.0 - Mend

csvutils 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/HISTORY.md +3 -3
data/LICENSE.md +116 -0
data/Manifest.txt +8 -4
data/README.md +302 -286
data/Rakefile +30 -26
data/bin/csvcut +17 -17
data/bin/csvhead +17 -17
data/bin/csvheader +17 -17
data/bin/csvsplit +17 -17
data/bin/csvstat +17 -17
data/{test/data → datasets}/at-austria/AUT.csv +363 -363
data/{test/data → datasets}/de-deutschland/bundesliga.csv +481 -481
data/{test/data → datasets}/eng-england/2017-18/E0.csv +381 -381
data/lib/csvutils.rb +32 -31
data/lib/csvutils/commands/cut.rb +43 -43
data/lib/csvutils/commands/head.rb +40 -40
data/lib/csvutils/commands/header.rb +35 -35
data/lib/csvutils/commands/split.rb +41 -41
data/lib/csvutils/commands/stat.rb +41 -41
data/lib/csvutils/cut.rb +43 -50
data/lib/csvutils/head.rb +22 -25
data/lib/csvutils/header.rb +16 -28
data/lib/csvutils/split.rb +106 -107
data/lib/csvutils/stat.rb +81 -86
data/lib/csvutils/test.rb +19 -22
data/lib/csvutils/utils.rb +29 -13
data/lib/csvutils/version.rb +24 -24
data/test/helper.rb +16 -16
data/test/test_cut.rb +31 -0
data/test/test_head.rb +30 -0
data/test/{test_headers.rb → test_header.rb} +50 -50
data/test/test_misc.rb +44 -44
data/test/test_split.rb +31 -0
data/test/test_version.rb +20 -20
metadata +28 -9

data/lib/csvutils/cut.rb CHANGED

@@ -1,50 +1,43 @@
-# encoding: utf-8
-## check/use class or module ???
-class CsvUtils
-  def self.cut( path, *columns, output: path, sep: ',' )
-    inpath  = path
-    outpath = output   # note: output defaults to inpath (overwrites datafile in-place!!!)
-    puts "cvscut in: >#{inpath}<  out: >#{outpath}<"
-    ##  ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
-    puts "columns:"
-    pp columns
-    text = File.open( inpath, 'r:utf-8' ).read   ## note: make sure to use (assume) utf-8
-    csv_options = { headers: true,
-                    col_sep: sep }
-    table = CSV.parse( text, csv_options )
-    ## for convenience - make sure parent folders/directories exist
-    FileUtils.mkdir_p( File.dirname( outpath ))  unless Dir.exists?( File.dirname( outpath ))
-    ## use wb mode - why? why not?
-    ##   assumes same encoding as input?
-    ##   fix/todo: better (always) use utf8!!!!
-    ## CSV.open( out_path, 'wb' ) do |out|
-    ## use just "regular" File for output - why? why not?
-    ##    downside will not encode comma (for now) if present ("Beethoven, van")
-    ##      all values will be unquoted etc. - keep it simple?
-    CSV.open( outpath, 'w:utf-8' ) do |out|
-      out << columns   ## for row add headers/columns
-      table.each do |row|
-        values = columns.map { |col| row[col].strip }  ## find data for column
-        out << values
-      end
-    end
-    puts 'Done.'
-  end  ## method self.cut
-end # class CsvUtils
+# encoding: utf-8
+## check/use class or module ???
+class CsvUtils
+  def self.cut( path, *columns, output: path, sep: ',' )
+    inpath  = path
+    outpath = output   # note: output defaults to inpath (overwrites datafile in-place!!!)
+    puts "cvscut in: >#{inpath}<  out: >#{outpath}<"
+    ##  ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
+    puts "columns:"
+    pp columns
+    csv_options = { sep: sep }
+    recs = CsvHash.read( inpath, csv_options )
+    ## for convenience - make sure parent folders/directories exist
+    FileUtils.mkdir_p( File.dirname( outpath ))  unless Dir.exists?( File.dirname( outpath ))
+    ## note:
+    ##  todo/fix: add two trailing spaces for pretty printing - why? why not?
+    File.open( outpath, 'w:utf-8' ) do |out|
+      out << csv_row( *columns, sep: sep ).join( sep )   ## for row add headers/columns
+      out << "\n"
+      recs.each do |rec|
+        values = columns.map { |col| rec[col] }  ## find data for column
+        out << csv_row( *values, sep: sep ).join( sep )
+        out << "\n"
+      end
+    end
+    puts 'Done.'
+  end  ## method self.cut
+end # class CsvUtils

data/lib/csvutils/head.rb CHANGED

@@ -1,25 +1,22 @@
-# encoding: utf-8
-class CsvUtils
-  ## test or dry run to check if rows can get read/scanned
-  def self.head( path, sep: ',', n: 4 )
-    i = 0
-    csv_options = { headers: true,
-                    col_sep: sep,
-                    external_encoding: 'utf-8'  ## note:  always (auto-)add utf-8 external encoding!!!
-                   }
-    CSV.foreach( path, csv_options ) do |row|
-      i += 1
-      pp row
-      break if i >= n
-    end
-    puts " #{i} rows"
-  end
-end  # class CsvUtils
+# encoding: utf-8
+class CsvUtils
+  ## test or dry run to check if rows can get read/scanned
+  def self.head( path, sep: ',', n: 4 )
+    i = 0
+    csv_options = { sep: sep }
+    CsvHash.foreach( path, csv_options ) do |rec|
+      i += 1
+      pp rec
+      break if i >= n
+    end
+    puts " #{i} records"
+  end
+end  # class CsvUtils

data/lib/csvutils/header.rb CHANGED

@@ -1,28 +1,16 @@
-# encoding: utf-8
-class CsvUtils
-  def self.header( path, sep: ',', debug: false )   ## use header or headers - or use both (with alias)?
-    # read first line (only)
-    #  and parse with csv to get header from csv library itself
-    #
-    #  check - if there's an easier or built-in way for the csv library
-    line = File.open( path, 'r:utf-8' ) { |f| f.readline }
-    pp line   if debug
-    ## e.g.:
-    #  "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
-    csv_options = {  col_sep: sep }
-    ## note: do NOT use headers: true to get "plain" data array (no hash records)
-    ##   hash record does NOT work for single line/row
-    rows = CSV.parse( line, csv_options )
-    pp rows   if debug
-    rows[0]   ## return first row
-  end  # method self.header
-end  # class CsvUtils
+# encoding: utf-8
+class CsvUtils
+  def self.header( path, sep: ',', debug: false )   ## use header or headers - or use both (with alias)?
+    row = CsvReader.header( path, sep: sep )
+    pp row   if debug
+    ## e.g.:
+    #  "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
+    row
+  end  # method self.header
+end  # class CsvUtils

data/lib/csvutils/split.rb CHANGED

@@ -1,107 +1,106 @@
-# encoding: utf-8
-class CsvUtils
-  def self.split( path, *columns, sep: ',', &blk )
-    puts "cvssplit in: >#{path}<"
-    ##  ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
-    puts "columns:"
-    pp columns
-    text = File.open( path, 'r:utf-8' ).read   ## note: make sure to use (assume) utf-8
-    ## note: do NOT use headers
-    ##   for easy sorting use "plain" array of array for records
-    csv_options = { col_sep: sep }
-    data = CSV.parse( text, csv_options )
-    ## todo/check: (auto-) strip (remove all leading and trailing spaces)
-    ##     from all values - why? why not?
-    ##   check if CSV.parse has an option for it?
-    headers = data.shift   ## remove top array item (that is, row with headers)
-    header_mapping = {}
-    headers.each_with_index  { | header,i | header_mapping[header]=i }
-    pp header_mapping
-    ## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
-    column_indices = columns.map { |col| header_mapping[col] }
-    pp column_indices
-    ###################################################
-    ## note: sort data by columns (before split)
-    data = data.sort do |row1,row2|
-       res = 0
-       column_indices.each do |col|
-         res = row1[col] <=> row2[col]    if res == 0
-       end
-       res
-    end
-    chunk = []
-    data.each_with_index do |row,i|
-      chunk << row
-      next_row = data[i+1]
-      changed = false
-      if next_row.nil?   ## end-of-file
-        changed = true
-      else
-        column_indices.each do |col|
-          if row[col] != next_row[col]
-             changed = true
-             break   ## out of each column_indices loop
-           end
-        end
-      end
-      if changed
-        puts "save new chunk:"
-        column_values = column_indices.map {|col| row[col] }
-        pp column_values
-        # note: add header(s) row upfront (as first row) to chunk (with unshift)
-        chunk_with_headers = chunk.unshift( headers )
-        if blk
-          yield( column_values, chunk_with_headers )
-        else
-          ## auto-save (write-to-file) by default - why? why not?
-          split_write( path, column_values, chunk_with_headers )
-        end
-        chunk = []   ## reset chunk for next batch of records
-      end
-    end
-    puts 'Done.'
-  end  ## method self.split
-  def self.split_write( inpath, values, chunk )
-    basename = File.basename( inpath, '.*' )
-    dirname  = File.dirname( inpath )
-    ## check/change invalid filename chars
-    ##  e.g. change 1990/91 to 1990-91
-    extraname = values.map {|value| value.tr('/','-')}.join('~')
-    outpath = "#{dirname}/#{basename}_#{extraname}.csv"
-    puts "saving >#{basename}_#{extraname}.csv<..."
-    CSV.open( outpath, 'w:utf-8' ) do |out|
-      chunk.each do |row|
-        out << row
-      end
-    end
-  end
-end # class CsvUtils
+# encoding: utf-8
+class CsvUtils
+  def self.split( path, *columns, sep: ',', &blk )
+    puts "cvssplit in: >#{path}<"
+    ##  ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
+    puts "columns:"
+    pp columns
+    ## note: do NOT use headers
+    ##   for easy sorting use "plain" array of array for records
+    csv_options = { sep: sep }
+    data = CsvReader.read( path, csv_options )
+    ## todo/check: (auto-) strip (remove all leading and trailing spaces)
+    ##     from all values - why? why not?
+    ##   check if CSV.parse has an option for it?
+    headers = data.shift   ## remove top array item (that is, row with headers)
+    header_mapping = {}
+    headers.each_with_index  { | header,i | header_mapping[header]=i }
+    pp header_mapping
+    ## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
+    column_indices = columns.map { |col| header_mapping[col] }
+    pp column_indices
+    ###################################################
+    ## note: sort data by columns (before split)
+    data = data.sort do |row1,row2|
+       res = 0
+       column_indices.each do |col|
+         res = row1[col] <=> row2[col]    if res == 0
+       end
+       res
+    end
+    chunk = []
+    data.each_with_index do |row,i|
+      chunk << row
+      next_row = data[i+1]
+      changed = false
+      if next_row.nil?   ## end-of-file
+        changed = true
+      else
+        column_indices.each do |col|
+          if row[col] != next_row[col]
+             changed = true
+             break   ## out of each column_indices loop
+           end
+        end
+      end
+      if changed
+        puts "save new chunk:"
+        column_values = column_indices.map {|col| row[col] }
+        pp column_values
+        # note: add header(s) row upfront (as first row) to chunk (with unshift)
+        chunk_with_headers = chunk.unshift( headers )
+        if blk
+          yield( column_values, chunk_with_headers )
+        else
+          ## auto-save (write-to-file) by default - why? why not?
+          split_write( path, column_values, chunk_with_headers, sep: sep )
+        end
+        chunk = []   ## reset chunk for next batch of records
+      end
+    end
+    puts 'Done.'
+  end  ## method self.split
+  def self.split_write( inpath, values, chunk, sep: )
+    basename = File.basename( inpath, '.*' )
+    dirname  = File.dirname( inpath )
+    ## check/change invalid filename chars
+    ##  e.g. change 1990/91 to 1990-91
+    extraname = values.map {|value| value.tr('/','-')}.join('~')
+    outpath = "#{dirname}/#{basename}_#{extraname}.csv"
+    puts "saving >#{basename}_#{extraname}.csv<..."
+    File.open( outpath, 'w:utf-8' ) do |out|
+      chunk.each do |row|
+        out << csv_row( *row, sep: sep ).join( sep )
+        out << "\n"
+      end
+    end
+  end
+end # class CsvUtils

data/lib/csvutils/stat.rb CHANGED

@@ -1,86 +1,81 @@
-# encoding: utf-8
-class CsvUtils
-  def self.stat( path, *columns, sep: ',', debug: false )
-    csv_options = { headers: true,
-                    col_sep: sep,
-                    external_encoding: 'utf-8'  ## note:  always (auto-)add utf-8 external encoding!!!
-                  }
-    values = {}
-    nulls  = {}
-    # check 1) nulls/nils (e.g. empty strings ""),
-    #       2) not/appliation or available  n/a NA or NaN or ...
-    #       3) missing - e.g. ?
-    i=0
-    CSV.foreach( path, csv_options ) do |row|
-      i += 1
-      pp row    if i == 1 && debug
-      print '.' if i % 100 == 0
-      ## collect unique values for passed in columns
-      columns.each do |col|
-        value = row[col]    ## note: value might be nil!!!!!
-        value = value.strip   if value   ## use strip - why? why not? report/track trailing spaces?
-        values[col] ||= Hash.new(0)
-        values[col][ value ? value : '<nil>' ] +=1
-      end
-      ## alway track nulls - why? why not
-      row.each do |col,value|
-        ## if value.nil?    ## todo/check - nil value possible (not always empty string - why? why not?)
-        ##   puts "[debug] nil value in row:"
-        ##   puts "#{col} = #{value.inspect} : #{value.class.name}"
-        ## end
-        value = value.strip   if value   ## use strip - why? why not? report/track trailing spaces?
-        if value.nil?
-          nulls[col] ||= Hash.new(0)
-          nulls[col]['nil'] +=1
-        elsif value.empty?
-          nulls[col] ||= Hash.new(0)
-          nulls[col]['empty'] +=1
-        elsif ['na', 'n/a', '-'].include?( value.downcase )
-          nulls[col] ||= Hash.new(0)
-          nulls[col]['na'] +=1
-        elsif value == '?'    ## check for (?) e.g. value.include?( '(?)') - why? why not?
-          nulls[col] ||= Hash.new(0)
-          nulls[col]['?'] +=1
-        else
-          # do nothing; "regular" value
-        end
-      end
-    end
-    puts " #{i} rows"
-    puts
-    puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
-    puts "   #{nulls.inspect}"
-    puts
-    ## dump headers first (first row with names of columns)
-    headers = header( path, sep: sep, debug: debug )
-    pp_header( headers )  ## pretty print header columns
-    puts
-    if values.any?
-       ## pretty print (pp) / dump unique values for passed in columns
-       values.each do |col,h|
-         puts " column >#{col}< #{h.size} unique values:"
-         ## sort by name/value for now (not frequency) - change - why? why not?
-         sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
-         sorted_values.each do |rec|
-           puts "   #{rec[1]} x  #{rec[0]}"
-         end
-       end
-    end
-  end # method self.stat
-end  # class CsvUtils
+# encoding: utf-8
+class CsvUtils
+  def self.stat( path, *columns, sep: ',', debug: false )
+    csv_options = { sep: sep }
+    values = {}
+    nulls  = {}
+    # check 1) nulls/nils (e.g. empty strings ""),
+    #       2) not/appliation or available  n/a NA or NaN or ...
+    #       3) missing - e.g. ?
+    i=0
+    CsvHash.foreach( path, csv_options ) do |rec|
+      i += 1
+      pp rec    if i == 1 && debug
+      print '.' if i % 100 == 0
+      ## collect unique values for passed in columns
+      columns.each do |col|
+        value = rec[col]    ## note: value might be nil!!!!!
+        values[col] ||= Hash.new(0)
+        values[col][ value ? value : '<nil>' ] +=1
+      end
+      ## alway track nulls - why? why not
+      rec.each do |col,value|
+        ## if value.nil?    ## todo/check - nil value possible (not always empty string - why? why not?)
+        ##   puts "[debug] nil value in row:"
+        ##   puts "#{col} = #{value.inspect} : #{value.class.name}"
+        ## end
+        if value.nil?
+          nulls[col] ||= Hash.new(0)
+          nulls[col]['nil'] +=1
+        elsif value.empty?
+          nulls[col] ||= Hash.new(0)
+          nulls[col]['empty'] +=1
+        elsif ['na', 'n/a', '-'].include?( value.downcase )
+          nulls[col] ||= Hash.new(0)
+          nulls[col]['na'] +=1
+        elsif value == '?'    ## check for (?) e.g. value.include?( '(?)') - why? why not?
+          nulls[col] ||= Hash.new(0)
+          nulls[col]['?'] +=1
+        else
+          # do nothing; "regular" value
+        end
+      end
+    end
+    puts " #{i} rows"
+    puts
+    puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
+    puts "   #{nulls.inspect}"
+    puts
+    ## dump headers first (first row with names of columns)
+    headers = header( path, sep: sep, debug: debug )
+    pp_header( headers )  ## pretty print header columns
+    puts
+    if values.any?
+       ## pretty print (pp) / dump unique values for passed in columns
+       values.each do |col,h|
+         puts " column >#{col}< #{h.size} unique values:"
+         ## sort by name/value for now (not frequency) - change - why? why not?
+         sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
+         sorted_values.each do |rec|
+           puts "   #{rec[1]} x  #{rec[0]}"
+         end
+       end
+    end
+  end # method self.stat
+end  # class CsvUtils