csvutils 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,50 +1,43 @@
1
- # encoding: utf-8
2
-
3
- ## check/use class or module ???
4
-
5
-
6
- class CsvUtils
7
-
8
- def self.cut( path, *columns, output: path, sep: ',' )
9
-
10
- inpath = path
11
- outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!)
12
-
13
- puts "cvscut in: >#{inpath}< out: >#{outpath}<"
14
-
15
- ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
16
- puts "columns:"
17
- pp columns
18
-
19
- text = File.open( inpath, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
20
-
21
- csv_options = { headers: true,
22
- col_sep: sep }
23
-
24
- table = CSV.parse( text, csv_options )
25
-
26
-
27
- ## for convenience - make sure parent folders/directories exist
28
- FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
29
-
30
- ## use wb mode - why? why not?
31
- ## assumes same encoding as input?
32
- ## fix/todo: better (always) use utf8!!!!
33
- ## CSV.open( out_path, 'wb' ) do |out|
34
-
35
- ## use just "regular" File for output - why? why not?
36
- ## downside will not encode comma (for now) if present ("Beethoven, van")
37
- ## all values will be unquoted etc. - keep it simple?
38
-
39
- CSV.open( outpath, 'w:utf-8' ) do |out|
40
- out << columns ## for row add headers/columns
41
- table.each do |row|
42
- values = columns.map { |col| row[col].strip } ## find data for column
43
- out << values
44
- end
45
- end
46
-
47
- puts 'Done.'
48
- end ## method self.cut
49
-
50
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+ ## check/use class or module ???
4
+
5
+
6
+ class CsvUtils
7
+
8
+ def self.cut( path, *columns, output: path, sep: ',' )
9
+
10
+ inpath = path
11
+ outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!)
12
+
13
+ puts "cvscut in: >#{inpath}< out: >#{outpath}<"
14
+
15
+ ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
16
+ puts "columns:"
17
+ pp columns
18
+
19
+ csv_options = { sep: sep }
20
+
21
+ recs = CsvHash.read( inpath, csv_options )
22
+
23
+
24
+ ## for convenience - make sure parent folders/directories exist
25
+ FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
26
+
27
+
28
+ ## note:
29
+ ## todo/fix: add two trailing spaces for pretty printing - why? why not?
30
+ File.open( outpath, 'w:utf-8' ) do |out|
31
+ out << csv_row( *columns, sep: sep ).join( sep ) ## for row add headers/columns
32
+ out << "\n"
33
+ recs.each do |rec|
34
+ values = columns.map { |col| rec[col] } ## find data for column
35
+ out << csv_row( *values, sep: sep ).join( sep )
36
+ out << "\n"
37
+ end
38
+ end
39
+
40
+ puts 'Done.'
41
+ end ## method self.cut
42
+
43
+ end # class CsvUtils
@@ -1,25 +1,22 @@
1
- # encoding: utf-8
2
-
3
-
4
- class CsvUtils
5
-
6
- ## test or dry run to check if rows can get read/scanned
7
- def self.head( path, sep: ',', n: 4 )
8
- i = 0
9
- csv_options = { headers: true,
10
- col_sep: sep,
11
- external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!!
12
- }
13
-
14
- CSV.foreach( path, csv_options ) do |row|
15
- i += 1
16
-
17
- pp row
18
-
19
- break if i >= n
20
- end
21
-
22
- puts " #{i} rows"
23
- end
24
-
25
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+
4
+ class CsvUtils
5
+
6
+ ## test or dry run to check if rows can get read/scanned
7
+ def self.head( path, sep: ',', n: 4 )
8
+ i = 0
9
+ csv_options = { sep: sep }
10
+
11
+ CsvHash.foreach( path, csv_options ) do |rec|
12
+ i += 1
13
+
14
+ pp rec
15
+
16
+ break if i >= n
17
+ end
18
+
19
+ puts " #{i} records"
20
+ end
21
+
22
+ end # class CsvUtils
@@ -1,28 +1,16 @@
1
- # encoding: utf-8
2
-
3
-
4
- class CsvUtils
5
-
6
- def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
7
-
8
- # read first line (only)
9
- # and parse with csv to get header from csv library itself
10
- #
11
- # check - if there's an easier or built-in way for the csv library
12
-
13
- line = File.open( path, 'r:utf-8' ) { |f| f.readline }
14
-
15
- pp line if debug
16
- ## e.g.:
17
- # "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
18
-
19
- csv_options = { col_sep: sep }
20
-
21
- ## note: do NOT use headers: true to get "plain" data array (no hash records)
22
- ## hash record does NOT work for single line/row
23
- rows = CSV.parse( line, csv_options )
24
- pp rows if debug
25
- rows[0] ## return first row
26
- end # method self.header
27
-
28
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+
4
+ class CsvUtils
5
+
6
+ def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
7
+ row = CsvReader.header( path, sep: sep )
8
+
9
+ pp row if debug
10
+ ## e.g.:
11
+ # "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
12
+
13
+ row
14
+ end # method self.header
15
+
16
+ end # class CsvUtils
@@ -1,107 +1,106 @@
1
- # encoding: utf-8
2
-
3
-
4
-
5
-
6
- class CsvUtils
7
-
8
- def self.split( path, *columns, sep: ',', &blk )
9
-
10
- puts "cvssplit in: >#{path}<"
11
-
12
- ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
13
- puts "columns:"
14
- pp columns
15
-
16
- text = File.open( path, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
17
-
18
- ## note: do NOT use headers
19
- ## for easy sorting use "plain" array of array for records
20
- csv_options = { col_sep: sep }
21
-
22
- data = CSV.parse( text, csv_options )
23
-
24
- ## todo/check: (auto-) strip (remove all leading and trailing spaces)
25
- ## from all values - why? why not?
26
- ## check if CSV.parse has an option for it?
27
-
28
- headers = data.shift ## remove top array item (that is, row with headers)
29
-
30
- header_mapping = {}
31
- headers.each_with_index { | header,i | header_mapping[header]=i }
32
- pp header_mapping
33
-
34
- ## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
35
- column_indices = columns.map { |col| header_mapping[col] }
36
- pp column_indices
37
-
38
-
39
- ###################################################
40
- ## note: sort data by columns (before split)
41
- data = data.sort do |row1,row2|
42
- res = 0
43
- column_indices.each do |col|
44
- res = row1[col] <=> row2[col] if res == 0
45
- end
46
- res
47
- end
48
-
49
- chunk = []
50
- data.each_with_index do |row,i|
51
- chunk << row
52
-
53
- next_row = data[i+1]
54
-
55
- changed = false
56
- if next_row.nil? ## end-of-file
57
- changed = true
58
- else
59
- column_indices.each do |col|
60
- if row[col] != next_row[col]
61
- changed = true
62
- break ## out of each column_indices loop
63
- end
64
- end
65
- end
66
-
67
- if changed
68
- puts "save new chunk:"
69
- column_values = column_indices.map {|col| row[col] }
70
- pp column_values
71
-
72
- # note: add header(s) row upfront (as first row) to chunk (with unshift)
73
- chunk_with_headers = chunk.unshift( headers )
74
- if blk
75
- yield( column_values, chunk_with_headers )
76
- else
77
- ## auto-save (write-to-file) by default - why? why not?
78
- split_write( path, column_values, chunk_with_headers )
79
- end
80
-
81
- chunk = [] ## reset chunk for next batch of records
82
- end
83
- end
84
-
85
- puts 'Done.'
86
- end ## method self.split
87
-
88
-
89
- def self.split_write( inpath, values, chunk )
90
- basename = File.basename( inpath, '.*' )
91
- dirname = File.dirname( inpath )
92
-
93
- ## check/change invalid filename chars
94
- ## e.g. change 1990/91 to 1990-91
95
- extraname = values.map {|value| value.tr('/','-')}.join('~')
96
-
97
- outpath = "#{dirname}/#{basename}_#{extraname}.csv"
98
- puts "saving >#{basename}_#{extraname}.csv<..."
99
-
100
- CSV.open( outpath, 'w:utf-8' ) do |out|
101
- chunk.each do |row|
102
- out << row
103
- end
104
- end
105
- end
106
-
107
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+
4
+
5
+
6
+ class CsvUtils
7
+
8
+ def self.split( path, *columns, sep: ',', &blk )
9
+
10
+ puts "cvssplit in: >#{path}<"
11
+
12
+ ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
13
+ puts "columns:"
14
+ pp columns
15
+
16
+ ## note: do NOT use headers
17
+ ## for easy sorting use "plain" array of array for records
18
+ csv_options = { sep: sep }
19
+
20
+ data = CsvReader.read( path, csv_options )
21
+
22
+ ## todo/check: (auto-) strip (remove all leading and trailing spaces)
23
+ ## from all values - why? why not?
24
+ ## check if CSV.parse has an option for it?
25
+
26
+ headers = data.shift ## remove top array item (that is, row with headers)
27
+
28
+ header_mapping = {}
29
+ headers.each_with_index { | header,i | header_mapping[header]=i }
30
+ pp header_mapping
31
+
32
+ ## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
33
+ column_indices = columns.map { |col| header_mapping[col] }
34
+ pp column_indices
35
+
36
+
37
+ ###################################################
38
+ ## note: sort data by columns (before split)
39
+ data = data.sort do |row1,row2|
40
+ res = 0
41
+ column_indices.each do |col|
42
+ res = row1[col] <=> row2[col] if res == 0
43
+ end
44
+ res
45
+ end
46
+
47
+ chunk = []
48
+ data.each_with_index do |row,i|
49
+ chunk << row
50
+
51
+ next_row = data[i+1]
52
+
53
+ changed = false
54
+ if next_row.nil? ## end-of-file
55
+ changed = true
56
+ else
57
+ column_indices.each do |col|
58
+ if row[col] != next_row[col]
59
+ changed = true
60
+ break ## out of each column_indices loop
61
+ end
62
+ end
63
+ end
64
+
65
+ if changed
66
+ puts "save new chunk:"
67
+ column_values = column_indices.map {|col| row[col] }
68
+ pp column_values
69
+
70
+ # note: add header(s) row upfront (as first row) to chunk (with unshift)
71
+ chunk_with_headers = chunk.unshift( headers )
72
+ if blk
73
+ yield( column_values, chunk_with_headers )
74
+ else
75
+ ## auto-save (write-to-file) by default - why? why not?
76
+ split_write( path, column_values, chunk_with_headers, sep: sep )
77
+ end
78
+
79
+ chunk = [] ## reset chunk for next batch of records
80
+ end
81
+ end
82
+
83
+ puts 'Done.'
84
+ end ## method self.split
85
+
86
+
87
+ def self.split_write( inpath, values, chunk, sep: )
88
+ basename = File.basename( inpath, '.*' )
89
+ dirname = File.dirname( inpath )
90
+
91
+ ## check/change invalid filename chars
92
+ ## e.g. change 1990/91 to 1990-91
93
+ extraname = values.map {|value| value.tr('/','-')}.join('~')
94
+
95
+ outpath = "#{dirname}/#{basename}_#{extraname}.csv"
96
+ puts "saving >#{basename}_#{extraname}.csv<..."
97
+
98
+ File.open( outpath, 'w:utf-8' ) do |out|
99
+ chunk.each do |row|
100
+ out << csv_row( *row, sep: sep ).join( sep )
101
+ out << "\n"
102
+ end
103
+ end
104
+ end
105
+
106
+ end # class CsvUtils
@@ -1,86 +1,81 @@
1
- # encoding: utf-8
2
-
3
-
4
- class CsvUtils
5
-
6
- def self.stat( path, *columns, sep: ',', debug: false )
7
-
8
- csv_options = { headers: true,
9
- col_sep: sep,
10
- external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!!
11
- }
12
-
13
- values = {}
14
- nulls = {}
15
- # check 1) nulls/nils (e.g. empty strings ""),
16
- # 2) not/appliation or available n/a NA or NaN or ...
17
- # 3) missing - e.g. ?
18
-
19
- i=0
20
- CSV.foreach( path, csv_options ) do |row|
21
- i += 1
22
-
23
- pp row if i == 1 && debug
24
-
25
- print '.' if i % 100 == 0
26
-
27
- ## collect unique values for passed in columns
28
- columns.each do |col|
29
- value = row[col] ## note: value might be nil!!!!!
30
- value = value.strip if value ## use strip - why? why not? report/track trailing spaces?
31
-
32
- values[col] ||= Hash.new(0)
33
- values[col][ value ? value : '<nil>' ] +=1
34
- end
35
-
36
- ## alway track nulls - why? why not
37
- row.each do |col,value|
38
- ## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?)
39
- ## puts "[debug] nil value in row:"
40
- ## puts "#{col} = #{value.inspect} : #{value.class.name}"
41
- ## end
42
-
43
- value = value.strip if value ## use strip - why? why not? report/track trailing spaces?
44
- if value.nil?
45
- nulls[col] ||= Hash.new(0)
46
- nulls[col]['nil'] +=1
47
- elsif value.empty?
48
- nulls[col] ||= Hash.new(0)
49
- nulls[col]['empty'] +=1
50
- elsif ['na', 'n/a', '-'].include?( value.downcase )
51
- nulls[col] ||= Hash.new(0)
52
- nulls[col]['na'] +=1
53
- elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not?
54
- nulls[col] ||= Hash.new(0)
55
- nulls[col]['?'] +=1
56
- else
57
- # do nothing; "regular" value
58
- end
59
- end
60
- end
61
-
62
- puts " #{i} rows"
63
- puts
64
- puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
65
- puts " #{nulls.inspect}"
66
- puts
67
-
68
- ## dump headers first (first row with names of columns)
69
- headers = header( path, sep: sep, debug: debug )
70
- pp_header( headers ) ## pretty print header columns
71
- puts
72
-
73
- if values.any?
74
- ## pretty print (pp) / dump unique values for passed in columns
75
- values.each do |col,h|
76
- puts " column >#{col}< #{h.size} unique values:"
77
- ## sort by name/value for now (not frequency) - change - why? why not?
78
- sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
79
- sorted_values.each do |rec|
80
- puts " #{rec[1]} x #{rec[0]}"
81
- end
82
- end
83
- end
84
- end # method self.stat
85
-
86
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+
4
+ class CsvUtils
5
+
6
+ def self.stat( path, *columns, sep: ',', debug: false )
7
+
8
+ csv_options = { sep: sep }
9
+
10
+ values = {}
11
+ nulls = {}
12
+ # check 1) nulls/nils (e.g. empty strings ""),
13
+ # 2) not/appliation or available n/a NA or NaN or ...
14
+ # 3) missing - e.g. ?
15
+
16
+ i=0
17
+ CsvHash.foreach( path, csv_options ) do |rec|
18
+ i += 1
19
+
20
+ pp rec if i == 1 && debug
21
+
22
+ print '.' if i % 100 == 0
23
+
24
+ ## collect unique values for passed in columns
25
+ columns.each do |col|
26
+ value = rec[col] ## note: value might be nil!!!!!
27
+
28
+ values[col] ||= Hash.new(0)
29
+ values[col][ value ? value : '<nil>' ] +=1
30
+ end
31
+
32
+ ## alway track nulls - why? why not
33
+ rec.each do |col,value|
34
+ ## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?)
35
+ ## puts "[debug] nil value in row:"
36
+ ## puts "#{col} = #{value.inspect} : #{value.class.name}"
37
+ ## end
38
+
39
+ if value.nil?
40
+ nulls[col] ||= Hash.new(0)
41
+ nulls[col]['nil'] +=1
42
+ elsif value.empty?
43
+ nulls[col] ||= Hash.new(0)
44
+ nulls[col]['empty'] +=1
45
+ elsif ['na', 'n/a', '-'].include?( value.downcase )
46
+ nulls[col] ||= Hash.new(0)
47
+ nulls[col]['na'] +=1
48
+ elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not?
49
+ nulls[col] ||= Hash.new(0)
50
+ nulls[col]['?'] +=1
51
+ else
52
+ # do nothing; "regular" value
53
+ end
54
+ end
55
+ end
56
+
57
+ puts " #{i} rows"
58
+ puts
59
+ puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
60
+ puts " #{nulls.inspect}"
61
+ puts
62
+
63
+ ## dump headers first (first row with names of columns)
64
+ headers = header( path, sep: sep, debug: debug )
65
+ pp_header( headers ) ## pretty print header columns
66
+ puts
67
+
68
+ if values.any?
69
+ ## pretty print (pp) / dump unique values for passed in columns
70
+ values.each do |col,h|
71
+ puts " column >#{col}< #{h.size} unique values:"
72
+ ## sort by name/value for now (not frequency) - change - why? why not?
73
+ sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
74
+ sorted_values.each do |rec|
75
+ puts " #{rec[1]} x #{rec[0]}"
76
+ end
77
+ end
78
+ end
79
+ end # method self.stat
80
+
81
+ end # class CsvUtils