csvutils 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,50 +1,43 @@
1
- # encoding: utf-8
2
-
3
- ## check/use class or module ???
4
-
5
-
6
- class CsvUtils
7
-
8
- def self.cut( path, *columns, output: path, sep: ',' )
9
-
10
- inpath = path
11
- outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!)
12
-
13
- puts "cvscut in: >#{inpath}< out: >#{outpath}<"
14
-
15
- ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
16
- puts "columns:"
17
- pp columns
18
-
19
- text = File.open( inpath, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
20
-
21
- csv_options = { headers: true,
22
- col_sep: sep }
23
-
24
- table = CSV.parse( text, csv_options )
25
-
26
-
27
- ## for convenience - make sure parent folders/directories exist
28
- FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
29
-
30
- ## use wb mode - why? why not?
31
- ## assumes same encoding as input?
32
- ## fix/todo: better (always) use utf8!!!!
33
- ## CSV.open( out_path, 'wb' ) do |out|
34
-
35
- ## use just "regular" File for output - why? why not?
36
- ## downside will not encode comma (for now) if present ("Beethoven, van")
37
- ## all values will be unquoted etc. - keep it simple?
38
-
39
- CSV.open( outpath, 'w:utf-8' ) do |out|
40
- out << columns ## for row add headers/columns
41
- table.each do |row|
42
- values = columns.map { |col| row[col].strip } ## find data for column
43
- out << values
44
- end
45
- end
46
-
47
- puts 'Done.'
48
- end ## method self.cut
49
-
50
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+ ## check/use class or module ???
4
+
5
+
6
+ class CsvUtils
7
+
8
+ def self.cut( path, *columns, output: path, sep: ',' )
9
+
10
+ inpath = path
11
+ outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!)
12
+
13
+ puts "cvscut in: >#{inpath}< out: >#{outpath}<"
14
+
15
+ ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
16
+ puts "columns:"
17
+ pp columns
18
+
19
+ csv_options = { sep: sep }
20
+
21
+ recs = CsvHash.read( inpath, csv_options )
22
+
23
+
24
+ ## for convenience - make sure parent folders/directories exist
25
+ FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
26
+
27
+
28
+ ## note:
29
+ ## todo/fix: add two trailing spaces for pretty printing - why? why not?
30
+ File.open( outpath, 'w:utf-8' ) do |out|
31
+ out << csv_row( *columns, sep: sep ).join( sep ) ## for row add headers/columns
32
+ out << "\n"
33
+ recs.each do |rec|
34
+ values = columns.map { |col| rec[col] } ## find data for column
35
+ out << csv_row( *values, sep: sep ).join( sep )
36
+ out << "\n"
37
+ end
38
+ end
39
+
40
+ puts 'Done.'
41
+ end ## method self.cut
42
+
43
+ end # class CsvUtils
@@ -1,25 +1,22 @@
1
- # encoding: utf-8
2
-
3
-
4
- class CsvUtils
5
-
6
- ## test or dry run to check if rows can get read/scanned
7
- def self.head( path, sep: ',', n: 4 )
8
- i = 0
9
- csv_options = { headers: true,
10
- col_sep: sep,
11
- external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!!
12
- }
13
-
14
- CSV.foreach( path, csv_options ) do |row|
15
- i += 1
16
-
17
- pp row
18
-
19
- break if i >= n
20
- end
21
-
22
- puts " #{i} rows"
23
- end
24
-
25
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+
4
+ class CsvUtils
5
+
6
+ ## test or dry run to check if rows can get read/scanned
7
+ def self.head( path, sep: ',', n: 4 )
8
+ i = 0
9
+ csv_options = { sep: sep }
10
+
11
+ CsvHash.foreach( path, csv_options ) do |rec|
12
+ i += 1
13
+
14
+ pp rec
15
+
16
+ break if i >= n
17
+ end
18
+
19
+ puts " #{i} records"
20
+ end
21
+
22
+ end # class CsvUtils
@@ -1,28 +1,16 @@
1
- # encoding: utf-8
2
-
3
-
4
- class CsvUtils
5
-
6
- def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
7
-
8
- # read first line (only)
9
- # and parse with csv to get header from csv library itself
10
- #
11
- # check - if there's an easier or built-in way for the csv library
12
-
13
- line = File.open( path, 'r:utf-8' ) { |f| f.readline }
14
-
15
- pp line if debug
16
- ## e.g.:
17
- # "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
18
-
19
- csv_options = { col_sep: sep }
20
-
21
- ## note: do NOT use headers: true to get "plain" data array (no hash records)
22
- ## hash record does NOT work for single line/row
23
- rows = CSV.parse( line, csv_options )
24
- pp rows if debug
25
- rows[0] ## return first row
26
- end # method self.header
27
-
28
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+
4
+ class CsvUtils
5
+
6
+ def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
7
+ row = CsvReader.header( path, sep: sep )
8
+
9
+ pp row if debug
10
+ ## e.g.:
11
+ # "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
12
+
13
+ row
14
+ end # method self.header
15
+
16
+ end # class CsvUtils
@@ -1,107 +1,106 @@
1
- # encoding: utf-8
2
-
3
-
4
-
5
-
6
- class CsvUtils
7
-
8
- def self.split( path, *columns, sep: ',', &blk )
9
-
10
- puts "cvssplit in: >#{path}<"
11
-
12
- ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
13
- puts "columns:"
14
- pp columns
15
-
16
- text = File.open( path, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
17
-
18
- ## note: do NOT use headers
19
- ## for easy sorting use "plain" array of array for records
20
- csv_options = { col_sep: sep }
21
-
22
- data = CSV.parse( text, csv_options )
23
-
24
- ## todo/check: (auto-) strip (remove all leading and trailing spaces)
25
- ## from all values - why? why not?
26
- ## check if CSV.parse has an option for it?
27
-
28
- headers = data.shift ## remove top array item (that is, row with headers)
29
-
30
- header_mapping = {}
31
- headers.each_with_index { | header,i | header_mapping[header]=i }
32
- pp header_mapping
33
-
34
- ## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
35
- column_indices = columns.map { |col| header_mapping[col] }
36
- pp column_indices
37
-
38
-
39
- ###################################################
40
- ## note: sort data by columns (before split)
41
- data = data.sort do |row1,row2|
42
- res = 0
43
- column_indices.each do |col|
44
- res = row1[col] <=> row2[col] if res == 0
45
- end
46
- res
47
- end
48
-
49
- chunk = []
50
- data.each_with_index do |row,i|
51
- chunk << row
52
-
53
- next_row = data[i+1]
54
-
55
- changed = false
56
- if next_row.nil? ## end-of-file
57
- changed = true
58
- else
59
- column_indices.each do |col|
60
- if row[col] != next_row[col]
61
- changed = true
62
- break ## out of each column_indices loop
63
- end
64
- end
65
- end
66
-
67
- if changed
68
- puts "save new chunk:"
69
- column_values = column_indices.map {|col| row[col] }
70
- pp column_values
71
-
72
- # note: add header(s) row upfront (as first row) to chunk (with unshift)
73
- chunk_with_headers = chunk.unshift( headers )
74
- if blk
75
- yield( column_values, chunk_with_headers )
76
- else
77
- ## auto-save (write-to-file) by default - why? why not?
78
- split_write( path, column_values, chunk_with_headers )
79
- end
80
-
81
- chunk = [] ## reset chunk for next batch of records
82
- end
83
- end
84
-
85
- puts 'Done.'
86
- end ## method self.split
87
-
88
-
89
- def self.split_write( inpath, values, chunk )
90
- basename = File.basename( inpath, '.*' )
91
- dirname = File.dirname( inpath )
92
-
93
- ## check/change invalid filename chars
94
- ## e.g. change 1990/91 to 1990-91
95
- extraname = values.map {|value| value.tr('/','-')}.join('~')
96
-
97
- outpath = "#{dirname}/#{basename}_#{extraname}.csv"
98
- puts "saving >#{basename}_#{extraname}.csv<..."
99
-
100
- CSV.open( outpath, 'w:utf-8' ) do |out|
101
- chunk.each do |row|
102
- out << row
103
- end
104
- end
105
- end
106
-
107
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+
4
+
5
+
6
+ class CsvUtils
7
+
8
+ def self.split( path, *columns, sep: ',', &blk )
9
+
10
+ puts "cvssplit in: >#{path}<"
11
+
12
+ ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
13
+ puts "columns:"
14
+ pp columns
15
+
16
+ ## note: do NOT use headers
17
+ ## for easy sorting use "plain" array of array for records
18
+ csv_options = { sep: sep }
19
+
20
+ data = CsvReader.read( path, csv_options )
21
+
22
+ ## todo/check: (auto-) strip (remove all leading and trailing spaces)
23
+ ## from all values - why? why not?
24
+ ## check if CSV.parse has an option for it?
25
+
26
+ headers = data.shift ## remove top array item (that is, row with headers)
27
+
28
+ header_mapping = {}
29
+ headers.each_with_index { | header,i | header_mapping[header]=i }
30
+ pp header_mapping
31
+
32
+ ## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
33
+ column_indices = columns.map { |col| header_mapping[col] }
34
+ pp column_indices
35
+
36
+
37
+ ###################################################
38
+ ## note: sort data by columns (before split)
39
+ data = data.sort do |row1,row2|
40
+ res = 0
41
+ column_indices.each do |col|
42
+ res = row1[col] <=> row2[col] if res == 0
43
+ end
44
+ res
45
+ end
46
+
47
+ chunk = []
48
+ data.each_with_index do |row,i|
49
+ chunk << row
50
+
51
+ next_row = data[i+1]
52
+
53
+ changed = false
54
+ if next_row.nil? ## end-of-file
55
+ changed = true
56
+ else
57
+ column_indices.each do |col|
58
+ if row[col] != next_row[col]
59
+ changed = true
60
+ break ## out of each column_indices loop
61
+ end
62
+ end
63
+ end
64
+
65
+ if changed
66
+ puts "save new chunk:"
67
+ column_values = column_indices.map {|col| row[col] }
68
+ pp column_values
69
+
70
+ # note: add header(s) row upfront (as first row) to chunk (with unshift)
71
+ chunk_with_headers = chunk.unshift( headers )
72
+ if blk
73
+ yield( column_values, chunk_with_headers )
74
+ else
75
+ ## auto-save (write-to-file) by default - why? why not?
76
+ split_write( path, column_values, chunk_with_headers, sep: sep )
77
+ end
78
+
79
+ chunk = [] ## reset chunk for next batch of records
80
+ end
81
+ end
82
+
83
+ puts 'Done.'
84
+ end ## method self.split
85
+
86
+
87
+ def self.split_write( inpath, values, chunk, sep: )
88
+ basename = File.basename( inpath, '.*' )
89
+ dirname = File.dirname( inpath )
90
+
91
+ ## check/change invalid filename chars
92
+ ## e.g. change 1990/91 to 1990-91
93
+ extraname = values.map {|value| value.tr('/','-')}.join('~')
94
+
95
+ outpath = "#{dirname}/#{basename}_#{extraname}.csv"
96
+ puts "saving >#{basename}_#{extraname}.csv<..."
97
+
98
+ File.open( outpath, 'w:utf-8' ) do |out|
99
+ chunk.each do |row|
100
+ out << csv_row( *row, sep: sep ).join( sep )
101
+ out << "\n"
102
+ end
103
+ end
104
+ end
105
+
106
+ end # class CsvUtils
@@ -1,86 +1,81 @@
1
- # encoding: utf-8
2
-
3
-
4
- class CsvUtils
5
-
6
- def self.stat( path, *columns, sep: ',', debug: false )
7
-
8
- csv_options = { headers: true,
9
- col_sep: sep,
10
- external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!!
11
- }
12
-
13
- values = {}
14
- nulls = {}
15
- # check 1) nulls/nils (e.g. empty strings ""),
16
- # 2) not/appliation or available n/a NA or NaN or ...
17
- # 3) missing - e.g. ?
18
-
19
- i=0
20
- CSV.foreach( path, csv_options ) do |row|
21
- i += 1
22
-
23
- pp row if i == 1 && debug
24
-
25
- print '.' if i % 100 == 0
26
-
27
- ## collect unique values for passed in columns
28
- columns.each do |col|
29
- value = row[col] ## note: value might be nil!!!!!
30
- value = value.strip if value ## use strip - why? why not? report/track trailing spaces?
31
-
32
- values[col] ||= Hash.new(0)
33
- values[col][ value ? value : '<nil>' ] +=1
34
- end
35
-
36
- ## alway track nulls - why? why not
37
- row.each do |col,value|
38
- ## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?)
39
- ## puts "[debug] nil value in row:"
40
- ## puts "#{col} = #{value.inspect} : #{value.class.name}"
41
- ## end
42
-
43
- value = value.strip if value ## use strip - why? why not? report/track trailing spaces?
44
- if value.nil?
45
- nulls[col] ||= Hash.new(0)
46
- nulls[col]['nil'] +=1
47
- elsif value.empty?
48
- nulls[col] ||= Hash.new(0)
49
- nulls[col]['empty'] +=1
50
- elsif ['na', 'n/a', '-'].include?( value.downcase )
51
- nulls[col] ||= Hash.new(0)
52
- nulls[col]['na'] +=1
53
- elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not?
54
- nulls[col] ||= Hash.new(0)
55
- nulls[col]['?'] +=1
56
- else
57
- # do nothing; "regular" value
58
- end
59
- end
60
- end
61
-
62
- puts " #{i} rows"
63
- puts
64
- puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
65
- puts " #{nulls.inspect}"
66
- puts
67
-
68
- ## dump headers first (first row with names of columns)
69
- headers = header( path, sep: sep, debug: debug )
70
- pp_header( headers ) ## pretty print header columns
71
- puts
72
-
73
- if values.any?
74
- ## pretty print (pp) / dump unique values for passed in columns
75
- values.each do |col,h|
76
- puts " column >#{col}< #{h.size} unique values:"
77
- ## sort by name/value for now (not frequency) - change - why? why not?
78
- sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
79
- sorted_values.each do |rec|
80
- puts " #{rec[1]} x #{rec[0]}"
81
- end
82
- end
83
- end
84
- end # method self.stat
85
-
86
- end # class CsvUtils
1
+ # encoding: utf-8
2
+
3
+
4
+ class CsvUtils
5
+
6
+ def self.stat( path, *columns, sep: ',', debug: false )
7
+
8
+ csv_options = { sep: sep }
9
+
10
+ values = {}
11
+ nulls = {}
12
+ # check 1) nulls/nils (e.g. empty strings ""),
13
+ # 2) not/appliation or available n/a NA or NaN or ...
14
+ # 3) missing - e.g. ?
15
+
16
+ i=0
17
+ CsvHash.foreach( path, csv_options ) do |rec|
18
+ i += 1
19
+
20
+ pp rec if i == 1 && debug
21
+
22
+ print '.' if i % 100 == 0
23
+
24
+ ## collect unique values for passed in columns
25
+ columns.each do |col|
26
+ value = rec[col] ## note: value might be nil!!!!!
27
+
28
+ values[col] ||= Hash.new(0)
29
+ values[col][ value ? value : '<nil>' ] +=1
30
+ end
31
+
32
+ ## alway track nulls - why? why not
33
+ rec.each do |col,value|
34
+ ## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?)
35
+ ## puts "[debug] nil value in row:"
36
+ ## puts "#{col} = #{value.inspect} : #{value.class.name}"
37
+ ## end
38
+
39
+ if value.nil?
40
+ nulls[col] ||= Hash.new(0)
41
+ nulls[col]['nil'] +=1
42
+ elsif value.empty?
43
+ nulls[col] ||= Hash.new(0)
44
+ nulls[col]['empty'] +=1
45
+ elsif ['na', 'n/a', '-'].include?( value.downcase )
46
+ nulls[col] ||= Hash.new(0)
47
+ nulls[col]['na'] +=1
48
+ elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not?
49
+ nulls[col] ||= Hash.new(0)
50
+ nulls[col]['?'] +=1
51
+ else
52
+ # do nothing; "regular" value
53
+ end
54
+ end
55
+ end
56
+
57
+ puts " #{i} rows"
58
+ puts
59
+ puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
60
+ puts " #{nulls.inspect}"
61
+ puts
62
+
63
+ ## dump headers first (first row with names of columns)
64
+ headers = header( path, sep: sep, debug: debug )
65
+ pp_header( headers ) ## pretty print header columns
66
+ puts
67
+
68
+ if values.any?
69
+ ## pretty print (pp) / dump unique values for passed in columns
70
+ values.each do |col,h|
71
+ puts " column >#{col}< #{h.size} unique values:"
72
+ ## sort by name/value for now (not frequency) - change - why? why not?
73
+ sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
74
+ sorted_values.each do |rec|
75
+ puts " #{rec[1]} x #{rec[0]}"
76
+ end
77
+ end
78
+ end
79
+ end # method self.stat
80
+
81
+ end # class CsvUtils