csvutils 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/HISTORY.md +3 -3
- data/LICENSE.md +116 -0
- data/Manifest.txt +8 -4
- data/README.md +302 -286
- data/Rakefile +30 -26
- data/bin/csvcut +17 -17
- data/bin/csvhead +17 -17
- data/bin/csvheader +17 -17
- data/bin/csvsplit +17 -17
- data/bin/csvstat +17 -17
- data/{test/data → datasets}/at-austria/AUT.csv +363 -363
- data/{test/data → datasets}/de-deutschland/bundesliga.csv +481 -481
- data/{test/data → datasets}/eng-england/2017-18/E0.csv +381 -381
- data/lib/csvutils.rb +32 -31
- data/lib/csvutils/commands/cut.rb +43 -43
- data/lib/csvutils/commands/head.rb +40 -40
- data/lib/csvutils/commands/header.rb +35 -35
- data/lib/csvutils/commands/split.rb +41 -41
- data/lib/csvutils/commands/stat.rb +41 -41
- data/lib/csvutils/cut.rb +43 -50
- data/lib/csvutils/head.rb +22 -25
- data/lib/csvutils/header.rb +16 -28
- data/lib/csvutils/split.rb +106 -107
- data/lib/csvutils/stat.rb +81 -86
- data/lib/csvutils/test.rb +19 -22
- data/lib/csvutils/utils.rb +29 -13
- data/lib/csvutils/version.rb +24 -24
- data/test/helper.rb +16 -16
- data/test/test_cut.rb +31 -0
- data/test/test_head.rb +30 -0
- data/test/{test_headers.rb → test_header.rb} +50 -50
- data/test/test_misc.rb +44 -44
- data/test/test_split.rb +31 -0
- data/test/test_version.rb +20 -20
- metadata +28 -9
data/lib/csvutils/cut.rb
CHANGED
@@ -1,50 +1,43 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
## check/use class or module ???
|
4
|
-
|
5
|
-
|
6
|
-
class CsvUtils
|
7
|
-
|
8
|
-
def self.cut( path, *columns, output: path, sep: ',' )
|
9
|
-
|
10
|
-
inpath = path
|
11
|
-
outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!)
|
12
|
-
|
13
|
-
puts "cvscut in: >#{inpath}< out: >#{outpath}<"
|
14
|
-
|
15
|
-
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
16
|
-
puts "columns:"
|
17
|
-
pp columns
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
puts 'Done.'
|
48
|
-
end ## method self.cut
|
49
|
-
|
50
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
## check/use class or module ???
|
4
|
+
|
5
|
+
|
6
|
+
class CsvUtils
|
7
|
+
|
8
|
+
def self.cut( path, *columns, output: path, sep: ',' )
|
9
|
+
|
10
|
+
inpath = path
|
11
|
+
outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!)
|
12
|
+
|
13
|
+
puts "cvscut in: >#{inpath}< out: >#{outpath}<"
|
14
|
+
|
15
|
+
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
16
|
+
puts "columns:"
|
17
|
+
pp columns
|
18
|
+
|
19
|
+
csv_options = { sep: sep }
|
20
|
+
|
21
|
+
recs = CsvHash.read( inpath, csv_options )
|
22
|
+
|
23
|
+
|
24
|
+
## for convenience - make sure parent folders/directories exist
|
25
|
+
FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
|
26
|
+
|
27
|
+
|
28
|
+
## note:
|
29
|
+
## todo/fix: add two trailing spaces for pretty printing - why? why not?
|
30
|
+
File.open( outpath, 'w:utf-8' ) do |out|
|
31
|
+
out << csv_row( *columns, sep: sep ).join( sep ) ## for row add headers/columns
|
32
|
+
out << "\n"
|
33
|
+
recs.each do |rec|
|
34
|
+
values = columns.map { |col| rec[col] } ## find data for column
|
35
|
+
out << csv_row( *values, sep: sep ).join( sep )
|
36
|
+
out << "\n"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
puts 'Done.'
|
41
|
+
end ## method self.cut
|
42
|
+
|
43
|
+
end # class CsvUtils
|
data/lib/csvutils/head.rb
CHANGED
@@ -1,25 +1,22 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
class CsvUtils
|
5
|
-
|
6
|
-
## test or dry run to check if rows can get read/scanned
|
7
|
-
def self.head( path, sep: ',', n: 4 )
|
8
|
-
i = 0
|
9
|
-
csv_options = {
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
end
|
24
|
-
|
25
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
## test or dry run to check if rows can get read/scanned
|
7
|
+
def self.head( path, sep: ',', n: 4 )
|
8
|
+
i = 0
|
9
|
+
csv_options = { sep: sep }
|
10
|
+
|
11
|
+
CsvHash.foreach( path, csv_options ) do |rec|
|
12
|
+
i += 1
|
13
|
+
|
14
|
+
pp rec
|
15
|
+
|
16
|
+
break if i >= n
|
17
|
+
end
|
18
|
+
|
19
|
+
puts " #{i} records"
|
20
|
+
end
|
21
|
+
|
22
|
+
end # class CsvUtils
|
data/lib/csvutils/header.rb
CHANGED
@@ -1,28 +1,16 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
class CsvUtils
|
5
|
-
|
6
|
-
def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
#
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
# "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
|
18
|
-
|
19
|
-
csv_options = { col_sep: sep }
|
20
|
-
|
21
|
-
## note: do NOT use headers: true to get "plain" data array (no hash records)
|
22
|
-
## hash record does NOT work for single line/row
|
23
|
-
rows = CSV.parse( line, csv_options )
|
24
|
-
pp rows if debug
|
25
|
-
rows[0] ## return first row
|
26
|
-
end # method self.header
|
27
|
-
|
28
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
|
7
|
+
row = CsvReader.header( path, sep: sep )
|
8
|
+
|
9
|
+
pp row if debug
|
10
|
+
## e.g.:
|
11
|
+
# "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
|
12
|
+
|
13
|
+
row
|
14
|
+
end # method self.header
|
15
|
+
|
16
|
+
end # class CsvUtils
|
data/lib/csvutils/split.rb
CHANGED
@@ -1,107 +1,106 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
class CsvUtils
|
7
|
-
|
8
|
-
def self.split( path, *columns, sep: ',', &blk )
|
9
|
-
|
10
|
-
puts "cvssplit in: >#{path}<"
|
11
|
-
|
12
|
-
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
-
puts "columns:"
|
14
|
-
pp columns
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
##
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
header_mapping
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
class CsvUtils
|
7
|
+
|
8
|
+
def self.split( path, *columns, sep: ',', &blk )
|
9
|
+
|
10
|
+
puts "cvssplit in: >#{path}<"
|
11
|
+
|
12
|
+
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
+
puts "columns:"
|
14
|
+
pp columns
|
15
|
+
|
16
|
+
## note: do NOT use headers
|
17
|
+
## for easy sorting use "plain" array of array for records
|
18
|
+
csv_options = { sep: sep }
|
19
|
+
|
20
|
+
data = CsvReader.read( path, csv_options )
|
21
|
+
|
22
|
+
## todo/check: (auto-) strip (remove all leading and trailing spaces)
|
23
|
+
## from all values - why? why not?
|
24
|
+
## check if CSV.parse has an option for it?
|
25
|
+
|
26
|
+
headers = data.shift ## remove top array item (that is, row with headers)
|
27
|
+
|
28
|
+
header_mapping = {}
|
29
|
+
headers.each_with_index { | header,i | header_mapping[header]=i }
|
30
|
+
pp header_mapping
|
31
|
+
|
32
|
+
## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
|
33
|
+
column_indices = columns.map { |col| header_mapping[col] }
|
34
|
+
pp column_indices
|
35
|
+
|
36
|
+
|
37
|
+
###################################################
|
38
|
+
## note: sort data by columns (before split)
|
39
|
+
data = data.sort do |row1,row2|
|
40
|
+
res = 0
|
41
|
+
column_indices.each do |col|
|
42
|
+
res = row1[col] <=> row2[col] if res == 0
|
43
|
+
end
|
44
|
+
res
|
45
|
+
end
|
46
|
+
|
47
|
+
chunk = []
|
48
|
+
data.each_with_index do |row,i|
|
49
|
+
chunk << row
|
50
|
+
|
51
|
+
next_row = data[i+1]
|
52
|
+
|
53
|
+
changed = false
|
54
|
+
if next_row.nil? ## end-of-file
|
55
|
+
changed = true
|
56
|
+
else
|
57
|
+
column_indices.each do |col|
|
58
|
+
if row[col] != next_row[col]
|
59
|
+
changed = true
|
60
|
+
break ## out of each column_indices loop
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
if changed
|
66
|
+
puts "save new chunk:"
|
67
|
+
column_values = column_indices.map {|col| row[col] }
|
68
|
+
pp column_values
|
69
|
+
|
70
|
+
# note: add header(s) row upfront (as first row) to chunk (with unshift)
|
71
|
+
chunk_with_headers = chunk.unshift( headers )
|
72
|
+
if blk
|
73
|
+
yield( column_values, chunk_with_headers )
|
74
|
+
else
|
75
|
+
## auto-save (write-to-file) by default - why? why not?
|
76
|
+
split_write( path, column_values, chunk_with_headers, sep: sep )
|
77
|
+
end
|
78
|
+
|
79
|
+
chunk = [] ## reset chunk for next batch of records
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
puts 'Done.'
|
84
|
+
end ## method self.split
|
85
|
+
|
86
|
+
|
87
|
+
def self.split_write( inpath, values, chunk, sep: )
|
88
|
+
basename = File.basename( inpath, '.*' )
|
89
|
+
dirname = File.dirname( inpath )
|
90
|
+
|
91
|
+
## check/change invalid filename chars
|
92
|
+
## e.g. change 1990/91 to 1990-91
|
93
|
+
extraname = values.map {|value| value.tr('/','-')}.join('~')
|
94
|
+
|
95
|
+
outpath = "#{dirname}/#{basename}_#{extraname}.csv"
|
96
|
+
puts "saving >#{basename}_#{extraname}.csv<..."
|
97
|
+
|
98
|
+
File.open( outpath, 'w:utf-8' ) do |out|
|
99
|
+
chunk.each do |row|
|
100
|
+
out << csv_row( *row, sep: sep ).join( sep )
|
101
|
+
out << "\n"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end # class CsvUtils
|
data/lib/csvutils/stat.rb
CHANGED
@@ -1,86 +1,81 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
class CsvUtils
|
5
|
-
|
6
|
-
def self.stat( path, *columns, sep: ',', debug: false )
|
7
|
-
|
8
|
-
csv_options = {
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
nulls[col]
|
47
|
-
|
48
|
-
|
49
|
-
nulls[col]
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
puts
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end # method self.stat
|
85
|
-
|
86
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
def self.stat( path, *columns, sep: ',', debug: false )
|
7
|
+
|
8
|
+
csv_options = { sep: sep }
|
9
|
+
|
10
|
+
values = {}
|
11
|
+
nulls = {}
|
12
|
+
# check 1) nulls/nils (e.g. empty strings ""),
|
13
|
+
# 2) not/appliation or available n/a NA or NaN or ...
|
14
|
+
# 3) missing - e.g. ?
|
15
|
+
|
16
|
+
i=0
|
17
|
+
CsvHash.foreach( path, csv_options ) do |rec|
|
18
|
+
i += 1
|
19
|
+
|
20
|
+
pp rec if i == 1 && debug
|
21
|
+
|
22
|
+
print '.' if i % 100 == 0
|
23
|
+
|
24
|
+
## collect unique values for passed in columns
|
25
|
+
columns.each do |col|
|
26
|
+
value = rec[col] ## note: value might be nil!!!!!
|
27
|
+
|
28
|
+
values[col] ||= Hash.new(0)
|
29
|
+
values[col][ value ? value : '<nil>' ] +=1
|
30
|
+
end
|
31
|
+
|
32
|
+
## alway track nulls - why? why not
|
33
|
+
rec.each do |col,value|
|
34
|
+
## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?)
|
35
|
+
## puts "[debug] nil value in row:"
|
36
|
+
## puts "#{col} = #{value.inspect} : #{value.class.name}"
|
37
|
+
## end
|
38
|
+
|
39
|
+
if value.nil?
|
40
|
+
nulls[col] ||= Hash.new(0)
|
41
|
+
nulls[col]['nil'] +=1
|
42
|
+
elsif value.empty?
|
43
|
+
nulls[col] ||= Hash.new(0)
|
44
|
+
nulls[col]['empty'] +=1
|
45
|
+
elsif ['na', 'n/a', '-'].include?( value.downcase )
|
46
|
+
nulls[col] ||= Hash.new(0)
|
47
|
+
nulls[col]['na'] +=1
|
48
|
+
elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not?
|
49
|
+
nulls[col] ||= Hash.new(0)
|
50
|
+
nulls[col]['?'] +=1
|
51
|
+
else
|
52
|
+
# do nothing; "regular" value
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
puts " #{i} rows"
|
58
|
+
puts
|
59
|
+
puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
|
60
|
+
puts " #{nulls.inspect}"
|
61
|
+
puts
|
62
|
+
|
63
|
+
## dump headers first (first row with names of columns)
|
64
|
+
headers = header( path, sep: sep, debug: debug )
|
65
|
+
pp_header( headers ) ## pretty print header columns
|
66
|
+
puts
|
67
|
+
|
68
|
+
if values.any?
|
69
|
+
## pretty print (pp) / dump unique values for passed in columns
|
70
|
+
values.each do |col,h|
|
71
|
+
puts " column >#{col}< #{h.size} unique values:"
|
72
|
+
## sort by name/value for now (not frequency) - change - why? why not?
|
73
|
+
sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
|
74
|
+
sorted_values.each do |rec|
|
75
|
+
puts " #{rec[1]} x #{rec[0]}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end # method self.stat
|
80
|
+
|
81
|
+
end # class CsvUtils
|