csvutils 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/HISTORY.md +3 -3
- data/LICENSE.md +116 -0
- data/Manifest.txt +8 -4
- data/README.md +302 -286
- data/Rakefile +30 -26
- data/bin/csvcut +17 -17
- data/bin/csvhead +17 -17
- data/bin/csvheader +17 -17
- data/bin/csvsplit +17 -17
- data/bin/csvstat +17 -17
- data/{test/data → datasets}/at-austria/AUT.csv +363 -363
- data/{test/data → datasets}/de-deutschland/bundesliga.csv +481 -481
- data/{test/data → datasets}/eng-england/2017-18/E0.csv +381 -381
- data/lib/csvutils.rb +32 -31
- data/lib/csvutils/commands/cut.rb +43 -43
- data/lib/csvutils/commands/head.rb +40 -40
- data/lib/csvutils/commands/header.rb +35 -35
- data/lib/csvutils/commands/split.rb +41 -41
- data/lib/csvutils/commands/stat.rb +41 -41
- data/lib/csvutils/cut.rb +43 -50
- data/lib/csvutils/head.rb +22 -25
- data/lib/csvutils/header.rb +16 -28
- data/lib/csvutils/split.rb +106 -107
- data/lib/csvutils/stat.rb +81 -86
- data/lib/csvutils/test.rb +19 -22
- data/lib/csvutils/utils.rb +29 -13
- data/lib/csvutils/version.rb +24 -24
- data/test/helper.rb +16 -16
- data/test/test_cut.rb +31 -0
- data/test/test_head.rb +30 -0
- data/test/{test_headers.rb → test_header.rb} +50 -50
- data/test/test_misc.rb +44 -44
- data/test/test_split.rb +31 -0
- data/test/test_version.rb +20 -20
- metadata +28 -9
data/lib/csvutils/cut.rb
CHANGED
@@ -1,50 +1,43 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
## check/use class or module ???
|
4
|
-
|
5
|
-
|
6
|
-
class CsvUtils
|
7
|
-
|
8
|
-
def self.cut( path, *columns, output: path, sep: ',' )
|
9
|
-
|
10
|
-
inpath = path
|
11
|
-
outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!)
|
12
|
-
|
13
|
-
puts "cvscut in: >#{inpath}< out: >#{outpath}<"
|
14
|
-
|
15
|
-
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
16
|
-
puts "columns:"
|
17
|
-
pp columns
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
puts 'Done.'
|
48
|
-
end ## method self.cut
|
49
|
-
|
50
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
## check/use class or module ???
|
4
|
+
|
5
|
+
|
6
|
+
class CsvUtils
|
7
|
+
|
8
|
+
def self.cut( path, *columns, output: path, sep: ',' )
|
9
|
+
|
10
|
+
inpath = path
|
11
|
+
outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!)
|
12
|
+
|
13
|
+
puts "cvscut in: >#{inpath}< out: >#{outpath}<"
|
14
|
+
|
15
|
+
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
16
|
+
puts "columns:"
|
17
|
+
pp columns
|
18
|
+
|
19
|
+
csv_options = { sep: sep }
|
20
|
+
|
21
|
+
recs = CsvHash.read( inpath, csv_options )
|
22
|
+
|
23
|
+
|
24
|
+
## for convenience - make sure parent folders/directories exist
|
25
|
+
FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
|
26
|
+
|
27
|
+
|
28
|
+
## note:
|
29
|
+
## todo/fix: add two trailing spaces for pretty printing - why? why not?
|
30
|
+
File.open( outpath, 'w:utf-8' ) do |out|
|
31
|
+
out << csv_row( *columns, sep: sep ).join( sep ) ## for row add headers/columns
|
32
|
+
out << "\n"
|
33
|
+
recs.each do |rec|
|
34
|
+
values = columns.map { |col| rec[col] } ## find data for column
|
35
|
+
out << csv_row( *values, sep: sep ).join( sep )
|
36
|
+
out << "\n"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
puts 'Done.'
|
41
|
+
end ## method self.cut
|
42
|
+
|
43
|
+
end # class CsvUtils
|
data/lib/csvutils/head.rb
CHANGED
@@ -1,25 +1,22 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
class CsvUtils
|
5
|
-
|
6
|
-
## test or dry run to check if rows can get read/scanned
|
7
|
-
def self.head( path, sep: ',', n: 4 )
|
8
|
-
i = 0
|
9
|
-
csv_options = {
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
end
|
24
|
-
|
25
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
## test or dry run to check if rows can get read/scanned
|
7
|
+
def self.head( path, sep: ',', n: 4 )
|
8
|
+
i = 0
|
9
|
+
csv_options = { sep: sep }
|
10
|
+
|
11
|
+
CsvHash.foreach( path, csv_options ) do |rec|
|
12
|
+
i += 1
|
13
|
+
|
14
|
+
pp rec
|
15
|
+
|
16
|
+
break if i >= n
|
17
|
+
end
|
18
|
+
|
19
|
+
puts " #{i} records"
|
20
|
+
end
|
21
|
+
|
22
|
+
end # class CsvUtils
|
data/lib/csvutils/header.rb
CHANGED
@@ -1,28 +1,16 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
class CsvUtils
|
5
|
-
|
6
|
-
def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
#
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
# "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
|
18
|
-
|
19
|
-
csv_options = { col_sep: sep }
|
20
|
-
|
21
|
-
## note: do NOT use headers: true to get "plain" data array (no hash records)
|
22
|
-
## hash record does NOT work for single line/row
|
23
|
-
rows = CSV.parse( line, csv_options )
|
24
|
-
pp rows if debug
|
25
|
-
rows[0] ## return first row
|
26
|
-
end # method self.header
|
27
|
-
|
28
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
|
7
|
+
row = CsvReader.header( path, sep: sep )
|
8
|
+
|
9
|
+
pp row if debug
|
10
|
+
## e.g.:
|
11
|
+
# "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
|
12
|
+
|
13
|
+
row
|
14
|
+
end # method self.header
|
15
|
+
|
16
|
+
end # class CsvUtils
|
data/lib/csvutils/split.rb
CHANGED
@@ -1,107 +1,106 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
class CsvUtils
|
7
|
-
|
8
|
-
def self.split( path, *columns, sep: ',', &blk )
|
9
|
-
|
10
|
-
puts "cvssplit in: >#{path}<"
|
11
|
-
|
12
|
-
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
-
puts "columns:"
|
14
|
-
pp columns
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
##
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
header_mapping
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
class CsvUtils
|
7
|
+
|
8
|
+
def self.split( path, *columns, sep: ',', &blk )
|
9
|
+
|
10
|
+
puts "cvssplit in: >#{path}<"
|
11
|
+
|
12
|
+
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
+
puts "columns:"
|
14
|
+
pp columns
|
15
|
+
|
16
|
+
## note: do NOT use headers
|
17
|
+
## for easy sorting use "plain" array of array for records
|
18
|
+
csv_options = { sep: sep }
|
19
|
+
|
20
|
+
data = CsvReader.read( path, csv_options )
|
21
|
+
|
22
|
+
## todo/check: (auto-) strip (remove all leading and trailing spaces)
|
23
|
+
## from all values - why? why not?
|
24
|
+
## check if CSV.parse has an option for it?
|
25
|
+
|
26
|
+
headers = data.shift ## remove top array item (that is, row with headers)
|
27
|
+
|
28
|
+
header_mapping = {}
|
29
|
+
headers.each_with_index { | header,i | header_mapping[header]=i }
|
30
|
+
pp header_mapping
|
31
|
+
|
32
|
+
## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
|
33
|
+
column_indices = columns.map { |col| header_mapping[col] }
|
34
|
+
pp column_indices
|
35
|
+
|
36
|
+
|
37
|
+
###################################################
|
38
|
+
## note: sort data by columns (before split)
|
39
|
+
data = data.sort do |row1,row2|
|
40
|
+
res = 0
|
41
|
+
column_indices.each do |col|
|
42
|
+
res = row1[col] <=> row2[col] if res == 0
|
43
|
+
end
|
44
|
+
res
|
45
|
+
end
|
46
|
+
|
47
|
+
chunk = []
|
48
|
+
data.each_with_index do |row,i|
|
49
|
+
chunk << row
|
50
|
+
|
51
|
+
next_row = data[i+1]
|
52
|
+
|
53
|
+
changed = false
|
54
|
+
if next_row.nil? ## end-of-file
|
55
|
+
changed = true
|
56
|
+
else
|
57
|
+
column_indices.each do |col|
|
58
|
+
if row[col] != next_row[col]
|
59
|
+
changed = true
|
60
|
+
break ## out of each column_indices loop
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
if changed
|
66
|
+
puts "save new chunk:"
|
67
|
+
column_values = column_indices.map {|col| row[col] }
|
68
|
+
pp column_values
|
69
|
+
|
70
|
+
# note: add header(s) row upfront (as first row) to chunk (with unshift)
|
71
|
+
chunk_with_headers = chunk.unshift( headers )
|
72
|
+
if blk
|
73
|
+
yield( column_values, chunk_with_headers )
|
74
|
+
else
|
75
|
+
## auto-save (write-to-file) by default - why? why not?
|
76
|
+
split_write( path, column_values, chunk_with_headers, sep: sep )
|
77
|
+
end
|
78
|
+
|
79
|
+
chunk = [] ## reset chunk for next batch of records
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
puts 'Done.'
|
84
|
+
end ## method self.split
|
85
|
+
|
86
|
+
|
87
|
+
def self.split_write( inpath, values, chunk, sep: )
|
88
|
+
basename = File.basename( inpath, '.*' )
|
89
|
+
dirname = File.dirname( inpath )
|
90
|
+
|
91
|
+
## check/change invalid filename chars
|
92
|
+
## e.g. change 1990/91 to 1990-91
|
93
|
+
extraname = values.map {|value| value.tr('/','-')}.join('~')
|
94
|
+
|
95
|
+
outpath = "#{dirname}/#{basename}_#{extraname}.csv"
|
96
|
+
puts "saving >#{basename}_#{extraname}.csv<..."
|
97
|
+
|
98
|
+
File.open( outpath, 'w:utf-8' ) do |out|
|
99
|
+
chunk.each do |row|
|
100
|
+
out << csv_row( *row, sep: sep ).join( sep )
|
101
|
+
out << "\n"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end # class CsvUtils
|
data/lib/csvutils/stat.rb
CHANGED
@@ -1,86 +1,81 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
class CsvUtils
|
5
|
-
|
6
|
-
def self.stat( path, *columns, sep: ',', debug: false )
|
7
|
-
|
8
|
-
csv_options = {
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
nulls[col]
|
47
|
-
|
48
|
-
|
49
|
-
nulls[col]
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
puts
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end # method self.stat
|
85
|
-
|
86
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
def self.stat( path, *columns, sep: ',', debug: false )
|
7
|
+
|
8
|
+
csv_options = { sep: sep }
|
9
|
+
|
10
|
+
values = {}
|
11
|
+
nulls = {}
|
12
|
+
# check 1) nulls/nils (e.g. empty strings ""),
|
13
|
+
# 2) not/appliation or available n/a NA or NaN or ...
|
14
|
+
# 3) missing - e.g. ?
|
15
|
+
|
16
|
+
i=0
|
17
|
+
CsvHash.foreach( path, csv_options ) do |rec|
|
18
|
+
i += 1
|
19
|
+
|
20
|
+
pp rec if i == 1 && debug
|
21
|
+
|
22
|
+
print '.' if i % 100 == 0
|
23
|
+
|
24
|
+
## collect unique values for passed in columns
|
25
|
+
columns.each do |col|
|
26
|
+
value = rec[col] ## note: value might be nil!!!!!
|
27
|
+
|
28
|
+
values[col] ||= Hash.new(0)
|
29
|
+
values[col][ value ? value : '<nil>' ] +=1
|
30
|
+
end
|
31
|
+
|
32
|
+
## alway track nulls - why? why not
|
33
|
+
rec.each do |col,value|
|
34
|
+
## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?)
|
35
|
+
## puts "[debug] nil value in row:"
|
36
|
+
## puts "#{col} = #{value.inspect} : #{value.class.name}"
|
37
|
+
## end
|
38
|
+
|
39
|
+
if value.nil?
|
40
|
+
nulls[col] ||= Hash.new(0)
|
41
|
+
nulls[col]['nil'] +=1
|
42
|
+
elsif value.empty?
|
43
|
+
nulls[col] ||= Hash.new(0)
|
44
|
+
nulls[col]['empty'] +=1
|
45
|
+
elsif ['na', 'n/a', '-'].include?( value.downcase )
|
46
|
+
nulls[col] ||= Hash.new(0)
|
47
|
+
nulls[col]['na'] +=1
|
48
|
+
elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not?
|
49
|
+
nulls[col] ||= Hash.new(0)
|
50
|
+
nulls[col]['?'] +=1
|
51
|
+
else
|
52
|
+
# do nothing; "regular" value
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
puts " #{i} rows"
|
58
|
+
puts
|
59
|
+
puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):"
|
60
|
+
puts " #{nulls.inspect}"
|
61
|
+
puts
|
62
|
+
|
63
|
+
## dump headers first (first row with names of columns)
|
64
|
+
headers = header( path, sep: sep, debug: debug )
|
65
|
+
pp_header( headers ) ## pretty print header columns
|
66
|
+
puts
|
67
|
+
|
68
|
+
if values.any?
|
69
|
+
## pretty print (pp) / dump unique values for passed in columns
|
70
|
+
values.each do |col,h|
|
71
|
+
puts " column >#{col}< #{h.size} unique values:"
|
72
|
+
## sort by name/value for now (not frequency) - change - why? why not?
|
73
|
+
sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] }
|
74
|
+
sorted_values.each do |rec|
|
75
|
+
puts " #{rec[1]} x #{rec[0]}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end # method self.stat
|
80
|
+
|
81
|
+
end # class CsvUtils
|