csvutils 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +3 -0
- data/README.md +1 -1
- data/Rakefile +1 -1
- data/lib/csvutils.rb +109 -0
- data/lib/csvutils/cut.rb +47 -47
- data/lib/csvutils/head.rb +25 -0
- data/lib/csvutils/header.rb +28 -0
- data/lib/csvutils/split.rb +107 -107
- data/lib/csvutils/stat.rb +86 -0
- data/lib/csvutils/test.rb +1 -1
- data/lib/csvutils/utils.rb +13 -29
- data/lib/csvutils/version.rb +1 -1
- data/test/data/at-austria/AUT.csv +363 -363
- data/test/data/eng-england/2017-18/E0.csv +381 -381
- data/test/test_headers.rb +50 -41
- data/test/test_misc.rb +12 -1
- data/test/test_version.rb +20 -20
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 435468b42345511466981b9f470e39e1dc78bfea
|
4
|
+
data.tar.gz: f2b4c42ec30da76fe929942d6d0706420dd63f63
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b1edc99a8b3e7257a899df34ed2b144467c1268f21877676194b138e59113d69733990ba7a0c70b1a4c51cb2861dff6e3c429cbc9f6be4a0a83c9f08a52fd22
|
7
|
+
data.tar.gz: b57408a1a5d2743649538c79a269480ce2a20d21df483c8133772bbb46589d7dbff6d1c9b22d4ec329bf331f70bd5a207650bedfd83821ba2cdeb7787595621b
|
data/Manifest.txt
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# csvutils - tools 'n' scripts for working with comma-separated values (csv) datafiles - the world's most popular tabular date interchange format in text
|
1
|
+
# csvutils - tools 'n' scripts for working with comma-separated values (csv) datafiles - the world's most popular tabular date interchange format in text
|
2
2
|
|
3
3
|
|
4
4
|
* home :: [github.com/csv11/csvutils](https://github.com/csv11/csvutils)
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Hoe.spec 'csvutils' do
|
|
11
11
|
self.urls = ['https://github.com/csv11/csvutils']
|
12
12
|
|
13
13
|
self.author = 'Gerald Bauer'
|
14
|
-
self.email = '
|
14
|
+
self.email = 'wwwmake@googlegroups.com'
|
15
15
|
|
16
16
|
# switch extension to .markdown for gihub formatting
|
17
17
|
self.readme_file = 'README.md'
|
data/lib/csvutils.rb
CHANGED
@@ -4,6 +4,8 @@ require 'pp'
|
|
4
4
|
require 'csv'
|
5
5
|
require 'date'
|
6
6
|
require 'fileutils'
|
7
|
+
require 'optparse'
|
8
|
+
|
7
9
|
|
8
10
|
|
9
11
|
###
|
@@ -13,7 +15,114 @@ require 'csvutils/utils'
|
|
13
15
|
require 'csvutils/split'
|
14
16
|
require 'csvutils/cut'
|
15
17
|
require 'csvutils/test'
|
18
|
+
require 'csvutils/stat'
|
19
|
+
require 'csvutils/header'
|
20
|
+
require 'csvutils/head'
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
class CsvTool
|
26
|
+
|
27
|
+
## command line tools
|
28
|
+
def self.header( args )
|
29
|
+
|
30
|
+
config = {}
|
31
|
+
|
32
|
+
parser = OptionParser.new do |opts|
|
33
|
+
opts.banner = "Usage: csvheader [OPTS] datafile ..."
|
34
|
+
|
35
|
+
opts.on("-h", "--help", "Prints this help") do
|
36
|
+
puts opts
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
parser.parse!( args )
|
42
|
+
|
43
|
+
## pp config
|
44
|
+
## pp args
|
45
|
+
|
46
|
+
args.each do |arg|
|
47
|
+
path = arg
|
48
|
+
|
49
|
+
puts "== #{File.basename(path)} (#{File.dirname(path)}) =="
|
50
|
+
puts
|
51
|
+
CsvUtils.pp_header( CsvUtils.header( path ) )
|
52
|
+
puts
|
53
|
+
end # each arg
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def self.stat( args )
|
58
|
+
|
59
|
+
config = { columns: [] }
|
60
|
+
|
61
|
+
parser = OptionParser.new do |opts|
|
62
|
+
opts.banner = "Usage: csvstat [OPTS] datafile ..."
|
63
|
+
|
64
|
+
opts.on("-c", "--columns=COLUMNS", "Name of header columns" ) do |columns|
|
65
|
+
config[:columns] = columns.split(/[,|;]/) ## allow differnt separators
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on("-h", "--help", "Prints this help") do
|
69
|
+
puts opts
|
70
|
+
exit
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
parser.parse!( args )
|
75
|
+
|
76
|
+
## pp config
|
77
|
+
## pp args
|
78
|
+
|
79
|
+
args.each do |arg|
|
80
|
+
path = arg
|
81
|
+
columns = config[:columns]
|
82
|
+
|
83
|
+
puts "== #{File.basename(path)} (#{File.dirname(path)}) =="
|
84
|
+
puts
|
85
|
+
CsvUtils.stat( path, *columns )
|
86
|
+
puts
|
87
|
+
end # each arg
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def self.head( args )
|
92
|
+
|
93
|
+
config = { n: 4 }
|
94
|
+
|
95
|
+
parser = OptionParser.new do |opts|
|
96
|
+
opts.banner = "Usage: csvhead [OPTS] datafile ..."
|
97
|
+
|
98
|
+
opts.on("-n", "--num=NUM", "Number of rows" ) do |num|
|
99
|
+
config[:n] = num.to_i
|
100
|
+
end
|
101
|
+
|
102
|
+
opts.on("-h", "--help", "Prints this help") do
|
103
|
+
puts opts
|
104
|
+
exit
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
parser.parse!( args )
|
109
|
+
|
110
|
+
## pp config
|
111
|
+
## pp args
|
112
|
+
|
113
|
+
args.each do |arg|
|
114
|
+
path = arg
|
115
|
+
n = config[:n]
|
116
|
+
|
117
|
+
puts "== #{File.basename(path)} (#{File.dirname(path)}) =="
|
118
|
+
puts
|
119
|
+
CsvUtils.head( path, n: n )
|
120
|
+
puts
|
121
|
+
end # each arg
|
122
|
+
end
|
123
|
+
|
16
124
|
|
125
|
+
end # class CsvTool
|
17
126
|
|
18
127
|
|
19
128
|
puts CsvUtils.banner # say hello
|
data/lib/csvutils/cut.rb
CHANGED
@@ -1,47 +1,47 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
## check/use class or module ???
|
4
|
-
|
5
|
-
|
6
|
-
class CsvUtils
|
7
|
-
|
8
|
-
def self.cut( inpath, outpath, *columns, sep: ',' )
|
9
|
-
|
10
|
-
puts "cvscut in: >#{inpath}< out: >#{outpath}<"
|
11
|
-
|
12
|
-
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
-
puts "columns:"
|
14
|
-
pp columns
|
15
|
-
|
16
|
-
text = File.open( inpath, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
|
17
|
-
|
18
|
-
csv_options = { headers: true,
|
19
|
-
col_sep: sep }
|
20
|
-
|
21
|
-
table = CSV.parse( text, csv_options )
|
22
|
-
|
23
|
-
|
24
|
-
## for convenience - make sure parent folders/directories exist
|
25
|
-
FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
|
26
|
-
|
27
|
-
## use wb mode - why? why not?
|
28
|
-
## assumes same encoding as input?
|
29
|
-
## fix/todo: better (always) use utf8!!!!
|
30
|
-
## CSV.open( out_path, 'wb' ) do |out|
|
31
|
-
|
32
|
-
## use just "regular" File for output - why? why not?
|
33
|
-
## downside will not encode comma (for now) if present ("Beethoven, van")
|
34
|
-
## all values will be unquoted etc. - keep it simple?
|
35
|
-
|
36
|
-
CSV.open( outpath, 'w:utf-8' ) do |out|
|
37
|
-
out << columns ## for row add headers/columns
|
38
|
-
table.each do |row|
|
39
|
-
values = columns.map { |col| row[col].strip } ## find data for column
|
40
|
-
out << values
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
puts 'Done.'
|
45
|
-
end ## method self.cut
|
46
|
-
|
47
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
## check/use class or module ???
|
4
|
+
|
5
|
+
|
6
|
+
class CsvUtils
|
7
|
+
|
8
|
+
def self.cut( inpath, outpath, *columns, sep: ',' )
|
9
|
+
|
10
|
+
puts "cvscut in: >#{inpath}< out: >#{outpath}<"
|
11
|
+
|
12
|
+
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
+
puts "columns:"
|
14
|
+
pp columns
|
15
|
+
|
16
|
+
text = File.open( inpath, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
|
17
|
+
|
18
|
+
csv_options = { headers: true,
|
19
|
+
col_sep: sep }
|
20
|
+
|
21
|
+
table = CSV.parse( text, csv_options )
|
22
|
+
|
23
|
+
|
24
|
+
## for convenience - make sure parent folders/directories exist
|
25
|
+
FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
|
26
|
+
|
27
|
+
## use wb mode - why? why not?
|
28
|
+
## assumes same encoding as input?
|
29
|
+
## fix/todo: better (always) use utf8!!!!
|
30
|
+
## CSV.open( out_path, 'wb' ) do |out|
|
31
|
+
|
32
|
+
## use just "regular" File for output - why? why not?
|
33
|
+
## downside will not encode comma (for now) if present ("Beethoven, van")
|
34
|
+
## all values will be unquoted etc. - keep it simple?
|
35
|
+
|
36
|
+
CSV.open( outpath, 'w:utf-8' ) do |out|
|
37
|
+
out << columns ## for row add headers/columns
|
38
|
+
table.each do |row|
|
39
|
+
values = columns.map { |col| row[col].strip } ## find data for column
|
40
|
+
out << values
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
puts 'Done.'
|
45
|
+
end ## method self.cut
|
46
|
+
|
47
|
+
end # class CsvUtils
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
## test or dry run to check if rows can get read/scanned
|
7
|
+
def self.head( path, sep: ',', n: 4 )
|
8
|
+
i = 0
|
9
|
+
csv_options = { headers: true,
|
10
|
+
col_sep: sep,
|
11
|
+
external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!!
|
12
|
+
}
|
13
|
+
|
14
|
+
CSV.foreach( path, csv_options ) do |row|
|
15
|
+
i += 1
|
16
|
+
|
17
|
+
pp row
|
18
|
+
|
19
|
+
break if i >= n
|
20
|
+
end
|
21
|
+
|
22
|
+
puts " #{i} rows"
|
23
|
+
end
|
24
|
+
|
25
|
+
end # class CsvUtils
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
|
7
|
+
|
8
|
+
# read first line (only)
|
9
|
+
# and parse with csv to get header from csv library itself
|
10
|
+
#
|
11
|
+
# check - if there's an easier or built-in way for the csv library
|
12
|
+
|
13
|
+
line = File.open( path, 'r:utf-8' ) { |f| f.readline }
|
14
|
+
|
15
|
+
pp line if debug
|
16
|
+
## e.g.:
|
17
|
+
# "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
|
18
|
+
|
19
|
+
csv_options = { col_sep: sep }
|
20
|
+
|
21
|
+
## note: do NOT use headers: true to get "plain" data array (no hash records)
|
22
|
+
## hash record does NOT work for single line/row
|
23
|
+
rows = CSV.parse( line, csv_options )
|
24
|
+
pp rows if debug
|
25
|
+
rows[0] ## return first row
|
26
|
+
end # method self.header
|
27
|
+
|
28
|
+
end # class CsvUtils
|
data/lib/csvutils/split.rb
CHANGED
@@ -1,107 +1,107 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
class CsvUtils
|
7
|
-
|
8
|
-
def self.split( path, *columns, sep: ',', &blk )
|
9
|
-
|
10
|
-
puts "cvssplit in: >#{path}<"
|
11
|
-
|
12
|
-
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
-
puts "columns:"
|
14
|
-
pp columns
|
15
|
-
|
16
|
-
text = File.open( path, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
|
17
|
-
|
18
|
-
## note: do NOT use headers
|
19
|
-
## for easy sorting use "plain" array of array for records
|
20
|
-
csv_options = { col_sep: sep }
|
21
|
-
|
22
|
-
data = CSV.parse( text, csv_options )
|
23
|
-
|
24
|
-
## todo/check: (auto-) strip (remove all leading and trailing spaces)
|
25
|
-
## from all values - why? why not?
|
26
|
-
## check if CSV.parse has an option for it?
|
27
|
-
|
28
|
-
headers = data.shift ## remove top array item (that is, row with headers)
|
29
|
-
|
30
|
-
header_mapping = {}
|
31
|
-
headers.each_with_index { | header,i | header_mapping[header]=i }
|
32
|
-
pp header_mapping
|
33
|
-
|
34
|
-
## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
|
35
|
-
column_indices = columns.map { |col| header_mapping[col] }
|
36
|
-
pp column_indices
|
37
|
-
|
38
|
-
|
39
|
-
###################################################
|
40
|
-
## note: sort data by columns (before split)
|
41
|
-
data = data.sort do |row1,row2|
|
42
|
-
res = 0
|
43
|
-
column_indices.each do |col|
|
44
|
-
res = row1[col] <=> row2[col] if res == 0
|
45
|
-
end
|
46
|
-
res
|
47
|
-
end
|
48
|
-
|
49
|
-
chunk = []
|
50
|
-
data.each_with_index do |row,i|
|
51
|
-
chunk << row
|
52
|
-
|
53
|
-
next_row = data[i+1]
|
54
|
-
|
55
|
-
changed = false
|
56
|
-
if next_row.nil? ## end-of-file
|
57
|
-
changed = true
|
58
|
-
else
|
59
|
-
column_indices.each do |col|
|
60
|
-
if row[col] != next_row[col]
|
61
|
-
changed = true
|
62
|
-
break ## out of each column_indices loop
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
if changed
|
68
|
-
puts "save new chunk:"
|
69
|
-
column_values = column_indices.map {|col| row[col] }
|
70
|
-
pp column_values
|
71
|
-
|
72
|
-
# note: add header(s) row upfront (as first row) to chunk (with unshift)
|
73
|
-
chunk_with_headers = chunk.unshift( headers )
|
74
|
-
if blk
|
75
|
-
yield( column_values, chunk_with_headers )
|
76
|
-
else
|
77
|
-
## auto-save (write-to-file) by default - why? why not?
|
78
|
-
split_write( path, column_values, chunk_with_headers )
|
79
|
-
end
|
80
|
-
|
81
|
-
chunk = [] ## reset chunk for next batch of records
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
puts 'Done.'
|
86
|
-
end ## method self.split
|
87
|
-
|
88
|
-
|
89
|
-
def self.split_write( inpath, values, chunk )
|
90
|
-
basename = File.basename( inpath, '.*' )
|
91
|
-
dirname = File.dirname( inpath )
|
92
|
-
|
93
|
-
## check/change invalid filename chars
|
94
|
-
## e.g. change 1990/91 to 1990-91
|
95
|
-
extraname = values.map {|value| value.tr('/','-')}.join('~')
|
96
|
-
|
97
|
-
outpath = "#{dirname}/#{basename}_#{extraname}.csv"
|
98
|
-
puts "saving >#{basename}_#{extraname}.csv<..."
|
99
|
-
|
100
|
-
CSV.open( outpath, 'w:utf-8' ) do |out|
|
101
|
-
chunk.each do |row|
|
102
|
-
out << row
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
class CsvUtils
|
7
|
+
|
8
|
+
def self.split( path, *columns, sep: ',', &blk )
|
9
|
+
|
10
|
+
puts "cvssplit in: >#{path}<"
|
11
|
+
|
12
|
+
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
+
puts "columns:"
|
14
|
+
pp columns
|
15
|
+
|
16
|
+
text = File.open( path, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
|
17
|
+
|
18
|
+
## note: do NOT use headers
|
19
|
+
## for easy sorting use "plain" array of array for records
|
20
|
+
csv_options = { col_sep: sep }
|
21
|
+
|
22
|
+
data = CSV.parse( text, csv_options )
|
23
|
+
|
24
|
+
## todo/check: (auto-) strip (remove all leading and trailing spaces)
|
25
|
+
## from all values - why? why not?
|
26
|
+
## check if CSV.parse has an option for it?
|
27
|
+
|
28
|
+
headers = data.shift ## remove top array item (that is, row with headers)
|
29
|
+
|
30
|
+
header_mapping = {}
|
31
|
+
headers.each_with_index { | header,i | header_mapping[header]=i }
|
32
|
+
pp header_mapping
|
33
|
+
|
34
|
+
## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
|
35
|
+
column_indices = columns.map { |col| header_mapping[col] }
|
36
|
+
pp column_indices
|
37
|
+
|
38
|
+
|
39
|
+
###################################################
|
40
|
+
## note: sort data by columns (before split)
|
41
|
+
data = data.sort do |row1,row2|
|
42
|
+
res = 0
|
43
|
+
column_indices.each do |col|
|
44
|
+
res = row1[col] <=> row2[col] if res == 0
|
45
|
+
end
|
46
|
+
res
|
47
|
+
end
|
48
|
+
|
49
|
+
chunk = []
|
50
|
+
data.each_with_index do |row,i|
|
51
|
+
chunk << row
|
52
|
+
|
53
|
+
next_row = data[i+1]
|
54
|
+
|
55
|
+
changed = false
|
56
|
+
if next_row.nil? ## end-of-file
|
57
|
+
changed = true
|
58
|
+
else
|
59
|
+
column_indices.each do |col|
|
60
|
+
if row[col] != next_row[col]
|
61
|
+
changed = true
|
62
|
+
break ## out of each column_indices loop
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
if changed
|
68
|
+
puts "save new chunk:"
|
69
|
+
column_values = column_indices.map {|col| row[col] }
|
70
|
+
pp column_values
|
71
|
+
|
72
|
+
# note: add header(s) row upfront (as first row) to chunk (with unshift)
|
73
|
+
chunk_with_headers = chunk.unshift( headers )
|
74
|
+
if blk
|
75
|
+
yield( column_values, chunk_with_headers )
|
76
|
+
else
|
77
|
+
## auto-save (write-to-file) by default - why? why not?
|
78
|
+
split_write( path, column_values, chunk_with_headers )
|
79
|
+
end
|
80
|
+
|
81
|
+
chunk = [] ## reset chunk for next batch of records
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
puts 'Done.'
|
86
|
+
end ## method self.split
|
87
|
+
|
88
|
+
|
89
|
+
def self.split_write( inpath, values, chunk )
|
90
|
+
basename = File.basename( inpath, '.*' )
|
91
|
+
dirname = File.dirname( inpath )
|
92
|
+
|
93
|
+
## check/change invalid filename chars
|
94
|
+
## e.g. change 1990/91 to 1990-91
|
95
|
+
extraname = values.map {|value| value.tr('/','-')}.join('~')
|
96
|
+
|
97
|
+
outpath = "#{dirname}/#{basename}_#{extraname}.csv"
|
98
|
+
puts "saving >#{basename}_#{extraname}.csv<..."
|
99
|
+
|
100
|
+
CSV.open( outpath, 'w:utf-8' ) do |out|
|
101
|
+
chunk.each do |row|
|
102
|
+
out << row
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
end # class CsvUtils
|