csvutils 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +3 -0
- data/README.md +1 -1
- data/Rakefile +1 -1
- data/lib/csvutils.rb +109 -0
- data/lib/csvutils/cut.rb +47 -47
- data/lib/csvutils/head.rb +25 -0
- data/lib/csvutils/header.rb +28 -0
- data/lib/csvutils/split.rb +107 -107
- data/lib/csvutils/stat.rb +86 -0
- data/lib/csvutils/test.rb +1 -1
- data/lib/csvutils/utils.rb +13 -29
- data/lib/csvutils/version.rb +1 -1
- data/test/data/at-austria/AUT.csv +363 -363
- data/test/data/eng-england/2017-18/E0.csv +381 -381
- data/test/test_headers.rb +50 -41
- data/test/test_misc.rb +12 -1
- data/test/test_version.rb +20 -20
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 435468b42345511466981b9f470e39e1dc78bfea
|
4
|
+
data.tar.gz: f2b4c42ec30da76fe929942d6d0706420dd63f63
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b1edc99a8b3e7257a899df34ed2b144467c1268f21877676194b138e59113d69733990ba7a0c70b1a4c51cb2861dff6e3c429cbc9f6be4a0a83c9f08a52fd22
|
7
|
+
data.tar.gz: b57408a1a5d2743649538c79a269480ce2a20d21df483c8133772bbb46589d7dbff6d1c9b22d4ec329bf331f70bd5a207650bedfd83821ba2cdeb7787595621b
|
data/Manifest.txt
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# csvutils - tools 'n' scripts for working with comma-separated values (csv) datafiles - the world's most popular tabular date interchange format in text
|
1
|
+
# csvutils - tools 'n' scripts for working with comma-separated values (csv) datafiles - the world's most popular tabular date interchange format in text
|
2
2
|
|
3
3
|
|
4
4
|
* home :: [github.com/csv11/csvutils](https://github.com/csv11/csvutils)
|
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Hoe.spec 'csvutils' do
|
|
11
11
|
self.urls = ['https://github.com/csv11/csvutils']
|
12
12
|
|
13
13
|
self.author = 'Gerald Bauer'
|
14
|
-
self.email = '
|
14
|
+
self.email = 'wwwmake@googlegroups.com'
|
15
15
|
|
16
16
|
# switch extension to .markdown for gihub formatting
|
17
17
|
self.readme_file = 'README.md'
|
data/lib/csvutils.rb
CHANGED
@@ -4,6 +4,8 @@ require 'pp'
|
|
4
4
|
require 'csv'
|
5
5
|
require 'date'
|
6
6
|
require 'fileutils'
|
7
|
+
require 'optparse'
|
8
|
+
|
7
9
|
|
8
10
|
|
9
11
|
###
|
@@ -13,7 +15,114 @@ require 'csvutils/utils'
|
|
13
15
|
require 'csvutils/split'
|
14
16
|
require 'csvutils/cut'
|
15
17
|
require 'csvutils/test'
|
18
|
+
require 'csvutils/stat'
|
19
|
+
require 'csvutils/header'
|
20
|
+
require 'csvutils/head'
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
class CsvTool
|
26
|
+
|
27
|
+
## command line tools
|
28
|
+
def self.header( args )
|
29
|
+
|
30
|
+
config = {}
|
31
|
+
|
32
|
+
parser = OptionParser.new do |opts|
|
33
|
+
opts.banner = "Usage: csvheader [OPTS] datafile ..."
|
34
|
+
|
35
|
+
opts.on("-h", "--help", "Prints this help") do
|
36
|
+
puts opts
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
parser.parse!( args )
|
42
|
+
|
43
|
+
## pp config
|
44
|
+
## pp args
|
45
|
+
|
46
|
+
args.each do |arg|
|
47
|
+
path = arg
|
48
|
+
|
49
|
+
puts "== #{File.basename(path)} (#{File.dirname(path)}) =="
|
50
|
+
puts
|
51
|
+
CsvUtils.pp_header( CsvUtils.header( path ) )
|
52
|
+
puts
|
53
|
+
end # each arg
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def self.stat( args )
|
58
|
+
|
59
|
+
config = { columns: [] }
|
60
|
+
|
61
|
+
parser = OptionParser.new do |opts|
|
62
|
+
opts.banner = "Usage: csvstat [OPTS] datafile ..."
|
63
|
+
|
64
|
+
opts.on("-c", "--columns=COLUMNS", "Name of header columns" ) do |columns|
|
65
|
+
config[:columns] = columns.split(/[,|;]/) ## allow differnt separators
|
66
|
+
end
|
67
|
+
|
68
|
+
opts.on("-h", "--help", "Prints this help") do
|
69
|
+
puts opts
|
70
|
+
exit
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
parser.parse!( args )
|
75
|
+
|
76
|
+
## pp config
|
77
|
+
## pp args
|
78
|
+
|
79
|
+
args.each do |arg|
|
80
|
+
path = arg
|
81
|
+
columns = config[:columns]
|
82
|
+
|
83
|
+
puts "== #{File.basename(path)} (#{File.dirname(path)}) =="
|
84
|
+
puts
|
85
|
+
CsvUtils.stat( path, *columns )
|
86
|
+
puts
|
87
|
+
end # each arg
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def self.head( args )
|
92
|
+
|
93
|
+
config = { n: 4 }
|
94
|
+
|
95
|
+
parser = OptionParser.new do |opts|
|
96
|
+
opts.banner = "Usage: csvhead [OPTS] datafile ..."
|
97
|
+
|
98
|
+
opts.on("-n", "--num=NUM", "Number of rows" ) do |num|
|
99
|
+
config[:n] = num.to_i
|
100
|
+
end
|
101
|
+
|
102
|
+
opts.on("-h", "--help", "Prints this help") do
|
103
|
+
puts opts
|
104
|
+
exit
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
parser.parse!( args )
|
109
|
+
|
110
|
+
## pp config
|
111
|
+
## pp args
|
112
|
+
|
113
|
+
args.each do |arg|
|
114
|
+
path = arg
|
115
|
+
n = config[:n]
|
116
|
+
|
117
|
+
puts "== #{File.basename(path)} (#{File.dirname(path)}) =="
|
118
|
+
puts
|
119
|
+
CsvUtils.head( path, n: n )
|
120
|
+
puts
|
121
|
+
end # each arg
|
122
|
+
end
|
123
|
+
|
16
124
|
|
125
|
+
end # class CsvTool
|
17
126
|
|
18
127
|
|
19
128
|
puts CsvUtils.banner # say hello
|
data/lib/csvutils/cut.rb
CHANGED
@@ -1,47 +1,47 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
## check/use class or module ???
|
4
|
-
|
5
|
-
|
6
|
-
class CsvUtils
|
7
|
-
|
8
|
-
def self.cut( inpath, outpath, *columns, sep: ',' )
|
9
|
-
|
10
|
-
puts "cvscut in: >#{inpath}< out: >#{outpath}<"
|
11
|
-
|
12
|
-
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
-
puts "columns:"
|
14
|
-
pp columns
|
15
|
-
|
16
|
-
text = File.open( inpath, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
|
17
|
-
|
18
|
-
csv_options = { headers: true,
|
19
|
-
col_sep: sep }
|
20
|
-
|
21
|
-
table = CSV.parse( text, csv_options )
|
22
|
-
|
23
|
-
|
24
|
-
## for convenience - make sure parent folders/directories exist
|
25
|
-
FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
|
26
|
-
|
27
|
-
## use wb mode - why? why not?
|
28
|
-
## assumes same encoding as input?
|
29
|
-
## fix/todo: better (always) use utf8!!!!
|
30
|
-
## CSV.open( out_path, 'wb' ) do |out|
|
31
|
-
|
32
|
-
## use just "regular" File for output - why? why not?
|
33
|
-
## downside will not encode comma (for now) if present ("Beethoven, van")
|
34
|
-
## all values will be unquoted etc. - keep it simple?
|
35
|
-
|
36
|
-
CSV.open( outpath, 'w:utf-8' ) do |out|
|
37
|
-
out << columns ## for row add headers/columns
|
38
|
-
table.each do |row|
|
39
|
-
values = columns.map { |col| row[col].strip } ## find data for column
|
40
|
-
out << values
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
puts 'Done.'
|
45
|
-
end ## method self.cut
|
46
|
-
|
47
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
## check/use class or module ???
|
4
|
+
|
5
|
+
|
6
|
+
class CsvUtils
|
7
|
+
|
8
|
+
def self.cut( inpath, outpath, *columns, sep: ',' )
|
9
|
+
|
10
|
+
puts "cvscut in: >#{inpath}< out: >#{outpath}<"
|
11
|
+
|
12
|
+
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
+
puts "columns:"
|
14
|
+
pp columns
|
15
|
+
|
16
|
+
text = File.open( inpath, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
|
17
|
+
|
18
|
+
csv_options = { headers: true,
|
19
|
+
col_sep: sep }
|
20
|
+
|
21
|
+
table = CSV.parse( text, csv_options )
|
22
|
+
|
23
|
+
|
24
|
+
## for convenience - make sure parent folders/directories exist
|
25
|
+
FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath ))
|
26
|
+
|
27
|
+
## use wb mode - why? why not?
|
28
|
+
## assumes same encoding as input?
|
29
|
+
## fix/todo: better (always) use utf8!!!!
|
30
|
+
## CSV.open( out_path, 'wb' ) do |out|
|
31
|
+
|
32
|
+
## use just "regular" File for output - why? why not?
|
33
|
+
## downside will not encode comma (for now) if present ("Beethoven, van")
|
34
|
+
## all values will be unquoted etc. - keep it simple?
|
35
|
+
|
36
|
+
CSV.open( outpath, 'w:utf-8' ) do |out|
|
37
|
+
out << columns ## for row add headers/columns
|
38
|
+
table.each do |row|
|
39
|
+
values = columns.map { |col| row[col].strip } ## find data for column
|
40
|
+
out << values
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
puts 'Done.'
|
45
|
+
end ## method self.cut
|
46
|
+
|
47
|
+
end # class CsvUtils
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
## test or dry run to check if rows can get read/scanned
|
7
|
+
def self.head( path, sep: ',', n: 4 )
|
8
|
+
i = 0
|
9
|
+
csv_options = { headers: true,
|
10
|
+
col_sep: sep,
|
11
|
+
external_encoding: 'utf-8' ## note: always (auto-)add utf-8 external encoding!!!
|
12
|
+
}
|
13
|
+
|
14
|
+
CSV.foreach( path, csv_options ) do |row|
|
15
|
+
i += 1
|
16
|
+
|
17
|
+
pp row
|
18
|
+
|
19
|
+
break if i >= n
|
20
|
+
end
|
21
|
+
|
22
|
+
puts " #{i} rows"
|
23
|
+
end
|
24
|
+
|
25
|
+
end # class CsvUtils
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
class CsvUtils
|
5
|
+
|
6
|
+
def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)?
|
7
|
+
|
8
|
+
# read first line (only)
|
9
|
+
# and parse with csv to get header from csv library itself
|
10
|
+
#
|
11
|
+
# check - if there's an easier or built-in way for the csv library
|
12
|
+
|
13
|
+
line = File.open( path, 'r:utf-8' ) { |f| f.readline }
|
14
|
+
|
15
|
+
pp line if debug
|
16
|
+
## e.g.:
|
17
|
+
# "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n"
|
18
|
+
|
19
|
+
csv_options = { col_sep: sep }
|
20
|
+
|
21
|
+
## note: do NOT use headers: true to get "plain" data array (no hash records)
|
22
|
+
## hash record does NOT work for single line/row
|
23
|
+
rows = CSV.parse( line, csv_options )
|
24
|
+
pp rows if debug
|
25
|
+
rows[0] ## return first row
|
26
|
+
end # method self.header
|
27
|
+
|
28
|
+
end # class CsvUtils
|
data/lib/csvutils/split.rb
CHANGED
@@ -1,107 +1,107 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
class CsvUtils
|
7
|
-
|
8
|
-
def self.split( path, *columns, sep: ',', &blk )
|
9
|
-
|
10
|
-
puts "cvssplit in: >#{path}<"
|
11
|
-
|
12
|
-
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
-
puts "columns:"
|
14
|
-
pp columns
|
15
|
-
|
16
|
-
text = File.open( path, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
|
17
|
-
|
18
|
-
## note: do NOT use headers
|
19
|
-
## for easy sorting use "plain" array of array for records
|
20
|
-
csv_options = { col_sep: sep }
|
21
|
-
|
22
|
-
data = CSV.parse( text, csv_options )
|
23
|
-
|
24
|
-
## todo/check: (auto-) strip (remove all leading and trailing spaces)
|
25
|
-
## from all values - why? why not?
|
26
|
-
## check if CSV.parse has an option for it?
|
27
|
-
|
28
|
-
headers = data.shift ## remove top array item (that is, row with headers)
|
29
|
-
|
30
|
-
header_mapping = {}
|
31
|
-
headers.each_with_index { | header,i | header_mapping[header]=i }
|
32
|
-
pp header_mapping
|
33
|
-
|
34
|
-
## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
|
35
|
-
column_indices = columns.map { |col| header_mapping[col] }
|
36
|
-
pp column_indices
|
37
|
-
|
38
|
-
|
39
|
-
###################################################
|
40
|
-
## note: sort data by columns (before split)
|
41
|
-
data = data.sort do |row1,row2|
|
42
|
-
res = 0
|
43
|
-
column_indices.each do |col|
|
44
|
-
res = row1[col] <=> row2[col] if res == 0
|
45
|
-
end
|
46
|
-
res
|
47
|
-
end
|
48
|
-
|
49
|
-
chunk = []
|
50
|
-
data.each_with_index do |row,i|
|
51
|
-
chunk << row
|
52
|
-
|
53
|
-
next_row = data[i+1]
|
54
|
-
|
55
|
-
changed = false
|
56
|
-
if next_row.nil? ## end-of-file
|
57
|
-
changed = true
|
58
|
-
else
|
59
|
-
column_indices.each do |col|
|
60
|
-
if row[col] != next_row[col]
|
61
|
-
changed = true
|
62
|
-
break ## out of each column_indices loop
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
if changed
|
68
|
-
puts "save new chunk:"
|
69
|
-
column_values = column_indices.map {|col| row[col] }
|
70
|
-
pp column_values
|
71
|
-
|
72
|
-
# note: add header(s) row upfront (as first row) to chunk (with unshift)
|
73
|
-
chunk_with_headers = chunk.unshift( headers )
|
74
|
-
if blk
|
75
|
-
yield( column_values, chunk_with_headers )
|
76
|
-
else
|
77
|
-
## auto-save (write-to-file) by default - why? why not?
|
78
|
-
split_write( path, column_values, chunk_with_headers )
|
79
|
-
end
|
80
|
-
|
81
|
-
chunk = [] ## reset chunk for next batch of records
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
puts 'Done.'
|
86
|
-
end ## method self.split
|
87
|
-
|
88
|
-
|
89
|
-
def self.split_write( inpath, values, chunk )
|
90
|
-
basename = File.basename( inpath, '.*' )
|
91
|
-
dirname = File.dirname( inpath )
|
92
|
-
|
93
|
-
## check/change invalid filename chars
|
94
|
-
## e.g. change 1990/91 to 1990-91
|
95
|
-
extraname = values.map {|value| value.tr('/','-')}.join('~')
|
96
|
-
|
97
|
-
outpath = "#{dirname}/#{basename}_#{extraname}.csv"
|
98
|
-
puts "saving >#{basename}_#{extraname}.csv<..."
|
99
|
-
|
100
|
-
CSV.open( outpath, 'w:utf-8' ) do |out|
|
101
|
-
chunk.each do |row|
|
102
|
-
out << row
|
103
|
-
end
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
end # class CsvUtils
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
class CsvUtils
|
7
|
+
|
8
|
+
def self.split( path, *columns, sep: ',', &blk )
|
9
|
+
|
10
|
+
puts "cvssplit in: >#{path}<"
|
11
|
+
|
12
|
+
## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"]
|
13
|
+
puts "columns:"
|
14
|
+
pp columns
|
15
|
+
|
16
|
+
text = File.open( path, 'r:utf-8' ).read ## note: make sure to use (assume) utf-8
|
17
|
+
|
18
|
+
## note: do NOT use headers
|
19
|
+
## for easy sorting use "plain" array of array for records
|
20
|
+
csv_options = { col_sep: sep }
|
21
|
+
|
22
|
+
data = CSV.parse( text, csv_options )
|
23
|
+
|
24
|
+
## todo/check: (auto-) strip (remove all leading and trailing spaces)
|
25
|
+
## from all values - why? why not?
|
26
|
+
## check if CSV.parse has an option for it?
|
27
|
+
|
28
|
+
headers = data.shift ## remove top array item (that is, row with headers)
|
29
|
+
|
30
|
+
header_mapping = {}
|
31
|
+
headers.each_with_index { | header,i | header_mapping[header]=i }
|
32
|
+
pp header_mapping
|
33
|
+
|
34
|
+
## map columns to array indices e.g. ['Season', 'Div'] => [1,2]
|
35
|
+
column_indices = columns.map { |col| header_mapping[col] }
|
36
|
+
pp column_indices
|
37
|
+
|
38
|
+
|
39
|
+
###################################################
|
40
|
+
## note: sort data by columns (before split)
|
41
|
+
data = data.sort do |row1,row2|
|
42
|
+
res = 0
|
43
|
+
column_indices.each do |col|
|
44
|
+
res = row1[col] <=> row2[col] if res == 0
|
45
|
+
end
|
46
|
+
res
|
47
|
+
end
|
48
|
+
|
49
|
+
chunk = []
|
50
|
+
data.each_with_index do |row,i|
|
51
|
+
chunk << row
|
52
|
+
|
53
|
+
next_row = data[i+1]
|
54
|
+
|
55
|
+
changed = false
|
56
|
+
if next_row.nil? ## end-of-file
|
57
|
+
changed = true
|
58
|
+
else
|
59
|
+
column_indices.each do |col|
|
60
|
+
if row[col] != next_row[col]
|
61
|
+
changed = true
|
62
|
+
break ## out of each column_indices loop
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
if changed
|
68
|
+
puts "save new chunk:"
|
69
|
+
column_values = column_indices.map {|col| row[col] }
|
70
|
+
pp column_values
|
71
|
+
|
72
|
+
# note: add header(s) row upfront (as first row) to chunk (with unshift)
|
73
|
+
chunk_with_headers = chunk.unshift( headers )
|
74
|
+
if blk
|
75
|
+
yield( column_values, chunk_with_headers )
|
76
|
+
else
|
77
|
+
## auto-save (write-to-file) by default - why? why not?
|
78
|
+
split_write( path, column_values, chunk_with_headers )
|
79
|
+
end
|
80
|
+
|
81
|
+
chunk = [] ## reset chunk for next batch of records
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
puts 'Done.'
|
86
|
+
end ## method self.split
|
87
|
+
|
88
|
+
|
89
|
+
def self.split_write( inpath, values, chunk )
|
90
|
+
basename = File.basename( inpath, '.*' )
|
91
|
+
dirname = File.dirname( inpath )
|
92
|
+
|
93
|
+
## check/change invalid filename chars
|
94
|
+
## e.g. change 1990/91 to 1990-91
|
95
|
+
extraname = values.map {|value| value.tr('/','-')}.join('~')
|
96
|
+
|
97
|
+
outpath = "#{dirname}/#{basename}_#{extraname}.csv"
|
98
|
+
puts "saving >#{basename}_#{extraname}.csv<..."
|
99
|
+
|
100
|
+
CSV.open( outpath, 'w:utf-8' ) do |out|
|
101
|
+
chunk.each do |row|
|
102
|
+
out << row
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
end # class CsvUtils
|