csvreader 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -2
- data/README.md +682 -682
- data/Rakefile +33 -32
- data/datasets/cars11.csv +10 -10
- data/datasets/cities11.csv +12 -12
- data/datasets/customers11.csv +13 -13
- data/datasets/iris.attrib.csv +25 -25
- data/datasets/iris11.csv +163 -163
- data/datasets/lcc.attrib.csv +14 -14
- data/datasets/shakespeare.csv +9 -9
- data/lib/csvreader/base.rb +6 -2
- data/lib/csvreader/buffer.rb +0 -1
- data/lib/csvreader/builder.rb +0 -1
- data/lib/csvreader/converter.rb +0 -1
- data/lib/csvreader/parser.rb +32 -33
- data/lib/csvreader/parser_fixed.rb +105 -106
- data/lib/csvreader/parser_json.rb +23 -24
- data/lib/csvreader/parser_std.rb +582 -583
- data/lib/csvreader/parser_strict.rb +290 -291
- data/lib/csvreader/parser_tab.rb +22 -23
- data/lib/csvreader/parser_table.rb +122 -123
- data/lib/csvreader/parser_yaml.rb +23 -24
- data/lib/csvreader/reader.rb +2 -3
- data/lib/csvreader/reader_hash.rb +1 -2
- data/lib/csvreader/version.rb +30 -32
- data/lib/csvreader.rb +0 -1
- data/test/test_parser_formats.rb +66 -66
- data/test/test_parser_java.rb +208 -208
- metadata +18 -15
- data/LICENSE.md +0 -116
@@ -1,123 +1,122 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
l =
|
16
|
-
l
|
17
|
-
|
18
|
-
end
|
19
|
-
def
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
##
|
28
|
-
##
|
29
|
-
##
|
30
|
-
##
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
##
|
36
|
-
##
|
37
|
-
##
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
## note:
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
##
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
logger.debug
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
##
|
88
|
-
##
|
89
|
-
|
90
|
-
line = line.
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
#
|
105
|
-
|
106
|
-
|
107
|
-
values
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
##
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
end # class
|
123
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader
|
3
|
+
|
4
|
+
class ParserTable
|
5
|
+
|
6
|
+
###################################
|
7
|
+
## add simple logger with debug flag/switch
|
8
|
+
#
|
9
|
+
# use Parser.debug = true # to turn on
|
10
|
+
#
|
11
|
+
# todo/fix: use logutils instead of std logger - why? why not?
|
12
|
+
|
13
|
+
def self.build_logger()
|
14
|
+
l = Logger.new( STDOUT )
|
15
|
+
l.level = :info ## set to :info on start; note: is 0 (debug) by default
|
16
|
+
l
|
17
|
+
end
|
18
|
+
def self.logger() @@logger ||= build_logger; end
|
19
|
+
def logger() self.class.logger; end
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
|
25
|
+
|
26
|
+
##
|
27
|
+
## todo/check:
|
28
|
+
## null values - include NA - why? why not?
|
29
|
+
## make null values case sensitive or add an option for case sensitive
|
30
|
+
## or better allow a proc as option for checking too!!!
|
31
|
+
def initialize( space: nil )
|
32
|
+
@config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
|
33
|
+
|
34
|
+
## e.g. treat/convert char to space e.g. _-+• etc
|
35
|
+
## Man_Utd => Man Utd
|
36
|
+
## or use it for leading and trailing spaces without quotes
|
37
|
+
## todo/check: only use for unquoted values? why? why not?
|
38
|
+
@config[:space] = space
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
#########################################
|
43
|
+
## config convenience helpers
|
44
|
+
def space=( value ) @config[:space]=value; end
|
45
|
+
|
46
|
+
|
47
|
+
|
48
|
+
|
49
|
+
|
50
|
+
def parse( str_or_readable, **kwargs, &block )
|
51
|
+
|
52
|
+
## note: input: required each_line (string or io/file for example)
|
53
|
+
## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
|
54
|
+
|
55
|
+
input = str_or_readable ## assume it's a string or io/file handle
|
56
|
+
|
57
|
+
if block_given?
|
58
|
+
parse_lines( input, &block )
|
59
|
+
else
|
60
|
+
records = []
|
61
|
+
|
62
|
+
parse_lines( input ) do |record|
|
63
|
+
records << record
|
64
|
+
end
|
65
|
+
|
66
|
+
records
|
67
|
+
end
|
68
|
+
end ## method parse
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def parse_lines( input, &block )
|
75
|
+
|
76
|
+
space = config[:space]
|
77
|
+
|
78
|
+
## note: each line only works with \n (windows) or \r\n (unix)
|
79
|
+
## will NOT work with \r (old mac, any others?) only!!!!
|
80
|
+
input.each_line do |line|
|
81
|
+
|
82
|
+
logger.debug "line:" if logger.debug?
|
83
|
+
logger.debug line.pretty_inspect if logger.debug?
|
84
|
+
|
85
|
+
|
86
|
+
## note: chomp('') if is an empty string,
|
87
|
+
## it will remove all trailing newlines from the string.
|
88
|
+
## use line.sub(/[\n\r]*$/, '') or similar instead - why? why not?
|
89
|
+
line = line.chomp( '' )
|
90
|
+
line = line.strip ## strip leading and trailing whitespaces (space/tab) too
|
91
|
+
logger.debug line.pretty_inspect if logger.debug?
|
92
|
+
|
93
|
+
if line.empty? ## skip blank lines
|
94
|
+
logger.debug "skip blank line" if logger.debug?
|
95
|
+
next
|
96
|
+
end
|
97
|
+
|
98
|
+
if line.start_with?( "#" ) ## skip comment lines
|
99
|
+
logger.debug "skip comment line" if logger.debug?
|
100
|
+
next
|
101
|
+
end
|
102
|
+
|
103
|
+
# note: string.split defaults to split by space (e.g. /\s+/) :-)
|
104
|
+
# for just make it "explicit" with /[ \t]+/
|
105
|
+
|
106
|
+
values = line.split( /[ \t]+/ )
|
107
|
+
logger.debug values.pretty_inspect if logger.debug?
|
108
|
+
|
109
|
+
if space
|
110
|
+
## e.g. translate _-+ etc. if configured to space
|
111
|
+
## Man_Utd => Man Utd etc.
|
112
|
+
values = values.map {|value| value.tr(space,' ') }
|
113
|
+
end
|
114
|
+
|
115
|
+
## note: requires block - enforce? how? why? why not?
|
116
|
+
block.call( values )
|
117
|
+
end
|
118
|
+
end # method parse_lines
|
119
|
+
|
120
|
+
|
121
|
+
end # class ParserTable
|
122
|
+
end # class CsvReader
|
@@ -1,24 +1,23 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
##
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
end # class
|
24
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
|
3
|
+
class CsvReader
|
4
|
+
|
5
|
+
class ParserYaml
|
6
|
+
|
7
|
+
def parse( data, **kwargs, &block )
|
8
|
+
## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
|
9
|
+
|
10
|
+
## note: input: required each_line (string or io/file for example)
|
11
|
+
## assume data is a string or io/file handle
|
12
|
+
csv = CsvYaml.new( data )
|
13
|
+
|
14
|
+
if block_given?
|
15
|
+
csv.each( &block )
|
16
|
+
else
|
17
|
+
csv.to_a
|
18
|
+
end
|
19
|
+
end ## method parse
|
20
|
+
|
21
|
+
|
22
|
+
end # class ParserYaml
|
23
|
+
end # class CsvReader
|
data/lib/csvreader/reader.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
class CsvReader
|
4
3
|
|
@@ -155,11 +154,11 @@ class CsvReader
|
|
155
154
|
|
156
155
|
## check array / pipeline of converters is empty (size=0 e.g. is [])
|
157
156
|
if @converters.empty?
|
158
|
-
@parser.parse( @io, kwargs, &block )
|
157
|
+
@parser.parse( @io, **kwargs, &block )
|
159
158
|
else
|
160
159
|
## add "post"-processing with converters pipeline
|
161
160
|
## that is, convert all strings to integer, float, date, ... if wanted
|
162
|
-
@parser.parse( @io, kwargs ) do |raw_record|
|
161
|
+
@parser.parse( @io, **kwargs ) do |raw_record|
|
163
162
|
record = []
|
164
163
|
raw_record.each_with_index do | value, i |
|
165
164
|
record << @converters.convert( value, i )
|
@@ -1,4 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
class CsvHashReader
|
4
3
|
|
@@ -169,7 +168,7 @@ def_delegators :@io,
|
|
169
168
|
kwargs[:width] = @kwargs[:width] if @parser.is_a?( ParserFixed )
|
170
169
|
|
171
170
|
|
172
|
-
@parser.parse( @io, kwargs ) do |raw_values| # sep: sep
|
171
|
+
@parser.parse( @io, **kwargs ) do |raw_values| # sep: sep
|
173
172
|
if @names.nil? ## check for (first) headers row
|
174
173
|
if @header_converters.empty?
|
175
174
|
@names = raw_values ## store header row / a.k.a. field/column names
|
data/lib/csvreader/version.rb
CHANGED
@@ -1,32 +1,30 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader ## note: uses a class for now - change to module - why? why not?
|
3
|
+
|
4
|
+
module Version
|
5
|
+
MAJOR = 1 ## todo: namespace inside version or something - why? why not??
|
6
|
+
MINOR = 2
|
7
|
+
PATCH = 5
|
8
|
+
|
9
|
+
## self.to_s - why? why not?
|
10
|
+
end
|
11
|
+
|
12
|
+
VERSION = [Version::MAJOR,
|
13
|
+
Version::MINOR,
|
14
|
+
Version::PATCH].join('.')
|
15
|
+
|
16
|
+
def self.version ## keep (as an alternative to VERSION) - why? why not?
|
17
|
+
VERSION
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def self.banner
|
23
|
+
"csvreader/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}] in (#{root})"
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.root
|
27
|
+
File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
|
28
|
+
end
|
29
|
+
|
30
|
+
end # class CsvReader
|
data/lib/csvreader.rb
CHANGED
data/test/test_parser_formats.rb
CHANGED
@@ -1,66 +1,66 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
###
|
4
|
-
# to run use
|
5
|
-
# ruby -I ./lib -I ./test test/test_parser_formats.rb
|
6
|
-
|
7
|
-
|
8
|
-
require 'helper'
|
9
|
-
|
10
|
-
class TestParserFormats < MiniTest::Test
|
11
|
-
|
12
|
-
|
13
|
-
def parser
|
14
|
-
CsvReader::Parser
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
def test_parse_whitespace
|
19
|
-
records = [["a", "b", "c"],
|
20
|
-
["1", "2", "3"]]
|
21
|
-
|
22
|
-
## don't care about newlines (\r\n) ??? - fix? why? why not?
|
23
|
-
assert_equal records, parser.default.parse( "a,b,c\n1,2,3" )
|
24
|
-
assert_equal records, parser.default.parse( "a,b,c\n1,2,3\n" )
|
25
|
-
assert_equal records, parser.default.parse( " a, b ,c \n\n1,2,3\n" )
|
26
|
-
assert_equal records, parser.default.parse( " a, b ,c \n \n1,2,3\n" )
|
27
|
-
|
28
|
-
assert_equal [["a", "b", "c"],
|
29
|
-
[""],
|
30
|
-
["1", "2", "3"]], parser.default.parse( %Q{a,b,c\n""\n1,2,3\n} )
|
31
|
-
assert_equal [["", ""],
|
32
|
-
[""],
|
33
|
-
["", "", ""]], parser.default.parse( %Q{,\n""\n"","",""\n} )
|
34
|
-
|
35
|
-
|
36
|
-
## strict rfc4180 - no trim leading or trailing spaces or blank lines
|
37
|
-
assert_equal records, parser.strict.parse( "a,b,c\n1,2,3" )
|
38
|
-
assert_equal [["a", "b", "c"],
|
39
|
-
[""],
|
40
|
-
["1", "2", "3"]], parser.strict.parse( "a,b,c\n\n1,2,3" )
|
41
|
-
assert_equal [[" a", " b ", "c "],
|
42
|
-
[""],
|
43
|
-
["1", "2", "3"]], parser.strict.parse( " a, b ,c \n\n1,2,3" )
|
44
|
-
assert_equal [[" a", " b ", "c "],
|
45
|
-
[" "],
|
46
|
-
["",""],
|
47
|
-
["1", "2", "3"]], parser.strict.parse( " a, b ,c \n \n,\n1,2,3" )
|
48
|
-
end
|
49
|
-
|
50
|
-
|
51
|
-
def test_parse_empties
|
52
|
-
assert_equal [], parser.default.parse( "\n \n \n" )
|
53
|
-
|
54
|
-
## strict rfc4180 - no trim leading or trailing spaces or blank lines
|
55
|
-
assert_equal [[""],
|
56
|
-
[" "],
|
57
|
-
[" "]], parser.strict.parse( "\n \n \n" )
|
58
|
-
assert_equal [[""],
|
59
|
-
[" "],
|
60
|
-
[" "]], parser.strict.parse( "\n \n " )
|
61
|
-
|
62
|
-
assert_equal [[""]], parser.strict.parse( "\n" )
|
63
|
-
assert_equal [], parser.strict.parse( "" )
|
64
|
-
end
|
65
|
-
|
66
|
-
end # class TestParserFormats
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_parser_formats.rb
|
6
|
+
|
7
|
+
|
8
|
+
require 'helper'
|
9
|
+
|
10
|
+
class TestParserFormats < MiniTest::Test
|
11
|
+
|
12
|
+
|
13
|
+
def parser
|
14
|
+
CsvReader::Parser
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def test_parse_whitespace
|
19
|
+
records = [["a", "b", "c"],
|
20
|
+
["1", "2", "3"]]
|
21
|
+
|
22
|
+
## don't care about newlines (\r\n) ??? - fix? why? why not?
|
23
|
+
assert_equal records, parser.default.parse( "a,b,c\n1,2,3" )
|
24
|
+
assert_equal records, parser.default.parse( "a,b,c\n1,2,3\n" )
|
25
|
+
assert_equal records, parser.default.parse( " a, b ,c \n\n1,2,3\n" )
|
26
|
+
assert_equal records, parser.default.parse( " a, b ,c \n \n1,2,3\n" )
|
27
|
+
|
28
|
+
assert_equal [["a", "b", "c"],
|
29
|
+
[""],
|
30
|
+
["1", "2", "3"]], parser.default.parse( %Q{a,b,c\n""\n1,2,3\n} )
|
31
|
+
assert_equal [["", ""],
|
32
|
+
[""],
|
33
|
+
["", "", ""]], parser.default.parse( %Q{,\n""\n"","",""\n} )
|
34
|
+
|
35
|
+
|
36
|
+
## strict rfc4180 - no trim leading or trailing spaces or blank lines
|
37
|
+
assert_equal records, parser.strict.parse( "a,b,c\n1,2,3" )
|
38
|
+
assert_equal [["a", "b", "c"],
|
39
|
+
[""],
|
40
|
+
["1", "2", "3"]], parser.strict.parse( "a,b,c\n\n1,2,3" )
|
41
|
+
assert_equal [[" a", " b ", "c "],
|
42
|
+
[""],
|
43
|
+
["1", "2", "3"]], parser.strict.parse( " a, b ,c \n\n1,2,3" )
|
44
|
+
assert_equal [[" a", " b ", "c "],
|
45
|
+
[" "],
|
46
|
+
["",""],
|
47
|
+
["1", "2", "3"]], parser.strict.parse( " a, b ,c \n \n,\n1,2,3" )
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
def test_parse_empties
|
52
|
+
assert_equal [], parser.default.parse( "\n \n \n" )
|
53
|
+
|
54
|
+
## strict rfc4180 - no trim leading or trailing spaces or blank lines
|
55
|
+
assert_equal [[""],
|
56
|
+
[" "],
|
57
|
+
[" "]], parser.strict.parse( "\n \n \n" )
|
58
|
+
assert_equal [[""],
|
59
|
+
[" "],
|
60
|
+
[" "]], parser.strict.parse( "\n \n " )
|
61
|
+
|
62
|
+
assert_equal [[""]], parser.strict.parse( "\n" )
|
63
|
+
assert_equal [], parser.strict.parse( "" )
|
64
|
+
end
|
65
|
+
|
66
|
+
end # class TestParserFormats
|