csvreader 1.2.1 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +13 -12
- data/README.md +682 -677
- data/Rakefile +33 -26
- data/{test/data → datasets}/beer.csv +0 -0
- data/{test/data → datasets}/beer11.csv +0 -0
- data/{test/data → datasets}/cars11.csv +10 -10
- data/{test/data → datasets}/cities11.csv +12 -12
- data/{test/data → datasets}/customers11.csv +13 -13
- data/{test/data → datasets}/iris.attrib.csv +25 -25
- data/{test/data → datasets}/iris11.csv +163 -163
- data/{test/data → datasets}/lcc.attrib.csv +14 -14
- data/{test/data → datasets}/shakespeare.csv +9 -9
- data/{test/data → datasets}/test.csv +0 -0
- data/lib/csvreader/base.rb +36 -2
- data/lib/csvreader/buffer.rb +0 -1
- data/lib/csvreader/builder.rb +0 -1
- data/lib/csvreader/converter.rb +0 -1
- data/lib/csvreader/parser.rb +32 -33
- data/lib/csvreader/parser_fixed.rb +105 -106
- data/lib/csvreader/parser_json.rb +23 -5
- data/lib/csvreader/parser_std.rb +582 -534
- data/lib/csvreader/parser_strict.rb +290 -291
- data/lib/csvreader/parser_tab.rb +22 -62
- data/lib/csvreader/parser_table.rb +122 -123
- data/lib/csvreader/parser_yaml.rb +23 -0
- data/lib/csvreader/reader.rb +2 -3
- data/lib/csvreader/reader_hash.rb +3 -2
- data/lib/csvreader/version.rb +30 -32
- data/lib/csvreader.rb +0 -1
- data/test/helper.rb +1 -1
- data/test/test_parser_autofix.rb +28 -0
- data/test/test_parser_formats.rb +66 -66
- data/test/test_parser_java.rb +208 -208
- metadata +72 -25
- data/LICENSE.md +0 -116
data/lib/csvreader/base.rb
CHANGED
@@ -1,13 +1,21 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
|
3
2
|
|
4
3
|
require 'pp'
|
5
|
-
require 'logger'
|
4
|
+
require 'logger' ## todo/fix: check why logger is required - use logutils!!!???
|
6
5
|
require 'forwardable'
|
7
6
|
require 'stringio'
|
7
|
+
require 'fileutils'
|
8
|
+
|
9
|
+
require 'time'
|
8
10
|
require 'date' ## use for Date.parse and DateTime.parse
|
9
11
|
require 'yaml' ## used for (optional) meta data blocks
|
12
|
+
require 'json'
|
13
|
+
|
10
14
|
|
15
|
+
## our own parser libs
|
16
|
+
require 'tabreader'
|
17
|
+
require 'csvjson'
|
18
|
+
require 'csvyaml'
|
11
19
|
|
12
20
|
|
13
21
|
###
|
@@ -19,6 +27,7 @@ require 'csvreader/parser_strict' # flexible (strict - no leading/trailing spa
|
|
19
27
|
require 'csvreader/parser_tab'
|
20
28
|
require 'csvreader/parser_fixed'
|
21
29
|
require 'csvreader/parser_json'
|
30
|
+
require 'csvreader/parser_yaml'
|
22
31
|
require 'csvreader/parser_table'
|
23
32
|
require 'csvreader/parser'
|
24
33
|
require 'csvreader/converter'
|
@@ -68,6 +77,8 @@ class Parser
|
|
68
77
|
TABLE = ParserTable.new ## space-separated e.g /[ \t]+/
|
69
78
|
FIXED = ParserFixed.new
|
70
79
|
|
80
|
+
JSON = ParserJson.new
|
81
|
+
YAML = ParserYaml.new
|
71
82
|
|
72
83
|
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
73
84
|
def self.numeric() NUMERIC; end
|
@@ -85,11 +96,17 @@ class Parser
|
|
85
96
|
def self.postgres() postgresql; end
|
86
97
|
def self.postgresql_text() POSTGRESQL_TEXT; end
|
87
98
|
def self.postgres_text() postgresql_text; end
|
99
|
+
|
88
100
|
def self.tab() TAB; end
|
89
101
|
def self.table() TABLE; end
|
90
102
|
def self.fixed() FIXED; end
|
91
103
|
def self.fix() fixed; end
|
92
104
|
def self.f() fixed; end
|
105
|
+
|
106
|
+
def self.json() JSON; end
|
107
|
+
def self.j() json; end
|
108
|
+
def self.yaml() YAML; end
|
109
|
+
def self.y() yaml; end
|
93
110
|
end # class Parser
|
94
111
|
end # class CsvReader
|
95
112
|
|
@@ -114,6 +131,8 @@ class CsvReader
|
|
114
131
|
TABLE = Builder.new( Parser::TABLE )
|
115
132
|
FIXED = Builder.new( Parser::FIXED )
|
116
133
|
|
134
|
+
JSON = Builder.new( Parser::JSON )
|
135
|
+
YAML = Builder.new( Parser::YAML )
|
117
136
|
|
118
137
|
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
119
138
|
def self.numeric() NUMERIC; end
|
@@ -131,11 +150,17 @@ class CsvReader
|
|
131
150
|
def self.postgres() postgresql; end
|
132
151
|
def self.postgresql_text() POSTGRESQL_TEXT; end
|
133
152
|
def self.postgres_text() postgresql_text; end
|
153
|
+
|
134
154
|
def self.tab() TAB; end
|
135
155
|
def self.table() TABLE; end
|
136
156
|
def self.fixed() FIXED; end
|
137
157
|
def self.fix() fixed; end
|
138
158
|
def self.f() fixed; end
|
159
|
+
|
160
|
+
def self.json() JSON; end
|
161
|
+
def self.j() json; end
|
162
|
+
def self.yaml() YAML; end
|
163
|
+
def self.y() yaml; end
|
139
164
|
end # class CsvReader
|
140
165
|
|
141
166
|
|
@@ -158,6 +183,9 @@ class CsvHashReader
|
|
158
183
|
TABLE = Builder.new( Parser::TABLE )
|
159
184
|
FIXED = Builder.new( Parser::FIXED )
|
160
185
|
|
186
|
+
JSON = Builder.new( Parser::JSON )
|
187
|
+
YAML = Builder.new( Parser::YAML )
|
188
|
+
|
161
189
|
|
162
190
|
def self.default() DEFAULT; end ## alternative alias for DEFAULT
|
163
191
|
def self.numeric() NUMERIC; end
|
@@ -171,11 +199,17 @@ class CsvHashReader
|
|
171
199
|
def self.postgres() postgresql; end
|
172
200
|
def self.postgresql_text() POSTGRESQL_TEXT; end
|
173
201
|
def self.postgres_text() postgresql_text; end
|
202
|
+
|
174
203
|
def self.tab() TAB; end
|
175
204
|
def self.table() TABLE; end
|
176
205
|
def self.fixed() FIXED; end
|
177
206
|
def self.fix() fixed; end
|
178
207
|
def self.f() fixed; end
|
208
|
+
|
209
|
+
def self.json() JSON; end
|
210
|
+
def self.j() json; end
|
211
|
+
def self.yaml() YAML; end
|
212
|
+
def self.y() yaml; end
|
179
213
|
end # class CsvHashReader
|
180
214
|
|
181
215
|
|
data/lib/csvreader/buffer.rb
CHANGED
data/lib/csvreader/builder.rb
CHANGED
data/lib/csvreader/converter.rb
CHANGED
data/lib/csvreader/parser.rb
CHANGED
@@ -1,33 +1,32 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
##
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
#
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end # class
|
33
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader
|
3
|
+
|
4
|
+
class Parser
|
5
|
+
## "forward" reference,
|
6
|
+
## see base.rb for more
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
####################################
|
11
|
+
# define errors / exceptions
|
12
|
+
# for all parsers for (re)use
|
13
|
+
|
14
|
+
class Error < StandardError
|
15
|
+
end
|
16
|
+
|
17
|
+
####
|
18
|
+
# todo/check:
|
19
|
+
# use "common" error class - why? why not?
|
20
|
+
|
21
|
+
class ParseError < Error
|
22
|
+
attr_reader :message
|
23
|
+
|
24
|
+
def initialize( message )
|
25
|
+
@message = message
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s
|
29
|
+
"*** csv parse error: #{@message}"
|
30
|
+
end
|
31
|
+
end # class ParseError
|
32
|
+
end # class CsvReader
|
@@ -1,106 +1,105 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
l =
|
16
|
-
l
|
17
|
-
|
18
|
-
end
|
19
|
-
def
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
##
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
##
|
53
|
-
##
|
54
|
-
|
55
|
-
line
|
56
|
-
logger.debug
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
## skip
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
##
|
74
|
-
##
|
75
|
-
##
|
76
|
-
##
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
value =
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
end # class
|
106
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader
|
3
|
+
|
4
|
+
class ParserFixed
|
5
|
+
|
6
|
+
###################################
|
7
|
+
## add simple logger with debug flag/switch
|
8
|
+
#
|
9
|
+
# use Parser.debug = true # to turn on
|
10
|
+
#
|
11
|
+
# todo/fix: use logutils instead of std logger - why? why not?
|
12
|
+
|
13
|
+
def self.build_logger()
|
14
|
+
l = Logger.new( STDOUT )
|
15
|
+
l.level = :info ## set to :info on start; note: is 0 (debug) by default
|
16
|
+
l
|
17
|
+
end
|
18
|
+
def self.logger() @@logger ||= build_logger; end
|
19
|
+
def logger() self.class.logger; end
|
20
|
+
|
21
|
+
|
22
|
+
def parse( data, width:, &block )
|
23
|
+
|
24
|
+
## note: input: required each_line (string or io/file for example)
|
25
|
+
|
26
|
+
input = data ## assume it's a string or io/file handle
|
27
|
+
|
28
|
+
if block_given?
|
29
|
+
parse_lines( input, width: width, &block )
|
30
|
+
else
|
31
|
+
records = []
|
32
|
+
|
33
|
+
parse_lines( input, width: width ) do |record|
|
34
|
+
records << record
|
35
|
+
end
|
36
|
+
|
37
|
+
records
|
38
|
+
end
|
39
|
+
end ## method parse
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def parse_lines( input, width:, &block )
|
46
|
+
|
47
|
+
## note: each line only works with \n (windows) or \r\n (unix)
|
48
|
+
## will NOT work with \r (old mac, any others?) only!!!!
|
49
|
+
input.each_line do |line|
|
50
|
+
|
51
|
+
## note: chomp('') if is an empty string,
|
52
|
+
## it will remove all trailing newlines from the string.
|
53
|
+
## use line.sub(/[\n\r]*$/, '') or similar instead - why? why not?
|
54
|
+
line = line.chomp( '' )
|
55
|
+
logger.debug "line:" if logger.debug?
|
56
|
+
logger.debug line.pretty_inspect if logger.debug?
|
57
|
+
|
58
|
+
|
59
|
+
## skip empty lines and comments
|
60
|
+
if line =~ /^[ \t]*$/ ## skip blank lines (with whitespace only)
|
61
|
+
logger.debug "skip blank line" if logger.debug?
|
62
|
+
next
|
63
|
+
end
|
64
|
+
|
65
|
+
if line =~ /^[ \t]*#/ # start_with?( "#" ) -- skip comment lines (note: allow leading whitespaces)
|
66
|
+
logger.debug "skip comment line" if logger.debug?
|
67
|
+
next
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
if width.is_a?( String )
|
72
|
+
## assume it's String#unpack format e.g.
|
73
|
+
## "209231-231992395 MoreData".unpack('aa5A1A9a4Z*')
|
74
|
+
## returns an array as follows :
|
75
|
+
## ["2", "09231", "-", "231992395", " ", "MoreData"]
|
76
|
+
## see String#unpack
|
77
|
+
|
78
|
+
values = line.unpack( width )
|
79
|
+
else ## assume array with integers
|
80
|
+
values = []
|
81
|
+
offset = 0 # start position / offset
|
82
|
+
width.each_with_index do |w,i|
|
83
|
+
logger.debug "[#{i}] start: #{offset}, width: #{w}" if logger.debug?
|
84
|
+
|
85
|
+
if w < 0 ## convention - if width negative, skip column
|
86
|
+
# note: minus (-) and minus (-) equal plus (+)
|
87
|
+
## e.g. 2 - -2 = 4
|
88
|
+
offset -= w
|
89
|
+
else
|
90
|
+
value = line[offset, w]
|
91
|
+
value = value.strip if value ## note: if not nil strip; only use rstrip (for trailing only) - why? why not?
|
92
|
+
values << value
|
93
|
+
offset += w
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
## note: requires block - enforce? how? why? why not?
|
99
|
+
block.call( values )
|
100
|
+
end
|
101
|
+
end # method parse_lines
|
102
|
+
|
103
|
+
|
104
|
+
end # class ParserFixed
|
105
|
+
end # class CsvReader
|
@@ -1,5 +1,23 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
1
|
+
|
2
|
+
|
3
|
+
class CsvReader
|
4
|
+
|
5
|
+
class ParserJson
|
6
|
+
|
7
|
+
def parse( data, **kwargs, &block )
|
8
|
+
## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
|
9
|
+
|
10
|
+
## note: input: required each_line (string or io/file for example)
|
11
|
+
## assume data is a string or io/file handle
|
12
|
+
csv = CsvJson.new( data )
|
13
|
+
|
14
|
+
if block_given?
|
15
|
+
csv.each( &block )
|
16
|
+
else
|
17
|
+
csv.to_a
|
18
|
+
end
|
19
|
+
end ## method parse
|
20
|
+
|
21
|
+
|
22
|
+
end # class ParserJson
|
23
|
+
end # class CsvReader
|