csvreader 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ class CsvReader
4
+
5
+ class ParserTab
6
+
7
+ def parse( data, **kwargs, &block )
8
+
9
+ ## note: input: required each_line (string or io/file for example)
10
+ ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
11
+
12
+ input = data ## assume it's a string or io/file handle
13
+
14
+ if block_given?
15
+ parse_lines( input, &block )
16
+ else
17
+ records = []
18
+
19
+ parse_lines( input ) do |record|
20
+ records << record
21
+ end
22
+
23
+ records
24
+ end
25
+ end ## method parse
26
+
27
+
28
+
29
+ private
30
+
31
+ def parse_lines( input, &block )
32
+
33
+ ## note: each line only works with \n (windows) or \r\n (unix)
34
+ ## will NOT work with \r (old mac, any others?) only!!!!
35
+ input.each_line do |line|
36
+
37
+ ## puts "line:"
38
+ ## pp line
39
+
40
+ ## note: chomp('') if is an empty string,
41
+ ## it will remove all trailing newlines from the string.
42
+ ## use line.sub(/[\n\r]*$/, '') or similar instead - why? why not?
43
+ line = line.chomp( '' )
44
+ ## pp line
45
+
46
+ # note: trailing empty fields get (auto-)trimmed by split !!!!!!!
47
+ values = line.split( "\t" )
48
+ ## pp values
49
+
50
+ ## note: requires block - enforce? how? why? why not?
51
+ block.call( values )
52
+ end
53
+ end # method parse_lines
54
+
55
+
56
+ end # class ParserTab
57
+ end # class CsvReader
@@ -9,35 +9,28 @@ class CsvReader
9
9
  end
10
10
 
11
11
  DEFAULT = new( Parser::DEFAULT )
12
+ STRICT = new( Parser::STRICT )
12
13
  RFC4180 = new( Parser::RFC4180 )
13
14
  EXCEL = new( Parser::EXCEL )
15
+ TAB = new( Parser::TAB )
14
16
 
15
17
  def self.default() DEFAULT; end ## alternative alias for DEFAULT
18
+ def self.strict() STRICT; end ## alternative alias for RFC4180
16
19
  def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
17
20
  def self.excel() EXCEL; end ## alternative alias for EXCEL
21
+ def self.tab() TAB; end ## alternative alias for TAB
18
22
 
19
23
 
20
24
  #####################
21
25
  ## convenience helpers defaulting to default csv dialect/format reader
22
26
  ##
23
- ## CsvReader.parse_line is the same as
24
- ## CsvReader::DEFAULT.parse_line or CsvReader.default.parse_line
27
+ ## CsvReader.parse is the same as
28
+ ## CsvReader::DEFAULT.parse or CsvReader.default.parse
25
29
  ##
26
30
 
27
- def self.parse_line( data, sep: nil,
28
- converters: nil )
29
- DEFAULT.parse_line( data, sep: sep, converters: converters )
30
- end
31
-
32
31
  def self.parse( data, sep: nil,
33
- converters: nil )
34
- DEFAULT.parse( data, sep: sep, converters: converters )
35
- end
36
-
37
- #### fix!!! remove - replace with parse with (optional) block!!!!!
38
- def self.parse_lines( data, sep: nil,
39
- converters: nil, &block )
40
- DEFAULT.parse_lines( data, sep: sep, converters: nil, &block )
32
+ converters: nil, &block )
33
+ DEFAULT.parse( data, sep: sep, converters: converters, &block )
41
34
  end
42
35
 
43
36
  def self.read( path, sep: nil,
@@ -55,6 +48,20 @@ class CsvReader
55
48
  end
56
49
 
57
50
 
51
+ ############################
52
+ ## note: only add parse_line convenience helper for default
53
+ ## always use parse (do NOT use parse_line) - why? why not?
54
+ def self.parse_line( data, sep: nil,
55
+ converters: nil )
56
+ records = []
57
+ DEFAULT.parse( data, sep: sep, converters: converters ) do |record|
58
+ records << record
59
+ break # only parse first record
60
+ end
61
+ records.size == 0 ? nil : records.first
62
+ end
63
+
64
+
58
65
 
59
66
  #############################
60
67
  ## all "high-level" reader methods
@@ -62,33 +69,15 @@ class CsvReader
62
69
  ## note: allow "overriding" of separator
63
70
  ## if sep is not nil otherwise use default dialect/format separator
64
71
 
72
+ def parse( data, sep: nil,
73
+ converters: nil, &block )
74
+ kwargs = {
75
+ ## converters: converters ## todo: add converters
76
+ }
77
+ ## note: only add separator if present/defined (not nil)
78
+ kwargs[:sep] = sep if sep && @parser.respond_to?( :'sep=' )
65
79
 
66
- ##
67
- ## todo/fix: "unify" parse and parse_lines !!!
68
- ## check for block_given? - why? why not?
69
-
70
- def parse( data, sep: nil, limit: nil,
71
- converters: nil )
72
- sep = @parser.config[:sep] if sep.nil?
73
- @parser.parse( data, sep: sep, limit: limit )
74
- end
75
-
76
- #### fix!!! remove - replace with parse with (optional) block!!!!!
77
- def parse_lines( data, sep: nil,
78
- converters: nil, &block )
79
- sep = @parser.config[:sep] if sep.nil?
80
- @parser.parse_lines( data, sep: sep, &block )
81
- end
82
-
83
-
84
-
85
- def parse_line( data, sep: nil,
86
- converters: nil )
87
- records = parse( data, sep: sep, limit: 1 )
88
-
89
- ## unwrap record if empty return nil - why? why not?
90
- ## return empty record e.g. [] - why? why not?
91
- records.size == 0 ? nil : records.first
80
+ @parser.parse( data, kwargs, &block )
92
81
  end
93
82
 
94
83
  def read( path, sep: nil,
@@ -103,75 +92,26 @@ class CsvReader
103
92
  def foreach( path, sep: nil,
104
93
  converters: nil, &block )
105
94
  File.open( path, 'r:bom|utf-8' ) do |file|
106
- parse_lines( file, sep: sep, &block )
95
+ parse( file, sep: sep, &block )
107
96
  end
108
97
  end
109
98
 
110
-
111
-
112
99
  def header( path, sep: nil ) ## use header or headers - or use both (with alias)?
113
100
  # read first lines (only)
114
101
  # and parse with csv to get header from csv library itself
115
102
 
116
- record = nil
103
+ records = []
117
104
  File.open( path, 'r:bom|utf-8' ) do |file|
118
- record = parse_line( file, sep: sep )
105
+ parse( file, sep: sep ) do |record|
106
+ records << record
107
+ break ## only parse/read first record
108
+ end
119
109
  end
120
110
 
121
- record ## todo/fix: returns nil for empty - why? why not?
111
+ ## unwrap record if empty return nil - why? why not?
112
+ ## return empty record e.g. [] - why? why not?
113
+ ## returns nil for empty (for now) - why? why not?
114
+ records.size == 0 ? nil : records.first
122
115
  end # method self.header
123
116
 
124
117
  end # class CsvReader
125
-
126
-
127
-
128
-
129
- class CsvHashReader
130
-
131
-
132
- def self.parse( data, sep: nil, headers: nil )
133
-
134
- ## pass in headers as array e.g. ['A', 'B', 'C']
135
- names = headers ? headers : nil
136
-
137
- records = []
138
- CsvReader.parse_lines( data ) do |values| # sep: sep
139
- if names.nil?
140
- names = values ## store header row / a.k.a. field/column names
141
- else
142
- record = names.zip( values ).to_h ## todo/fix: check for more values than names/headers!!!
143
- records << record
144
- end
145
- end
146
- records
147
- end
148
-
149
-
150
- def self.read( path, sep: nil, headers: nil )
151
- txt = File.open( path, 'r:bom|utf-8' ).read
152
- parse( txt, sep: sep, headers: headers )
153
- end
154
-
155
-
156
- def self.foreach( path, sep: nil, headers: nil, &block )
157
-
158
- ## pass in headers as array e.g. ['A', 'B', 'C']
159
- names = headers ? headers : nil
160
-
161
- CsvReader.foreach( path ) do |values| # sep: sep
162
- if names.nil?
163
- names = values ## store header row / a.k.a. field/column names
164
- else
165
- record = names.zip( values ).to_h ## todo/fix: check for more values than names/headers!!!
166
- block.call( record )
167
- end
168
- end
169
- end
170
-
171
-
172
- def self.header( path, sep: nil ) ## add header too? why? why not?
173
- ## same as "classic" header method - delegate/reuse :-)
174
- CsvReader.header( path, sep: sep )
175
- end
176
-
177
- end # class CsvHashReader
@@ -0,0 +1,88 @@
1
+ # encoding: utf-8
2
+
3
+ class CsvHashReader
4
+
5
+ def initialize( parser )
6
+ @parser = parser
7
+ end
8
+
9
+ DEFAULT = new( CsvReader::Parser::DEFAULT )
10
+ STRICT = new( CsvReader::Parser::STRICT )
11
+ RFC4180 = new( CsvReader::Parser::RFC4180 )
12
+
13
+ def self.default() DEFAULT; end ## alternative alias for DEFAULT
14
+ def self.strict() STRICT; end ## alternative alias for STRICT
15
+ def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
16
+
17
+
18
+
19
+ def self.parse( data, sep: nil, headers: nil, &block )
20
+ DEFAULT.parse( data, sep: sep, headers: headers, &block )
21
+ end
22
+
23
+ def self.read( path, sep: nil, headers: nil )
24
+ DEFAULT.read( path, sep: sep, headers: headers )
25
+ end
26
+
27
+ def self.foreach( path, sep: nil, headers: nil, &block )
28
+ DEFAULT.foreach( path,sep: sep, headers: headers, &block )
29
+ end
30
+
31
+
32
+
33
+ #############################
34
+ ## all "high-level" reader methods
35
+ ##
36
+
37
+ def parse( data, sep: nil, headers: nil, &block )
38
+ if block_given?
39
+ parse_lines( data, sep: sep, headers: headers, &block )
40
+ else
41
+ records = []
42
+ parse_lines( data, sep: sep, headers: headers ) do |record|
43
+ records << record
44
+ end
45
+ records
46
+ end
47
+ end
48
+
49
+
50
+ def read( path, sep: nil, headers: nil )
51
+ txt = File.open( path, 'r:bom|utf-8' ).read
52
+ parse( txt, sep: sep, headers: headers )
53
+ end
54
+
55
+
56
+ def foreach( path, sep: nil, headers: nil, &block )
57
+ File.open( path, 'r:bom|utf-8' ) do |file|
58
+ parse_lines( file, sep: sep, headers: headers, &block )
59
+ end
60
+ end
61
+
62
+
63
+ private
64
+
65
+ ####################
66
+ ## parse_lines helper method to keep in one (central) place only (for easy editing/changing)
67
+ ## - builds key/value pairs
68
+
69
+ def parse_lines( data, sep: nil, headers: nil, &block)
70
+ ## pass in headers as array e.g. ['A', 'B', 'C']
71
+ names = headers ? headers : nil
72
+
73
+ kwargs = {
74
+ ## converters: converters ## todo: add converters
75
+ }
76
+ kwargs[:sep] = sep if sep && @parser.respond_to?( :'sep=' ) ## note: only add separator if present/defined (not nil)
77
+
78
+ @parser.parse( data, kwargs ) do |values| # sep: sep
79
+ if names.nil?
80
+ names = values ## store header row / a.k.a. field/column names
81
+ else
82
+ record = names.zip( values ).to_h ## todo/fix: check for more values than names/headers!!!
83
+ block.call( record )
84
+ end
85
+ end
86
+ end
87
+
88
+ end # class CsvHashReader
@@ -4,7 +4,7 @@
4
4
  class CsvReader ## note: uses a class for now - change to module - why? why not?
5
5
 
6
6
  MAJOR = 0 ## todo: namespace inside version or something - why? why not??
7
- MINOR = 5
7
+ MINOR = 6
8
8
  PATCH = 0
9
9
  VERSION = [MAJOR,MINOR,PATCH].join('.')
10
10
 
data/test/helper.rb CHANGED
@@ -14,3 +14,7 @@ class CsvReader
14
14
  "#{root}/test/data"
15
15
  end
16
16
  end
17
+
18
+
19
+ ## CsvReader::ParserStd.logger.level = :debug ## turn on "global" logging
20
+ ## CsvReader::ParserStrict.logger.level = :debug ## turn on "global" logging
data/test/test_parser.rb CHANGED
@@ -9,9 +9,6 @@ require 'helper'
9
9
 
10
10
  class TestParser < MiniTest::Test
11
11
 
12
- def setup
13
- CsvReader::Parser.logger.level = :debug ## turn on "global" logging - move to helper - why? why not?
14
- end
15
12
 
16
13
  def parser
17
14
  parser = CsvReader::Parser::DEFAULT
@@ -9,9 +9,6 @@ require 'helper'
9
9
 
10
10
  class TestParserFormats < MiniTest::Test
11
11
 
12
- def setup
13
- CsvReader::Parser.logger.level = :debug ## turn on "global" logging - move to helper - why? why not?
14
- end
15
12
 
16
13
  def parser
17
14
  CsvReader::Parser
@@ -37,17 +34,17 @@ def test_parse_whitespace
37
34
 
38
35
 
39
36
  ## strict rfc4180 - no trim leading or trailing spaces or blank lines
40
- assert_equal records, parser.rfc4180.parse( "a,b,c\n1,2,3" )
37
+ assert_equal records, parser.strict.parse( "a,b,c\n1,2,3" )
41
38
  assert_equal [["a", "b", "c"],
42
39
  [""],
43
- ["1", "2", "3"]], parser.rfc4180.parse( "a,b,c\n\n1,2,3" )
40
+ ["1", "2", "3"]], parser.strict.parse( "a,b,c\n\n1,2,3" )
44
41
  assert_equal [[" a", " b ", "c "],
45
42
  [""],
46
- ["1", "2", "3"]], parser.rfc4180.parse( " a, b ,c \n\n1,2,3" )
43
+ ["1", "2", "3"]], parser.strict.parse( " a, b ,c \n\n1,2,3" )
47
44
  assert_equal [[" a", " b ", "c "],
48
45
  [" "],
49
46
  ["",""],
50
- ["1", "2", "3"]], parser.rfc4180.parse( " a, b ,c \n \n,\n1,2,3" )
47
+ ["1", "2", "3"]], parser.strict.parse( " a, b ,c \n \n,\n1,2,3" )
51
48
  end
52
49
 
53
50
 
@@ -57,13 +54,13 @@ def test_parse_empties
57
54
  ## strict rfc4180 - no trim leading or trailing spaces or blank lines
58
55
  assert_equal [[""],
59
56
  [" "],
60
- [" "]], parser.rfc4180.parse( "\n \n \n" )
57
+ [" "]], parser.strict.parse( "\n \n \n" )
61
58
  assert_equal [[""],
62
59
  [" "],
63
- [" "]], parser.rfc4180.parse( "\n \n " )
60
+ [" "]], parser.strict.parse( "\n \n " )
64
61
 
65
- assert_equal [[""]], parser.rfc4180.parse( "\n" )
66
- assert_equal [], parser.rfc4180.parse( "" )
62
+ assert_equal [[""]], parser.strict.parse( "\n" )
63
+ assert_equal [], parser.strict.parse( "" )
67
64
  end
68
65
 
69
66
  end # class TestParserFormats
@@ -0,0 +1,219 @@
1
+ # encoding: utf-8
2
+
3
+ ###
4
+ # to run use
5
+ # ruby -I ./lib -I ./test test/test_parser_java.rb
6
+
7
+
8
+ require 'helper'
9
+
10
+ ##########################
11
+ # try some tests from apache java (commons) csv reader
12
+ # see https://github.com/apache/commons-csv/blob/master/src/test/java/org/apache/commons/csv/LexerTest.java
13
+
14
+
15
+ class TestParserJava < MiniTest::Test
16
+
17
+
18
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
19
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
20
+
21
+
22
+
23
+ def parser
24
+ CsvReader::Parser
25
+ end
26
+
27
+ def test_surrounding_spaces_are_deleted
28
+ assert_equal [["noSpaces",
29
+ "leadingSpaces",
30
+ "trailingSpaces",
31
+ "surroundingSpaces",
32
+ "",
33
+ "",
34
+ ""]], parser.default.parse( "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,," )
35
+ end
36
+
37
+
38
+ def test_surrounding_tabs_are_deleted
39
+ assert_equal [["noTabs",
40
+ "leadingTab",
41
+ "trailingTab",
42
+ "surroundingTabs",
43
+ "",
44
+ "",
45
+ ""]], parser.default.parse( "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,," )
46
+ end
47
+
48
+ def test_ignore_empty_lines
49
+ assert_equal [[ "first", "line", "" ],
50
+ [ "second", "line" ],
51
+ [ "third line" ],
52
+ [ "last", "line" ]],
53
+ parser.default.parse( "first,line,\n" + "\n" + "\n" +
54
+ "second,line\n" + "\n" + "\n" +
55
+ "third line \n" + "\n" + "\n" +
56
+ "last, line \n" + "\n" + "\n" + "\n" )
57
+ end
58
+
59
+
60
+ def test_comments
61
+ assert_equal [["first", "line", "" ],
62
+ ["second", "line", "tokenWith#no-comment" ],
63
+ ["third", "line", "#no-comment" ]],
64
+ parser.default.parse( "first,line,\n" +
65
+ "second,line,tokenWith#no-comment\n" +
66
+ "# comment line \n" +
67
+ "third,line,#no-comment\n" +
68
+ "# penultimate comment\n" +
69
+ "# Final comment\n" )
70
+ end
71
+
72
+
73
+
74
+
75
+
76
+ def test_comments_and_empty_lines
77
+ parser.strict.comment = '#'
78
+
79
+ assert_equal [[ "1", "2", "3", "" ], ## 1
80
+ [ "" ], ## 1b
81
+ [ "" ], ## 1c
82
+ [ "a", "b x", "c#no-comment" ], ## 2
83
+ [ "" ], ## 4
84
+ [ "" ], ## 4b
85
+ [ "d", "e", "#no-comment" ], ## 5
86
+ [ "" ], ## 5b
87
+ [ "" ], ## 5c
88
+ [ "" ], ## 6b
89
+ [ "" ] ## 6c
90
+ ],
91
+ parser.strict.parse(
92
+ "1,2,3,\n" + ## 1
93
+ "\n" + ## 1b
94
+ "\n" + ## 1c
95
+ "a,b x,c#no-comment\n" + ## 2
96
+ "#foo\n" + ## 3
97
+ "\n" + ## 4
98
+ "\n" + ## 4b
99
+ "d,e,#no-comment\n" + ## 5
100
+ "\n" + ## 5b
101
+ "\n" + ## 5c
102
+ "# penultimate comment\n" + ## 6
103
+ "\n" + ## 6b
104
+ "\n" + ## 6c
105
+ "# Final comment\n" ## 7
106
+ )
107
+
108
+ parser.strict.comment = false ## reset to defaults
109
+ end
110
+
111
+
112
+ def test_backslash_with_escaping
113
+ ## simple token with escaping enabled
114
+ assert_equal [[ "a", ",", "b\\" ],
115
+ [ ",", "\nc", "d\r" ],
116
+ [ "e" ]], parser.default.parse( "a,\\,,b\\\\\n" +
117
+ "\\,,\\\nc,d\\\r\n" +
118
+ "e" )
119
+
120
+
121
+ parser.strict.escape = "\\"
122
+ assert_equal [[ "a", ",", "b\\" ],
123
+ [ ",", "\nc", "d\r" ],
124
+ [ "e" ]], parser.strict.parse( "a,\\,,b\\\\\n" +
125
+ "\\,,\\\nc,d\\\r\n" +
126
+ "e" )
127
+ parser.strict.escape = false
128
+ end
129
+
130
+
131
+ def test_backslash_without_escaping
132
+ ## simple token with escaping not enabled
133
+ assert_equal [[ "a",
134
+ "\\", ## an unquoted single backslash is not an escape char
135
+ "",
136
+ "b\\" ## an unquoted single backslash is not an escape char
137
+ ],
138
+ [ "\\", "", "" ]], parser.strict.parse( "a,\\,,b\\\n" +
139
+ "\\,," )
140
+
141
+ end
142
+
143
+
144
+
145
+ def test_next_token4
146
+ ## encapsulator tokenizer (single line)
147
+ assert_equal [[ "a", "foo", "b" ],
148
+ [ "a", " foo", "b" ],
149
+ [ "a", "foo ", "b" ],
150
+ [ "a", " foo ", "b" ]],
151
+ parser.default.parse( "a,\"foo\",b\n" +
152
+ "a, \" foo\",b\n" +
153
+ "a,\"foo \" ,b\n" +
154
+ "a, \" foo \" ,b" )
155
+ end
156
+
157
+
158
+ def test_next_token5
159
+ ## encapsulator tokenizer (multi line, delimiter in string)
160
+ assert_equal [[ "a", "foo\n", "b" ],
161
+ [ "foo\n baar ,,," ],
162
+ [ "\n\t \n" ]],
163
+ parser.default.parse( "a,\"foo\n\",b\n" +
164
+ "\"foo\n baar ,,,\"\n" +
165
+ "\"\n\t \n\"" )
166
+ end
167
+
168
+
169
+ def test_separator_is_tab
170
+ parser.strict.sep = "\t"
171
+ assert_equal [["one",
172
+ "two",
173
+ "",
174
+ "four ",
175
+ " five",
176
+ " six" ]], parser.strict.parse( "one\ttwo\t\tfour \t five\t six" )
177
+ parser.strict.sep = "," ## reset back to comma
178
+ end
179
+
180
+
181
+
182
+
183
+ def test_escaped_cr
184
+ assert_equal [[ "character" + CR + "Escaped" ]],
185
+ parser.default.parse( "character\\" + CR + "Escaped" )
186
+ end
187
+
188
+
189
+ def test_cr
190
+ assert_equal [[ "character" ],
191
+ [ "NotEscaped" ]],
192
+ parser.default.parse( "character" + CR + "NotEscaped" )
193
+ end
194
+
195
+
196
+
197
+ def test_escaped_lf
198
+ assert_equal [[ "character" + LF + "Escaped" ]],
199
+ parser.default.parse( "character\\" + LF + "Escaped" )
200
+ end
201
+
202
+ def test_lf
203
+ assert_equal [[ "character" ],
204
+ [ "NotEscaped" ]],
205
+ parser.default.parse( "character" + LF + "NotEscaped" )
206
+ end
207
+
208
+
209
+
210
+ def test_escaped_mysql_null_value
211
+ ## MySQL uses \N to symbolize null values. We have to restore this
212
+
213
+ ## note: "unknown escape sequences e.g. \N get passed "through" as-is (unescaped)"
214
+ ## only supports \n \r (sep e.g \, or \t) (quote e.g. \") for now - any others?
215
+ assert_equal [[ "character\\NEscaped" ]],
216
+ parser.default.parse( "character\\NEscaped" )
217
+ end
218
+
219
+ end # class TestParserJava