csv 3.0.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.md +377 -0
- data/README.md +1 -0
- data/lib/csv/delete_suffix.rb +18 -0
- data/lib/csv/fields_converter.rb +78 -0
- data/lib/csv/match_p.rb +20 -0
- data/lib/csv/parser.rb +1092 -0
- data/lib/csv/row.rb +6 -6
- data/lib/csv/table.rb +28 -4
- data/lib/csv/version.rb +1 -1
- data/lib/csv/writer.rb +156 -0
- data/lib/csv.rb +330 -632
- metadata +24 -5
- data/news.md +0 -123
data/lib/csv/parser.rb
ADDED
@@ -0,0 +1,1092 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "strscan"
|
4
|
+
|
5
|
+
require_relative "delete_suffix"
|
6
|
+
require_relative "match_p"
|
7
|
+
require_relative "row"
|
8
|
+
require_relative "table"
|
9
|
+
|
10
|
+
using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
|
11
|
+
using CSV::MatchP if CSV.const_defined?(:MatchP)
|
12
|
+
|
13
|
+
class CSV
|
14
|
+
class Parser
|
15
|
+
class InvalidEncoding < StandardError
|
16
|
+
end
|
17
|
+
|
18
|
+
class Scanner < StringScanner
|
19
|
+
alias_method :scan_all, :scan
|
20
|
+
|
21
|
+
def initialize(*args)
|
22
|
+
super
|
23
|
+
@keeps = []
|
24
|
+
end
|
25
|
+
|
26
|
+
def each_line(row_separator)
|
27
|
+
position = pos
|
28
|
+
rest.each_line(row_separator) do |line|
|
29
|
+
position += line.bytesize
|
30
|
+
self.pos = position
|
31
|
+
yield(line)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def keep_start
|
36
|
+
@keeps.push(pos)
|
37
|
+
end
|
38
|
+
|
39
|
+
def keep_end
|
40
|
+
start = @keeps.pop
|
41
|
+
string[start, pos - start]
|
42
|
+
end
|
43
|
+
|
44
|
+
def keep_back
|
45
|
+
self.pos = @keeps.pop
|
46
|
+
end
|
47
|
+
|
48
|
+
def keep_drop
|
49
|
+
@keeps.pop
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class InputsScanner
|
54
|
+
def initialize(inputs, encoding, chunk_size: 8192)
|
55
|
+
@inputs = inputs.dup
|
56
|
+
@encoding = encoding
|
57
|
+
@chunk_size = chunk_size
|
58
|
+
@last_scanner = @inputs.empty?
|
59
|
+
@keeps = []
|
60
|
+
read_chunk
|
61
|
+
end
|
62
|
+
|
63
|
+
def each_line(row_separator)
|
64
|
+
buffer = nil
|
65
|
+
input = @scanner.rest
|
66
|
+
position = @scanner.pos
|
67
|
+
offset = 0
|
68
|
+
n_row_separator_chars = row_separator.size
|
69
|
+
while true
|
70
|
+
input.each_line(row_separator) do |line|
|
71
|
+
@scanner.pos += line.bytesize
|
72
|
+
if buffer
|
73
|
+
if n_row_separator_chars == 2 and
|
74
|
+
buffer.end_with?(row_separator[0]) and
|
75
|
+
line.start_with?(row_separator[1])
|
76
|
+
buffer << line[0]
|
77
|
+
line = line[1..-1]
|
78
|
+
position += buffer.bytesize + offset
|
79
|
+
@scanner.pos = position
|
80
|
+
offset = 0
|
81
|
+
yield(buffer)
|
82
|
+
buffer = nil
|
83
|
+
next if line.empty?
|
84
|
+
else
|
85
|
+
buffer << line
|
86
|
+
line = buffer
|
87
|
+
buffer = nil
|
88
|
+
end
|
89
|
+
end
|
90
|
+
if line.end_with?(row_separator)
|
91
|
+
position += line.bytesize + offset
|
92
|
+
@scanner.pos = position
|
93
|
+
offset = 0
|
94
|
+
yield(line)
|
95
|
+
else
|
96
|
+
buffer = line
|
97
|
+
end
|
98
|
+
end
|
99
|
+
break unless read_chunk
|
100
|
+
input = @scanner.rest
|
101
|
+
position = @scanner.pos
|
102
|
+
offset = -buffer.bytesize if buffer
|
103
|
+
end
|
104
|
+
yield(buffer) if buffer
|
105
|
+
end
|
106
|
+
|
107
|
+
def scan(pattern)
|
108
|
+
value = @scanner.scan(pattern)
|
109
|
+
return value if @last_scanner
|
110
|
+
|
111
|
+
if value
|
112
|
+
read_chunk if @scanner.eos?
|
113
|
+
return value
|
114
|
+
else
|
115
|
+
nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def scan_all(pattern)
|
120
|
+
value = @scanner.scan(pattern)
|
121
|
+
return value if @last_scanner
|
122
|
+
|
123
|
+
return nil if value.nil?
|
124
|
+
while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
|
125
|
+
value << sub_value
|
126
|
+
end
|
127
|
+
value
|
128
|
+
end
|
129
|
+
|
130
|
+
def eos?
|
131
|
+
@scanner.eos?
|
132
|
+
end
|
133
|
+
|
134
|
+
def keep_start
|
135
|
+
@keeps.push([@scanner.pos, nil])
|
136
|
+
end
|
137
|
+
|
138
|
+
def keep_end
|
139
|
+
start, buffer = @keeps.pop
|
140
|
+
keep = @scanner.string[start, @scanner.pos - start]
|
141
|
+
if buffer
|
142
|
+
buffer << keep
|
143
|
+
keep = buffer
|
144
|
+
end
|
145
|
+
keep
|
146
|
+
end
|
147
|
+
|
148
|
+
def keep_back
|
149
|
+
start, buffer = @keeps.pop
|
150
|
+
if buffer
|
151
|
+
string = @scanner.string
|
152
|
+
keep = string.byteslice(start, string.bytesize - start)
|
153
|
+
if keep and not keep.empty?
|
154
|
+
@inputs.unshift(StringIO.new(keep))
|
155
|
+
@last_scanner = false
|
156
|
+
end
|
157
|
+
@scanner = StringScanner.new(buffer)
|
158
|
+
else
|
159
|
+
@scanner.pos = start
|
160
|
+
end
|
161
|
+
read_chunk if @scanner.eos?
|
162
|
+
end
|
163
|
+
|
164
|
+
def keep_drop
|
165
|
+
@keeps.pop
|
166
|
+
end
|
167
|
+
|
168
|
+
def rest
|
169
|
+
@scanner.rest
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
def read_chunk
|
174
|
+
return false if @last_scanner
|
175
|
+
|
176
|
+
unless @keeps.empty?
|
177
|
+
keep = @keeps.last
|
178
|
+
keep_start = keep[0]
|
179
|
+
string = @scanner.string
|
180
|
+
keep_data = string.byteslice(keep_start, @scanner.pos - keep_start)
|
181
|
+
if keep_data
|
182
|
+
keep_buffer = keep[1]
|
183
|
+
if keep_buffer
|
184
|
+
keep_buffer << keep_data
|
185
|
+
else
|
186
|
+
keep[1] = keep_data.dup
|
187
|
+
end
|
188
|
+
end
|
189
|
+
keep[0] = 0
|
190
|
+
end
|
191
|
+
|
192
|
+
input = @inputs.first
|
193
|
+
case input
|
194
|
+
when StringIO
|
195
|
+
string = input.string
|
196
|
+
raise InvalidEncoding unless string.valid_encoding?
|
197
|
+
@scanner = StringScanner.new(string)
|
198
|
+
@inputs.shift
|
199
|
+
@last_scanner = @inputs.empty?
|
200
|
+
true
|
201
|
+
else
|
202
|
+
chunk = input.gets(nil, @chunk_size)
|
203
|
+
if chunk
|
204
|
+
raise InvalidEncoding unless chunk.valid_encoding?
|
205
|
+
@scanner = StringScanner.new(chunk)
|
206
|
+
if input.respond_to?(:eof?) and input.eof?
|
207
|
+
@inputs.shift
|
208
|
+
@last_scanner = @inputs.empty?
|
209
|
+
end
|
210
|
+
true
|
211
|
+
else
|
212
|
+
@scanner = StringScanner.new("".encode(@encoding))
|
213
|
+
@inputs.shift
|
214
|
+
@last_scanner = @inputs.empty?
|
215
|
+
if @last_scanner
|
216
|
+
false
|
217
|
+
else
|
218
|
+
read_chunk
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
def initialize(input, options)
|
226
|
+
@input = input
|
227
|
+
@options = options
|
228
|
+
@samples = []
|
229
|
+
|
230
|
+
prepare
|
231
|
+
end
|
232
|
+
|
233
|
+
def column_separator
|
234
|
+
@column_separator
|
235
|
+
end
|
236
|
+
|
237
|
+
def row_separator
|
238
|
+
@row_separator
|
239
|
+
end
|
240
|
+
|
241
|
+
def quote_character
|
242
|
+
@quote_character
|
243
|
+
end
|
244
|
+
|
245
|
+
def field_size_limit
|
246
|
+
@field_size_limit
|
247
|
+
end
|
248
|
+
|
249
|
+
def skip_lines
|
250
|
+
@skip_lines
|
251
|
+
end
|
252
|
+
|
253
|
+
def unconverted_fields?
|
254
|
+
@unconverted_fields
|
255
|
+
end
|
256
|
+
|
257
|
+
def headers
|
258
|
+
@headers
|
259
|
+
end
|
260
|
+
|
261
|
+
def header_row?
|
262
|
+
@use_headers and @headers.nil?
|
263
|
+
end
|
264
|
+
|
265
|
+
def return_headers?
|
266
|
+
@return_headers
|
267
|
+
end
|
268
|
+
|
269
|
+
def skip_blanks?
|
270
|
+
@skip_blanks
|
271
|
+
end
|
272
|
+
|
273
|
+
def liberal_parsing?
|
274
|
+
@liberal_parsing
|
275
|
+
end
|
276
|
+
|
277
|
+
def lineno
|
278
|
+
@lineno
|
279
|
+
end
|
280
|
+
|
281
|
+
def line
|
282
|
+
last_line
|
283
|
+
end
|
284
|
+
|
285
|
+
def parse(&block)
|
286
|
+
return to_enum(__method__) unless block_given?
|
287
|
+
|
288
|
+
if @return_headers and @headers and @raw_headers
|
289
|
+
headers = Row.new(@headers, @raw_headers, true)
|
290
|
+
if @unconverted_fields
|
291
|
+
headers = add_unconverted_fields(headers, [])
|
292
|
+
end
|
293
|
+
yield headers
|
294
|
+
end
|
295
|
+
|
296
|
+
begin
|
297
|
+
@scanner ||= build_scanner
|
298
|
+
if quote_character.nil?
|
299
|
+
parse_no_quote(&block)
|
300
|
+
elsif @need_robust_parsing
|
301
|
+
parse_quotable_robust(&block)
|
302
|
+
else
|
303
|
+
parse_quotable_loose(&block)
|
304
|
+
end
|
305
|
+
rescue InvalidEncoding
|
306
|
+
if @scanner
|
307
|
+
ignore_broken_line
|
308
|
+
lineno = @lineno
|
309
|
+
else
|
310
|
+
lineno = @lineno + 1
|
311
|
+
end
|
312
|
+
message = "Invalid byte sequence in #{@encoding}"
|
313
|
+
raise MalformedCSVError.new(message, lineno)
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
def use_headers?
|
318
|
+
@use_headers
|
319
|
+
end
|
320
|
+
|
321
|
+
private
|
322
|
+
def prepare
|
323
|
+
prepare_variable
|
324
|
+
prepare_quote_character
|
325
|
+
prepare_backslash
|
326
|
+
prepare_skip_lines
|
327
|
+
prepare_strip
|
328
|
+
prepare_separators
|
329
|
+
prepare_quoted
|
330
|
+
prepare_unquoted
|
331
|
+
prepare_line
|
332
|
+
prepare_header
|
333
|
+
prepare_parser
|
334
|
+
end
|
335
|
+
|
336
|
+
def prepare_variable
|
337
|
+
@need_robust_parsing = false
|
338
|
+
@encoding = @options[:encoding]
|
339
|
+
liberal_parsing = @options[:liberal_parsing]
|
340
|
+
if liberal_parsing
|
341
|
+
@liberal_parsing = true
|
342
|
+
if liberal_parsing.is_a?(Hash)
|
343
|
+
@double_quote_outside_quote =
|
344
|
+
liberal_parsing[:double_quote_outside_quote]
|
345
|
+
@backslash_quote = liberal_parsing[:backslash_quote]
|
346
|
+
else
|
347
|
+
@double_quote_outside_quote = false
|
348
|
+
@backslash_quote = false
|
349
|
+
end
|
350
|
+
@need_robust_parsing = true
|
351
|
+
else
|
352
|
+
@liberal_parsing = false
|
353
|
+
@backslash_quote = false
|
354
|
+
end
|
355
|
+
@unconverted_fields = @options[:unconverted_fields]
|
356
|
+
@field_size_limit = @options[:field_size_limit]
|
357
|
+
@skip_blanks = @options[:skip_blanks]
|
358
|
+
@fields_converter = @options[:fields_converter]
|
359
|
+
@header_fields_converter = @options[:header_fields_converter]
|
360
|
+
end
|
361
|
+
|
362
|
+
def prepare_quote_character
|
363
|
+
@quote_character = @options[:quote_character]
|
364
|
+
if @quote_character.nil?
|
365
|
+
@escaped_quote_character = nil
|
366
|
+
@escaped_quote = nil
|
367
|
+
else
|
368
|
+
@quote_character = @quote_character.to_s.encode(@encoding)
|
369
|
+
if @quote_character.length != 1
|
370
|
+
message = ":quote_char has to be nil or a single character String"
|
371
|
+
raise ArgumentError, message
|
372
|
+
end
|
373
|
+
@double_quote_character = @quote_character * 2
|
374
|
+
@escaped_quote_character = Regexp.escape(@quote_character)
|
375
|
+
@escaped_quote = Regexp.new(@escaped_quote_character)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
def prepare_backslash
|
380
|
+
return unless @backslash_quote
|
381
|
+
|
382
|
+
@backslash_character = "\\".encode(@encoding)
|
383
|
+
|
384
|
+
@escaped_backslash_character = Regexp.escape(@backslash_character)
|
385
|
+
@escaped_backslash = Regexp.new(@escaped_backslash_character)
|
386
|
+
if @quote_character.nil?
|
387
|
+
@backslash_quote_character = nil
|
388
|
+
else
|
389
|
+
@backslash_quote_character =
|
390
|
+
@backslash_character + @escaped_quote_character
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
def prepare_skip_lines
|
395
|
+
skip_lines = @options[:skip_lines]
|
396
|
+
case skip_lines
|
397
|
+
when String
|
398
|
+
@skip_lines = skip_lines.encode(@encoding)
|
399
|
+
when Regexp, nil
|
400
|
+
@skip_lines = skip_lines
|
401
|
+
else
|
402
|
+
unless skip_lines.respond_to?(:match)
|
403
|
+
message =
|
404
|
+
":skip_lines has to respond to \#match: #{skip_lines.inspect}"
|
405
|
+
raise ArgumentError, message
|
406
|
+
end
|
407
|
+
@skip_lines = skip_lines
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
def prepare_strip
|
412
|
+
@strip = @options[:strip]
|
413
|
+
@escaped_strip = nil
|
414
|
+
@strip_value = nil
|
415
|
+
if @strip.is_a?(String)
|
416
|
+
case @strip.length
|
417
|
+
when 0
|
418
|
+
raise ArgumentError, ":strip must not be an empty String"
|
419
|
+
when 1
|
420
|
+
# ok
|
421
|
+
else
|
422
|
+
raise ArgumentError, ":strip doesn't support 2 or more characters yet"
|
423
|
+
end
|
424
|
+
@strip = @strip.encode(@encoding)
|
425
|
+
@escaped_strip = Regexp.escape(@strip)
|
426
|
+
if @quote_character
|
427
|
+
@strip_value = Regexp.new(@escaped_strip +
|
428
|
+
"+".encode(@encoding))
|
429
|
+
end
|
430
|
+
@need_robust_parsing = true
|
431
|
+
elsif @strip
|
432
|
+
strip_values = " \t\r\n\f\v"
|
433
|
+
@escaped_strip = strip_values.encode(@encoding)
|
434
|
+
if @quote_character
|
435
|
+
@strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
|
436
|
+
end
|
437
|
+
@need_robust_parsing = true
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
begin
|
442
|
+
StringScanner.new("x").scan("x")
|
443
|
+
rescue TypeError
|
444
|
+
@@string_scanner_scan_accept_string = false
|
445
|
+
else
|
446
|
+
@@string_scanner_scan_accept_string = true
|
447
|
+
end
|
448
|
+
|
449
|
+
def prepare_separators
|
450
|
+
@column_separator = @options[:column_separator].to_s.encode(@encoding)
|
451
|
+
@row_separator =
|
452
|
+
resolve_row_separator(@options[:row_separator]).encode(@encoding)
|
453
|
+
|
454
|
+
@escaped_column_separator = Regexp.escape(@column_separator)
|
455
|
+
@escaped_first_column_separator = Regexp.escape(@column_separator[0])
|
456
|
+
if @column_separator.size > 1
|
457
|
+
@column_end = Regexp.new(@escaped_column_separator)
|
458
|
+
@column_ends = @column_separator.each_char.collect do |char|
|
459
|
+
Regexp.new(Regexp.escape(char))
|
460
|
+
end
|
461
|
+
@first_column_separators = Regexp.new(@escaped_first_column_separator +
|
462
|
+
"+".encode(@encoding))
|
463
|
+
else
|
464
|
+
if @@string_scanner_scan_accept_string
|
465
|
+
@column_end = @column_separator
|
466
|
+
else
|
467
|
+
@column_end = Regexp.new(@escaped_column_separator)
|
468
|
+
end
|
469
|
+
@column_ends = nil
|
470
|
+
@first_column_separators = nil
|
471
|
+
end
|
472
|
+
|
473
|
+
escaped_row_separator = Regexp.escape(@row_separator)
|
474
|
+
@row_end = Regexp.new(escaped_row_separator)
|
475
|
+
if @row_separator.size > 1
|
476
|
+
@row_ends = @row_separator.each_char.collect do |char|
|
477
|
+
Regexp.new(Regexp.escape(char))
|
478
|
+
end
|
479
|
+
else
|
480
|
+
@row_ends = nil
|
481
|
+
end
|
482
|
+
|
483
|
+
@cr = "\r".encode(@encoding)
|
484
|
+
@lf = "\n".encode(@encoding)
|
485
|
+
@cr_or_lf = Regexp.new("[\r\n]".encode(@encoding))
|
486
|
+
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
|
487
|
+
end
|
488
|
+
|
489
|
+
def prepare_quoted
|
490
|
+
if @quote_character
|
491
|
+
@quotes = Regexp.new(@escaped_quote_character +
|
492
|
+
"+".encode(@encoding))
|
493
|
+
no_quoted_values = @escaped_quote_character.dup
|
494
|
+
if @backslash_quote
|
495
|
+
no_quoted_values << @escaped_backslash_character
|
496
|
+
end
|
497
|
+
@quoted_value = Regexp.new("[^".encode(@encoding) +
|
498
|
+
no_quoted_values +
|
499
|
+
"]+".encode(@encoding))
|
500
|
+
end
|
501
|
+
if @escaped_strip
|
502
|
+
@split_column_separator = Regexp.new(@escaped_strip +
|
503
|
+
"*".encode(@encoding) +
|
504
|
+
@escaped_column_separator +
|
505
|
+
@escaped_strip +
|
506
|
+
"*".encode(@encoding))
|
507
|
+
else
|
508
|
+
if @column_separator == " ".encode(@encoding)
|
509
|
+
@split_column_separator = Regexp.new(@escaped_column_separator)
|
510
|
+
else
|
511
|
+
@split_column_separator = @column_separator
|
512
|
+
end
|
513
|
+
end
|
514
|
+
end
|
515
|
+
|
516
|
+
def prepare_unquoted
|
517
|
+
return if @quote_character.nil?
|
518
|
+
|
519
|
+
no_unquoted_values = "\r\n".encode(@encoding)
|
520
|
+
no_unquoted_values << @escaped_first_column_separator
|
521
|
+
unless @liberal_parsing
|
522
|
+
no_unquoted_values << @escaped_quote_character
|
523
|
+
end
|
524
|
+
if @escaped_strip
|
525
|
+
no_unquoted_values << @escaped_strip
|
526
|
+
end
|
527
|
+
@unquoted_value = Regexp.new("[^".encode(@encoding) +
|
528
|
+
no_unquoted_values +
|
529
|
+
"]+".encode(@encoding))
|
530
|
+
end
|
531
|
+
|
532
|
+
def resolve_row_separator(separator)
|
533
|
+
if separator == :auto
|
534
|
+
cr = "\r".encode(@encoding)
|
535
|
+
lf = "\n".encode(@encoding)
|
536
|
+
if @input.is_a?(StringIO)
|
537
|
+
separator = detect_row_separator(@input.string, cr, lf)
|
538
|
+
elsif @input.respond_to?(:gets)
|
539
|
+
if @input.is_a?(File)
|
540
|
+
chunk_size = 32 * 1024
|
541
|
+
else
|
542
|
+
chunk_size = 1024
|
543
|
+
end
|
544
|
+
begin
|
545
|
+
while separator == :auto
|
546
|
+
#
|
547
|
+
# if we run out of data, it's probably a single line
|
548
|
+
# (ensure will set default value)
|
549
|
+
#
|
550
|
+
break unless sample = @input.gets(nil, chunk_size)
|
551
|
+
|
552
|
+
# extend sample if we're unsure of the line ending
|
553
|
+
if sample.end_with?(cr)
|
554
|
+
sample << (@input.gets(nil, 1) || "")
|
555
|
+
end
|
556
|
+
|
557
|
+
@samples << sample
|
558
|
+
|
559
|
+
separator = detect_row_separator(sample, cr, lf)
|
560
|
+
end
|
561
|
+
rescue IOError
|
562
|
+
# do nothing: ensure will set default
|
563
|
+
end
|
564
|
+
end
|
565
|
+
separator = $INPUT_RECORD_SEPARATOR if separator == :auto
|
566
|
+
end
|
567
|
+
separator.to_s.encode(@encoding)
|
568
|
+
end
|
569
|
+
|
570
|
+
def detect_row_separator(sample, cr, lf)
|
571
|
+
lf_index = sample.index(lf)
|
572
|
+
if lf_index
|
573
|
+
cr_index = sample[0, lf_index].index(cr)
|
574
|
+
else
|
575
|
+
cr_index = sample.index(cr)
|
576
|
+
end
|
577
|
+
if cr_index and lf_index
|
578
|
+
if cr_index + 1 == lf_index
|
579
|
+
cr + lf
|
580
|
+
elsif cr_index < lf_index
|
581
|
+
cr
|
582
|
+
else
|
583
|
+
lf
|
584
|
+
end
|
585
|
+
elsif cr_index
|
586
|
+
cr
|
587
|
+
elsif lf_index
|
588
|
+
lf
|
589
|
+
else
|
590
|
+
:auto
|
591
|
+
end
|
592
|
+
end
|
593
|
+
|
594
|
+
def prepare_line
|
595
|
+
@lineno = 0
|
596
|
+
@last_line = nil
|
597
|
+
@scanner = nil
|
598
|
+
end
|
599
|
+
|
600
|
+
def last_line
|
601
|
+
if @scanner
|
602
|
+
@last_line ||= @scanner.keep_end
|
603
|
+
else
|
604
|
+
@last_line
|
605
|
+
end
|
606
|
+
end
|
607
|
+
|
608
|
+
def prepare_header
|
609
|
+
@return_headers = @options[:return_headers]
|
610
|
+
|
611
|
+
headers = @options[:headers]
|
612
|
+
case headers
|
613
|
+
when Array
|
614
|
+
@raw_headers = headers
|
615
|
+
@use_headers = true
|
616
|
+
when String
|
617
|
+
@raw_headers = parse_headers(headers)
|
618
|
+
@use_headers = true
|
619
|
+
when nil, false
|
620
|
+
@raw_headers = nil
|
621
|
+
@use_headers = false
|
622
|
+
else
|
623
|
+
@raw_headers = nil
|
624
|
+
@use_headers = true
|
625
|
+
end
|
626
|
+
if @raw_headers
|
627
|
+
@headers = adjust_headers(@raw_headers)
|
628
|
+
else
|
629
|
+
@headers = nil
|
630
|
+
end
|
631
|
+
end
|
632
|
+
|
633
|
+
def parse_headers(row)
|
634
|
+
CSV.parse_line(row,
|
635
|
+
col_sep: @column_separator,
|
636
|
+
row_sep: @row_separator,
|
637
|
+
quote_char: @quote_character)
|
638
|
+
end
|
639
|
+
|
640
|
+
def adjust_headers(headers)
|
641
|
+
adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
|
642
|
+
adjusted_headers.each {|h| h.freeze if h.is_a? String}
|
643
|
+
adjusted_headers
|
644
|
+
end
|
645
|
+
|
646
|
+
def prepare_parser
|
647
|
+
@may_quoted = may_quoted?
|
648
|
+
end
|
649
|
+
|
650
|
+
def may_quoted?
|
651
|
+
return false if @quote_character.nil?
|
652
|
+
|
653
|
+
if @input.is_a?(StringIO)
|
654
|
+
sample = @input.string
|
655
|
+
else
|
656
|
+
return false if @samples.empty?
|
657
|
+
sample = @samples.first
|
658
|
+
end
|
659
|
+
sample[0, 128].index(@quote_character)
|
660
|
+
end
|
661
|
+
|
662
|
+
SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
|
663
|
+
if SCANNER_TEST
|
664
|
+
class UnoptimizedStringIO
|
665
|
+
def initialize(string)
|
666
|
+
@io = StringIO.new(string)
|
667
|
+
end
|
668
|
+
|
669
|
+
def gets(*args)
|
670
|
+
@io.gets(*args)
|
671
|
+
end
|
672
|
+
|
673
|
+
def each_line(*args, &block)
|
674
|
+
@io.each_line(*args, &block)
|
675
|
+
end
|
676
|
+
|
677
|
+
def eof?
|
678
|
+
@io.eof?
|
679
|
+
end
|
680
|
+
end
|
681
|
+
|
682
|
+
def build_scanner
|
683
|
+
inputs = @samples.collect do |sample|
|
684
|
+
UnoptimizedStringIO.new(sample)
|
685
|
+
end
|
686
|
+
if @input.is_a?(StringIO)
|
687
|
+
inputs << UnoptimizedStringIO.new(@input.string)
|
688
|
+
else
|
689
|
+
inputs << @input
|
690
|
+
end
|
691
|
+
chunk_size = ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"
|
692
|
+
InputsScanner.new(inputs,
|
693
|
+
@encoding,
|
694
|
+
chunk_size: Integer(chunk_size, 10))
|
695
|
+
end
|
696
|
+
else
|
697
|
+
def build_scanner
|
698
|
+
string = nil
|
699
|
+
if @samples.empty? and @input.is_a?(StringIO)
|
700
|
+
string = @input.string
|
701
|
+
elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
|
702
|
+
string = @samples[0]
|
703
|
+
end
|
704
|
+
if string
|
705
|
+
unless string.valid_encoding?
|
706
|
+
index = string.lines(@row_separator).index do |line|
|
707
|
+
!line.valid_encoding?
|
708
|
+
end
|
709
|
+
if index
|
710
|
+
message = "Invalid byte sequence in #{@encoding}"
|
711
|
+
raise MalformedCSVError.new(message, @lineno + index + 1)
|
712
|
+
end
|
713
|
+
end
|
714
|
+
Scanner.new(string)
|
715
|
+
else
|
716
|
+
inputs = @samples.collect do |sample|
|
717
|
+
StringIO.new(sample)
|
718
|
+
end
|
719
|
+
inputs << @input
|
720
|
+
InputsScanner.new(inputs, @encoding)
|
721
|
+
end
|
722
|
+
end
|
723
|
+
end
|
724
|
+
|
725
|
+
def skip_needless_lines
|
726
|
+
return unless @skip_lines
|
727
|
+
|
728
|
+
while true
|
729
|
+
@scanner.keep_start
|
730
|
+
line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
|
731
|
+
line << @row_separator if parse_row_end
|
732
|
+
if skip_line?(line)
|
733
|
+
@lineno += 1
|
734
|
+
@scanner.keep_drop
|
735
|
+
else
|
736
|
+
@scanner.keep_back
|
737
|
+
return
|
738
|
+
end
|
739
|
+
end
|
740
|
+
end
|
741
|
+
|
742
|
+
def skip_line?(line)
|
743
|
+
case @skip_lines
|
744
|
+
when String
|
745
|
+
line.include?(@skip_lines)
|
746
|
+
when Regexp
|
747
|
+
@skip_lines.match?(line)
|
748
|
+
else
|
749
|
+
@skip_lines.match(line)
|
750
|
+
end
|
751
|
+
end
|
752
|
+
|
753
|
+
def parse_no_quote(&block)
|
754
|
+
@scanner.each_line(@row_separator) do |line|
|
755
|
+
next if @skip_lines and skip_line?(line)
|
756
|
+
original_line = line
|
757
|
+
line = line.delete_suffix(@row_separator)
|
758
|
+
|
759
|
+
if line.empty?
|
760
|
+
next if @skip_blanks
|
761
|
+
row = []
|
762
|
+
else
|
763
|
+
line = strip_value(line)
|
764
|
+
row = line.split(@split_column_separator, -1)
|
765
|
+
n_columns = row.size
|
766
|
+
i = 0
|
767
|
+
while i < n_columns
|
768
|
+
row[i] = nil if row[i].empty?
|
769
|
+
i += 1
|
770
|
+
end
|
771
|
+
end
|
772
|
+
@last_line = original_line
|
773
|
+
emit_row(row, &block)
|
774
|
+
end
|
775
|
+
end
|
776
|
+
|
777
|
+
def parse_quotable_loose(&block)
|
778
|
+
@scanner.keep_start
|
779
|
+
@scanner.each_line(@row_separator) do |line|
|
780
|
+
if @skip_lines and skip_line?(line)
|
781
|
+
@scanner.keep_drop
|
782
|
+
@scanner.keep_start
|
783
|
+
next
|
784
|
+
end
|
785
|
+
original_line = line
|
786
|
+
line = line.delete_suffix(@row_separator)
|
787
|
+
|
788
|
+
if line.empty?
|
789
|
+
if @skip_blanks
|
790
|
+
@scanner.keep_drop
|
791
|
+
@scanner.keep_start
|
792
|
+
next
|
793
|
+
end
|
794
|
+
row = []
|
795
|
+
elsif line.include?(@cr) or line.include?(@lf)
|
796
|
+
@scanner.keep_back
|
797
|
+
@need_robust_parsing = true
|
798
|
+
return parse_quotable_robust(&block)
|
799
|
+
else
|
800
|
+
row = line.split(@split_column_separator, -1)
|
801
|
+
n_columns = row.size
|
802
|
+
i = 0
|
803
|
+
while i < n_columns
|
804
|
+
column = row[i]
|
805
|
+
if column.empty?
|
806
|
+
row[i] = nil
|
807
|
+
else
|
808
|
+
n_quotes = column.count(@quote_character)
|
809
|
+
if n_quotes.zero?
|
810
|
+
# no quote
|
811
|
+
elsif n_quotes == 2 and
|
812
|
+
column.start_with?(@quote_character) and
|
813
|
+
column.end_with?(@quote_character)
|
814
|
+
row[i] = column[1..-2]
|
815
|
+
else
|
816
|
+
@scanner.keep_back
|
817
|
+
@need_robust_parsing = true
|
818
|
+
return parse_quotable_robust(&block)
|
819
|
+
end
|
820
|
+
end
|
821
|
+
i += 1
|
822
|
+
end
|
823
|
+
end
|
824
|
+
@scanner.keep_drop
|
825
|
+
@scanner.keep_start
|
826
|
+
@last_line = original_line
|
827
|
+
emit_row(row, &block)
|
828
|
+
end
|
829
|
+
@scanner.keep_drop
|
830
|
+
end
|
831
|
+
|
832
|
+
def parse_quotable_robust(&block)
|
833
|
+
row = []
|
834
|
+
skip_needless_lines
|
835
|
+
start_row
|
836
|
+
while true
|
837
|
+
@quoted_column_value = false
|
838
|
+
@unquoted_column_value = false
|
839
|
+
@scanner.scan_all(@strip_value) if @strip_value
|
840
|
+
value = parse_column_value
|
841
|
+
if value
|
842
|
+
@scanner.scan_all(@strip_value) if @strip_value
|
843
|
+
if @field_size_limit and value.size >= @field_size_limit
|
844
|
+
ignore_broken_line
|
845
|
+
raise MalformedCSVError.new("Field size exceeded", @lineno)
|
846
|
+
end
|
847
|
+
end
|
848
|
+
if parse_column_end
|
849
|
+
row << value
|
850
|
+
elsif parse_row_end
|
851
|
+
if row.empty? and value.nil?
|
852
|
+
emit_row([], &block) unless @skip_blanks
|
853
|
+
else
|
854
|
+
row << value
|
855
|
+
emit_row(row, &block)
|
856
|
+
row = []
|
857
|
+
end
|
858
|
+
skip_needless_lines
|
859
|
+
start_row
|
860
|
+
elsif @scanner.eos?
|
861
|
+
break if row.empty? and value.nil?
|
862
|
+
row << value
|
863
|
+
emit_row(row, &block)
|
864
|
+
break
|
865
|
+
else
|
866
|
+
if @quoted_column_value
|
867
|
+
ignore_broken_line
|
868
|
+
message = "Any value after quoted field isn't allowed"
|
869
|
+
raise MalformedCSVError.new(message, @lineno)
|
870
|
+
elsif @unquoted_column_value and
|
871
|
+
(new_line = @scanner.scan(@cr_or_lf))
|
872
|
+
ignore_broken_line
|
873
|
+
message = "Unquoted fields do not allow new line " +
|
874
|
+
"<#{new_line.inspect}>"
|
875
|
+
raise MalformedCSVError.new(message, @lineno)
|
876
|
+
elsif @scanner.rest.start_with?(@quote_character)
|
877
|
+
ignore_broken_line
|
878
|
+
message = "Illegal quoting"
|
879
|
+
raise MalformedCSVError.new(message, @lineno)
|
880
|
+
elsif (new_line = @scanner.scan(@cr_or_lf))
|
881
|
+
ignore_broken_line
|
882
|
+
message = "New line must be <#{@row_separator.inspect}> " +
|
883
|
+
"not <#{new_line.inspect}>"
|
884
|
+
raise MalformedCSVError.new(message, @lineno)
|
885
|
+
else
|
886
|
+
ignore_broken_line
|
887
|
+
raise MalformedCSVError.new("TODO: Meaningful message",
|
888
|
+
@lineno)
|
889
|
+
end
|
890
|
+
end
|
891
|
+
end
|
892
|
+
end
|
893
|
+
|
894
|
+
def parse_column_value
|
895
|
+
if @liberal_parsing
|
896
|
+
quoted_value = parse_quoted_column_value
|
897
|
+
if quoted_value
|
898
|
+
unquoted_value = parse_unquoted_column_value
|
899
|
+
if unquoted_value
|
900
|
+
if @double_quote_outside_quote
|
901
|
+
unquoted_value = unquoted_value.gsub(@quote_character * 2,
|
902
|
+
@quote_character)
|
903
|
+
if quoted_value.empty? # %Q{""...} case
|
904
|
+
return @quote_character + unquoted_value
|
905
|
+
end
|
906
|
+
end
|
907
|
+
@quote_character + quoted_value + @quote_character + unquoted_value
|
908
|
+
else
|
909
|
+
quoted_value
|
910
|
+
end
|
911
|
+
else
|
912
|
+
parse_unquoted_column_value
|
913
|
+
end
|
914
|
+
elsif @may_quoted
|
915
|
+
parse_quoted_column_value ||
|
916
|
+
parse_unquoted_column_value
|
917
|
+
else
|
918
|
+
parse_unquoted_column_value ||
|
919
|
+
parse_quoted_column_value
|
920
|
+
end
|
921
|
+
end
|
922
|
+
|
923
|
+
def parse_unquoted_column_value
|
924
|
+
value = @scanner.scan_all(@unquoted_value)
|
925
|
+
return nil unless value
|
926
|
+
|
927
|
+
@unquoted_column_value = true
|
928
|
+
if @first_column_separators
|
929
|
+
while true
|
930
|
+
@scanner.keep_start
|
931
|
+
is_column_end = @column_ends.all? do |column_end|
|
932
|
+
@scanner.scan(column_end)
|
933
|
+
end
|
934
|
+
@scanner.keep_back
|
935
|
+
break if is_column_end
|
936
|
+
sub_separator = @scanner.scan_all(@first_column_separators)
|
937
|
+
break if sub_separator.nil?
|
938
|
+
value << sub_separator
|
939
|
+
sub_value = @scanner.scan_all(@unquoted_value)
|
940
|
+
break if sub_value.nil?
|
941
|
+
value << sub_value
|
942
|
+
end
|
943
|
+
end
|
944
|
+
value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
|
945
|
+
value
|
946
|
+
end
|
947
|
+
|
948
|
+
def parse_quoted_column_value
|
949
|
+
quotes = @scanner.scan_all(@quotes)
|
950
|
+
return nil unless quotes
|
951
|
+
|
952
|
+
@quoted_column_value = true
|
953
|
+
n_quotes = quotes.size
|
954
|
+
if (n_quotes % 2).zero?
|
955
|
+
quotes[0, (n_quotes - 2) / 2]
|
956
|
+
else
|
957
|
+
value = quotes[0, (n_quotes - 1) / 2]
|
958
|
+
while true
|
959
|
+
quoted_value = @scanner.scan_all(@quoted_value)
|
960
|
+
value << quoted_value if quoted_value
|
961
|
+
if @backslash_quote
|
962
|
+
if @scanner.scan(@escaped_backslash)
|
963
|
+
if @scanner.scan(@escaped_quote)
|
964
|
+
value << @quote_character
|
965
|
+
else
|
966
|
+
value << @backslash_character
|
967
|
+
end
|
968
|
+
next
|
969
|
+
end
|
970
|
+
end
|
971
|
+
|
972
|
+
quotes = @scanner.scan_all(@quotes)
|
973
|
+
unless quotes
|
974
|
+
ignore_broken_line
|
975
|
+
message = "Unclosed quoted field"
|
976
|
+
raise MalformedCSVError.new(message, @lineno)
|
977
|
+
end
|
978
|
+
n_quotes = quotes.size
|
979
|
+
if n_quotes == 1
|
980
|
+
break
|
981
|
+
elsif (n_quotes % 2) == 1
|
982
|
+
value << quotes[0, (n_quotes - 1) / 2]
|
983
|
+
break
|
984
|
+
else
|
985
|
+
value << quotes[0, n_quotes / 2]
|
986
|
+
end
|
987
|
+
end
|
988
|
+
value
|
989
|
+
end
|
990
|
+
end
|
991
|
+
|
992
|
+
def parse_column_end
|
993
|
+
return true if @scanner.scan(@column_end)
|
994
|
+
return false unless @column_ends
|
995
|
+
|
996
|
+
@scanner.keep_start
|
997
|
+
if @column_ends.all? {|column_end| @scanner.scan(column_end)}
|
998
|
+
@scanner.keep_drop
|
999
|
+
true
|
1000
|
+
else
|
1001
|
+
@scanner.keep_back
|
1002
|
+
false
|
1003
|
+
end
|
1004
|
+
end
|
1005
|
+
|
1006
|
+
def parse_row_end
|
1007
|
+
return true if @scanner.scan(@row_end)
|
1008
|
+
return false unless @row_ends
|
1009
|
+
@scanner.keep_start
|
1010
|
+
if @row_ends.all? {|row_end| @scanner.scan(row_end)}
|
1011
|
+
@scanner.keep_drop
|
1012
|
+
true
|
1013
|
+
else
|
1014
|
+
@scanner.keep_back
|
1015
|
+
false
|
1016
|
+
end
|
1017
|
+
end
|
1018
|
+
|
1019
|
+
def strip_value(value)
|
1020
|
+
return value unless @strip
|
1021
|
+
return nil if value.nil?
|
1022
|
+
|
1023
|
+
case @strip
|
1024
|
+
when String
|
1025
|
+
size = value.size
|
1026
|
+
while value.start_with?(@strip)
|
1027
|
+
size -= 1
|
1028
|
+
value = value[1, size]
|
1029
|
+
end
|
1030
|
+
while value.end_with?(@strip)
|
1031
|
+
size -= 1
|
1032
|
+
value = value[0, size]
|
1033
|
+
end
|
1034
|
+
else
|
1035
|
+
value.strip!
|
1036
|
+
end
|
1037
|
+
value
|
1038
|
+
end
|
1039
|
+
|
1040
|
+
def ignore_broken_line
|
1041
|
+
@scanner.scan_all(@not_line_end)
|
1042
|
+
@scanner.scan_all(@cr_or_lf)
|
1043
|
+
@lineno += 1
|
1044
|
+
end
|
1045
|
+
|
1046
|
+
def start_row
|
1047
|
+
if @last_line
|
1048
|
+
@last_line = nil
|
1049
|
+
else
|
1050
|
+
@scanner.keep_drop
|
1051
|
+
end
|
1052
|
+
@scanner.keep_start
|
1053
|
+
end
|
1054
|
+
|
1055
|
+
def emit_row(row, &block)
|
1056
|
+
@lineno += 1
|
1057
|
+
|
1058
|
+
raw_row = row
|
1059
|
+
if @use_headers
|
1060
|
+
if @headers.nil?
|
1061
|
+
@headers = adjust_headers(row)
|
1062
|
+
return unless @return_headers
|
1063
|
+
row = Row.new(@headers, row, true)
|
1064
|
+
else
|
1065
|
+
row = Row.new(@headers,
|
1066
|
+
@fields_converter.convert(raw_row, @headers, @lineno))
|
1067
|
+
end
|
1068
|
+
else
|
1069
|
+
# convert fields, if needed...
|
1070
|
+
row = @fields_converter.convert(raw_row, nil, @lineno)
|
1071
|
+
end
|
1072
|
+
|
1073
|
+
# inject unconverted fields and accessor, if requested...
|
1074
|
+
if @unconverted_fields and not row.respond_to?(:unconverted_fields)
|
1075
|
+
add_unconverted_fields(row, raw_row)
|
1076
|
+
end
|
1077
|
+
|
1078
|
+
yield(row)
|
1079
|
+
end
|
1080
|
+
|
1081
|
+
# This method injects an instance variable <tt>unconverted_fields</tt> into
|
1082
|
+
# +row+ and an accessor method for +row+ called unconverted_fields(). The
|
1083
|
+
# variable is set to the contents of +fields+.
|
1084
|
+
def add_unconverted_fields(row, fields)
|
1085
|
+
class << row
|
1086
|
+
attr_reader :unconverted_fields
|
1087
|
+
end
|
1088
|
+
row.instance_variable_set(:@unconverted_fields, fields)
|
1089
|
+
row
|
1090
|
+
end
|
1091
|
+
end
|
1092
|
+
end
|