smarter_csv 1.5.2 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +8 -4
- data/lib/smarter_csv/smarter_csv.rb +74 -32
- data/lib/smarter_csv/version.rb +1 -1
- data/smarter_csv.gemspec +1 -1
- data/spec/smarter_csv/malformed_spec.rb +15 -7
- data/spec/smarter_csv/parse/column_separator_spec.rb +61 -0
- data/spec/smarter_csv/parse/old_csv_library_spec.rb +74 -0
- data/spec/smarter_csv/parse/rfc4180_and_more_spec.rb +170 -0
- data/spec/smarter_csv/quoted_spec.rb +8 -4
- metadata +23 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd2cf82aafc3b45257fbdfc594ed8e1d3bf2226e59cbee144b3003d8f79ec6cf
|
4
|
+
data.tar.gz: 95df862865e3123cf86194d47107f140f69f2fc91c20aba01d4004e8bffa5d74
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: df32ae9a380fa4fff0932d56e8a0cacadb8d4ebf7d8124e607f2ba389c3b60f875c300a2137fe04aac2b7eda77850b343af5e58c53e310f90460f96223f3228c
|
7
|
+
data.tar.gz: 107e1dbacdc6293a0c044a91cf237f50fbeab59eb5b032167f55d0fe6c2cf07b079c6cfe296368c03d2c84e64ce3c0e6ad744043397cdc217cd3ab51beb3ab09
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
|
2
2
|
# SmarterCSV 1.x Change Log
|
3
3
|
|
4
|
+
## 1.6.0 (2022-05-03)
|
5
|
+
* completely rewrote line parser
|
6
|
+
* added methods `SmarterCSV.raw_headers` and `SmarterCSV.headers` to allow easy examination of how the headers are processed.
|
7
|
+
|
4
8
|
## 1.5.2 (2022-04-29)
|
5
9
|
* added missing keys to the SmarterCSV::KeyMappingError exception message #189 (thanks to John Dell)
|
6
10
|
|
data/README.md
CHANGED
@@ -16,10 +16,12 @@
|
|
16
16
|
|
17
17
|
# SmarterCSV
|
18
18
|
|
19
|
-
[](http://travis-ci.
|
19
|
+
[](http://travis-ci.com/tilo/smarter_csv) [](http://badge.fury.io/rb/smarter_csv)
|
20
20
|
|
21
21
|
#### SmarterCSV 1.x
|
22
22
|
|
23
|
+
`smarter_csv` is now 10 years old, and still kicking! 🎉🎉🎉
|
24
|
+
|
23
25
|
`smarter_csv` is a Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, suitable for direct processing with Mongoid or ActiveRecord,
|
24
26
|
and parallel processing with Resque or Sidekiq.
|
25
27
|
|
@@ -42,11 +44,13 @@ NOTE; This Gem is only for importing CSV files - writing of CSV files is not sup
|
|
42
44
|
|
43
45
|
### Why?
|
44
46
|
|
45
|
-
Ruby's CSV library's API is pretty old, and it's processing of CSV-files returning Arrays of Arrays feels 'very close to the metal'. The output is not easy to use - especially not if you want to create database records
|
47
|
+
Ruby's CSV library's API is pretty old, and it's processing of CSV-files returning Arrays of Arrays feels 'very close to the metal'. The output is not easy to use - especially not if you want to create database records or Sidekiq jobs with it. Another shortcoming is that Ruby's CSV library does not have good support for huge CSV-files, e.g. there is no support for 'chunking' and/or parallel processing of the CSV-content (e.g. with Sidekiq).
|
48
|
+
|
49
|
+
As the existing CSV libraries didn't fit my needs, I was writing my own CSV processing - specifically for use in connection with Rails ORMs like Mongoid, MongoMapper and ActiveRecord. In those ORMs you can easily pass a hash with attribute/value pairs to the create() method. The lower-level Mongo driver and Moped also accept larger arrays of such hashes to create a larger amount of records quickly with just one call. The same patterns are used when you pass data to Sidekiq jobs.
|
46
50
|
|
47
|
-
|
51
|
+
For processing large CSV files it is essential to process them in chunks, so the memory impact is minimized.
|
48
52
|
|
49
|
-
###
|
53
|
+
### How?
|
50
54
|
|
51
55
|
The two main choices you have in terms of how to call `SmarterCSV.process` are:
|
52
56
|
* calling `process` with or without a block
|
@@ -6,6 +6,7 @@ module SmarterCSV
|
|
6
6
|
class MissingHeaders < SmarterCSVException; end
|
7
7
|
class NoColSepDetected < SmarterCSVException; end
|
8
8
|
class KeyMappingError < SmarterCSVException; end
|
9
|
+
class MalformedCSVError < SmarterCSVException; end
|
9
10
|
|
10
11
|
# first parameter: filename or input object which responds to readline method
|
11
12
|
def SmarterCSV.process(input, options={}, &block)
|
@@ -24,10 +25,6 @@ module SmarterCSV
|
|
24
25
|
options[:row_sep] = SmarterCSV.guess_line_ending(fh, options) if options[:row_sep].to_sym == :auto
|
25
26
|
# attempt to auto-detect column separator
|
26
27
|
options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep].to_sym == :auto
|
27
|
-
# preserve options, in case we need to call the CSV class
|
28
|
-
csv_options = options.select{|k,v| [:col_sep, :row_sep, :quote_char].include?(k)} # options.slice(:col_sep, :row_sep, :quote_char)
|
29
|
-
csv_options.delete(:row_sep) if [nil, :auto].include?( options[:row_sep].to_sym )
|
30
|
-
csv_options.delete(:col_sep) if [nil, :auto].include?( options[:col_sep].to_sym )
|
31
28
|
|
32
29
|
if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && ( fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8') )
|
33
30
|
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
|
@@ -39,7 +36,7 @@ module SmarterCSV
|
|
39
36
|
end
|
40
37
|
end
|
41
38
|
|
42
|
-
headerA, header_size = process_headers(fh, options
|
39
|
+
headerA, header_size = process_headers(fh, options)
|
43
40
|
|
44
41
|
# in case we use chunking.. we'll need to set it up..
|
45
42
|
if ! options[:chunk_size].nil? && options[:chunk_size].to_i > 0
|
@@ -76,15 +73,8 @@ module SmarterCSV
|
|
76
73
|
|
77
74
|
line.chomp!(options[:row_sep])
|
78
75
|
|
79
|
-
|
80
|
-
|
81
|
-
CSV.parse( line, **csv_options ).flatten.collect!{|x| x.nil? ? '' : x} # to deal with nil values from CSV.parse
|
82
|
-
rescue CSV::MalformedCSVError => e
|
83
|
-
raise $!, "#{$!} [SmarterCSV: csv line #{@csv_line_count}]", $!.backtrace
|
84
|
-
end
|
85
|
-
else
|
86
|
-
dataA = line.split(options[:col_sep], header_size)
|
87
|
-
end
|
76
|
+
dataA, data_size = parse(line, options, header_size)
|
77
|
+
|
88
78
|
dataA.map!{|x| x.sub(/(#{options[:col_sep]})+\z/, '')} # remove any unwanted trailing col_sep characters at the end
|
89
79
|
dataA.map!{|x| x.strip} if options[:strip_whitespace]
|
90
80
|
|
@@ -192,13 +182,6 @@ module SmarterCSV
|
|
192
182
|
|
193
183
|
private
|
194
184
|
|
195
|
-
def self.readline_with_counts(filehandle, options)
|
196
|
-
line = filehandle.readline(options[:row_sep])
|
197
|
-
@file_line_count += 1
|
198
|
-
@csv_line_count += 1
|
199
|
-
line
|
200
|
-
end
|
201
|
-
|
202
185
|
def self.default_options
|
203
186
|
{
|
204
187
|
auto_row_sep_chars: 500,
|
@@ -233,6 +216,62 @@ module SmarterCSV
|
|
233
216
|
}
|
234
217
|
end
|
235
218
|
|
219
|
+
def self.readline_with_counts(filehandle, options)
|
220
|
+
line = filehandle.readline(options[:row_sep])
|
221
|
+
@file_line_count += 1
|
222
|
+
@csv_line_count += 1
|
223
|
+
line
|
224
|
+
end
|
225
|
+
|
226
|
+
# parses a single line: either a CSV header and body line
|
227
|
+
# - quoting rules compared to RFC-4180 are somewhat relaxed
|
228
|
+
# - we are not assuming that quotes inside a fields need to be doubled
|
229
|
+
# - we are not assuming that all fields need to be quoted (0 is even)
|
230
|
+
# - works with multi-char col_sep
|
231
|
+
# - if header_size is given, only up to header_size fields are parsed
|
232
|
+
#
|
233
|
+
# We use header_size for parsing the body lines to make sure we always match the number of headers
|
234
|
+
# in case there are trailing col_sep characters in line
|
235
|
+
#
|
236
|
+
# Our convention is that empty fields are returned as empty strings, not as nil.
|
237
|
+
#
|
238
|
+
def self.parse(line, options, header_size = nil)
|
239
|
+
return [] if line.nil?
|
240
|
+
|
241
|
+
col_sep = options[:col_sep]
|
242
|
+
quote = options[:quote_char]
|
243
|
+
quote_count = 0
|
244
|
+
elements = []
|
245
|
+
start = 0
|
246
|
+
i = 0
|
247
|
+
|
248
|
+
while i < line.size do
|
249
|
+
if line[i...i+col_sep.size] == col_sep && quote_count.even?
|
250
|
+
break if !header_size.nil? && elements.size >= header_size
|
251
|
+
|
252
|
+
elements << cleanup_quotes(line[start...i], quote)
|
253
|
+
i += col_sep.size
|
254
|
+
start = i
|
255
|
+
else
|
256
|
+
quote_count += 1 if line[i] == quote
|
257
|
+
i += 1
|
258
|
+
end
|
259
|
+
end
|
260
|
+
elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
|
261
|
+
[elements, elements.size]
|
262
|
+
end
|
263
|
+
|
264
|
+
def self.cleanup_quotes(field, quote)
|
265
|
+
return field if field.nil? || field !~ /#{quote}/
|
266
|
+
|
267
|
+
if field.start_with?(quote) && field.end_with?(quote)
|
268
|
+
field.delete_prefix!(quote)
|
269
|
+
field.delete_suffix!(quote)
|
270
|
+
end
|
271
|
+
field.gsub!("#{quote}#{quote}", quote)
|
272
|
+
field
|
273
|
+
end
|
274
|
+
|
236
275
|
def self.blank?(value)
|
237
276
|
case value
|
238
277
|
when Array
|
@@ -319,11 +358,22 @@ module SmarterCSV
|
|
319
358
|
return k # the most frequent one is it
|
320
359
|
end
|
321
360
|
|
322
|
-
def self.
|
361
|
+
def self.raw_hearder
|
362
|
+
@raw_header
|
363
|
+
end
|
364
|
+
|
365
|
+
def self.headers
|
366
|
+
@headers
|
367
|
+
end
|
368
|
+
|
369
|
+
def self.process_headers(filehandle, options)
|
370
|
+
@raw_header = nil
|
371
|
+
@headers = nil
|
323
372
|
if options[:headers_in_file] # extract the header line
|
324
373
|
# process the header line in the CSV file..
|
325
374
|
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
326
375
|
header = readline_with_counts(filehandle, options)
|
376
|
+
@raw_header = header
|
327
377
|
|
328
378
|
header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
329
379
|
header = header.sub(options[:comment_regexp],'') if options[:comment_regexp]
|
@@ -331,16 +381,7 @@ module SmarterCSV
|
|
331
381
|
|
332
382
|
header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
|
333
383
|
|
334
|
-
|
335
|
-
file_headerA = begin
|
336
|
-
CSV.parse( header, **csv_options ).flatten.collect!{|x| x.nil? ? '' : x} # to deal with nil values from CSV.parse
|
337
|
-
rescue CSV::MalformedCSVError => e
|
338
|
-
raise $!, "#{$!} [SmarterCSV: csv line #{@csv_line_count}]", $!.backtrace
|
339
|
-
end
|
340
|
-
else
|
341
|
-
file_headerA = header.split(options[:col_sep])
|
342
|
-
end
|
343
|
-
file_header_size = file_headerA.size # before mapping, which could delete keys
|
384
|
+
file_headerA, file_header_size = parse(header, options)
|
344
385
|
|
345
386
|
file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/,'') }
|
346
387
|
file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
|
@@ -400,6 +441,7 @@ module SmarterCSV
|
|
400
441
|
raise SmarterCSV::MissingHeaders , "ERROR: missing headers: #{missing_headers.join(',')}" unless missing_headers.empty?
|
401
442
|
end
|
402
443
|
|
444
|
+
@headers = headerA
|
403
445
|
[headerA, header_size]
|
404
446
|
end
|
405
447
|
|
data/lib/smarter_csv/version.rb
CHANGED
data/smarter_csv.gemspec
CHANGED
@@ -16,9 +16,9 @@ Gem::Specification.new do |spec|
|
|
16
16
|
spec.executables = spec.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ["lib"]
|
19
|
-
spec.requirements = ['csv'] # for CSV.parse() only needed in case we have quoted fields
|
20
19
|
spec.add_development_dependency "rspec"
|
21
20
|
spec.add_development_dependency "simplecov"
|
21
|
+
spec.add_development_dependency "awesome_print"
|
22
22
|
# spec.add_development_dependency "guard-rspec"
|
23
23
|
|
24
24
|
spec.metadata["homepage_uri"] = spec.homepage
|
@@ -2,16 +2,24 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
fixture_path = 'spec/fixtures'
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
context "malformed header" do
|
5
|
+
# according to RFC-4180 quotes inside of "words" shouldbe doubled, but our parser is robust against that.
|
6
|
+
describe 'malformed CSV quotes' do
|
7
|
+
context "malformed quotes in header" do
|
9
8
|
let(:csv_path) { "#{fixture_path}/malformed_header.csv" }
|
10
|
-
it
|
9
|
+
it 'should be resilient against single quotes' do
|
10
|
+
data = SmarterCSV.process(csv_path)
|
11
|
+
expect(data[0]).to eq({:name=>"Arnold Schwarzenegger", :dobdob=>"1947-07-30"})
|
12
|
+
expect(data[1]).to eq({:name=>"Jeff Bridges", :dobdob=>"1949-12-04"})
|
13
|
+
end
|
11
14
|
end
|
12
15
|
|
13
|
-
context "malformed content" do
|
16
|
+
context "malformed quotes in content" do
|
14
17
|
let(:csv_path) { "#{fixture_path}/malformed.csv" }
|
15
|
-
|
18
|
+
|
19
|
+
it 'should be resilient against single quotes' do
|
20
|
+
data = SmarterCSV.process(csv_path)
|
21
|
+
expect(data[0]).to eq({:name=>"Arnold Schwarzenegger", :dob=>"1947-07-30"})
|
22
|
+
expect(data[1]).to eq({:name=>"Jeff \"the dude\" Bridges", :dob=>"1949-12-04"})
|
23
|
+
end
|
16
24
|
end
|
17
25
|
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'parse with col_sep' do
|
4
|
+
let(:options) { {quote_char: '"'} }
|
5
|
+
|
6
|
+
it 'parses with comma' do
|
7
|
+
line = "a,b,,d"
|
8
|
+
options.merge!({col_sep: ","})
|
9
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
10
|
+
expect(array).to eq ['a', 'b', '', 'd']
|
11
|
+
expect(array_size).to eq 4
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'parses trailing commas' do
|
15
|
+
line = "a,b,c,,"
|
16
|
+
options.merge!({col_sep: ","})
|
17
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
18
|
+
expect(array).to eq ['a', 'b', 'c', '', '']
|
19
|
+
expect(array_size).to eq 5
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'parses with space' do
|
23
|
+
line = "a b d"
|
24
|
+
options.merge!({col_sep: " "})
|
25
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
26
|
+
expect(array).to eq ['a', 'b', '', 'd']
|
27
|
+
expect(array_size).to eq 4
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'parses with tab' do
|
31
|
+
line = "a\tb\t\td"
|
32
|
+
options.merge!({col_sep: "\t"})
|
33
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
34
|
+
expect(array).to eq ['a', 'b', '', 'd']
|
35
|
+
expect(array_size).to eq 4
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'parses with multiple space separator' do
|
39
|
+
line = "a b d"
|
40
|
+
options.merge!({col_sep: " "})
|
41
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
42
|
+
expect(array).to eq ['a b', '', 'd']
|
43
|
+
expect(array_size).to eq 3
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'parses with multiple char separator' do
|
47
|
+
line = '<=><=>A<=>B<=>C'
|
48
|
+
options.merge!({col_sep: "<=>"})
|
49
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
50
|
+
expect(array).to eq ["", "", "A", "B", "C"]
|
51
|
+
expect(array_size).to eq 5
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'parses trailing multiple char separator' do
|
55
|
+
line = '<=><=>A<=>B<=>C<=><=>'
|
56
|
+
options.merge!({col_sep: "<=>"})
|
57
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
58
|
+
expect(array).to eq ["", "", "A", "B", "C", "", ""]
|
59
|
+
expect(array_size).to eq 7
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'old CSV library parsing tests' do
|
4
|
+
let(:options) { {quote_char: '"', col_sep: ","} }
|
5
|
+
|
6
|
+
[ ["\t", ["\t"]],
|
7
|
+
["foo,\"\"\"\"\"\",baz", ["foo", "\"\"", "baz"]],
|
8
|
+
["foo,\"\"\"bar\"\"\",baz", ["foo", "\"bar\"", "baz"]],
|
9
|
+
["\"\"\"\n\",\"\"\"\n\"", ["\"\n", "\"\n"]],
|
10
|
+
["foo,\"\r\n\",baz", ["foo", "\r\n", "baz"]],
|
11
|
+
["\"\"", [""]],
|
12
|
+
["foo,\"\"\"\",baz", ["foo", "\"", "baz"]],
|
13
|
+
["foo,\"\r.\n\",baz", ["foo", "\r.\n", "baz"]],
|
14
|
+
["foo,\"\r\",baz", ["foo", "\r", "baz"]],
|
15
|
+
["foo,\"\",baz", ["foo", "", "baz"]],
|
16
|
+
["\",\"", [","]],
|
17
|
+
["foo", ["foo"]],
|
18
|
+
[",,", ['', '', '']],
|
19
|
+
[",", ['', '']],
|
20
|
+
["foo,\"\n\",baz", ["foo", "\n", "baz"]],
|
21
|
+
["foo,,baz", ["foo", '', "baz"]],
|
22
|
+
["\"\"\"\r\",\"\"\"\r\"", ["\"\r", "\"\r"]],
|
23
|
+
["\",\",\",\"", [",", ","]],
|
24
|
+
["foo,bar,", ["foo", "bar", '']],
|
25
|
+
[",foo,bar", ['', "foo", "bar"]],
|
26
|
+
["foo,bar", ["foo", "bar"]],
|
27
|
+
[";", [";"]],
|
28
|
+
["\t,\t", ["\t", "\t"]],
|
29
|
+
["foo,\"\r\n\r\",baz", ["foo", "\r\n\r", "baz"]],
|
30
|
+
["foo,\"\r\n\n\",baz", ["foo", "\r\n\n", "baz"]],
|
31
|
+
["foo,\"foo,bar\",baz", ["foo", "foo,bar", "baz"]],
|
32
|
+
[";,;", [";", ";"]]
|
33
|
+
].each do |line, result|
|
34
|
+
it "parses #{line}" do
|
35
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
36
|
+
expect(array).to eq result
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
[ ["foo,\"\"\"\"\"\",baz", ["foo", "\"\"", "baz"]],
|
41
|
+
["foo,\"\"\"bar\"\"\",baz", ["foo", "\"bar\"", "baz"]],
|
42
|
+
["foo,\"\r\n\",baz", ["foo", "\r\n", "baz"]],
|
43
|
+
["\"\"", [""]],
|
44
|
+
["foo,\"\"\"\",baz", ["foo", "\"", "baz"]],
|
45
|
+
["foo,\"\r.\n\",baz", ["foo", "\r.\n", "baz"]],
|
46
|
+
["foo,\"\r\",baz", ["foo", "\r", "baz"]],
|
47
|
+
["foo,\"\",baz", ["foo", "", "baz"]],
|
48
|
+
["foo", ["foo"]],
|
49
|
+
[",,", ['', '', '']],
|
50
|
+
[",", ['', '']],
|
51
|
+
["foo,\"\n\",baz", ["foo", "\n", "baz"]],
|
52
|
+
["foo,,baz", ["foo", '', "baz"]],
|
53
|
+
["foo,bar", ["foo", "bar"]],
|
54
|
+
["foo,\"\r\n\n\",baz", ["foo", "\r\n\n", "baz"]],
|
55
|
+
["foo,\"foo,bar\",baz", ["foo", "foo,bar", "baz"]]
|
56
|
+
].each do |line, result|
|
57
|
+
it "parses #{line}" do
|
58
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
59
|
+
expect(array).to eq result
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'mixed quotes' do
|
64
|
+
line = %Q{Ten Thousand,10000, 2710 ,,"10,000","It's ""10 Grand"", baby",10K}
|
65
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
66
|
+
expect(array).to eq ["Ten Thousand", "10000", " 2710 ", "", "10,000", "It's \"10 Grand\", baby", "10K"]
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'single quotes in fields' do
|
70
|
+
line = 'Indoor Chrome,49.2"" L x 49.2"" W x 20.5"" H,Chrome,"Crystal,Metal,Wood",23.12'
|
71
|
+
array, array_size = SmarterCSV.send(:parse, line, options)
|
72
|
+
expect(array).to eq ['Indoor Chrome', '49.2" L x 49.2" W x 20.5" H', 'Chrome', 'Crystal,Metal,Wood', '23.12']
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
fixture_path = 'spec/fixtures'
|
4
|
+
|
5
|
+
describe 'fulfills RFC-4180 and more' do
|
6
|
+
let(:options) { {col_sep: ',', row_sep: $INPUT_RECORD_SEPARATOR, quote_char: '"' } }
|
7
|
+
|
8
|
+
context 'parses simple CSV' do
|
9
|
+
context 'RFC-4180' do
|
10
|
+
it 'separating on col_sep' do
|
11
|
+
line = 'aaa,bbb,ccc'
|
12
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [%w[aaa bbb ccc], 3]
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'preserves whitespace' do
|
16
|
+
line = ' aaa , bbb , ccc '
|
17
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
18
|
+
[' aaa ', ' bbb ', ' ccc '], 3
|
19
|
+
]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
context 'extending RFC-4180' do
|
24
|
+
it 'with extra col_sep' do
|
25
|
+
line = 'aaa,bbb,ccc,'
|
26
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
27
|
+
['aaa', 'bbb', 'ccc', ''], 4
|
28
|
+
]
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'with extra col_sep with given header_size' do
|
32
|
+
line = 'aaa,bbb,ccc,'
|
33
|
+
expect( SmarterCSV.send(:parse, line, options, 3)).to eq [
|
34
|
+
['aaa', 'bbb', 'ccc'], 3
|
35
|
+
]
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'with multiple extra col_sep' do
|
39
|
+
line = 'aaa,bbb,ccc,,,'
|
40
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
41
|
+
['aaa', 'bbb', 'ccc', '', '', ''], 6
|
42
|
+
]
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'with multiple extra col_sep' do
|
46
|
+
line = 'aaa,bbb,ccc,,,'
|
47
|
+
expect( SmarterCSV.send(:parse, line, options, 3)).to eq [
|
48
|
+
['aaa', 'bbb', 'ccc'], 3
|
49
|
+
]
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'with multiple complex col_sep' do
|
53
|
+
line = 'aaa<=>bbb<=>ccc<=><=><=>'
|
54
|
+
expect( SmarterCSV.send(:parse, line, options.merge({col_sep: '<=>'}))).to eq [
|
55
|
+
['aaa', 'bbb', 'ccc', '', '', ''], 6
|
56
|
+
]
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'with multiple complex col_sep with given header_size' do
|
60
|
+
line = 'aaa<=>bbb<=>ccc<=><=><=>'
|
61
|
+
expect( SmarterCSV.send(:parse, line, options.merge({col_sep: '<=>'}), 3)).to eq [
|
62
|
+
['aaa', 'bbb', 'ccc'], 3
|
63
|
+
]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context 'parses quoted CSV' do
|
69
|
+
context 'RFC-4180' do
|
70
|
+
it 'separating on col_sep' do
|
71
|
+
line = '"aaa","bbb","ccc"'
|
72
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [%w[aaa bbb ccc], 3]
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'parses corner case correctly' do
|
76
|
+
line = '"Board 4""","$17.40","10000003427"'
|
77
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
78
|
+
['Board 4"', '$17.40', '10000003427'], 3
|
79
|
+
]
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'quoted parts can contain spaces' do
|
83
|
+
line = '" aaa1 aaa2 "," bbb1 bbb2 "," ccc1 ccc2 "'
|
84
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
85
|
+
[' aaa1 aaa2 ', ' bbb1 bbb2 ', ' ccc1 ccc2 '], 3
|
86
|
+
]
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'quoted parts can contain row_sep' do
|
90
|
+
line = '"aaa1, aaa2","bbb1, bbb2","ccc1, ccc2"'
|
91
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
92
|
+
['aaa1, aaa2', 'bbb1, bbb2', 'ccc1, ccc2'], 3
|
93
|
+
]
|
94
|
+
end
|
95
|
+
|
96
|
+
it 'quoted parts can contain row_sep' do
|
97
|
+
line = '"aaa1, ""aaa2"", aaa3","""bbb1"", bbb2","ccc1, ""ccc2"""'
|
98
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
99
|
+
['aaa1, "aaa2", aaa3', '"bbb1", bbb2', 'ccc1, "ccc2"'], 3
|
100
|
+
]
|
101
|
+
end
|
102
|
+
|
103
|
+
it 'some fields are quoted' do
|
104
|
+
line = '1,"board 4""",12.95'
|
105
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
106
|
+
['1', 'board 4"', '12.95'], 3
|
107
|
+
]
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'separating on col_sep' do
|
111
|
+
line = '"some","thing","""completely"" different"'
|
112
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
113
|
+
['some', 'thing', '"completely" different'], 3
|
114
|
+
]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
context 'extending RFC-4180' do
|
119
|
+
it 'with extra col_sep, without given header_size' do
|
120
|
+
line = '"aaa","bbb","ccc",'
|
121
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
122
|
+
['aaa', 'bbb', 'ccc', ''], 4
|
123
|
+
]
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'with extra col_sep, with given header_size' do
|
127
|
+
line = '"aaa","bbb","ccc",'
|
128
|
+
expect( SmarterCSV.send(:parse, line, options, 3)).to eq [%w[aaa bbb ccc], 3]
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'with multiple extra col_sep, without given header_size' do
|
132
|
+
line = '"aaa","bbb","ccc",,,'
|
133
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
134
|
+
['aaa', 'bbb', 'ccc', '', '', ''], 6
|
135
|
+
]
|
136
|
+
end
|
137
|
+
|
138
|
+
it 'with multiple extra col_sep, with given header_size' do
|
139
|
+
line = '"aaa","bbb","ccc",,,'
|
140
|
+
expect( SmarterCSV.send(:parse, line, options, 3)).to eq [
|
141
|
+
['aaa', 'bbb', 'ccc'], 3
|
142
|
+
]
|
143
|
+
end
|
144
|
+
|
145
|
+
it 'with multiple complex extra col_sep, without given header_size' do
|
146
|
+
line = '"aaa"<=>"bbb"<=>"ccc"<=><=><=>'
|
147
|
+
expect( SmarterCSV.send(:parse, line, options.merge({col_sep: '<=>'}))).to eq [
|
148
|
+
['aaa', 'bbb', 'ccc', '', '', ''], 6
|
149
|
+
]
|
150
|
+
end
|
151
|
+
|
152
|
+
it 'with multiple complex extra col_sep, with given header_size' do
|
153
|
+
line = '"aaa"<=>"bbb"<=>"ccc"<=><=><=>'
|
154
|
+
expect( SmarterCSV.send(:parse, line, options.merge({col_sep: '<=>'}), 3)).to eq [
|
155
|
+
['aaa', 'bbb', 'ccc'], 3
|
156
|
+
]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# relaxed parsing compared to RFC-4180
|
162
|
+
context 'liberal_parsing' do
|
163
|
+
it 'parses corner case correctly' do
|
164
|
+
line = 'is,this "three, or four",fields'
|
165
|
+
expect( SmarterCSV.send(:parse, line, options)).to eq [
|
166
|
+
['is', 'this "three, or four"', 'fields'], 3
|
167
|
+
]
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
@@ -3,7 +3,6 @@ require 'spec_helper'
|
|
3
3
|
fixture_path = 'spec/fixtures'
|
4
4
|
|
5
5
|
describe 'loading file with quoted fields' do
|
6
|
-
|
7
6
|
it 'leaving the quotes in the data' do
|
8
7
|
options = {}
|
9
8
|
data = SmarterCSV.process("#{fixture_path}/quoted.csv", options)
|
@@ -12,6 +11,7 @@ describe 'loading file with quoted fields' do
|
|
12
11
|
data[1][:description].should be_nil
|
13
12
|
data[2][:model].should eq 'Venture "Extended Edition, Very Large"'
|
14
13
|
data[2][:description].should be_nil
|
14
|
+
data[3][:description].should eq 'MUST SELL! air, moon roof, loaded'
|
15
15
|
data.each do |h|
|
16
16
|
h[:year].class.should eq Fixnum
|
17
17
|
h[:make].should_not be_nil
|
@@ -20,17 +20,21 @@ describe 'loading file with quoted fields' do
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
-
|
23
|
+
# quotes inside quoted fields need to be escaped by another double-quote
|
24
24
|
it 'removes quotes around quoted fields, but not inside data' do
|
25
25
|
options = {}
|
26
26
|
data = SmarterCSV.process("#{fixture_path}/quote_char.csv", options)
|
27
27
|
|
28
28
|
data.length.should eq 6
|
29
|
+
data[0][:first_name].should eq "\"John"
|
30
|
+
data[0][:last_name].should eq "Cooke\""
|
29
31
|
data[1][:first_name].should eq "Jam\ne\nson\""
|
30
32
|
data[2][:first_name].should eq "\"Jean"
|
33
|
+
data[4][:first_name].should eq "Bo\"bbie"
|
34
|
+
data[5][:first_name].should eq 'Mica'
|
35
|
+
data[5][:last_name].should eq 'Copeland'
|
31
36
|
end
|
32
37
|
|
33
|
-
|
34
38
|
# NOTE: quotes inside headers need to be escaped by doubling them
|
35
39
|
# e.g. 'correct ""EXAMPLE""'
|
36
40
|
# this escaping is illegal: 'incorrect \"EXAMPLE\"' <-- this caused CSV parsing error
|
@@ -43,6 +47,6 @@ describe 'loading file with quoted fields' do
|
|
43
47
|
data.length.should eq 3
|
44
48
|
data.first.keys[2].should eq :isbn
|
45
49
|
data.first.keys[3].should eq :discounted_price
|
50
|
+
data[1][:author].should eq 'Timothy "The Parser" Campbell'
|
46
51
|
end
|
47
|
-
|
48
52
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: smarter_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tilo Sloboda
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-05-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: awesome_print
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with
|
42
56
|
optional features for processing large files in parallel, embedded comments, unusual
|
43
57
|
field- and record-separators, flexible mapping of CSV-headers to Hash-keys
|
@@ -126,6 +140,9 @@ files:
|
|
126
140
|
- spec/smarter_csv/malformed_spec.rb
|
127
141
|
- spec/smarter_csv/no_header_spec.rb
|
128
142
|
- spec/smarter_csv/not_downcase_header_spec.rb
|
143
|
+
- spec/smarter_csv/parse/column_separator_spec.rb
|
144
|
+
- spec/smarter_csv/parse/old_csv_library_spec.rb
|
145
|
+
- spec/smarter_csv/parse/rfc4180_and_more_spec.rb
|
129
146
|
- spec/smarter_csv/problematic.rb
|
130
147
|
- spec/smarter_csv/quoted_spec.rb
|
131
148
|
- spec/smarter_csv/remove_empty_values_spec.rb
|
@@ -161,8 +178,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
161
178
|
- - ">="
|
162
179
|
- !ruby/object:Gem::Version
|
163
180
|
version: '0'
|
164
|
-
requirements:
|
165
|
-
- csv
|
181
|
+
requirements: []
|
166
182
|
rubygems_version: 3.1.6
|
167
183
|
signing_key:
|
168
184
|
specification_version: 4
|
@@ -233,6 +249,9 @@ test_files:
|
|
233
249
|
- spec/smarter_csv/malformed_spec.rb
|
234
250
|
- spec/smarter_csv/no_header_spec.rb
|
235
251
|
- spec/smarter_csv/not_downcase_header_spec.rb
|
252
|
+
- spec/smarter_csv/parse/column_separator_spec.rb
|
253
|
+
- spec/smarter_csv/parse/old_csv_library_spec.rb
|
254
|
+
- spec/smarter_csv/parse/rfc4180_and_more_spec.rb
|
236
255
|
- spec/smarter_csv/problematic.rb
|
237
256
|
- spec/smarter_csv/quoted_spec.rb
|
238
257
|
- spec/smarter_csv/remove_empty_values_spec.rb
|