smarter_csv 1.8.5 → 1.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +13 -1
- data/CHANGELOG.md +21 -0
- data/README.md +19 -3
- data/Rakefile +9 -10
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +24 -19
- data/smarter_csv.gemspec +8 -5
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 20db4a75108d2b7934b90e6d0e3fef3053e61ad077b4ec4966a97a6e5cb3aa42
|
4
|
+
data.tar.gz: 44d1c3b995b3f7d53d46768437517379dfe3b93fa1db1384054ac16baadfb8d6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74a2edab893bd9e1b798b03321aea55b566accb92a47d939bdddb616af57fc8525b60b50fb94d1f67face95c83d0a00300f587355dd4934130f5bac9879d5dcd
|
7
|
+
data.tar.gz: 1cb78471b4021dafed4fc1bdd42c2acae934eb1a89f05e9700bfb99f40543c770db58b9b42a70272de9797bdf43fe5d53fa8c352964c1574d658ab2f881a06d6
|
data/.rubocop.yml
CHANGED
@@ -22,6 +22,9 @@ Metrics/BlockLength:
|
|
22
22
|
Metrics/BlockNesting:
|
23
23
|
Enabled: false
|
24
24
|
|
25
|
+
Metrics/ClassLength:
|
26
|
+
Enabled: false
|
27
|
+
|
25
28
|
Metrics/CyclomaticComplexity: # BS rule
|
26
29
|
Enabled: false
|
27
30
|
|
@@ -46,6 +49,9 @@ Naming/VariableNumber:
|
|
46
49
|
Style/ClassEqualityComparison:
|
47
50
|
Enabled: false
|
48
51
|
|
52
|
+
Style/ClassMethods:
|
53
|
+
Enabled: false
|
54
|
+
|
49
55
|
Style/ConditionalAssignment:
|
50
56
|
Enabled: false
|
51
57
|
|
@@ -114,6 +120,9 @@ Style/StringLiteralsInInterpolation:
|
|
114
120
|
Enabled: false
|
115
121
|
EnforcedStyle: double_quotes
|
116
122
|
|
123
|
+
Style/SymbolArray:
|
124
|
+
Enabled: false
|
125
|
+
|
117
126
|
Style/SymbolProc: # old Ruby versions can't do this
|
118
127
|
Enabled: false
|
119
128
|
|
@@ -123,6 +132,9 @@ Style/TrailingCommaInHashLiteral:
|
|
123
132
|
Style/TrailingUnderscoreVariable:
|
124
133
|
Enabled: false
|
125
134
|
|
135
|
+
Style/TrivialAccessors:
|
136
|
+
Enabled: false
|
137
|
+
|
126
138
|
# Style/UnlessModifier:
|
127
139
|
# Enabled: false
|
128
140
|
|
@@ -130,4 +142,4 @@ Style/ZeroLengthPredicate:
|
|
130
142
|
Enabled: false
|
131
143
|
|
132
144
|
Layout/LineLength:
|
133
|
-
Max:
|
145
|
+
Max: 256
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,27 @@
|
|
1
1
|
|
2
2
|
# SmarterCSV 1.x Change Log
|
3
3
|
|
4
|
+
## 1.9.0 (2023-09-04)
|
5
|
+
* fixed issue #139
|
6
|
+
|
7
|
+
* Error `SmarterCSV::MissingHeaders` was renamed to `SmarterCSV::MissingKeys`
|
8
|
+
|
9
|
+
* CHANGED BEHAVIOR:
|
10
|
+
When `key_mapping` option is used. (issue #139)
|
11
|
+
Previous versions just printed an error message when a CSV header was missing during key mapping.
|
12
|
+
Versions >= 1.9 will throw `SmarterCSV::MissingHeaders` listing all headers that were missing during mapping.
|
13
|
+
|
14
|
+
* Notable details for `key_mapping` and `required_headers`:
|
15
|
+
|
16
|
+
* `key_mapping` is applied to the headers early on during `SmarterCSV.process`, and raises an error if a header in the input CSV file is missing, and we can not map that header to its desired name.
|
17
|
+
|
18
|
+
Mapping errors can be surpressed by using:
|
19
|
+
* `silence_missing_keys` set to `true`, which silence all such errors, making all headers for mapping optional.
|
20
|
+
* `silence_missing_keys` given an Array with the specific header keys that are optional
|
21
|
+
The use case is that some header fields are optional, but we still want them renamed if they are present.
|
22
|
+
|
23
|
+
* `required_headers` checks which headers are present **after** `key_mapping` was applied.
|
24
|
+
|
4
25
|
## 1.8.5 (2023-06-25)
|
5
26
|
* fix parsing of escaped quote characters (thanks to JP Camara)
|
6
27
|
|
data/README.md
CHANGED
@@ -161,7 +161,22 @@ and how the `process` method returns the number of chunks when called with a blo
|
|
161
161
|
=> returns number of chunks / rows we processed
|
162
162
|
```
|
163
163
|
|
164
|
-
#### Example 4:
|
164
|
+
#### Example 4: Processing a CSV File, and inserting batch jobs in Sidekiq:
|
165
|
+
```ruby
|
166
|
+
filename = '/tmp/input.csv' # CSV file containing ids or data to process
|
167
|
+
options = { :chunk_size => 100 }
|
168
|
+
n = SmarterCSV.process(filename, options) do |chunk|
|
169
|
+
Sidekiq::Client.push_bulk(
|
170
|
+
'class' => SidekiqIndividualWorkerClass,
|
171
|
+
'args' => chunk,
|
172
|
+
)
|
173
|
+
# OR:
|
174
|
+
# SidekiqBatchWorkerClass.process_async(chunk ) # pass an array of hashes to Sidekiq workers for parallel processing
|
175
|
+
end
|
176
|
+
=> returns number of chunks
|
177
|
+
```
|
178
|
+
|
179
|
+
#### Example 4b: Reading a CSV-like File, and Processing it with Sidekiq:
|
165
180
|
```ruby
|
166
181
|
filename = '/tmp/strange_db_dump' # a file with CRTL-A as col_separator, and with CTRL-B\n as record_separator (hello iTunes!)
|
167
182
|
options = {
|
@@ -173,7 +188,6 @@ and how the `process` method returns the number of chunks when called with a blo
|
|
173
188
|
end
|
174
189
|
=> returns number of chunks
|
175
190
|
```
|
176
|
-
|
177
191
|
#### Example 5: Populate a MongoDB Database in Chunks of 100 records with SmarterCSV:
|
178
192
|
```ruby
|
179
193
|
# using chunks:
|
@@ -282,7 +296,9 @@ And header and data validations will also be supported in 2.x
|
|
282
296
|
| Option | Default | Explanation |
|
283
297
|
---------------------------------------------------------------------------------------------------------------------------------
|
284
298
|
| :key_mapping | nil | a hash which maps headers from the CSV file to keys in the result hash |
|
285
|
-
| :silence_missing_key | false | ignore missing keys in `key_mapping`
|
299
|
+
| :silence_missing_key | false | ignore missing keys in `key_mapping` |
|
300
|
+
| | | if set to true: makes all mapped keys optional |
|
301
|
+
| | | if given an array, makes only the keys listed in it optional |
|
286
302
|
| :required_keys | nil | An array. Specify the required names AFTER header transformation. |
|
287
303
|
| :required_headers | nil | (DEPRECATED / renamed) Use `required_keys` instead |
|
288
304
|
| | | or an exception is raised No validation if nil is given. |
|
data/Rakefile
CHANGED
@@ -3,16 +3,15 @@
|
|
3
3
|
require "bundler/gem_tasks"
|
4
4
|
require 'rspec/core/rake_task'
|
5
5
|
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
### end of tempfix
|
6
|
+
# # temp fix for NoMethodError: undefined method `last_comment'
|
7
|
+
# # remove when fixed in Rake 11.x and higher
|
8
|
+
# module TempFixForRakeLastComment
|
9
|
+
# def last_comment
|
10
|
+
# last_description
|
11
|
+
# end
|
12
|
+
# end
|
13
|
+
# Rake::Application.send :include, TempFixForRakeLastComment
|
14
|
+
# ### end of tempfix
|
16
15
|
|
17
16
|
RSpec::Core::RakeTask.new(:spec)
|
18
17
|
|
data/lib/smarter_csv/version.rb
CHANGED
data/lib/smarter_csv.rb
CHANGED
@@ -12,12 +12,12 @@ module SmarterCSV
|
|
12
12
|
class IncorrectOption < SmarterCSVException; end
|
13
13
|
class ValidationError < SmarterCSVException; end
|
14
14
|
class DuplicateHeaders < SmarterCSVException; end
|
15
|
-
class
|
15
|
+
class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
|
16
16
|
class NoColSepDetected < SmarterCSVException; end
|
17
|
-
class KeyMappingError < SmarterCSVException; end
|
17
|
+
class KeyMappingError < SmarterCSVException; end
|
18
18
|
|
19
19
|
# first parameter: filename or input object which responds to readline method
|
20
|
-
def SmarterCSV.process(input, options = {}, &block)
|
20
|
+
def SmarterCSV.process(input, options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
|
21
21
|
options = default_options.merge(options)
|
22
22
|
options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
|
23
23
|
puts "SmarterCSV OPTIONS: #{options.inspect}" if options[:verbose]
|
@@ -99,7 +99,7 @@ module SmarterCSV
|
|
99
99
|
hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
|
100
100
|
end
|
101
101
|
|
102
|
-
hash.delete_if{|_k, v| !v.nil? && v =~ /^(
|
102
|
+
hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
|
103
103
|
hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
|
104
104
|
|
105
105
|
if options[:convert_values_to_numeric]
|
@@ -171,15 +171,15 @@ module SmarterCSV
|
|
171
171
|
result << chunk # not sure yet, why anybody would want to do this without a block
|
172
172
|
end
|
173
173
|
chunk_count += 1
|
174
|
-
chunk = [] # initialize for next chunk of data
|
174
|
+
# chunk = [] # initialize for next chunk of data
|
175
175
|
end
|
176
176
|
ensure
|
177
177
|
fh.close if fh.respond_to?(:close)
|
178
178
|
end
|
179
179
|
if block_given?
|
180
|
-
|
180
|
+
chunk_count # when we do processing through a block we only care how many chunks we processed
|
181
181
|
else
|
182
|
-
|
182
|
+
result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
183
183
|
end
|
184
184
|
end
|
185
185
|
|
@@ -285,11 +285,11 @@ module SmarterCSV
|
|
285
285
|
has_quotes = line =~ /#{options[:quote_char]}/
|
286
286
|
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
|
287
287
|
elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
|
288
|
-
|
288
|
+
[elements, elements.size]
|
289
289
|
# :nocov:
|
290
290
|
else
|
291
291
|
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
|
292
|
-
|
292
|
+
parse_csv_line_ruby(line, options, header_size)
|
293
293
|
end
|
294
294
|
end
|
295
295
|
|
@@ -402,7 +402,7 @@ module SmarterCSV
|
|
402
402
|
return true unless Array(options[option_name][:only]).include?(key)
|
403
403
|
end
|
404
404
|
end
|
405
|
-
|
405
|
+
false
|
406
406
|
end
|
407
407
|
|
408
408
|
# If file has headers, then guesses column separator from headers.
|
@@ -467,8 +467,8 @@ module SmarterCSV
|
|
467
467
|
|
468
468
|
counts["\r"] += 1 if last_char == "\r"
|
469
469
|
# find the most frequent key/value pair:
|
470
|
-
|
471
|
-
|
470
|
+
most_frequent_key, _count = counts.max_by{|_, v| v}
|
471
|
+
most_frequent_key
|
472
472
|
end
|
473
473
|
|
474
474
|
def process_headers(filehandle, options)
|
@@ -490,6 +490,7 @@ module SmarterCSV
|
|
490
490
|
|
491
491
|
file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
|
492
492
|
file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
|
493
|
+
|
493
494
|
unless options[:keep_original_headers]
|
494
495
|
file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
|
495
496
|
file_headerA.map!{|x| x.downcase} if options[:downcase_header]
|
@@ -523,10 +524,13 @@ module SmarterCSV
|
|
523
524
|
# do some key mapping on the keys in the file header
|
524
525
|
# if you want to completely delete a key, then map it to nil or to ''
|
525
526
|
if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
527
|
+
# if silence_missing_keys are not set, raise error if missing header
|
528
|
+
missing_keys = key_mappingH.keys - headerA
|
529
|
+
# if the user passes a list of speciffic mapped keys that are optional
|
530
|
+
missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
|
531
|
+
|
532
|
+
unless missing_keys.empty? || options[:silence_missing_keys] == true
|
533
|
+
raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
|
530
534
|
end
|
531
535
|
|
532
536
|
headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
|
@@ -544,8 +548,8 @@ module SmarterCSV
|
|
544
548
|
end
|
545
549
|
|
546
550
|
# deprecate required_headers
|
547
|
-
|
548
|
-
puts "DEPRECATION WARNING: please use 'required_keys' instead of '
|
551
|
+
unless options[:required_headers].nil?
|
552
|
+
puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
|
549
553
|
if options[:required_keys].nil?
|
550
554
|
options[:required_keys] = options[:required_headers]
|
551
555
|
options[:required_headers] = nil
|
@@ -557,7 +561,7 @@ module SmarterCSV
|
|
557
561
|
options[:required_keys].each do |k|
|
558
562
|
missing_keys << k unless headerA.include?(k)
|
559
563
|
end
|
560
|
-
raise SmarterCSV::
|
564
|
+
raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
|
561
565
|
end
|
562
566
|
|
563
567
|
@headers = headerA
|
@@ -611,6 +615,7 @@ module SmarterCSV
|
|
611
615
|
def option_valid?(str)
|
612
616
|
return true if str.is_a?(Symbol) && str == :auto
|
613
617
|
return true if str.is_a?(String) && !str.empty?
|
618
|
+
|
614
619
|
false
|
615
620
|
end
|
616
621
|
end
|
data/smarter_csv.gemspec
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require File.expand_path('lib/smarter_csv/version', __dir__)
|
3
5
|
|
4
6
|
Gem::Specification.new do |spec|
|
5
7
|
spec.name = "smarter_csv"
|
@@ -7,8 +9,8 @@ Gem::Specification.new do |spec|
|
|
7
9
|
spec.authors = ["Tilo Sloboda"]
|
8
10
|
spec.email = ["tilo.sloboda@gmail.com"]
|
9
11
|
|
10
|
-
spec.summary =
|
11
|
-
spec.description =
|
12
|
+
spec.summary = "Ruby Gem for smarter importing of CSV Files (and CSV-like files), with lots of optional features, e.g. chunked processing for huge CSV files"
|
13
|
+
spec.description = "Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with optional features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys"
|
12
14
|
spec.homepage = "https://github.com/tilo/smarter_csv"
|
13
15
|
spec.license = 'MIT'
|
14
16
|
|
@@ -16,6 +18,8 @@ Gem::Specification.new do |spec|
|
|
16
18
|
spec.metadata["source_code_uri"] = spec.homepage
|
17
19
|
spec.metadata["changelog_uri"] = "https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md"
|
18
20
|
|
21
|
+
spec.required_ruby_version = ">= 2.5.0"
|
22
|
+
|
19
23
|
# Specify which files should be added to the gem when it is released.
|
20
24
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
25
|
spec.files = Dir.chdir(__dir__) do
|
@@ -30,7 +34,6 @@ Gem::Specification.new do |spec|
|
|
30
34
|
spec.require_paths = ["lib"] # add ext here?
|
31
35
|
spec.extensions = ["ext/smarter_csv/extconf.rb"]
|
32
36
|
|
33
|
-
|
34
37
|
spec.add_development_dependency "awesome_print"
|
35
38
|
spec.add_development_dependency "codecov"
|
36
39
|
spec.add_development_dependency "pry"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: smarter_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tilo Sloboda
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-09-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|
@@ -134,7 +134,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
134
134
|
requirements:
|
135
135
|
- - ">="
|
136
136
|
- !ruby/object:Gem::Version
|
137
|
-
version:
|
137
|
+
version: 2.5.0
|
138
138
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
139
|
requirements:
|
140
140
|
- - ">="
|