smarter_csv 1.8.5 → 1.9.2.pre01
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +13 -1
- data/CHANGELOG.md +34 -0
- data/Gemfile +6 -3
- data/README.md +24 -4
- data/Rakefile +29 -15
- data/ext/smarter_csv/extconf.rb +2 -2
- data/ext/smarter_csv/smarter_csv.c +8 -2
- data/lib/smarter_csv/options_processing.rb +84 -0
- data/lib/smarter_csv/smarter_csv.rb +556 -0
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +27 -605
- data/smarter_csv.gemspec +10 -6
- metadata +9 -6
- /data/lib/{extensions → core_ext}/hash.rb +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 75d9d441d771c2fbe0861e0bc5f84dbf05d010b4844a867fc4679df002822d07
|
4
|
+
data.tar.gz: 0d5d37d4f2654fd354a2adac23019b2955540c1356c57f72052c01220598ffa2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b24e2b09ea919994da347eb52b781868a19e0f28dc367bfeb43b8a254619ab8dd882d3035f0546683c2ebc893fd600ce05a3abb800e4b124c7369d314607ee3f
|
7
|
+
data.tar.gz: 3a1115ac4937c2fedf469d1f45e3aa1cf7ed03f1f55d66f6cb310c767a3b5e8cb4966a19ba968f065c5d1de2d7074f479b5eeb0686dbab8012e5e6b8ed0f2628
|
data/.rubocop.yml
CHANGED
@@ -22,6 +22,9 @@ Metrics/BlockLength:
|
|
22
22
|
Metrics/BlockNesting:
|
23
23
|
Enabled: false
|
24
24
|
|
25
|
+
Metrics/ClassLength:
|
26
|
+
Enabled: false
|
27
|
+
|
25
28
|
Metrics/CyclomaticComplexity: # BS rule
|
26
29
|
Enabled: false
|
27
30
|
|
@@ -46,6 +49,9 @@ Naming/VariableNumber:
|
|
46
49
|
Style/ClassEqualityComparison:
|
47
50
|
Enabled: false
|
48
51
|
|
52
|
+
Style/ClassMethods:
|
53
|
+
Enabled: false
|
54
|
+
|
49
55
|
Style/ConditionalAssignment:
|
50
56
|
Enabled: false
|
51
57
|
|
@@ -114,6 +120,9 @@ Style/StringLiteralsInInterpolation:
|
|
114
120
|
Enabled: false
|
115
121
|
EnforcedStyle: double_quotes
|
116
122
|
|
123
|
+
Style/SymbolArray:
|
124
|
+
Enabled: false
|
125
|
+
|
117
126
|
Style/SymbolProc: # old Ruby versions can't do this
|
118
127
|
Enabled: false
|
119
128
|
|
@@ -123,6 +132,9 @@ Style/TrailingCommaInHashLiteral:
|
|
123
132
|
Style/TrailingUnderscoreVariable:
|
124
133
|
Enabled: false
|
125
134
|
|
135
|
+
Style/TrivialAccessors:
|
136
|
+
Enabled: false
|
137
|
+
|
126
138
|
# Style/UnlessModifier:
|
127
139
|
# Enabled: false
|
128
140
|
|
@@ -130,4 +142,4 @@ Style/ZeroLengthPredicate:
|
|
130
142
|
Enabled: false
|
131
143
|
|
132
144
|
Layout/LineLength:
|
133
|
-
Max:
|
145
|
+
Max: 256
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,40 @@
|
|
1
1
|
|
2
2
|
# SmarterCSV 1.x Change Log
|
3
3
|
|
4
|
+
## 1.9.2.pre01 (2023-11-11)
|
5
|
+
* fixed bug with '\\' at end of line (issue #252)
|
6
|
+
* fixed require statements
|
7
|
+
|
8
|
+
## 1.9.1 (2023-10-30) (YANKED)
|
9
|
+
* yanked
|
10
|
+
* no functional changes
|
11
|
+
* refactored directory structure
|
12
|
+
* re-added JRuby and TruffleRuby to CI tests
|
13
|
+
* no C-accelleration for JRuby
|
14
|
+
* refactored options parsing
|
15
|
+
* code coverage / rubocop
|
16
|
+
|
17
|
+
## 1.9.0 (2023-09-04)
|
18
|
+
* fixed issue #139
|
19
|
+
|
20
|
+
* Error `SmarterCSV::MissingHeaders` was renamed to `SmarterCSV::MissingKeys`
|
21
|
+
|
22
|
+
* CHANGED BEHAVIOR:
|
23
|
+
When `key_mapping` option is used. (issue #139)
|
24
|
+
Previous versions just printed an error message when a CSV header was missing during key mapping.
|
25
|
+
Versions >= 1.9 will throw `SmarterCSV::MissingHeaders` listing all headers that were missing during mapping.
|
26
|
+
|
27
|
+
* Notable details for `key_mapping` and `required_headers`:
|
28
|
+
|
29
|
+
* `key_mapping` is applied to the headers early on during `SmarterCSV.process`, and raises an error if a header in the input CSV file is missing, and we can not map that header to its desired name.
|
30
|
+
|
31
|
+
Mapping errors can be surpressed by using:
|
32
|
+
* `silence_missing_keys` set to `true`, which silence all such errors, making all headers for mapping optional.
|
33
|
+
* `silence_missing_keys` given an Array with the specific header keys that are optional
|
34
|
+
The use case is that some header fields are optional, but we still want them renamed if they are present.
|
35
|
+
|
36
|
+
* `required_headers` checks which headers are present **after** `key_mapping` was applied.
|
37
|
+
|
4
38
|
## 1.8.5 (2023-06-25)
|
5
39
|
* fix parsing of escaped quote characters (thanks to JP Camara)
|
6
40
|
|
data/Gemfile
CHANGED
@@ -5,10 +5,13 @@ source 'https://rubygems.org'
|
|
5
5
|
# Specify your gem's dependencies in smarter_csv.gemspec
|
6
6
|
gemspec
|
7
7
|
|
8
|
-
gem "rake"
|
8
|
+
gem "rake"
|
9
9
|
gem "rake-compiler"
|
10
10
|
|
11
11
|
gem 'pry'
|
12
|
-
|
13
|
-
gem "rspec"
|
14
12
|
gem "rubocop"
|
13
|
+
|
14
|
+
group :test do
|
15
|
+
gem "rspec"
|
16
|
+
gem "simplecov"
|
17
|
+
end
|
data/README.md
CHANGED
@@ -46,7 +46,11 @@ One `smarter_csv` user wrote:
|
|
46
46
|
* able to ignore "columns" in the input (delete columns)
|
47
47
|
* able to eliminate nil or empty fields from the result hashes (default)
|
48
48
|
|
49
|
-
|
49
|
+
#### Assumptions / Limitations
|
50
|
+
* It is assumed that the escape character is `\`, as on UNIX and Windows systems.
|
51
|
+
* It is assumed that quote charcters around fields are balanced, e.g. valid: `"field"`, invalid: `"field\"`
|
52
|
+
e.g. an escaped `quote_char` does not denote the end of a field.
|
53
|
+
* This Gem is only for importing CSV files - writing of CSV files is not supported at this time.
|
50
54
|
|
51
55
|
### Why?
|
52
56
|
|
@@ -161,7 +165,22 @@ and how the `process` method returns the number of chunks when called with a blo
|
|
161
165
|
=> returns number of chunks / rows we processed
|
162
166
|
```
|
163
167
|
|
164
|
-
#### Example 4:
|
168
|
+
#### Example 4: Processing a CSV File, and inserting batch jobs in Sidekiq:
|
169
|
+
```ruby
|
170
|
+
filename = '/tmp/input.csv' # CSV file containing ids or data to process
|
171
|
+
options = { :chunk_size => 100 }
|
172
|
+
n = SmarterCSV.process(filename, options) do |chunk|
|
173
|
+
Sidekiq::Client.push_bulk(
|
174
|
+
'class' => SidekiqIndividualWorkerClass,
|
175
|
+
'args' => chunk,
|
176
|
+
)
|
177
|
+
# OR:
|
178
|
+
# SidekiqBatchWorkerClass.process_async(chunk ) # pass an array of hashes to Sidekiq workers for parallel processing
|
179
|
+
end
|
180
|
+
=> returns number of chunks
|
181
|
+
```
|
182
|
+
|
183
|
+
#### Example 4b: Reading a CSV-like File, and Processing it with Sidekiq:
|
165
184
|
```ruby
|
166
185
|
filename = '/tmp/strange_db_dump' # a file with CRTL-A as col_separator, and with CTRL-B\n as record_separator (hello iTunes!)
|
167
186
|
options = {
|
@@ -173,7 +192,6 @@ and how the `process` method returns the number of chunks when called with a blo
|
|
173
192
|
end
|
174
193
|
=> returns number of chunks
|
175
194
|
```
|
176
|
-
|
177
195
|
#### Example 5: Populate a MongoDB Database in Chunks of 100 records with SmarterCSV:
|
178
196
|
```ruby
|
179
197
|
# using chunks:
|
@@ -282,7 +300,9 @@ And header and data validations will also be supported in 2.x
|
|
282
300
|
| Option | Default | Explanation |
|
283
301
|
---------------------------------------------------------------------------------------------------------------------------------
|
284
302
|
| :key_mapping | nil | a hash which maps headers from the CSV file to keys in the result hash |
|
285
|
-
| :silence_missing_key | false | ignore missing keys in `key_mapping`
|
303
|
+
| :silence_missing_key | false | ignore missing keys in `key_mapping` |
|
304
|
+
| | | if set to true: makes all mapped keys optional |
|
305
|
+
| | | if given an array, makes only the keys listed in it optional |
|
286
306
|
| :required_keys | nil | An array. Specify the required names AFTER header transformation. |
|
287
307
|
| :required_headers | nil | (DEPRECATED / renamed) Use `required_keys` instead |
|
288
308
|
| | | or an exception is raised No validation if nil is given. |
|
data/Rakefile
CHANGED
@@ -3,16 +3,15 @@
|
|
3
3
|
require "bundler/gem_tasks"
|
4
4
|
require 'rspec/core/rake_task'
|
5
5
|
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
### end of tempfix
|
6
|
+
# # temp fix for NoMethodError: undefined method `last_comment'
|
7
|
+
# # remove when fixed in Rake 11.x and higher
|
8
|
+
# module TempFixForRakeLastComment
|
9
|
+
# def last_comment
|
10
|
+
# last_description
|
11
|
+
# end
|
12
|
+
# end
|
13
|
+
# Rake::Application.send :include, TempFixForRakeLastComment
|
14
|
+
# ### end of tempfix
|
16
15
|
|
17
16
|
RSpec::Core::RakeTask.new(:spec)
|
18
17
|
|
@@ -22,11 +21,26 @@ RuboCop::RakeTask.new
|
|
22
21
|
|
23
22
|
require "rake/extensiontask"
|
24
23
|
|
25
|
-
|
24
|
+
if RUBY_ENGINE == 'jruby'
|
25
|
+
|
26
|
+
task default: %i[spec]
|
26
27
|
|
27
|
-
|
28
|
-
|
28
|
+
else
|
29
|
+
task build: :compile
|
30
|
+
|
31
|
+
Rake::ExtensionTask.new("smarter_csv") do |ext|
|
32
|
+
ext.lib_dir = "lib/smarter_csv"
|
33
|
+
ext.ext_dir = "ext/smarter_csv"
|
34
|
+
ext.source_pattern = "*.{c,h}"
|
35
|
+
end
|
36
|
+
|
37
|
+
# task default: %i[clobber compile spec rubocop]
|
38
|
+
task default: %i[clobber compile spec]
|
29
39
|
end
|
30
40
|
|
31
|
-
|
32
|
-
task
|
41
|
+
desc 'Run spec with coverage'
|
42
|
+
task :coverage do
|
43
|
+
ENV['COVERAGE'] = 'true'
|
44
|
+
Rake::Task['spec'].execute
|
45
|
+
`open coverage/index.html`
|
46
|
+
end
|
data/ext/smarter_csv/extconf.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'mkmf'
|
4
|
-
|
5
4
|
require "rbconfig"
|
5
|
+
|
6
6
|
if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
|
7
|
-
fixed_CFLAGS = RbConfig::MAKEFILE_CONFIG["CFLAGS"].sub("-g -O3", "$(cflags)")
|
7
|
+
fixed_CFLAGS = RbConfig::MAKEFILE_CONFIG["CFLAGS"].sub("-g -O3", "-O3 $(cflags)")
|
8
8
|
puts("Fix CFLAGS: #{RbConfig::MAKEFILE_CONFIG["CFLAGS"]} -> #{fixed_CFLAGS}")
|
9
9
|
RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
|
10
10
|
end
|
@@ -40,6 +40,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
40
40
|
long i;
|
41
41
|
|
42
42
|
char prev_char = '\0'; // Store the previous character for comparison against an escape character
|
43
|
+
long backslash_count = 0; // to count consecutive backslash characters
|
43
44
|
|
44
45
|
while (p < endP) {
|
45
46
|
/* does the remaining string start with col_sep ? */
|
@@ -61,8 +62,13 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
61
62
|
startP = p;
|
62
63
|
}
|
63
64
|
} else {
|
64
|
-
if (*p ==
|
65
|
-
|
65
|
+
if (*p == '\\') {
|
66
|
+
backslash_count++;
|
67
|
+
} else {
|
68
|
+
if (*p == *quoteP && (backslash_count % 2 == 0)) {
|
69
|
+
quote_count++;
|
70
|
+
}
|
71
|
+
backslash_count = 0; // no more consecutive backslash characters
|
66
72
|
}
|
67
73
|
p++;
|
68
74
|
}
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
DEFAULT_OPTIONS = {
|
5
|
+
acceleration: true,
|
6
|
+
auto_row_sep_chars: 500,
|
7
|
+
chunk_size: nil,
|
8
|
+
col_sep: :auto, # was: ',',
|
9
|
+
comment_regexp: nil, # was: /\A#/,
|
10
|
+
convert_values_to_numeric: true,
|
11
|
+
downcase_header: true,
|
12
|
+
duplicate_header_suffix: nil,
|
13
|
+
file_encoding: 'utf-8',
|
14
|
+
force_simple_split: false,
|
15
|
+
force_utf8: false,
|
16
|
+
headers_in_file: true,
|
17
|
+
invalid_byte_sequence: '',
|
18
|
+
keep_original_headers: false,
|
19
|
+
key_mapping: nil,
|
20
|
+
quote_char: '"',
|
21
|
+
remove_empty_hashes: true,
|
22
|
+
remove_empty_values: true,
|
23
|
+
remove_unmapped_keys: false,
|
24
|
+
remove_values_matching: nil,
|
25
|
+
remove_zero_values: false,
|
26
|
+
required_headers: nil,
|
27
|
+
required_keys: nil,
|
28
|
+
row_sep: :auto, # was: $/,
|
29
|
+
silence_missing_keys: false,
|
30
|
+
skip_lines: nil,
|
31
|
+
strings_as_keys: false,
|
32
|
+
strip_chars_from_headers: nil,
|
33
|
+
strip_whitespace: true,
|
34
|
+
user_provided_headers: nil,
|
35
|
+
value_converters: nil,
|
36
|
+
verbose: false,
|
37
|
+
with_line_numbers: false,
|
38
|
+
}.freeze
|
39
|
+
|
40
|
+
class << self
|
41
|
+
# NOTE: this is not called when "parse" methods are tested by themselves
|
42
|
+
def process_options(given_options = {})
|
43
|
+
puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]
|
44
|
+
|
45
|
+
# fix invalid input
|
46
|
+
given_options[:invalid_byte_sequence] = '' if given_options[:invalid_byte_sequence].nil?
|
47
|
+
|
48
|
+
@options = DEFAULT_OPTIONS.dup.merge!(given_options)
|
49
|
+
puts "Computed options:\n#{pp(@options)}\n" if given_options[:verbose]
|
50
|
+
|
51
|
+
validate_options!(@options)
|
52
|
+
@options
|
53
|
+
end
|
54
|
+
|
55
|
+
# NOTE: this is not called when "parse" methods are tested by themselves
|
56
|
+
#
|
57
|
+
# ONLY FOR BACKWARDS-COMPATIBILITY
|
58
|
+
def default_options
|
59
|
+
DEFAULT_OPTIONS
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def validate_options!(options)
|
65
|
+
keys = options.keys
|
66
|
+
errors = []
|
67
|
+
errors << "invalid row_sep" if keys.include?(:row_sep) && !option_valid?(options[:row_sep])
|
68
|
+
errors << "invalid col_sep" if keys.include?(:col_sep) && !option_valid?(options[:col_sep])
|
69
|
+
errors << "invalid quote_char" if keys.include?(:quote_char) && !option_valid?(options[:quote_char])
|
70
|
+
raise SmarterCSV::ValidationError, errors.inspect if errors.any?
|
71
|
+
end
|
72
|
+
|
73
|
+
def option_valid?(str)
|
74
|
+
return true if str.is_a?(Symbol) && str == :auto
|
75
|
+
return true if str.is_a?(String) && !str.empty?
|
76
|
+
|
77
|
+
false
|
78
|
+
end
|
79
|
+
|
80
|
+
def pp(value)
|
81
|
+
defined?(AwesomePrint) ? value.awesome_inspect(index: nil) : value.inspect
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|