smarter_csv 1.1.5 → 1.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.rspec +1 -2
- data/.rubocop.yml +154 -0
- data/CHANGELOG.md +364 -0
- data/CONTRIBUTORS.md +56 -0
- data/Gemfile +7 -2
- data/LICENSE.txt +21 -0
- data/README.md +44 -441
- data/Rakefile +39 -19
- data/TO_DO_v2.md +14 -0
- data/docs/_introduction.md +56 -0
- data/docs/basic_api.md +157 -0
- data/docs/batch_processing.md +68 -0
- data/docs/data_transformations.md +50 -0
- data/docs/examples.md +75 -0
- data/docs/header_transformations.md +113 -0
- data/docs/header_validations.md +36 -0
- data/docs/options.md +98 -0
- data/docs/row_col_sep.md +104 -0
- data/docs/value_converters.md +68 -0
- data/ext/smarter_csv/extconf.rb +14 -0
- data/ext/smarter_csv/smarter_csv.c +97 -0
- data/lib/smarter_csv/auto_detection.rb +78 -0
- data/lib/smarter_csv/errors.rb +16 -0
- data/lib/smarter_csv/file_io.rb +50 -0
- data/lib/smarter_csv/hash_transformations.rb +91 -0
- data/lib/smarter_csv/header_transformations.rb +63 -0
- data/lib/smarter_csv/header_validations.rb +34 -0
- data/lib/smarter_csv/headers.rb +68 -0
- data/lib/smarter_csv/options.rb +95 -0
- data/lib/smarter_csv/parser.rb +90 -0
- data/lib/smarter_csv/reader.rb +243 -0
- data/lib/smarter_csv/version.rb +3 -1
- data/lib/smarter_csv/writer.rb +116 -0
- data/lib/smarter_csv.rb +91 -3
- data/smarter_csv.gemspec +43 -20
- metadata +122 -137
- data/.gitignore +0 -8
- data/.travis.yml +0 -19
- data/lib/extensions/hash.rb +0 -7
- data/lib/smarter_csv/smarter_csv.rb +0 -281
- data/spec/fixtures/basic.csv +0 -8
- data/spec/fixtures/binary.csv +0 -1
- data/spec/fixtures/carriage_returns_n.csv +0 -18
- data/spec/fixtures/carriage_returns_quoted.csv +0 -3
- data/spec/fixtures/carriage_returns_r.csv +0 -1
- data/spec/fixtures/carriage_returns_rn.csv +0 -18
- data/spec/fixtures/chunk_cornercase.csv +0 -10
- data/spec/fixtures/empty.csv +0 -5
- data/spec/fixtures/line_endings_n.csv +0 -4
- data/spec/fixtures/line_endings_r.csv +0 -1
- data/spec/fixtures/line_endings_rn.csv +0 -4
- data/spec/fixtures/lots_of_columns.csv +0 -2
- data/spec/fixtures/malformed.csv +0 -3
- data/spec/fixtures/malformed_header.csv +0 -3
- data/spec/fixtures/money.csv +0 -3
- data/spec/fixtures/no_header.csv +0 -7
- data/spec/fixtures/numeric.csv +0 -5
- data/spec/fixtures/pets.csv +0 -5
- data/spec/fixtures/quoted.csv +0 -5
- data/spec/fixtures/separator.csv +0 -4
- data/spec/fixtures/skip_lines.csv +0 -8
- data/spec/fixtures/valid_unicode.csv +0 -5
- data/spec/fixtures/with_dashes.csv +0 -8
- data/spec/fixtures/with_dates.csv +0 -4
- data/spec/smarter_csv/binary_file2_spec.rb +0 -24
- data/spec/smarter_csv/binary_file_spec.rb +0 -22
- data/spec/smarter_csv/carriage_return_spec.rb +0 -170
- data/spec/smarter_csv/chunked_reading_spec.rb +0 -14
- data/spec/smarter_csv/close_file_spec.rb +0 -15
- data/spec/smarter_csv/column_separator_spec.rb +0 -11
- data/spec/smarter_csv/convert_values_to_numeric_spec.rb +0 -48
- data/spec/smarter_csv/extenstions_spec.rb +0 -17
- data/spec/smarter_csv/header_transformation_spec.rb +0 -21
- data/spec/smarter_csv/keep_headers_spec.rb +0 -24
- data/spec/smarter_csv/key_mapping_spec.rb +0 -25
- data/spec/smarter_csv/line_ending_spec.rb +0 -43
- data/spec/smarter_csv/load_basic_spec.rb +0 -20
- data/spec/smarter_csv/malformed_spec.rb +0 -21
- data/spec/smarter_csv/no_header_spec.rb +0 -24
- data/spec/smarter_csv/not_downcase_header_spec.rb +0 -24
- data/spec/smarter_csv/quoted_spec.rb +0 -23
- data/spec/smarter_csv/remove_empty_values_spec.rb +0 -13
- data/spec/smarter_csv/remove_keys_from_hashes_spec.rb +0 -25
- data/spec/smarter_csv/remove_not_mapped_keys_spec.rb +0 -35
- data/spec/smarter_csv/remove_values_matching_spec.rb +0 -26
- data/spec/smarter_csv/remove_zero_values_spec.rb +0 -25
- data/spec/smarter_csv/skip_lines_spec.rb +0 -29
- data/spec/smarter_csv/strings_as_keys_spec.rb +0 -24
- data/spec/smarter_csv/strip_chars_from_headers_spec.rb +0 -24
- data/spec/smarter_csv/valid_unicode_spec.rb +0 -94
- data/spec/smarter_csv/value_converters_spec.rb +0 -52
- data/spec/spec/spec_helper.rb +0 -17
- data/spec/spec.opts +0 -2
- data/spec/spec_helper.rb +0 -21
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
#
|
5
|
+
# NOTE: this is not called when "parse" methods are tested by themselves
|
6
|
+
#
|
7
|
+
# ONLY FOR BACKWARDS-COMPATIBILITY
|
8
|
+
def self.default_options
|
9
|
+
Options::DEFAULT_OPTIONS
|
10
|
+
end
|
11
|
+
|
12
|
+
module Options
|
13
|
+
DEFAULT_OPTIONS = {
|
14
|
+
acceleration: true, # if user wants to use accelleration or not
|
15
|
+
auto_row_sep_chars: 500,
|
16
|
+
chunk_size: nil,
|
17
|
+
col_sep: :auto, # was: ',',
|
18
|
+
comment_regexp: nil, # was: /\A#/,
|
19
|
+
convert_values_to_numeric: true,
|
20
|
+
downcase_header: true,
|
21
|
+
duplicate_header_suffix: '', # was: nil,
|
22
|
+
file_encoding: 'utf-8',
|
23
|
+
force_simple_split: false,
|
24
|
+
force_utf8: false,
|
25
|
+
headers_in_file: true,
|
26
|
+
invalid_byte_sequence: '',
|
27
|
+
keep_original_headers: false,
|
28
|
+
key_mapping: nil,
|
29
|
+
quote_char: '"',
|
30
|
+
remove_empty_hashes: true,
|
31
|
+
remove_empty_values: true,
|
32
|
+
remove_unmapped_keys: false,
|
33
|
+
remove_values_matching: nil,
|
34
|
+
remove_zero_values: false,
|
35
|
+
required_headers: nil,
|
36
|
+
required_keys: nil,
|
37
|
+
row_sep: :auto, # was: $/,
|
38
|
+
silence_missing_keys: false,
|
39
|
+
skip_lines: nil,
|
40
|
+
strings_as_keys: false,
|
41
|
+
strip_chars_from_headers: nil,
|
42
|
+
strip_whitespace: true,
|
43
|
+
user_provided_headers: nil,
|
44
|
+
value_converters: nil,
|
45
|
+
verbose: false,
|
46
|
+
with_line_numbers: false,
|
47
|
+
}.freeze
|
48
|
+
|
49
|
+
# NOTE: this is not called when "parse" methods are tested by themselves
|
50
|
+
def process_options(given_options = {})
|
51
|
+
puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]
|
52
|
+
|
53
|
+
@options = DEFAULT_OPTIONS.dup.merge!(given_options)
|
54
|
+
|
55
|
+
# fix invalid input
|
56
|
+
@options[:invalid_byte_sequence] ||= ''
|
57
|
+
|
58
|
+
puts "Computed options:\n#{pp(@options)}\n" if @options[:verbose]
|
59
|
+
|
60
|
+
validate_options!(@options)
|
61
|
+
@options
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def validate_options!(options)
|
67
|
+
# deprecate required_headers
|
68
|
+
unless options[:required_headers].nil?
|
69
|
+
puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
|
70
|
+
if options[:required_keys].nil?
|
71
|
+
options[:required_keys] = options[:required_headers]
|
72
|
+
options[:required_headers] = nil
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
keys = options.keys
|
77
|
+
errors = []
|
78
|
+
errors << "invalid row_sep" if keys.include?(:row_sep) && !option_valid?(options[:row_sep])
|
79
|
+
errors << "invalid col_sep" if keys.include?(:col_sep) && !option_valid?(options[:col_sep])
|
80
|
+
errors << "invalid quote_char" if keys.include?(:quote_char) && !option_valid?(options[:quote_char])
|
81
|
+
raise SmarterCSV::ValidationError, errors.inspect if errors.any?
|
82
|
+
end
|
83
|
+
|
84
|
+
def option_valid?(str)
|
85
|
+
return true if str.is_a?(Symbol) && str == :auto
|
86
|
+
return true if str.is_a?(String) && !str.empty?
|
87
|
+
|
88
|
+
false
|
89
|
+
end
|
90
|
+
|
91
|
+
def pp(value)
|
92
|
+
defined?(AwesomePrint) ? value.awesome_inspect(index: nil) : value.inspect
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
module Parser
|
5
|
+
protected
|
6
|
+
|
7
|
+
###
|
8
|
+
### Thin wrapper around C-extension
|
9
|
+
###
|
10
|
+
def parse(line, options, header_size = nil)
|
11
|
+
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
|
12
|
+
|
13
|
+
if options[:acceleration] && has_acceleration
|
14
|
+
# :nocov:
|
15
|
+
has_quotes = line =~ /#{options[:quote_char]}/
|
16
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
|
17
|
+
elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
|
18
|
+
[elements, elements.size]
|
19
|
+
# :nocov:
|
20
|
+
else
|
21
|
+
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
|
22
|
+
parse_csv_line_ruby(line, options, header_size)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# ------------------------------------------------------------------
|
27
|
+
# Ruby equivalent of the C-extension for parse_line
|
28
|
+
#
|
29
|
+
# parses a single line: either a CSV header and body line
|
30
|
+
# - quoting rules compared to RFC-4180 are somewhat relaxed
|
31
|
+
# - we are not assuming that quotes inside a fields need to be doubled
|
32
|
+
# - we are not assuming that all fields need to be quoted (0 is even)
|
33
|
+
# - works with multi-char col_sep
|
34
|
+
# - if header_size is given, only up to header_size fields are parsed
|
35
|
+
#
|
36
|
+
# We use header_size for parsing the body lines to make sure we always match the number of headers
|
37
|
+
# in case there are trailing col_sep characters in line
|
38
|
+
#
|
39
|
+
# Our convention is that empty fields are returned as empty strings, not as nil.
|
40
|
+
#
|
41
|
+
#
|
42
|
+
# the purpose of the max_size parameter is to handle a corner case where
|
43
|
+
# CSV lines contain more fields than the header.
|
44
|
+
# In which case the remaining fields in the line are ignored
|
45
|
+
#
|
46
|
+
def parse_csv_line_ruby(line, options, header_size = nil)
|
47
|
+
return [] if line.nil?
|
48
|
+
|
49
|
+
line_size = line.size
|
50
|
+
col_sep = options[:col_sep]
|
51
|
+
col_sep_size = col_sep.size
|
52
|
+
quote = options[:quote_char]
|
53
|
+
quote_count = 0
|
54
|
+
elements = []
|
55
|
+
start = 0
|
56
|
+
i = 0
|
57
|
+
|
58
|
+
previous_char = ''
|
59
|
+
while i < line_size
|
60
|
+
if line[i...i+col_sep_size] == col_sep && quote_count.even?
|
61
|
+
break if !header_size.nil? && elements.size >= header_size
|
62
|
+
|
63
|
+
elements << cleanup_quotes(line[start...i], quote)
|
64
|
+
previous_char = line[i]
|
65
|
+
i += col_sep.size
|
66
|
+
start = i
|
67
|
+
else
|
68
|
+
quote_count += 1 if line[i] == quote && previous_char != '\\'
|
69
|
+
previous_char = line[i]
|
70
|
+
i += 1
|
71
|
+
end
|
72
|
+
end
|
73
|
+
elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
|
74
|
+
[elements, elements.size]
|
75
|
+
end
|
76
|
+
|
77
|
+
def cleanup_quotes(field, quote)
|
78
|
+
return field if field.nil?
|
79
|
+
|
80
|
+
# return if field !~ /#{quote}/ # this check can probably eliminated
|
81
|
+
|
82
|
+
if field.start_with?(quote) && field.end_with?(quote)
|
83
|
+
field.delete_prefix!(quote)
|
84
|
+
field.delete_suffix!(quote)
|
85
|
+
end
|
86
|
+
field.gsub!("#{quote}#{quote}", quote)
|
87
|
+
field
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,243 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
class Reader
|
5
|
+
include ::SmarterCSV::Options
|
6
|
+
include ::SmarterCSV::FileIO
|
7
|
+
include ::SmarterCSV::AutoDetection
|
8
|
+
include ::SmarterCSV::Headers
|
9
|
+
include ::SmarterCSV::HeaderTransformations
|
10
|
+
include ::SmarterCSV::HeaderValidations
|
11
|
+
include ::SmarterCSV::HashTransformations
|
12
|
+
include ::SmarterCSV::Parser
|
13
|
+
|
14
|
+
attr_reader :input, :options
|
15
|
+
attr_reader :csv_line_count, :chunk_count, :file_line_count
|
16
|
+
attr_reader :enforce_utf8, :has_rails, :has_acceleration
|
17
|
+
attr_reader :errors, :warnings, :headers, :raw_header, :result
|
18
|
+
|
19
|
+
# :nocov:
|
20
|
+
# rubocop:disable Naming/MethodName
|
21
|
+
def headerA
|
22
|
+
warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
|
23
|
+
@headerA
|
24
|
+
end
|
25
|
+
# rubocop:enable Naming/MethodName
|
26
|
+
# :nocov:
|
27
|
+
|
28
|
+
# first parameter: filename or input object which responds to readline method
|
29
|
+
def initialize(input, given_options = {})
|
30
|
+
@input = input
|
31
|
+
@has_rails = !!defined?(Rails)
|
32
|
+
@csv_line_count = 0
|
33
|
+
@chunk_count = 0
|
34
|
+
@errors = {}
|
35
|
+
@file_line_count = 0
|
36
|
+
@headerA = []
|
37
|
+
@headers = nil
|
38
|
+
@raw_header = nil # header as it appears in the file
|
39
|
+
@result = []
|
40
|
+
@warnings = {}
|
41
|
+
@enforce_utf8 = false # only set to true if needed (after options parsing)
|
42
|
+
@options = process_options(given_options)
|
43
|
+
# true if it is compiled with accelleration
|
44
|
+
@has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
|
45
|
+
end
|
46
|
+
|
47
|
+
def process(&block) # rubocop:disable Lint/UnusedMethodArgument
|
48
|
+
@enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
49
|
+
@verbose = options[:verbose]
|
50
|
+
|
51
|
+
begin
|
52
|
+
fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
|
53
|
+
|
54
|
+
if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
|
55
|
+
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
|
56
|
+
end
|
57
|
+
|
58
|
+
# auto-detect the row separator
|
59
|
+
options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
|
60
|
+
# attempt to auto-detect column separator
|
61
|
+
options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
|
62
|
+
|
63
|
+
skip_lines(fh, options)
|
64
|
+
|
65
|
+
@headers, header_size = process_headers(fh, options)
|
66
|
+
@headerA = @headers # @headerA is deprecated, use @headers
|
67
|
+
|
68
|
+
puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
|
69
|
+
|
70
|
+
header_validations(@headers, options)
|
71
|
+
|
72
|
+
# in case we use chunking.. we'll need to set it up..
|
73
|
+
if options[:chunk_size].to_i > 0
|
74
|
+
use_chunks = true
|
75
|
+
chunk_size = options[:chunk_size].to_i
|
76
|
+
@chunk_count = 0
|
77
|
+
chunk = []
|
78
|
+
else
|
79
|
+
use_chunks = false
|
80
|
+
end
|
81
|
+
|
82
|
+
# now on to processing all the rest of the lines in the CSV file:
|
83
|
+
# fh.each_line |line|
|
84
|
+
until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
|
85
|
+
line = readline_with_counts(fh, options)
|
86
|
+
|
87
|
+
# replace invalid byte sequence in UTF-8 with question mark to avoid errors
|
88
|
+
line = enforce_utf8_encoding(line, options) if @enforce_utf8
|
89
|
+
|
90
|
+
print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
|
91
|
+
|
92
|
+
next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
|
93
|
+
|
94
|
+
# cater for the quoted csv data containing the row separator carriage return character
|
95
|
+
# in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
|
96
|
+
# by detecting the existence of an uneven number of quote characters
|
97
|
+
multiline = count_quote_chars(line, options[:quote_char]).odd?
|
98
|
+
|
99
|
+
while multiline
|
100
|
+
next_line = fh.readline(options[:row_sep])
|
101
|
+
next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
|
102
|
+
line += next_line
|
103
|
+
@file_line_count += 1
|
104
|
+
|
105
|
+
break if fh.eof? # Exit loop if end of file is reached
|
106
|
+
|
107
|
+
multiline = count_quote_chars(line, options[:quote_char]).odd?
|
108
|
+
end
|
109
|
+
|
110
|
+
# :nocov:
|
111
|
+
if multiline && @verbose
|
112
|
+
print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
|
113
|
+
end
|
114
|
+
# :nocov:
|
115
|
+
|
116
|
+
line.chomp!(options[:row_sep])
|
117
|
+
|
118
|
+
# --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
|
119
|
+
dataA, _data_size = parse(line, options, header_size)
|
120
|
+
|
121
|
+
dataA.map!{|x| x.strip} if options[:strip_whitespace]
|
122
|
+
|
123
|
+
# if all values are blank, then ignore this line
|
124
|
+
next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
|
125
|
+
|
126
|
+
# --- HASH TRANSFORMATIONS ------------------------------------------------------------
|
127
|
+
hash = @headers.zip(dataA).to_h
|
128
|
+
|
129
|
+
hash = hash_transformations(hash, options)
|
130
|
+
|
131
|
+
# --- HASH VALIDATIONS ----------------------------------------------------------------
|
132
|
+
# will go here, and be able to:
|
133
|
+
# - validate correct format of the values for fields
|
134
|
+
# - required fields to be non-empty
|
135
|
+
# - ...
|
136
|
+
# -------------------------------------------------------------------------------------
|
137
|
+
|
138
|
+
next if options[:remove_empty_hashes] && hash.empty?
|
139
|
+
|
140
|
+
puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
|
141
|
+
# optional adding of csv_line_number to the hash to help debugging
|
142
|
+
hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
|
143
|
+
|
144
|
+
# process the chunks or the resulting hash
|
145
|
+
if use_chunks
|
146
|
+
chunk << hash # append temp result to chunk
|
147
|
+
|
148
|
+
if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
|
149
|
+
# do something with the chunk
|
150
|
+
if block_given?
|
151
|
+
yield chunk # do something with the hashes in the chunk in the block
|
152
|
+
else
|
153
|
+
@result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
|
154
|
+
end
|
155
|
+
@chunk_count += 1
|
156
|
+
chunk.clear # re-initialize for next chunk of data
|
157
|
+
else
|
158
|
+
# the last chunk may contain partial data, which is handled below
|
159
|
+
end
|
160
|
+
# while a chunk is being filled up we don't need to do anything else here
|
161
|
+
|
162
|
+
else # no chunk handling
|
163
|
+
if block_given?
|
164
|
+
yield [hash] # do something with the hash in the block (better to use chunking here)
|
165
|
+
else
|
166
|
+
@result << hash
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# print new line to retain last processing line message
|
172
|
+
print "\n" if @verbose
|
173
|
+
|
174
|
+
# handling of last chunk:
|
175
|
+
if !chunk.nil? && chunk.size > 0
|
176
|
+
# do something with the chunk
|
177
|
+
if block_given?
|
178
|
+
yield chunk # do something with the hashes in the chunk in the block
|
179
|
+
else
|
180
|
+
@result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
|
181
|
+
end
|
182
|
+
@chunk_count += 1
|
183
|
+
# chunk = [] # initialize for next chunk of data
|
184
|
+
end
|
185
|
+
ensure
|
186
|
+
fh.close if fh.respond_to?(:close)
|
187
|
+
end
|
188
|
+
|
189
|
+
if block_given?
|
190
|
+
@chunk_count # when we do processing through a block we only care how many chunks we processed
|
191
|
+
else
|
192
|
+
@result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
def count_quote_chars(line, quote_char)
|
197
|
+
return 0 if line.nil? || quote_char.nil? || quote_char.empty?
|
198
|
+
|
199
|
+
count = 0
|
200
|
+
escaped = false
|
201
|
+
|
202
|
+
line.each_char do |char|
|
203
|
+
if char == '\\' && !escaped
|
204
|
+
escaped = true
|
205
|
+
else
|
206
|
+
count += 1 if char == quote_char && !escaped
|
207
|
+
escaped = false
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
count
|
212
|
+
end
|
213
|
+
|
214
|
+
protected
|
215
|
+
|
216
|
+
# SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
|
217
|
+
# and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
|
218
|
+
BLANK_RE = /\A\s*\z/.freeze
|
219
|
+
|
220
|
+
def blank?(value)
|
221
|
+
case value
|
222
|
+
when String
|
223
|
+
BLANK_RE.match?(value)
|
224
|
+
when NilClass
|
225
|
+
true
|
226
|
+
when Array
|
227
|
+
value.all? { |elem| blank?(elem) }
|
228
|
+
when Hash
|
229
|
+
value.values.all? { |elem| blank?(elem) } # Focus on values only
|
230
|
+
else
|
231
|
+
false
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
private
|
236
|
+
|
237
|
+
def enforce_utf8_encoding(line, options)
|
238
|
+
# return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
239
|
+
|
240
|
+
line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
data/lib/smarter_csv/version.rb
CHANGED
@@ -0,0 +1,116 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
#
|
5
|
+
# Generate CSV files
|
6
|
+
#
|
7
|
+
# Create an instance of the Writer class with the filename and options.
|
8
|
+
# call `<<` one or mulltiple times to append data to the file.
|
9
|
+
# call `finalize` to save the file.
|
10
|
+
#
|
11
|
+
# The `<<` method can take different arguments:
|
12
|
+
# * a signle Hash
|
13
|
+
# * an array of Hashes
|
14
|
+
# * nested arrays of arrays of Hashes
|
15
|
+
#
|
16
|
+
# By default SmarterCSV::Writer automatically discovers all headers that are present
|
17
|
+
# in the data on-the-fly. This can be disabled, then only given headers are used.
|
18
|
+
# Disabling can be useful when you want to select attributes from hashes, or ActiveRecord instances.
|
19
|
+
#
|
20
|
+
# If `discover_headers` is enabled, and headers are given, any new headers that are found in the data will still be appended.
|
21
|
+
#
|
22
|
+
# The Writer automatically quotes fields containing the col_sep, row_sep, or the quote_char.
|
23
|
+
#
|
24
|
+
# Options:
|
25
|
+
# col_sep : defaults to , but can be set to any other character
|
26
|
+
# row_sep : defaults to LF \n , but can be set to \r\n or \r or anything else
|
27
|
+
# quote_char : defaults to "
|
28
|
+
# discover_headers : defaults to true
|
29
|
+
# headers : defaults to []
|
30
|
+
# force_quotes: defaults to false
|
31
|
+
# map_headers: defaults to {}, can be a hash of key -> value mappings
|
32
|
+
|
33
|
+
# IMPORTANT NOTES:
|
34
|
+
# * Data hashes could contain strings or symbols as keys.
|
35
|
+
# Make sure to use the correct form when specifying headers manually,
|
36
|
+
# in combination with the :discover_headers option
|
37
|
+
|
38
|
+
attr_reader :options, :row_sep, :col_sep, :quote_char, :force_quotes, :discover_headers, :headers, :map_headers, :output_file
|
39
|
+
|
40
|
+
class Writer
|
41
|
+
def initialize(file_path, options = {})
|
42
|
+
@options = options
|
43
|
+
|
44
|
+
@row_sep = options[:row_sep] || $/ # Defaults to system's row separator. RFC4180 "\r\n"
|
45
|
+
@col_sep = options[:col_sep] || ','
|
46
|
+
@quote_char = options[:quote_char] || '"'
|
47
|
+
@force_quotes = options[:force_quotes] == true
|
48
|
+
@discover_headers = true # defaults to true
|
49
|
+
if options.has_key?(:discover_headers)
|
50
|
+
# passing in the option overrides the default behavior
|
51
|
+
@discover_headers = options[:discover_headers] == true
|
52
|
+
else
|
53
|
+
# disable discover_headers when headers are given explicitly
|
54
|
+
@discover_headers = !(options.has_key?(:map_headers) || options.has_key?(:headers))
|
55
|
+
end
|
56
|
+
@headers = [] # start with empty headers
|
57
|
+
@headers = options[:headers] if options.has_key?(:headers) # unless explicitly given
|
58
|
+
@headers = options[:map_headers].keys if options.has_key?(:map_headers) && !options.has_key?(:headers)
|
59
|
+
@map_headers = options[:map_headers] || {}
|
60
|
+
|
61
|
+
@output_file = File.open(file_path, 'w+')
|
62
|
+
# hidden state:
|
63
|
+
@temp_file = Tempfile.new('tempfile', '/tmp')
|
64
|
+
@quote_regex = Regexp.union(@col_sep, @row_sep, @quote_char)
|
65
|
+
end
|
66
|
+
|
67
|
+
# this can be called many times in order to append lines to the csv file
|
68
|
+
def <<(data)
|
69
|
+
case data
|
70
|
+
when Hash
|
71
|
+
process_hash(data)
|
72
|
+
when Array
|
73
|
+
data.each { |item| self << item }
|
74
|
+
when NilClass
|
75
|
+
# ignore
|
76
|
+
else
|
77
|
+
raise InvalidInputData, "Invalid data type: #{data.class}. Must be a Hash or an Array."
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def finalize
|
82
|
+
# Map headers if :map_headers option is provided
|
83
|
+
mapped_headers = @headers.map { |header| @map_headers[header] || header }
|
84
|
+
|
85
|
+
@temp_file.rewind
|
86
|
+
@output_file.write(mapped_headers.join(@col_sep) + @row_sep)
|
87
|
+
@output_file.write(@temp_file.read)
|
88
|
+
@output_file.flush
|
89
|
+
@output_file.close
|
90
|
+
@temp_file.delete
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def process_hash(hash)
|
96
|
+
if @discover_headers
|
97
|
+
hash_keys = hash.keys
|
98
|
+
new_keys = hash_keys - @headers
|
99
|
+
@headers.concat(new_keys)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Reorder the hash to match the current headers order and fill missing fields
|
103
|
+
ordered_row = @headers.map { |header| hash[header] || '' }
|
104
|
+
|
105
|
+
@temp_file.write ordered_row.map { |value| escape_csv_field(value) }.join(@col_sep) + @row_sep
|
106
|
+
end
|
107
|
+
|
108
|
+
def escape_csv_field(field)
|
109
|
+
if @force_quotes || field.to_s.match(@quote_regex)
|
110
|
+
"\"#{field}\""
|
111
|
+
else
|
112
|
+
field.to_s
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
data/lib/smarter_csv.rb
CHANGED
@@ -1,4 +1,92 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require "smarter_csv/version"
|
3
|
-
require "
|
4
|
-
|
4
|
+
require "smarter_csv/errors"
|
5
|
+
|
6
|
+
require "smarter_csv/file_io"
|
7
|
+
require "smarter_csv/options"
|
8
|
+
require "smarter_csv/auto_detection"
|
9
|
+
require 'smarter_csv/header_transformations'
|
10
|
+
require 'smarter_csv/header_validations'
|
11
|
+
require "smarter_csv/headers"
|
12
|
+
require "smarter_csv/hash_transformations"
|
13
|
+
|
14
|
+
require "smarter_csv/parser"
|
15
|
+
require "smarter_csv/writer"
|
16
|
+
require "smarter_csv/reader"
|
17
|
+
|
18
|
+
# load the C-extension:
|
19
|
+
case RUBY_ENGINE
|
20
|
+
when 'ruby'
|
21
|
+
begin
|
22
|
+
if `uname -s`.chomp == 'Darwin'
|
23
|
+
#
|
24
|
+
# Please report if you see cases where the rake-compiler is building x86_64 code on arm64 cpus:
|
25
|
+
# https://github.com/rake-compiler/rake-compiler/issues/231
|
26
|
+
#
|
27
|
+
require 'smarter_csv/smarter_csv.bundle'
|
28
|
+
else
|
29
|
+
# :nocov:
|
30
|
+
require_relative "smarter_csv/smarter_csv"
|
31
|
+
# :nocov:
|
32
|
+
end
|
33
|
+
rescue Exception # rubocop:disable Lint/RescueException
|
34
|
+
# require_relative 'smarter_csv/smarter_csv'
|
35
|
+
end
|
36
|
+
# :nocov:
|
37
|
+
# when 'truffleruby'
|
38
|
+
# puts "\n\n truffleruby case in the load path | RUBY_ENGINE: #{RUBY_ENGINE} , #{RUBY_VERSION}\n\n"
|
39
|
+
# # this might not work - if you encounter problems, please contribute and create a PR
|
40
|
+
# # require 'truffleruby/smarter_csv'
|
41
|
+
else
|
42
|
+
puts <<-BLOCK_COMMENT
|
43
|
+
|
44
|
+
-------------------------------------------------------------------------
|
45
|
+
RUBY_ENGINE: #{RUBY_ENGINE} , #{RUBY_VERSION}
|
46
|
+
|
47
|
+
Acceleration via C-Extension is currently not supported for #{RUBY_ENGINE}
|
48
|
+
|
49
|
+
Please contribute and create a pull request if you need this
|
50
|
+
-------------------------------------------------------------------------
|
51
|
+
|
52
|
+
BLOCK_COMMENT
|
53
|
+
end
|
54
|
+
# :nocov:
|
55
|
+
|
56
|
+
module SmarterCSV
|
57
|
+
# For backwards compatibility:
|
58
|
+
#
|
59
|
+
# while `SmarterCSV.process` works for simple cases, you can't get access to the internal state any longer.
|
60
|
+
# e.g. you need the instance of the Reader to access the original headers
|
61
|
+
#
|
62
|
+
# Please use this instead:
|
63
|
+
#
|
64
|
+
# reader = SmarterCSV::Reader.new(input, options)
|
65
|
+
# reader.process # with or without block
|
66
|
+
#
|
67
|
+
def self.process(input, given_options = {}, &block)
|
68
|
+
reader = Reader.new(input, given_options)
|
69
|
+
reader.process(&block)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Convenience method for generating CSV files:
|
73
|
+
#
|
74
|
+
# SmarterCSV.generate(filename, options) do |csv_writer|
|
75
|
+
# MyModel.find_in_batches(batch_size: 100) do |batch|
|
76
|
+
# batch.pluck(:name, :description, :instructor).each do |record|
|
77
|
+
# csv_writer << record
|
78
|
+
# end
|
79
|
+
# end
|
80
|
+
# end
|
81
|
+
#
|
82
|
+
# rubocop:disable Lint/UnusedMethodArgument
|
83
|
+
def self.generate(filename, options = {}, &block)
|
84
|
+
raise unless block_given?
|
85
|
+
|
86
|
+
writer = Writer.new(filename, options)
|
87
|
+
yield writer
|
88
|
+
ensure
|
89
|
+
writer.finalize
|
90
|
+
end
|
91
|
+
# rubocop:enable Lint/UnusedMethodArgument
|
92
|
+
end
|