smarter_csv 1.11.2 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -2
- data/CHANGELOG.md +29 -1
- data/README.md +31 -396
- data/docs/_introduction.md +56 -0
- data/docs/basic_api.md +157 -0
- data/docs/batch_processing.md +68 -0
- data/docs/data_transformations.md +50 -0
- data/docs/examples.md +75 -0
- data/docs/header_transformations.md +113 -0
- data/docs/header_validations.md +36 -0
- data/docs/options.md +98 -0
- data/docs/row_col_sep.md +104 -0
- data/docs/value_converters.md +68 -0
- data/ext/smarter_csv/smarter_csv.c +4 -2
- data/lib/smarter_csv/auto_detection.rb +1 -1
- data/lib/smarter_csv/file_io.rb +1 -1
- data/lib/smarter_csv/hash_transformations.rb +1 -1
- data/lib/smarter_csv/header_transformations.rb +1 -1
- data/lib/smarter_csv/header_validations.rb +2 -2
- data/lib/smarter_csv/headers.rb +1 -1
- data/lib/smarter_csv/{options_processing.rb → options.rb} +44 -43
- data/lib/smarter_csv/{parse.rb → parser.rb} +2 -2
- data/lib/smarter_csv/reader.rb +243 -0
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv/writer.rb +2 -1
- data/lib/smarter_csv.rb +20 -4
- data/smarter_csv.gemspec +2 -2
- metadata +21 -11
- data/lib/smarter_csv/smarter_csv.rb +0 -210
- data/lib/smarter_csv/variables.rb +0 -30
data/lib/smarter_csv.rb
CHANGED
@@ -4,17 +4,16 @@ require "smarter_csv/version"
|
|
4
4
|
require "smarter_csv/errors"
|
5
5
|
|
6
6
|
require "smarter_csv/file_io"
|
7
|
-
require "smarter_csv/
|
7
|
+
require "smarter_csv/options"
|
8
8
|
require "smarter_csv/auto_detection"
|
9
|
-
require "smarter_csv/variables"
|
10
9
|
require 'smarter_csv/header_transformations'
|
11
10
|
require 'smarter_csv/header_validations'
|
12
11
|
require "smarter_csv/headers"
|
13
12
|
require "smarter_csv/hash_transformations"
|
14
13
|
|
15
|
-
require "smarter_csv/
|
14
|
+
require "smarter_csv/parser"
|
16
15
|
require "smarter_csv/writer"
|
17
|
-
require "smarter_csv/
|
16
|
+
require "smarter_csv/reader"
|
18
17
|
|
19
18
|
# load the C-extension:
|
20
19
|
case RUBY_ENGINE
|
@@ -55,6 +54,23 @@ end
|
|
55
54
|
# :nocov:
|
56
55
|
|
57
56
|
module SmarterCSV
|
57
|
+
# For backwards compatibility:
|
58
|
+
#
|
59
|
+
# while `SmarterCSV.process` works for simple cases, you can't get access to the internal state any longer.
|
60
|
+
# e.g. you need the instance of the Reader to access the original headers
|
61
|
+
#
|
62
|
+
# Please use this instead:
|
63
|
+
#
|
64
|
+
# reader = SmarterCSV::Reader.new(input, options)
|
65
|
+
# reader.process # with or without block
|
66
|
+
#
|
67
|
+
def self.process(input, given_options = {}, &block)
|
68
|
+
reader = Reader.new(input, given_options)
|
69
|
+
reader.process(&block)
|
70
|
+
end
|
71
|
+
|
72
|
+
# Convenience method for generating CSV files:
|
73
|
+
#
|
58
74
|
# SmarterCSV.generate(filename, options) do |csv_writer|
|
59
75
|
# MyModel.find_in_batches(batch_size: 100) do |batch|
|
60
76
|
# batch.pluck(:name, :description, :instructor).each do |record|
|
data/smarter_csv.gemspec
CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Tilo Sloboda"]
|
10
10
|
spec.email = ["tilo.sloboda@gmail.com"]
|
11
11
|
|
12
|
-
spec.summary = "CSV Reading and Writing"
|
13
|
-
spec.description = "Ruby Gem for convenient reading and writing
|
12
|
+
spec.summary = "Convenient CSV Reading and Writing"
|
13
|
+
spec.description = "Ruby Gem for convenient reading and writing of CSV files. It has intelligent defaults, and auto-discovery of column and row separators. It imports CSV Files as Array(s) of Hashes, suitable for direct processing with ActiveRecord, kicking-off batch jobs with Sidekiq, parallel processing, or oploading data to S3. Similarly, writing CSV files takes Hashes, or Arrays of Hashes to create a CSV file."
|
14
14
|
spec.homepage = "https://github.com/tilo/smarter_csv"
|
15
15
|
spec.license = 'MIT'
|
16
16
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: smarter_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tilo Sloboda
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-07-
|
11
|
+
date: 2024-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|
@@ -94,10 +94,11 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
-
description:
|
98
|
-
|
99
|
-
|
100
|
-
to
|
97
|
+
description: Ruby Gem for convenient reading and writing of CSV files. It has intelligent
|
98
|
+
defaults, and auto-discovery of column and row separators. It imports CSV Files
|
99
|
+
as Array(s) of Hashes, suitable for direct processing with ActiveRecord, kicking-off
|
100
|
+
batch jobs with Sidekiq, parallel processing, or oploading data to S3. Similarly,
|
101
|
+
writing CSV files takes Hashes, or Arrays of Hashes to create a CSV file.
|
101
102
|
email:
|
102
103
|
- tilo.sloboda@gmail.com
|
103
104
|
executables: []
|
@@ -115,6 +116,16 @@ files:
|
|
115
116
|
- README.md
|
116
117
|
- Rakefile
|
117
118
|
- TO_DO_v2.md
|
119
|
+
- docs/_introduction.md
|
120
|
+
- docs/basic_api.md
|
121
|
+
- docs/batch_processing.md
|
122
|
+
- docs/data_transformations.md
|
123
|
+
- docs/examples.md
|
124
|
+
- docs/header_transformations.md
|
125
|
+
- docs/header_validations.md
|
126
|
+
- docs/options.md
|
127
|
+
- docs/row_col_sep.md
|
128
|
+
- docs/value_converters.md
|
118
129
|
- ext/smarter_csv/extconf.rb
|
119
130
|
- ext/smarter_csv/smarter_csv.c
|
120
131
|
- lib/smarter_csv.rb
|
@@ -125,10 +136,9 @@ files:
|
|
125
136
|
- lib/smarter_csv/header_transformations.rb
|
126
137
|
- lib/smarter_csv/header_validations.rb
|
127
138
|
- lib/smarter_csv/headers.rb
|
128
|
-
- lib/smarter_csv/
|
129
|
-
- lib/smarter_csv/
|
130
|
-
- lib/smarter_csv/
|
131
|
-
- lib/smarter_csv/variables.rb
|
139
|
+
- lib/smarter_csv/options.rb
|
140
|
+
- lib/smarter_csv/parser.rb
|
141
|
+
- lib/smarter_csv/reader.rb
|
132
142
|
- lib/smarter_csv/version.rb
|
133
143
|
- lib/smarter_csv/writer.rb
|
134
144
|
- smarter_csv.gemspec
|
@@ -158,5 +168,5 @@ requirements: []
|
|
158
168
|
rubygems_version: 3.2.3
|
159
169
|
signing_key:
|
160
170
|
specification_version: 4
|
161
|
-
summary: CSV Reading and Writing
|
171
|
+
summary: Convenient CSV Reading and Writing
|
162
172
|
test_files: []
|
@@ -1,210 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module SmarterCSV
|
4
|
-
# first parameter: filename or input object which responds to readline method
|
5
|
-
def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
|
6
|
-
initialize_variables
|
7
|
-
|
8
|
-
options = process_options(given_options)
|
9
|
-
|
10
|
-
@enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
11
|
-
@verbose = options[:verbose]
|
12
|
-
|
13
|
-
begin
|
14
|
-
fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
|
15
|
-
|
16
|
-
if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
|
17
|
-
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
|
18
|
-
end
|
19
|
-
|
20
|
-
# auto-detect the row separator
|
21
|
-
options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
|
22
|
-
# attempt to auto-detect column separator
|
23
|
-
options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
|
24
|
-
|
25
|
-
skip_lines(fh, options)
|
26
|
-
|
27
|
-
@headers, header_size = process_headers(fh, options)
|
28
|
-
@headerA = @headers # @headerA is deprecated, use @headers
|
29
|
-
|
30
|
-
puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
|
31
|
-
|
32
|
-
header_validations(@headers, options)
|
33
|
-
|
34
|
-
# in case we use chunking.. we'll need to set it up..
|
35
|
-
if options[:chunk_size].to_i > 0
|
36
|
-
use_chunks = true
|
37
|
-
chunk_size = options[:chunk_size].to_i
|
38
|
-
@chunk_count = 0
|
39
|
-
chunk = []
|
40
|
-
else
|
41
|
-
use_chunks = false
|
42
|
-
end
|
43
|
-
|
44
|
-
# now on to processing all the rest of the lines in the CSV file:
|
45
|
-
# fh.each_line |line|
|
46
|
-
until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
|
47
|
-
line = readline_with_counts(fh, options)
|
48
|
-
|
49
|
-
# replace invalid byte sequence in UTF-8 with question mark to avoid errors
|
50
|
-
line = enforce_utf8_encoding(line, options) if @enforce_utf8
|
51
|
-
|
52
|
-
print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
|
53
|
-
|
54
|
-
next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
|
55
|
-
|
56
|
-
# cater for the quoted csv data containing the row separator carriage return character
|
57
|
-
# in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
|
58
|
-
# by detecting the existence of an uneven number of quote characters
|
59
|
-
multiline = count_quote_chars(line, options[:quote_char]).odd?
|
60
|
-
|
61
|
-
while multiline
|
62
|
-
next_line = fh.readline(options[:row_sep])
|
63
|
-
next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
|
64
|
-
line += next_line
|
65
|
-
@file_line_count += 1
|
66
|
-
|
67
|
-
break if fh.eof? # Exit loop if end of file is reached
|
68
|
-
|
69
|
-
multiline = count_quote_chars(line, options[:quote_char]).odd?
|
70
|
-
end
|
71
|
-
|
72
|
-
# :nocov:
|
73
|
-
if multiline && @verbose
|
74
|
-
print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
|
75
|
-
end
|
76
|
-
# :nocov:
|
77
|
-
|
78
|
-
line.chomp!(options[:row_sep])
|
79
|
-
|
80
|
-
# --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
|
81
|
-
dataA, _data_size = parse(line, options, header_size)
|
82
|
-
|
83
|
-
dataA.map!{|x| x.strip} if options[:strip_whitespace]
|
84
|
-
|
85
|
-
# if all values are blank, then ignore this line
|
86
|
-
next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
|
87
|
-
|
88
|
-
# --- HASH TRANSFORMATIONS ------------------------------------------------------------
|
89
|
-
hash = @headers.zip(dataA).to_h
|
90
|
-
|
91
|
-
hash = hash_transformations(hash, options)
|
92
|
-
|
93
|
-
# --- HASH VALIDATIONS ----------------------------------------------------------------
|
94
|
-
# will go here, and be able to:
|
95
|
-
# - validate correct format of the values for fields
|
96
|
-
# - required fields to be non-empty
|
97
|
-
# - ...
|
98
|
-
# -------------------------------------------------------------------------------------
|
99
|
-
|
100
|
-
next if options[:remove_empty_hashes] && hash.empty?
|
101
|
-
|
102
|
-
puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
|
103
|
-
# optional adding of csv_line_number to the hash to help debugging
|
104
|
-
hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
|
105
|
-
|
106
|
-
# process the chunks or the resulting hash
|
107
|
-
if use_chunks
|
108
|
-
chunk << hash # append temp result to chunk
|
109
|
-
|
110
|
-
if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
|
111
|
-
# do something with the chunk
|
112
|
-
if block_given?
|
113
|
-
yield chunk # do something with the hashes in the chunk in the block
|
114
|
-
else
|
115
|
-
@result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
|
116
|
-
end
|
117
|
-
@chunk_count += 1
|
118
|
-
chunk.clear # re-initialize for next chunk of data
|
119
|
-
else
|
120
|
-
# the last chunk may contain partial data, which is handled below
|
121
|
-
end
|
122
|
-
# while a chunk is being filled up we don't need to do anything else here
|
123
|
-
|
124
|
-
else # no chunk handling
|
125
|
-
if block_given?
|
126
|
-
yield [hash] # do something with the hash in the block (better to use chunking here)
|
127
|
-
else
|
128
|
-
@result << hash
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
# print new line to retain last processing line message
|
134
|
-
print "\n" if @verbose
|
135
|
-
|
136
|
-
# handling of last chunk:
|
137
|
-
if !chunk.nil? && chunk.size > 0
|
138
|
-
# do something with the chunk
|
139
|
-
if block_given?
|
140
|
-
yield chunk # do something with the hashes in the chunk in the block
|
141
|
-
else
|
142
|
-
@result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
|
143
|
-
end
|
144
|
-
@chunk_count += 1
|
145
|
-
# chunk = [] # initialize for next chunk of data
|
146
|
-
end
|
147
|
-
ensure
|
148
|
-
fh.close if fh.respond_to?(:close)
|
149
|
-
end
|
150
|
-
|
151
|
-
if block_given?
|
152
|
-
@chunk_count # when we do processing through a block we only care how many chunks we processed
|
153
|
-
else
|
154
|
-
@result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
155
|
-
end
|
156
|
-
end
|
157
|
-
|
158
|
-
class << self
|
159
|
-
def count_quote_chars(line, quote_char)
|
160
|
-
return 0 if line.nil? || quote_char.nil? || quote_char.empty?
|
161
|
-
|
162
|
-
count = 0
|
163
|
-
escaped = false
|
164
|
-
|
165
|
-
line.each_char do |char|
|
166
|
-
if char == '\\' && !escaped
|
167
|
-
escaped = true
|
168
|
-
else
|
169
|
-
count += 1 if char == quote_char && !escaped
|
170
|
-
escaped = false
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
count
|
175
|
-
end
|
176
|
-
|
177
|
-
def has_acceleration?
|
178
|
-
@has_acceleration ||= !!defined?(parse_csv_line_c)
|
179
|
-
end
|
180
|
-
|
181
|
-
protected
|
182
|
-
|
183
|
-
# SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
|
184
|
-
# and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
|
185
|
-
BLANK_RE = /\A\s*\z/.freeze
|
186
|
-
|
187
|
-
def blank?(value)
|
188
|
-
case value
|
189
|
-
when String
|
190
|
-
BLANK_RE.match?(value)
|
191
|
-
when NilClass
|
192
|
-
true
|
193
|
-
when Array
|
194
|
-
value.all? { |elem| blank?(elem) }
|
195
|
-
when Hash
|
196
|
-
value.values.all? { |elem| blank?(elem) } # Focus on values only
|
197
|
-
else
|
198
|
-
false
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
private
|
203
|
-
|
204
|
-
def enforce_utf8_encoding(line, options)
|
205
|
-
# return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
206
|
-
|
207
|
-
line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
|
208
|
-
end
|
209
|
-
end
|
210
|
-
end
|
@@ -1,30 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module SmarterCSV
|
4
|
-
class << self
|
5
|
-
attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
|
6
|
-
|
7
|
-
def initialize_variables
|
8
|
-
@has_rails = !!defined?(Rails)
|
9
|
-
@csv_line_count = 0
|
10
|
-
@chunk_count = 0
|
11
|
-
@errors = {}
|
12
|
-
@file_line_count = 0
|
13
|
-
@headerA = []
|
14
|
-
@headers = nil
|
15
|
-
@raw_header = nil # header as it appears in the file
|
16
|
-
@result = []
|
17
|
-
@warnings = {}
|
18
|
-
@enforce_utf8 = false # only set to true if needed (after options parsing)
|
19
|
-
end
|
20
|
-
|
21
|
-
# :nocov:
|
22
|
-
# rubocop:disable Naming/MethodName
|
23
|
-
def headerA
|
24
|
-
warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
|
25
|
-
@headerA
|
26
|
-
end
|
27
|
-
# rubocop:enable Naming/MethodName
|
28
|
-
# :nocov:
|
29
|
-
end
|
30
|
-
end
|