smarter_csv 1.11.0 → 1.12.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.0
4
+ version: 1.12.0.pre1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-07-02 00:00:00.000000000 Z
11
+ date: 2024-07-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -94,9 +94,10 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
- description: Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with
98
- lots of features for processing large files in parallel, embedded comments, unusual
99
- field- and record-separators, flexible mapping of CSV-headers to Hash-keys
97
+ description: 'Ruby Gem for convenient reading and writing: importing of CSV Files
98
+ as Array(s) of Hashes, with lots of features for processing large files in parallel,
99
+ embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers
100
+ to Hash-keys'
100
101
  email:
101
102
  - tilo.sloboda@gmail.com
102
103
  executables: []
@@ -114,19 +115,30 @@ files:
114
115
  - README.md
115
116
  - Rakefile
116
117
  - TO_DO_v2.md
118
+ - docs/_introduction.md
119
+ - docs/basic_api.md
120
+ - docs/batch_processing.md
121
+ - docs/data_transformations.md
122
+ - docs/examples.md
123
+ - docs/header_transformations.md
124
+ - docs/header_validations.md
125
+ - docs/notes.md
126
+ - docs/options.md
127
+ - docs/row_col_sep.md
128
+ - docs/value_converters.md
117
129
  - ext/smarter_csv/extconf.rb
118
130
  - ext/smarter_csv/smarter_csv.c
119
131
  - lib/smarter_csv.rb
120
132
  - lib/smarter_csv/auto_detection.rb
133
+ - lib/smarter_csv/errors.rb
121
134
  - lib/smarter_csv/file_io.rb
122
135
  - lib/smarter_csv/hash_transformations.rb
123
136
  - lib/smarter_csv/header_transformations.rb
124
137
  - lib/smarter_csv/header_validations.rb
125
138
  - lib/smarter_csv/headers.rb
126
- - lib/smarter_csv/options_processing.rb
127
- - lib/smarter_csv/parse.rb
128
- - lib/smarter_csv/smarter_csv.rb
129
- - lib/smarter_csv/variables.rb
139
+ - lib/smarter_csv/options.rb
140
+ - lib/smarter_csv/parser.rb
141
+ - lib/smarter_csv/reader.rb
130
142
  - lib/smarter_csv/version.rb
131
143
  - lib/smarter_csv/writer.rb
132
144
  - smarter_csv.gemspec
@@ -149,12 +161,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
149
161
  version: 2.5.0
150
162
  required_rubygems_version: !ruby/object:Gem::Requirement
151
163
  requirements:
152
- - - ">="
164
+ - - ">"
153
165
  - !ruby/object:Gem::Version
154
- version: '0'
166
+ version: 1.3.1
155
167
  requirements: []
156
168
  rubygems_version: 3.2.3
157
169
  signing_key:
158
170
  specification_version: 4
159
- summary: CSV Reading and Writing
171
+ summary: Convenient CSV Reading and Writing
160
172
  test_files: []
@@ -1,219 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- class SmarterCSVException < StandardError; end
5
- class HeaderSizeMismatch < SmarterCSVException; end
6
- class IncorrectOption < SmarterCSVException; end
7
- class ValidationError < SmarterCSVException; end
8
- class DuplicateHeaders < SmarterCSVException; end
9
- class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
10
- class NoColSepDetected < SmarterCSVException; end
11
- class KeyMappingError < SmarterCSVException; end
12
-
13
- # first parameter: filename or input object which responds to readline method
14
- def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
15
- initialize_variables
16
-
17
- options = process_options(given_options)
18
-
19
- @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
20
- @verbose = options[:verbose]
21
-
22
- begin
23
- fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
24
-
25
- if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
26
- puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
27
- end
28
-
29
- # auto-detect the row separator
30
- options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
31
- # attempt to auto-detect column separator
32
- options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
33
-
34
- skip_lines(fh, options)
35
-
36
- @headers, header_size = process_headers(fh, options)
37
- @headerA = @headers # @headerA is deprecated, use @headers
38
-
39
- puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
40
-
41
- header_validations(@headers, options)
42
-
43
- # in case we use chunking.. we'll need to set it up..
44
- if options[:chunk_size].to_i > 0
45
- use_chunks = true
46
- chunk_size = options[:chunk_size].to_i
47
- @chunk_count = 0
48
- chunk = []
49
- else
50
- use_chunks = false
51
- end
52
-
53
- # now on to processing all the rest of the lines in the CSV file:
54
- # fh.each_line |line|
55
- until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
56
- line = readline_with_counts(fh, options)
57
-
58
- # replace invalid byte sequence in UTF-8 with question mark to avoid errors
59
- line = enforce_utf8_encoding(line, options) if @enforce_utf8
60
-
61
- print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
62
-
63
- next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
64
-
65
- # cater for the quoted csv data containing the row separator carriage return character
66
- # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
67
- # by detecting the existence of an uneven number of quote characters
68
- multiline = count_quote_chars(line, options[:quote_char]).odd?
69
-
70
- while multiline
71
- next_line = fh.readline(options[:row_sep])
72
- next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
73
- line += next_line
74
- @file_line_count += 1
75
-
76
- break if fh.eof? # Exit loop if end of file is reached
77
-
78
- multiline = count_quote_chars(line, options[:quote_char]).odd?
79
- end
80
-
81
- # :nocov:
82
- if multiline && @verbose
83
- print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
84
- end
85
- # :nocov:
86
-
87
- line.chomp!(options[:row_sep])
88
-
89
- # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
90
- dataA, _data_size = parse(line, options, header_size)
91
-
92
- dataA.map!{|x| x.strip} if options[:strip_whitespace]
93
-
94
- # if all values are blank, then ignore this line
95
- next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
96
-
97
- # --- HASH TRANSFORMATIONS ------------------------------------------------------------
98
- hash = @headers.zip(dataA).to_h
99
-
100
- hash = hash_transformations(hash, options)
101
-
102
- # --- HASH VALIDATIONS ----------------------------------------------------------------
103
- # will go here, and be able to:
104
- # - validate correct format of the values for fields
105
- # - required fields to be non-empty
106
- # - ...
107
- # -------------------------------------------------------------------------------------
108
-
109
- next if options[:remove_empty_hashes] && hash.empty?
110
-
111
- puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
112
- # optional adding of csv_line_number to the hash to help debugging
113
- hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
114
-
115
- # process the chunks or the resulting hash
116
- if use_chunks
117
- chunk << hash # append temp result to chunk
118
-
119
- if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
120
- # do something with the chunk
121
- if block_given?
122
- yield chunk # do something with the hashes in the chunk in the block
123
- else
124
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
125
- end
126
- @chunk_count += 1
127
- chunk.clear # re-initialize for next chunk of data
128
- else
129
- # the last chunk may contain partial data, which is handled below
130
- end
131
- # while a chunk is being filled up we don't need to do anything else here
132
-
133
- else # no chunk handling
134
- if block_given?
135
- yield [hash] # do something with the hash in the block (better to use chunking here)
136
- else
137
- @result << hash
138
- end
139
- end
140
- end
141
-
142
- # print new line to retain last processing line message
143
- print "\n" if @verbose
144
-
145
- # handling of last chunk:
146
- if !chunk.nil? && chunk.size > 0
147
- # do something with the chunk
148
- if block_given?
149
- yield chunk # do something with the hashes in the chunk in the block
150
- else
151
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
152
- end
153
- @chunk_count += 1
154
- # chunk = [] # initialize for next chunk of data
155
- end
156
- ensure
157
- fh.close if fh.respond_to?(:close)
158
- end
159
-
160
- if block_given?
161
- @chunk_count # when we do processing through a block we only care how many chunks we processed
162
- else
163
- @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
164
- end
165
- end
166
-
167
- class << self
168
- def count_quote_chars(line, quote_char)
169
- return 0 if line.nil? || quote_char.nil? || quote_char.empty?
170
-
171
- count = 0
172
- escaped = false
173
-
174
- line.each_char do |char|
175
- if char == '\\' && !escaped
176
- escaped = true
177
- else
178
- count += 1 if char == quote_char && !escaped
179
- escaped = false
180
- end
181
- end
182
-
183
- count
184
- end
185
-
186
- def has_acceleration?
187
- @has_acceleration ||= !!defined?(parse_csv_line_c)
188
- end
189
-
190
- protected
191
-
192
- # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
193
- # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
194
- BLANK_RE = /\A\s*\z/.freeze
195
-
196
- def blank?(value)
197
- case value
198
- when String
199
- BLANK_RE.match?(value)
200
- when NilClass
201
- true
202
- when Array
203
- value.all? { |elem| blank?(elem) }
204
- when Hash
205
- value.values.all? { |elem| blank?(elem) } # Focus on values only
206
- else
207
- false
208
- end
209
- end
210
-
211
- private
212
-
213
- def enforce_utf8_encoding(line, options)
214
- # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
215
-
216
- line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
217
- end
218
- end
219
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- class << self
5
- attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
-
7
- def initialize_variables
8
- @has_rails = !!defined?(Rails)
9
- @csv_line_count = 0
10
- @chunk_count = 0
11
- @errors = {}
12
- @file_line_count = 0
13
- @headerA = []
14
- @headers = nil
15
- @raw_header = nil # header as it appears in the file
16
- @result = []
17
- @warnings = {}
18
- @enforce_utf8 = false # only set to true if needed (after options parsing)
19
- end
20
-
21
- # :nocov:
22
- # rubocop:disable Naming/MethodName
23
- def headerA
24
- warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
25
- @headerA
26
- end
27
- # rubocop:enable Naming/MethodName
28
- # :nocov:
29
- end
30
- end