smarter_csv 1.11.0 → 1.12.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.0
4
+ version: 1.12.0.pre1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-07-02 00:00:00.000000000 Z
11
+ date: 2024-07-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -94,9 +94,10 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
- description: Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with
98
- lots of features for processing large files in parallel, embedded comments, unusual
99
- field- and record-separators, flexible mapping of CSV-headers to Hash-keys
97
+ description: 'Ruby Gem for convenient reading and writing: importing of CSV Files
98
+ as Array(s) of Hashes, with lots of features for processing large files in parallel,
99
+ embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers
100
+ to Hash-keys'
100
101
  email:
101
102
  - tilo.sloboda@gmail.com
102
103
  executables: []
@@ -114,19 +115,30 @@ files:
114
115
  - README.md
115
116
  - Rakefile
116
117
  - TO_DO_v2.md
118
+ - docs/_introduction.md
119
+ - docs/basic_api.md
120
+ - docs/batch_processing.md
121
+ - docs/data_transformations.md
122
+ - docs/examples.md
123
+ - docs/header_transformations.md
124
+ - docs/header_validations.md
125
+ - docs/notes.md
126
+ - docs/options.md
127
+ - docs/row_col_sep.md
128
+ - docs/value_converters.md
117
129
  - ext/smarter_csv/extconf.rb
118
130
  - ext/smarter_csv/smarter_csv.c
119
131
  - lib/smarter_csv.rb
120
132
  - lib/smarter_csv/auto_detection.rb
133
+ - lib/smarter_csv/errors.rb
121
134
  - lib/smarter_csv/file_io.rb
122
135
  - lib/smarter_csv/hash_transformations.rb
123
136
  - lib/smarter_csv/header_transformations.rb
124
137
  - lib/smarter_csv/header_validations.rb
125
138
  - lib/smarter_csv/headers.rb
126
- - lib/smarter_csv/options_processing.rb
127
- - lib/smarter_csv/parse.rb
128
- - lib/smarter_csv/smarter_csv.rb
129
- - lib/smarter_csv/variables.rb
139
+ - lib/smarter_csv/options.rb
140
+ - lib/smarter_csv/parser.rb
141
+ - lib/smarter_csv/reader.rb
130
142
  - lib/smarter_csv/version.rb
131
143
  - lib/smarter_csv/writer.rb
132
144
  - smarter_csv.gemspec
@@ -149,12 +161,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
149
161
  version: 2.5.0
150
162
  required_rubygems_version: !ruby/object:Gem::Requirement
151
163
  requirements:
152
- - - ">="
164
+ - - ">"
153
165
  - !ruby/object:Gem::Version
154
- version: '0'
166
+ version: 1.3.1
155
167
  requirements: []
156
168
  rubygems_version: 3.2.3
157
169
  signing_key:
158
170
  specification_version: 4
159
- summary: CSV Reading and Writing
171
+ summary: Convenient CSV Reading and Writing
160
172
  test_files: []
@@ -1,219 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- class SmarterCSVException < StandardError; end
5
- class HeaderSizeMismatch < SmarterCSVException; end
6
- class IncorrectOption < SmarterCSVException; end
7
- class ValidationError < SmarterCSVException; end
8
- class DuplicateHeaders < SmarterCSVException; end
9
- class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
10
- class NoColSepDetected < SmarterCSVException; end
11
- class KeyMappingError < SmarterCSVException; end
12
-
13
- # first parameter: filename or input object which responds to readline method
14
- def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
15
- initialize_variables
16
-
17
- options = process_options(given_options)
18
-
19
- @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
20
- @verbose = options[:verbose]
21
-
22
- begin
23
- fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
24
-
25
- if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
26
- puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
27
- end
28
-
29
- # auto-detect the row separator
30
- options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
31
- # attempt to auto-detect column separator
32
- options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
33
-
34
- skip_lines(fh, options)
35
-
36
- @headers, header_size = process_headers(fh, options)
37
- @headerA = @headers # @headerA is deprecated, use @headers
38
-
39
- puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
40
-
41
- header_validations(@headers, options)
42
-
43
- # in case we use chunking.. we'll need to set it up..
44
- if options[:chunk_size].to_i > 0
45
- use_chunks = true
46
- chunk_size = options[:chunk_size].to_i
47
- @chunk_count = 0
48
- chunk = []
49
- else
50
- use_chunks = false
51
- end
52
-
53
- # now on to processing all the rest of the lines in the CSV file:
54
- # fh.each_line |line|
55
- until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
56
- line = readline_with_counts(fh, options)
57
-
58
- # replace invalid byte sequence in UTF-8 with question mark to avoid errors
59
- line = enforce_utf8_encoding(line, options) if @enforce_utf8
60
-
61
- print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
62
-
63
- next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
64
-
65
- # cater for the quoted csv data containing the row separator carriage return character
66
- # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
67
- # by detecting the existence of an uneven number of quote characters
68
- multiline = count_quote_chars(line, options[:quote_char]).odd?
69
-
70
- while multiline
71
- next_line = fh.readline(options[:row_sep])
72
- next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
73
- line += next_line
74
- @file_line_count += 1
75
-
76
- break if fh.eof? # Exit loop if end of file is reached
77
-
78
- multiline = count_quote_chars(line, options[:quote_char]).odd?
79
- end
80
-
81
- # :nocov:
82
- if multiline && @verbose
83
- print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
84
- end
85
- # :nocov:
86
-
87
- line.chomp!(options[:row_sep])
88
-
89
- # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
90
- dataA, _data_size = parse(line, options, header_size)
91
-
92
- dataA.map!{|x| x.strip} if options[:strip_whitespace]
93
-
94
- # if all values are blank, then ignore this line
95
- next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
96
-
97
- # --- HASH TRANSFORMATIONS ------------------------------------------------------------
98
- hash = @headers.zip(dataA).to_h
99
-
100
- hash = hash_transformations(hash, options)
101
-
102
- # --- HASH VALIDATIONS ----------------------------------------------------------------
103
- # will go here, and be able to:
104
- # - validate correct format of the values for fields
105
- # - required fields to be non-empty
106
- # - ...
107
- # -------------------------------------------------------------------------------------
108
-
109
- next if options[:remove_empty_hashes] && hash.empty?
110
-
111
- puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
112
- # optional adding of csv_line_number to the hash to help debugging
113
- hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
114
-
115
- # process the chunks or the resulting hash
116
- if use_chunks
117
- chunk << hash # append temp result to chunk
118
-
119
- if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
120
- # do something with the chunk
121
- if block_given?
122
- yield chunk # do something with the hashes in the chunk in the block
123
- else
124
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
125
- end
126
- @chunk_count += 1
127
- chunk.clear # re-initialize for next chunk of data
128
- else
129
- # the last chunk may contain partial data, which is handled below
130
- end
131
- # while a chunk is being filled up we don't need to do anything else here
132
-
133
- else # no chunk handling
134
- if block_given?
135
- yield [hash] # do something with the hash in the block (better to use chunking here)
136
- else
137
- @result << hash
138
- end
139
- end
140
- end
141
-
142
- # print new line to retain last processing line message
143
- print "\n" if @verbose
144
-
145
- # handling of last chunk:
146
- if !chunk.nil? && chunk.size > 0
147
- # do something with the chunk
148
- if block_given?
149
- yield chunk # do something with the hashes in the chunk in the block
150
- else
151
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
152
- end
153
- @chunk_count += 1
154
- # chunk = [] # initialize for next chunk of data
155
- end
156
- ensure
157
- fh.close if fh.respond_to?(:close)
158
- end
159
-
160
- if block_given?
161
- @chunk_count # when we do processing through a block we only care how many chunks we processed
162
- else
163
- @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
164
- end
165
- end
166
-
167
- class << self
168
- def count_quote_chars(line, quote_char)
169
- return 0 if line.nil? || quote_char.nil? || quote_char.empty?
170
-
171
- count = 0
172
- escaped = false
173
-
174
- line.each_char do |char|
175
- if char == '\\' && !escaped
176
- escaped = true
177
- else
178
- count += 1 if char == quote_char && !escaped
179
- escaped = false
180
- end
181
- end
182
-
183
- count
184
- end
185
-
186
- def has_acceleration?
187
- @has_acceleration ||= !!defined?(parse_csv_line_c)
188
- end
189
-
190
- protected
191
-
192
- # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
193
- # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
194
- BLANK_RE = /\A\s*\z/.freeze
195
-
196
- def blank?(value)
197
- case value
198
- when String
199
- BLANK_RE.match?(value)
200
- when NilClass
201
- true
202
- when Array
203
- value.all? { |elem| blank?(elem) }
204
- when Hash
205
- value.values.all? { |elem| blank?(elem) } # Focus on values only
206
- else
207
- false
208
- end
209
- end
210
-
211
- private
212
-
213
- def enforce_utf8_encoding(line, options)
214
- # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
215
-
216
- line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
217
- end
218
- end
219
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- class << self
5
- attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
-
7
- def initialize_variables
8
- @has_rails = !!defined?(Rails)
9
- @csv_line_count = 0
10
- @chunk_count = 0
11
- @errors = {}
12
- @file_line_count = 0
13
- @headerA = []
14
- @headers = nil
15
- @raw_header = nil # header as it appears in the file
16
- @result = []
17
- @warnings = {}
18
- @enforce_utf8 = false # only set to true if needed (after options parsing)
19
- end
20
-
21
- # :nocov:
22
- # rubocop:disable Naming/MethodName
23
- def headerA
24
- warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
25
- @headerA
26
- end
27
- # rubocop:enable Naming/MethodName
28
- # :nocov:
29
- end
30
- end