smarter_csv 1.11.2 → 1.12.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -87,9 +87,11 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
87
87
  }
88
88
 
89
89
  VALUE SmarterCSV = Qnil;
90
+ VALUE Parser = Qnil;
90
91
 
91
92
  void Init_smarter_csv(void) {
92
- VALUE SmarterCSV = rb_define_module("SmarterCSV");
93
+ SmarterCSV = rb_define_module("SmarterCSV");
94
+ Parser = rb_define_module_under(SmarterCSV, "Parser");
93
95
 
94
- rb_define_module_function(SmarterCSV, "parse_csv_line_c", rb_parse_csv_line, 4);
96
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
95
97
  }
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module AutoDetection
5
5
  protected
6
6
 
7
7
  # If file has headers, then guesses column separator from headers.
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module FileIO
5
5
  protected
6
6
 
7
7
  def readline_with_counts(filehandle, options)
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HashTransformations
5
5
  def hash_transformations(hash, options)
6
6
  # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
7
7
  # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HeaderTransformations
5
5
  # transform the headers that were in the file:
6
6
  def header_transformations(header_array, options)
7
7
  header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HeaderValidations
5
5
  def header_validations(headers, options)
6
6
  check_duplicate_headers(headers, options)
7
7
  check_required_headers(headers, options)
@@ -26,7 +26,7 @@ module SmarterCSV
26
26
  missing_keys = options[:required_keys].select { |k| !headers_set.include?(k) }
27
27
 
28
28
  unless missing_keys.empty?
29
- raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `SmarterCSV.headers` for original headers."
29
+ raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `reader.headers` for original headers."
30
30
  end
31
31
  end
32
32
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module Headers
5
5
  def process_headers(filehandle, options)
6
6
  @raw_header = nil # header as it appears in the file
7
7
  @headers = nil # the processed headers
@@ -1,43 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- DEFAULT_OPTIONS = {
5
- acceleration: true,
6
- auto_row_sep_chars: 500,
7
- chunk_size: nil,
8
- col_sep: :auto, # was: ',',
9
- comment_regexp: nil, # was: /\A#/,
10
- convert_values_to_numeric: true,
11
- downcase_header: true,
12
- duplicate_header_suffix: '', # was: nil,
13
- file_encoding: 'utf-8',
14
- force_simple_split: false,
15
- force_utf8: false,
16
- headers_in_file: true,
17
- invalid_byte_sequence: '',
18
- keep_original_headers: false,
19
- key_mapping: nil,
20
- quote_char: '"',
21
- remove_empty_hashes: true,
22
- remove_empty_values: true,
23
- remove_unmapped_keys: false,
24
- remove_values_matching: nil,
25
- remove_zero_values: false,
26
- required_headers: nil,
27
- required_keys: nil,
28
- row_sep: :auto, # was: $/,
29
- silence_missing_keys: false,
30
- skip_lines: nil,
31
- strings_as_keys: false,
32
- strip_chars_from_headers: nil,
33
- strip_whitespace: true,
34
- user_provided_headers: nil,
35
- value_converters: nil,
36
- verbose: false,
37
- with_line_numbers: false,
38
- }.freeze
4
+ #
5
+ # NOTE: this is not called when "parse" methods are tested by themselves
6
+ #
7
+ # ONLY FOR BACKWARDS-COMPATIBILITY
8
+ def self.default_options
9
+ Options::DEFAULT_OPTIONS
10
+ end
11
+
12
+ module Options
13
+ DEFAULT_OPTIONS = {
14
+ acceleration: true, # if user wants to use accelleration or not
15
+ auto_row_sep_chars: 500,
16
+ chunk_size: nil,
17
+ col_sep: :auto, # was: ',',
18
+ comment_regexp: nil, # was: /\A#/,
19
+ convert_values_to_numeric: true,
20
+ downcase_header: true,
21
+ duplicate_header_suffix: '', # was: nil,
22
+ file_encoding: 'utf-8',
23
+ force_simple_split: false,
24
+ force_utf8: false,
25
+ headers_in_file: true,
26
+ invalid_byte_sequence: '',
27
+ keep_original_headers: false,
28
+ key_mapping: nil,
29
+ quote_char: '"',
30
+ remove_empty_hashes: true,
31
+ remove_empty_values: true,
32
+ remove_unmapped_keys: false,
33
+ remove_values_matching: nil,
34
+ remove_zero_values: false,
35
+ required_headers: nil,
36
+ required_keys: nil,
37
+ row_sep: :auto, # was: $/,
38
+ silence_missing_keys: false,
39
+ skip_lines: nil,
40
+ strings_as_keys: false,
41
+ strip_chars_from_headers: nil,
42
+ strip_whitespace: true,
43
+ user_provided_headers: nil,
44
+ value_converters: nil,
45
+ verbose: false,
46
+ with_line_numbers: false,
47
+ }.freeze
39
48
 
40
- class << self
41
49
  # NOTE: this is not called when "parse" methods are tested by themselves
42
50
  def process_options(given_options = {})
43
51
  puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]
@@ -53,13 +61,6 @@ module SmarterCSV
53
61
  @options
54
62
  end
55
63
 
56
- # NOTE: this is not called when "parse" methods are tested by themselves
57
- #
58
- # ONLY FOR BACKWARDS-COMPATIBILITY
59
- def default_options
60
- DEFAULT_OPTIONS
61
- end
62
-
63
64
  private
64
65
 
65
66
  def validate_options!(options)
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module Parser
5
5
  protected
6
6
 
7
7
  ###
@@ -10,7 +10,7 @@ module SmarterCSV
10
10
  def parse(line, options, header_size = nil)
11
11
  # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
12
12
 
13
- if options[:acceleration] && has_acceleration?
13
+ if options[:acceleration] && has_acceleration
14
14
  # :nocov:
15
15
  has_quotes = line =~ /#{options[:quote_char]}/
16
16
  elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
@@ -0,0 +1,243 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class Reader
5
+ include ::SmarterCSV::Options
6
+ include ::SmarterCSV::FileIO
7
+ include ::SmarterCSV::AutoDetection
8
+ include ::SmarterCSV::Headers
9
+ include ::SmarterCSV::HeaderTransformations
10
+ include ::SmarterCSV::HeaderValidations
11
+ include ::SmarterCSV::HashTransformations
12
+ include ::SmarterCSV::Parser
13
+
14
+ attr_reader :input, :options
15
+ attr_reader :csv_line_count, :chunk_count, :file_line_count
16
+ attr_reader :enforce_utf8, :has_rails, :has_acceleration
17
+ attr_reader :errors, :warnings, :headers, :raw_header, :result
18
+
19
+ # :nocov:
20
+ # rubocop:disable Naming/MethodName
21
+ def headerA
22
+ warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
23
+ @headerA
24
+ end
25
+ # rubocop:enable Naming/MethodName
26
+ # :nocov:
27
+
28
+ # first parameter: filename or input object which responds to readline method
29
+ def initialize(input, given_options = {})
30
+ @input = input
31
+ @has_rails = !!defined?(Rails)
32
+ @csv_line_count = 0
33
+ @chunk_count = 0
34
+ @errors = {}
35
+ @file_line_count = 0
36
+ @headerA = []
37
+ @headers = nil
38
+ @raw_header = nil # header as it appears in the file
39
+ @result = []
40
+ @warnings = {}
41
+ @enforce_utf8 = false # only set to true if needed (after options parsing)
42
+ @options = process_options(given_options)
43
+ # true if it is compiled with accelleration
44
+ @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
45
+ end
46
+
47
+ def process(&block) # rubocop:disable Lint/UnusedMethodArgument
48
+ @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
49
+ @verbose = options[:verbose]
50
+
51
+ begin
52
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
53
+
54
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
55
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
56
+ end
57
+
58
+ # auto-detect the row separator
59
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
60
+ # attempt to auto-detect column separator
61
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
62
+
63
+ skip_lines(fh, options)
64
+
65
+ @headers, header_size = process_headers(fh, options)
66
+ @headerA = @headers # @headerA is deprecated, use @headers
67
+
68
+ puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
69
+
70
+ header_validations(@headers, options)
71
+
72
+ # in case we use chunking.. we'll need to set it up..
73
+ if options[:chunk_size].to_i > 0
74
+ use_chunks = true
75
+ chunk_size = options[:chunk_size].to_i
76
+ @chunk_count = 0
77
+ chunk = []
78
+ else
79
+ use_chunks = false
80
+ end
81
+
82
+ # now on to processing all the rest of the lines in the CSV file:
83
+ # fh.each_line |line|
84
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
85
+ line = readline_with_counts(fh, options)
86
+
87
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
88
+ line = enforce_utf8_encoding(line, options) if @enforce_utf8
89
+
90
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
91
+
92
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
93
+
94
+ # cater for the quoted csv data containing the row separator carriage return character
95
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
96
+ # by detecting the existence of an uneven number of quote characters
97
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
98
+
99
+ while multiline
100
+ next_line = fh.readline(options[:row_sep])
101
+ next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
102
+ line += next_line
103
+ @file_line_count += 1
104
+
105
+ break if fh.eof? # Exit loop if end of file is reached
106
+
107
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
108
+ end
109
+
110
+ # :nocov:
111
+ if multiline && @verbose
112
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
113
+ end
114
+ # :nocov:
115
+
116
+ line.chomp!(options[:row_sep])
117
+
118
+ # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
119
+ dataA, _data_size = parse(line, options, header_size)
120
+
121
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
122
+
123
+ # if all values are blank, then ignore this line
124
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
125
+
126
+ # --- HASH TRANSFORMATIONS ------------------------------------------------------------
127
+ hash = @headers.zip(dataA).to_h
128
+
129
+ hash = hash_transformations(hash, options)
130
+
131
+ # --- HASH VALIDATIONS ----------------------------------------------------------------
132
+ # will go here, and be able to:
133
+ # - validate correct format of the values for fields
134
+ # - required fields to be non-empty
135
+ # - ...
136
+ # -------------------------------------------------------------------------------------
137
+
138
+ next if options[:remove_empty_hashes] && hash.empty?
139
+
140
+ puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
141
+ # optional adding of csv_line_number to the hash to help debugging
142
+ hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
143
+
144
+ # process the chunks or the resulting hash
145
+ if use_chunks
146
+ chunk << hash # append temp result to chunk
147
+
148
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
149
+ # do something with the chunk
150
+ if block_given?
151
+ yield chunk # do something with the hashes in the chunk in the block
152
+ else
153
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
154
+ end
155
+ @chunk_count += 1
156
+ chunk.clear # re-initialize for next chunk of data
157
+ else
158
+ # the last chunk may contain partial data, which is handled below
159
+ end
160
+ # while a chunk is being filled up we don't need to do anything else here
161
+
162
+ else # no chunk handling
163
+ if block_given?
164
+ yield [hash] # do something with the hash in the block (better to use chunking here)
165
+ else
166
+ @result << hash
167
+ end
168
+ end
169
+ end
170
+
171
+ # print new line to retain last processing line message
172
+ print "\n" if @verbose
173
+
174
+ # handling of last chunk:
175
+ if !chunk.nil? && chunk.size > 0
176
+ # do something with the chunk
177
+ if block_given?
178
+ yield chunk # do something with the hashes in the chunk in the block
179
+ else
180
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
181
+ end
182
+ @chunk_count += 1
183
+ # chunk = [] # initialize for next chunk of data
184
+ end
185
+ ensure
186
+ fh.close if fh.respond_to?(:close)
187
+ end
188
+
189
+ if block_given?
190
+ @chunk_count # when we do processing through a block we only care how many chunks we processed
191
+ else
192
+ @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
193
+ end
194
+ end
195
+
196
+ def count_quote_chars(line, quote_char)
197
+ return 0 if line.nil? || quote_char.nil? || quote_char.empty?
198
+
199
+ count = 0
200
+ escaped = false
201
+
202
+ line.each_char do |char|
203
+ if char == '\\' && !escaped
204
+ escaped = true
205
+ else
206
+ count += 1 if char == quote_char && !escaped
207
+ escaped = false
208
+ end
209
+ end
210
+
211
+ count
212
+ end
213
+
214
+ protected
215
+
216
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
217
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
218
+ BLANK_RE = /\A\s*\z/.freeze
219
+
220
+ def blank?(value)
221
+ case value
222
+ when String
223
+ BLANK_RE.match?(value)
224
+ when NilClass
225
+ true
226
+ when Array
227
+ value.all? { |elem| blank?(elem) }
228
+ when Hash
229
+ value.values.all? { |elem| blank?(elem) } # Focus on values only
230
+ else
231
+ false
232
+ end
233
+ end
234
+
235
+ private
236
+
237
+ def enforce_utf8_encoding(line, options)
238
+ # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
239
+
240
+ line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
241
+ end
242
+ end
243
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.11.2"
4
+ VERSION = "1.12.0.pre1"
5
5
  end
@@ -40,7 +40,8 @@ module SmarterCSV
40
40
  class Writer
41
41
  def initialize(file_path, options = {})
42
42
  @options = options
43
- @row_sep = options[:row_sep] || "\n" # RFC4180 "\r\n"
43
+
44
+ @row_sep = options[:row_sep] || $/ # Defaults to system's row separator. RFC4180 "\r\n"
44
45
  @col_sep = options[:col_sep] || ','
45
46
  @quote_char = options[:quote_char] || '"'
46
47
  @force_quotes = options[:force_quotes] == true
data/lib/smarter_csv.rb CHANGED
@@ -4,17 +4,16 @@ require "smarter_csv/version"
4
4
  require "smarter_csv/errors"
5
5
 
6
6
  require "smarter_csv/file_io"
7
- require "smarter_csv/options_processing"
7
+ require "smarter_csv/options"
8
8
  require "smarter_csv/auto_detection"
9
- require "smarter_csv/variables"
10
9
  require 'smarter_csv/header_transformations'
11
10
  require 'smarter_csv/header_validations'
12
11
  require "smarter_csv/headers"
13
12
  require "smarter_csv/hash_transformations"
14
13
 
15
- require "smarter_csv/parse"
14
+ require "smarter_csv/parser"
16
15
  require "smarter_csv/writer"
17
- require "smarter_csv/smarter_csv"
16
+ require "smarter_csv/reader"
18
17
 
19
18
  # load the C-extension:
20
19
  case RUBY_ENGINE
@@ -55,6 +54,23 @@ end
55
54
  # :nocov:
56
55
 
57
56
  module SmarterCSV
57
+ # For backwards compatibility:
58
+ #
59
+ # while `SmarterCSV.process` works for simple cases, you can't get access to the internal state any longer.
60
+ # e.g. you need the instance of the Reader to access the original headers
61
+ #
62
+ # Please use this instead:
63
+ #
64
+ # reader = SmarterCSV::Reader.new(input, options)
65
+ # reader.process # with or without block
66
+ #
67
+ def self.process(input, given_options = {}, &block)
68
+ reader = Reader.new(input, given_options)
69
+ reader.process(&block)
70
+ end
71
+
72
+ # Convenience method for generating CSV files:
73
+ #
58
74
  # SmarterCSV.generate(filename, options) do |csv_writer|
59
75
  # MyModel.find_in_batches(batch_size: 100) do |batch|
60
76
  # batch.pluck(:name, :description, :instructor).each do |record|
data/smarter_csv.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Tilo Sloboda"]
10
10
  spec.email = ["tilo.sloboda@gmail.com"]
11
11
 
12
- spec.summary = "CSV Reading and Writing"
12
+ spec.summary = "Convenient CSV Reading and Writing"
13
13
  spec.description = "Ruby Gem for convenient reading and writing: importing of CSV Files as Array(s) of Hashes, with lots of features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys"
14
14
  spec.homepage = "https://github.com/tilo/smarter_csv"
15
15
  spec.license = 'MIT'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.2
4
+ version: 1.12.0.pre1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-07-06 00:00:00.000000000 Z
11
+ date: 2024-07-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -115,6 +115,17 @@ files:
115
115
  - README.md
116
116
  - Rakefile
117
117
  - TO_DO_v2.md
118
+ - docs/_introduction.md
119
+ - docs/basic_api.md
120
+ - docs/batch_processing.md
121
+ - docs/data_transformations.md
122
+ - docs/examples.md
123
+ - docs/header_transformations.md
124
+ - docs/header_validations.md
125
+ - docs/notes.md
126
+ - docs/options.md
127
+ - docs/row_col_sep.md
128
+ - docs/value_converters.md
118
129
  - ext/smarter_csv/extconf.rb
119
130
  - ext/smarter_csv/smarter_csv.c
120
131
  - lib/smarter_csv.rb
@@ -125,10 +136,9 @@ files:
125
136
  - lib/smarter_csv/header_transformations.rb
126
137
  - lib/smarter_csv/header_validations.rb
127
138
  - lib/smarter_csv/headers.rb
128
- - lib/smarter_csv/options_processing.rb
129
- - lib/smarter_csv/parse.rb
130
- - lib/smarter_csv/smarter_csv.rb
131
- - lib/smarter_csv/variables.rb
139
+ - lib/smarter_csv/options.rb
140
+ - lib/smarter_csv/parser.rb
141
+ - lib/smarter_csv/reader.rb
132
142
  - lib/smarter_csv/version.rb
133
143
  - lib/smarter_csv/writer.rb
134
144
  - smarter_csv.gemspec
@@ -151,12 +161,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
151
161
  version: 2.5.0
152
162
  required_rubygems_version: !ruby/object:Gem::Requirement
153
163
  requirements:
154
- - - ">="
164
+ - - ">"
155
165
  - !ruby/object:Gem::Version
156
- version: '0'
166
+ version: 1.3.1
157
167
  requirements: []
158
168
  rubygems_version: 3.2.3
159
169
  signing_key:
160
170
  specification_version: 4
161
- summary: CSV Reading and Writing
171
+ summary: Convenient CSV Reading and Writing
162
172
  test_files: []