smarter_csv 1.11.2 → 1.12.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
@@ -87,9 +87,11 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
87
87
  }
88
88
 
89
89
  VALUE SmarterCSV = Qnil;
90
+ VALUE Parser = Qnil;
90
91
 
91
92
  void Init_smarter_csv(void) {
92
- VALUE SmarterCSV = rb_define_module("SmarterCSV");
93
+ SmarterCSV = rb_define_module("SmarterCSV");
94
+ Parser = rb_define_module_under(SmarterCSV, "Parser");
93
95
 
94
- rb_define_module_function(SmarterCSV, "parse_csv_line_c", rb_parse_csv_line, 4);
96
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
95
97
  }
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module AutoDetection
5
5
  protected
6
6
 
7
7
  # If file has headers, then guesses column separator from headers.
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module FileIO
5
5
  protected
6
6
 
7
7
  def readline_with_counts(filehandle, options)
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HashTransformations
5
5
  def hash_transformations(hash, options)
6
6
  # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
7
7
  # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HeaderTransformations
5
5
  # transform the headers that were in the file:
6
6
  def header_transformations(header_array, options)
7
7
  header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HeaderValidations
5
5
  def header_validations(headers, options)
6
6
  check_duplicate_headers(headers, options)
7
7
  check_required_headers(headers, options)
@@ -26,7 +26,7 @@ module SmarterCSV
26
26
  missing_keys = options[:required_keys].select { |k| !headers_set.include?(k) }
27
27
 
28
28
  unless missing_keys.empty?
29
- raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `SmarterCSV.headers` for original headers."
29
+ raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `reader.headers` for original headers."
30
30
  end
31
31
  end
32
32
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module Headers
5
5
  def process_headers(filehandle, options)
6
6
  @raw_header = nil # header as it appears in the file
7
7
  @headers = nil # the processed headers
@@ -1,43 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- DEFAULT_OPTIONS = {
5
- acceleration: true,
6
- auto_row_sep_chars: 500,
7
- chunk_size: nil,
8
- col_sep: :auto, # was: ',',
9
- comment_regexp: nil, # was: /\A#/,
10
- convert_values_to_numeric: true,
11
- downcase_header: true,
12
- duplicate_header_suffix: '', # was: nil,
13
- file_encoding: 'utf-8',
14
- force_simple_split: false,
15
- force_utf8: false,
16
- headers_in_file: true,
17
- invalid_byte_sequence: '',
18
- keep_original_headers: false,
19
- key_mapping: nil,
20
- quote_char: '"',
21
- remove_empty_hashes: true,
22
- remove_empty_values: true,
23
- remove_unmapped_keys: false,
24
- remove_values_matching: nil,
25
- remove_zero_values: false,
26
- required_headers: nil,
27
- required_keys: nil,
28
- row_sep: :auto, # was: $/,
29
- silence_missing_keys: false,
30
- skip_lines: nil,
31
- strings_as_keys: false,
32
- strip_chars_from_headers: nil,
33
- strip_whitespace: true,
34
- user_provided_headers: nil,
35
- value_converters: nil,
36
- verbose: false,
37
- with_line_numbers: false,
38
- }.freeze
4
+ #
5
+ # NOTE: this is not called when "parse" methods are tested by themselves
6
+ #
7
+ # ONLY FOR BACKWARDS-COMPATIBILITY
8
+ def self.default_options
9
+ Options::DEFAULT_OPTIONS
10
+ end
11
+
12
+ module Options
13
+ DEFAULT_OPTIONS = {
14
+ acceleration: true, # if user wants to use accelleration or not
15
+ auto_row_sep_chars: 500,
16
+ chunk_size: nil,
17
+ col_sep: :auto, # was: ',',
18
+ comment_regexp: nil, # was: /\A#/,
19
+ convert_values_to_numeric: true,
20
+ downcase_header: true,
21
+ duplicate_header_suffix: '', # was: nil,
22
+ file_encoding: 'utf-8',
23
+ force_simple_split: false,
24
+ force_utf8: false,
25
+ headers_in_file: true,
26
+ invalid_byte_sequence: '',
27
+ keep_original_headers: false,
28
+ key_mapping: nil,
29
+ quote_char: '"',
30
+ remove_empty_hashes: true,
31
+ remove_empty_values: true,
32
+ remove_unmapped_keys: false,
33
+ remove_values_matching: nil,
34
+ remove_zero_values: false,
35
+ required_headers: nil,
36
+ required_keys: nil,
37
+ row_sep: :auto, # was: $/,
38
+ silence_missing_keys: false,
39
+ skip_lines: nil,
40
+ strings_as_keys: false,
41
+ strip_chars_from_headers: nil,
42
+ strip_whitespace: true,
43
+ user_provided_headers: nil,
44
+ value_converters: nil,
45
+ verbose: false,
46
+ with_line_numbers: false,
47
+ }.freeze
39
48
 
40
- class << self
41
49
  # NOTE: this is not called when "parse" methods are tested by themselves
42
50
  def process_options(given_options = {})
43
51
  puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]
@@ -53,13 +61,6 @@ module SmarterCSV
53
61
  @options
54
62
  end
55
63
 
56
- # NOTE: this is not called when "parse" methods are tested by themselves
57
- #
58
- # ONLY FOR BACKWARDS-COMPATIBILITY
59
- def default_options
60
- DEFAULT_OPTIONS
61
- end
62
-
63
64
  private
64
65
 
65
66
  def validate_options!(options)
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module Parser
5
5
  protected
6
6
 
7
7
  ###
@@ -10,7 +10,7 @@ module SmarterCSV
10
10
  def parse(line, options, header_size = nil)
11
11
  # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
12
12
 
13
- if options[:acceleration] && has_acceleration?
13
+ if options[:acceleration] && has_acceleration
14
14
  # :nocov:
15
15
  has_quotes = line =~ /#{options[:quote_char]}/
16
16
  elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
@@ -0,0 +1,243 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class Reader
5
+ include ::SmarterCSV::Options
6
+ include ::SmarterCSV::FileIO
7
+ include ::SmarterCSV::AutoDetection
8
+ include ::SmarterCSV::Headers
9
+ include ::SmarterCSV::HeaderTransformations
10
+ include ::SmarterCSV::HeaderValidations
11
+ include ::SmarterCSV::HashTransformations
12
+ include ::SmarterCSV::Parser
13
+
14
+ attr_reader :input, :options
15
+ attr_reader :csv_line_count, :chunk_count, :file_line_count
16
+ attr_reader :enforce_utf8, :has_rails, :has_acceleration
17
+ attr_reader :errors, :warnings, :headers, :raw_header, :result
18
+
19
+ # :nocov:
20
+ # rubocop:disable Naming/MethodName
21
+ def headerA
22
+ warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
23
+ @headerA
24
+ end
25
+ # rubocop:enable Naming/MethodName
26
+ # :nocov:
27
+
28
+ # first parameter: filename or input object which responds to readline method
29
+ def initialize(input, given_options = {})
30
+ @input = input
31
+ @has_rails = !!defined?(Rails)
32
+ @csv_line_count = 0
33
+ @chunk_count = 0
34
+ @errors = {}
35
+ @file_line_count = 0
36
+ @headerA = []
37
+ @headers = nil
38
+ @raw_header = nil # header as it appears in the file
39
+ @result = []
40
+ @warnings = {}
41
+ @enforce_utf8 = false # only set to true if needed (after options parsing)
42
+ @options = process_options(given_options)
43
+ # true if it is compiled with accelleration
44
+ @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
45
+ end
46
+
47
+ def process(&block) # rubocop:disable Lint/UnusedMethodArgument
48
+ @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
49
+ @verbose = options[:verbose]
50
+
51
+ begin
52
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
53
+
54
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
55
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
56
+ end
57
+
58
+ # auto-detect the row separator
59
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
60
+ # attempt to auto-detect column separator
61
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
62
+
63
+ skip_lines(fh, options)
64
+
65
+ @headers, header_size = process_headers(fh, options)
66
+ @headerA = @headers # @headerA is deprecated, use @headers
67
+
68
+ puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
69
+
70
+ header_validations(@headers, options)
71
+
72
+ # in case we use chunking.. we'll need to set it up..
73
+ if options[:chunk_size].to_i > 0
74
+ use_chunks = true
75
+ chunk_size = options[:chunk_size].to_i
76
+ @chunk_count = 0
77
+ chunk = []
78
+ else
79
+ use_chunks = false
80
+ end
81
+
82
+ # now on to processing all the rest of the lines in the CSV file:
83
+ # fh.each_line |line|
84
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
85
+ line = readline_with_counts(fh, options)
86
+
87
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
88
+ line = enforce_utf8_encoding(line, options) if @enforce_utf8
89
+
90
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
91
+
92
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
93
+
94
+ # cater for the quoted csv data containing the row separator carriage return character
95
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
96
+ # by detecting the existence of an uneven number of quote characters
97
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
98
+
99
+ while multiline
100
+ next_line = fh.readline(options[:row_sep])
101
+ next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
102
+ line += next_line
103
+ @file_line_count += 1
104
+
105
+ break if fh.eof? # Exit loop if end of file is reached
106
+
107
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
108
+ end
109
+
110
+ # :nocov:
111
+ if multiline && @verbose
112
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
113
+ end
114
+ # :nocov:
115
+
116
+ line.chomp!(options[:row_sep])
117
+
118
+ # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
119
+ dataA, _data_size = parse(line, options, header_size)
120
+
121
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
122
+
123
+ # if all values are blank, then ignore this line
124
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
125
+
126
+ # --- HASH TRANSFORMATIONS ------------------------------------------------------------
127
+ hash = @headers.zip(dataA).to_h
128
+
129
+ hash = hash_transformations(hash, options)
130
+
131
+ # --- HASH VALIDATIONS ----------------------------------------------------------------
132
+ # will go here, and be able to:
133
+ # - validate correct format of the values for fields
134
+ # - required fields to be non-empty
135
+ # - ...
136
+ # -------------------------------------------------------------------------------------
137
+
138
+ next if options[:remove_empty_hashes] && hash.empty?
139
+
140
+ puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
141
+ # optional adding of csv_line_number to the hash to help debugging
142
+ hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
143
+
144
+ # process the chunks or the resulting hash
145
+ if use_chunks
146
+ chunk << hash # append temp result to chunk
147
+
148
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
149
+ # do something with the chunk
150
+ if block_given?
151
+ yield chunk # do something with the hashes in the chunk in the block
152
+ else
153
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
154
+ end
155
+ @chunk_count += 1
156
+ chunk.clear # re-initialize for next chunk of data
157
+ else
158
+ # the last chunk may contain partial data, which is handled below
159
+ end
160
+ # while a chunk is being filled up we don't need to do anything else here
161
+
162
+ else # no chunk handling
163
+ if block_given?
164
+ yield [hash] # do something with the hash in the block (better to use chunking here)
165
+ else
166
+ @result << hash
167
+ end
168
+ end
169
+ end
170
+
171
+ # print new line to retain last processing line message
172
+ print "\n" if @verbose
173
+
174
+ # handling of last chunk:
175
+ if !chunk.nil? && chunk.size > 0
176
+ # do something with the chunk
177
+ if block_given?
178
+ yield chunk # do something with the hashes in the chunk in the block
179
+ else
180
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
181
+ end
182
+ @chunk_count += 1
183
+ # chunk = [] # initialize for next chunk of data
184
+ end
185
+ ensure
186
+ fh.close if fh.respond_to?(:close)
187
+ end
188
+
189
+ if block_given?
190
+ @chunk_count # when we do processing through a block we only care how many chunks we processed
191
+ else
192
+ @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
193
+ end
194
+ end
195
+
196
+ def count_quote_chars(line, quote_char)
197
+ return 0 if line.nil? || quote_char.nil? || quote_char.empty?
198
+
199
+ count = 0
200
+ escaped = false
201
+
202
+ line.each_char do |char|
203
+ if char == '\\' && !escaped
204
+ escaped = true
205
+ else
206
+ count += 1 if char == quote_char && !escaped
207
+ escaped = false
208
+ end
209
+ end
210
+
211
+ count
212
+ end
213
+
214
+ protected
215
+
216
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
217
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
218
+ BLANK_RE = /\A\s*\z/.freeze
219
+
220
+ def blank?(value)
221
+ case value
222
+ when String
223
+ BLANK_RE.match?(value)
224
+ when NilClass
225
+ true
226
+ when Array
227
+ value.all? { |elem| blank?(elem) }
228
+ when Hash
229
+ value.values.all? { |elem| blank?(elem) } # Focus on values only
230
+ else
231
+ false
232
+ end
233
+ end
234
+
235
+ private
236
+
237
+ def enforce_utf8_encoding(line, options)
238
+ # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
239
+
240
+ line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
241
+ end
242
+ end
243
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.11.2"
4
+ VERSION = "1.12.0.pre1"
5
5
  end
@@ -40,7 +40,8 @@ module SmarterCSV
40
40
  class Writer
41
41
  def initialize(file_path, options = {})
42
42
  @options = options
43
- @row_sep = options[:row_sep] || "\n" # RFC4180 "\r\n"
43
+
44
+ @row_sep = options[:row_sep] || $/ # Defaults to system's row separator. RFC4180 "\r\n"
44
45
  @col_sep = options[:col_sep] || ','
45
46
  @quote_char = options[:quote_char] || '"'
46
47
  @force_quotes = options[:force_quotes] == true
data/lib/smarter_csv.rb CHANGED
@@ -4,17 +4,16 @@ require "smarter_csv/version"
4
4
  require "smarter_csv/errors"
5
5
 
6
6
  require "smarter_csv/file_io"
7
- require "smarter_csv/options_processing"
7
+ require "smarter_csv/options"
8
8
  require "smarter_csv/auto_detection"
9
- require "smarter_csv/variables"
10
9
  require 'smarter_csv/header_transformations'
11
10
  require 'smarter_csv/header_validations'
12
11
  require "smarter_csv/headers"
13
12
  require "smarter_csv/hash_transformations"
14
13
 
15
- require "smarter_csv/parse"
14
+ require "smarter_csv/parser"
16
15
  require "smarter_csv/writer"
17
- require "smarter_csv/smarter_csv"
16
+ require "smarter_csv/reader"
18
17
 
19
18
  # load the C-extension:
20
19
  case RUBY_ENGINE
@@ -55,6 +54,23 @@ end
55
54
  # :nocov:
56
55
 
57
56
  module SmarterCSV
57
+ # For backwards compatibility:
58
+ #
59
+ # while `SmarterCSV.process` works for simple cases, you can't get access to the internal state any longer.
60
+ # e.g. you need the instance of the Reader to access the original headers
61
+ #
62
+ # Please use this instead:
63
+ #
64
+ # reader = SmarterCSV::Reader.new(input, options)
65
+ # reader.process # with or without block
66
+ #
67
+ def self.process(input, given_options = {}, &block)
68
+ reader = Reader.new(input, given_options)
69
+ reader.process(&block)
70
+ end
71
+
72
+ # Convenience method for generating CSV files:
73
+ #
58
74
  # SmarterCSV.generate(filename, options) do |csv_writer|
59
75
  # MyModel.find_in_batches(batch_size: 100) do |batch|
60
76
  # batch.pluck(:name, :description, :instructor).each do |record|
data/smarter_csv.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Tilo Sloboda"]
10
10
  spec.email = ["tilo.sloboda@gmail.com"]
11
11
 
12
- spec.summary = "CSV Reading and Writing"
12
+ spec.summary = "Convenient CSV Reading and Writing"
13
13
  spec.description = "Ruby Gem for convenient reading and writing: importing of CSV Files as Array(s) of Hashes, with lots of features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys"
14
14
  spec.homepage = "https://github.com/tilo/smarter_csv"
15
15
  spec.license = 'MIT'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.2
4
+ version: 1.12.0.pre1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-07-06 00:00:00.000000000 Z
11
+ date: 2024-07-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -115,6 +115,17 @@ files:
115
115
  - README.md
116
116
  - Rakefile
117
117
  - TO_DO_v2.md
118
+ - docs/_introduction.md
119
+ - docs/basic_api.md
120
+ - docs/batch_processing.md
121
+ - docs/data_transformations.md
122
+ - docs/examples.md
123
+ - docs/header_transformations.md
124
+ - docs/header_validations.md
125
+ - docs/notes.md
126
+ - docs/options.md
127
+ - docs/row_col_sep.md
128
+ - docs/value_converters.md
118
129
  - ext/smarter_csv/extconf.rb
119
130
  - ext/smarter_csv/smarter_csv.c
120
131
  - lib/smarter_csv.rb
@@ -125,10 +136,9 @@ files:
125
136
  - lib/smarter_csv/header_transformations.rb
126
137
  - lib/smarter_csv/header_validations.rb
127
138
  - lib/smarter_csv/headers.rb
128
- - lib/smarter_csv/options_processing.rb
129
- - lib/smarter_csv/parse.rb
130
- - lib/smarter_csv/smarter_csv.rb
131
- - lib/smarter_csv/variables.rb
139
+ - lib/smarter_csv/options.rb
140
+ - lib/smarter_csv/parser.rb
141
+ - lib/smarter_csv/reader.rb
132
142
  - lib/smarter_csv/version.rb
133
143
  - lib/smarter_csv/writer.rb
134
144
  - smarter_csv.gemspec
@@ -151,12 +161,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
151
161
  version: 2.5.0
152
162
  required_rubygems_version: !ruby/object:Gem::Requirement
153
163
  requirements:
154
- - - ">="
164
+ - - ">"
155
165
  - !ruby/object:Gem::Version
156
- version: '0'
166
+ version: 1.3.1
157
167
  requirements: []
158
168
  rubygems_version: 3.2.3
159
169
  signing_key:
160
170
  specification_version: 4
161
- summary: CSV Reading and Writing
171
+ summary: Convenient CSV Reading and Writing
162
172
  test_files: []