smarter_csv 1.11.0 → 1.12.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
@@ -87,9 +87,11 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
87
87
  }
88
88
 
89
89
  VALUE SmarterCSV = Qnil;
90
+ VALUE Parser = Qnil;
90
91
 
91
92
  void Init_smarter_csv(void) {
92
- VALUE SmarterCSV = rb_define_module("SmarterCSV");
93
+ SmarterCSV = rb_define_module("SmarterCSV");
94
+ Parser = rb_define_module_under(SmarterCSV, "Parser");
93
95
 
94
- rb_define_module_function(SmarterCSV, "parse_csv_line_c", rb_parse_csv_line, 4);
96
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
95
97
  }
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module AutoDetection
5
5
  protected
6
6
 
7
7
  # If file has headers, then guesses column separator from headers.
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class Error < StandardError; end # new code should rescue this instead
5
+ # Reader:
6
+ class SmarterCSVException < Error; end # for backwards compatibility
7
+ class HeaderSizeMismatch < SmarterCSVException; end
8
+ class IncorrectOption < SmarterCSVException; end
9
+ class ValidationError < SmarterCSVException; end
10
+ class DuplicateHeaders < SmarterCSVException; end
11
+ class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
12
+ class NoColSepDetected < SmarterCSVException; end
13
+ class KeyMappingError < SmarterCSVException; end
14
+ # Writer:
15
+ class InvalidInputData < SmarterCSVException; end
16
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module FileIO
5
5
  protected
6
6
 
7
7
  def readline_with_counts(filehandle, options)
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HashTransformations
5
5
  def hash_transformations(hash, options)
6
6
  # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
7
7
  # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HeaderTransformations
5
5
  # transform the headers that were in the file:
6
6
  def header_transformations(header_array, options)
7
7
  header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module HeaderValidations
5
5
  def header_validations(headers, options)
6
6
  check_duplicate_headers(headers, options)
7
7
  check_required_headers(headers, options)
@@ -26,7 +26,7 @@ module SmarterCSV
26
26
  missing_keys = options[:required_keys].select { |k| !headers_set.include?(k) }
27
27
 
28
28
  unless missing_keys.empty?
29
- raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `SmarterCSV.headers` for original headers."
29
+ raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `reader.headers` for original headers."
30
30
  end
31
31
  end
32
32
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module Headers
5
5
  def process_headers(filehandle, options)
6
6
  @raw_header = nil # header as it appears in the file
7
7
  @headers = nil # the processed headers
@@ -1,43 +1,51 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- DEFAULT_OPTIONS = {
5
- acceleration: true,
6
- auto_row_sep_chars: 500,
7
- chunk_size: nil,
8
- col_sep: :auto, # was: ',',
9
- comment_regexp: nil, # was: /\A#/,
10
- convert_values_to_numeric: true,
11
- downcase_header: true,
12
- duplicate_header_suffix: '', # was: nil,
13
- file_encoding: 'utf-8',
14
- force_simple_split: false,
15
- force_utf8: false,
16
- headers_in_file: true,
17
- invalid_byte_sequence: '',
18
- keep_original_headers: false,
19
- key_mapping: nil,
20
- quote_char: '"',
21
- remove_empty_hashes: true,
22
- remove_empty_values: true,
23
- remove_unmapped_keys: false,
24
- remove_values_matching: nil,
25
- remove_zero_values: false,
26
- required_headers: nil,
27
- required_keys: nil,
28
- row_sep: :auto, # was: $/,
29
- silence_missing_keys: false,
30
- skip_lines: nil,
31
- strings_as_keys: false,
32
- strip_chars_from_headers: nil,
33
- strip_whitespace: true,
34
- user_provided_headers: nil,
35
- value_converters: nil,
36
- verbose: false,
37
- with_line_numbers: false,
38
- }.freeze
4
+ #
5
+ # NOTE: this is not called when "parse" methods are tested by themselves
6
+ #
7
+ # ONLY FOR BACKWARDS-COMPATIBILITY
8
+ def self.default_options
9
+ Options::DEFAULT_OPTIONS
10
+ end
11
+
12
+ module Options
13
+ DEFAULT_OPTIONS = {
14
+ acceleration: true, # if user wants to use accelleration or not
15
+ auto_row_sep_chars: 500,
16
+ chunk_size: nil,
17
+ col_sep: :auto, # was: ',',
18
+ comment_regexp: nil, # was: /\A#/,
19
+ convert_values_to_numeric: true,
20
+ downcase_header: true,
21
+ duplicate_header_suffix: '', # was: nil,
22
+ file_encoding: 'utf-8',
23
+ force_simple_split: false,
24
+ force_utf8: false,
25
+ headers_in_file: true,
26
+ invalid_byte_sequence: '',
27
+ keep_original_headers: false,
28
+ key_mapping: nil,
29
+ quote_char: '"',
30
+ remove_empty_hashes: true,
31
+ remove_empty_values: true,
32
+ remove_unmapped_keys: false,
33
+ remove_values_matching: nil,
34
+ remove_zero_values: false,
35
+ required_headers: nil,
36
+ required_keys: nil,
37
+ row_sep: :auto, # was: $/,
38
+ silence_missing_keys: false,
39
+ skip_lines: nil,
40
+ strings_as_keys: false,
41
+ strip_chars_from_headers: nil,
42
+ strip_whitespace: true,
43
+ user_provided_headers: nil,
44
+ value_converters: nil,
45
+ verbose: false,
46
+ with_line_numbers: false,
47
+ }.freeze
39
48
 
40
- class << self
41
49
  # NOTE: this is not called when "parse" methods are tested by themselves
42
50
  def process_options(given_options = {})
43
51
  puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]
@@ -53,13 +61,6 @@ module SmarterCSV
53
61
  @options
54
62
  end
55
63
 
56
- # NOTE: this is not called when "parse" methods are tested by themselves
57
- #
58
- # ONLY FOR BACKWARDS-COMPATIBILITY
59
- def default_options
60
- DEFAULT_OPTIONS
61
- end
62
-
63
64
  private
64
65
 
65
66
  def validate_options!(options)
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- class << self
4
+ module Parser
5
5
  protected
6
6
 
7
7
  ###
@@ -10,7 +10,7 @@ module SmarterCSV
10
10
  def parse(line, options, header_size = nil)
11
11
  # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
12
12
 
13
- if options[:acceleration] && has_acceleration?
13
+ if options[:acceleration] && has_acceleration
14
14
  # :nocov:
15
15
  has_quotes = line =~ /#{options[:quote_char]}/
16
16
  elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
@@ -0,0 +1,243 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class Reader
5
+ include ::SmarterCSV::Options
6
+ include ::SmarterCSV::FileIO
7
+ include ::SmarterCSV::AutoDetection
8
+ include ::SmarterCSV::Headers
9
+ include ::SmarterCSV::HeaderTransformations
10
+ include ::SmarterCSV::HeaderValidations
11
+ include ::SmarterCSV::HashTransformations
12
+ include ::SmarterCSV::Parser
13
+
14
+ attr_reader :input, :options
15
+ attr_reader :csv_line_count, :chunk_count, :file_line_count
16
+ attr_reader :enforce_utf8, :has_rails, :has_acceleration
17
+ attr_reader :errors, :warnings, :headers, :raw_header, :result
18
+
19
+ # :nocov:
20
+ # rubocop:disable Naming/MethodName
21
+ def headerA
22
+ warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
23
+ @headerA
24
+ end
25
+ # rubocop:enable Naming/MethodName
26
+ # :nocov:
27
+
28
+ # first parameter: filename or input object which responds to readline method
29
+ def initialize(input, given_options = {})
30
+ @input = input
31
+ @has_rails = !!defined?(Rails)
32
+ @csv_line_count = 0
33
+ @chunk_count = 0
34
+ @errors = {}
35
+ @file_line_count = 0
36
+ @headerA = []
37
+ @headers = nil
38
+ @raw_header = nil # header as it appears in the file
39
+ @result = []
40
+ @warnings = {}
41
+ @enforce_utf8 = false # only set to true if needed (after options parsing)
42
+ @options = process_options(given_options)
43
+ # true if it is compiled with accelleration
44
+ @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
45
+ end
46
+
47
+ def process(&block) # rubocop:disable Lint/UnusedMethodArgument
48
+ @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
49
+ @verbose = options[:verbose]
50
+
51
+ begin
52
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
53
+
54
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
55
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
56
+ end
57
+
58
+ # auto-detect the row separator
59
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
60
+ # attempt to auto-detect column separator
61
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
62
+
63
+ skip_lines(fh, options)
64
+
65
+ @headers, header_size = process_headers(fh, options)
66
+ @headerA = @headers # @headerA is deprecated, use @headers
67
+
68
+ puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
69
+
70
+ header_validations(@headers, options)
71
+
72
+ # in case we use chunking.. we'll need to set it up..
73
+ if options[:chunk_size].to_i > 0
74
+ use_chunks = true
75
+ chunk_size = options[:chunk_size].to_i
76
+ @chunk_count = 0
77
+ chunk = []
78
+ else
79
+ use_chunks = false
80
+ end
81
+
82
+ # now on to processing all the rest of the lines in the CSV file:
83
+ # fh.each_line |line|
84
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
85
+ line = readline_with_counts(fh, options)
86
+
87
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
88
+ line = enforce_utf8_encoding(line, options) if @enforce_utf8
89
+
90
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
91
+
92
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
93
+
94
+ # cater for the quoted csv data containing the row separator carriage return character
95
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
96
+ # by detecting the existence of an uneven number of quote characters
97
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
98
+
99
+ while multiline
100
+ next_line = fh.readline(options[:row_sep])
101
+ next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
102
+ line += next_line
103
+ @file_line_count += 1
104
+
105
+ break if fh.eof? # Exit loop if end of file is reached
106
+
107
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
108
+ end
109
+
110
+ # :nocov:
111
+ if multiline && @verbose
112
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
113
+ end
114
+ # :nocov:
115
+
116
+ line.chomp!(options[:row_sep])
117
+
118
+ # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
119
+ dataA, _data_size = parse(line, options, header_size)
120
+
121
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
122
+
123
+ # if all values are blank, then ignore this line
124
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
125
+
126
+ # --- HASH TRANSFORMATIONS ------------------------------------------------------------
127
+ hash = @headers.zip(dataA).to_h
128
+
129
+ hash = hash_transformations(hash, options)
130
+
131
+ # --- HASH VALIDATIONS ----------------------------------------------------------------
132
+ # will go here, and be able to:
133
+ # - validate correct format of the values for fields
134
+ # - required fields to be non-empty
135
+ # - ...
136
+ # -------------------------------------------------------------------------------------
137
+
138
+ next if options[:remove_empty_hashes] && hash.empty?
139
+
140
+ puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
141
+ # optional adding of csv_line_number to the hash to help debugging
142
+ hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
143
+
144
+ # process the chunks or the resulting hash
145
+ if use_chunks
146
+ chunk << hash # append temp result to chunk
147
+
148
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
149
+ # do something with the chunk
150
+ if block_given?
151
+ yield chunk # do something with the hashes in the chunk in the block
152
+ else
153
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
154
+ end
155
+ @chunk_count += 1
156
+ chunk.clear # re-initialize for next chunk of data
157
+ else
158
+ # the last chunk may contain partial data, which is handled below
159
+ end
160
+ # while a chunk is being filled up we don't need to do anything else here
161
+
162
+ else # no chunk handling
163
+ if block_given?
164
+ yield [hash] # do something with the hash in the block (better to use chunking here)
165
+ else
166
+ @result << hash
167
+ end
168
+ end
169
+ end
170
+
171
+ # print new line to retain last processing line message
172
+ print "\n" if @verbose
173
+
174
+ # handling of last chunk:
175
+ if !chunk.nil? && chunk.size > 0
176
+ # do something with the chunk
177
+ if block_given?
178
+ yield chunk # do something with the hashes in the chunk in the block
179
+ else
180
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
181
+ end
182
+ @chunk_count += 1
183
+ # chunk = [] # initialize for next chunk of data
184
+ end
185
+ ensure
186
+ fh.close if fh.respond_to?(:close)
187
+ end
188
+
189
+ if block_given?
190
+ @chunk_count # when we do processing through a block we only care how many chunks we processed
191
+ else
192
+ @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
193
+ end
194
+ end
195
+
196
+ def count_quote_chars(line, quote_char)
197
+ return 0 if line.nil? || quote_char.nil? || quote_char.empty?
198
+
199
+ count = 0
200
+ escaped = false
201
+
202
+ line.each_char do |char|
203
+ if char == '\\' && !escaped
204
+ escaped = true
205
+ else
206
+ count += 1 if char == quote_char && !escaped
207
+ escaped = false
208
+ end
209
+ end
210
+
211
+ count
212
+ end
213
+
214
+ protected
215
+
216
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
217
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
218
+ BLANK_RE = /\A\s*\z/.freeze
219
+
220
+ def blank?(value)
221
+ case value
222
+ when String
223
+ BLANK_RE.match?(value)
224
+ when NilClass
225
+ true
226
+ when Array
227
+ value.all? { |elem| blank?(elem) }
228
+ when Hash
229
+ value.values.all? { |elem| blank?(elem) } # Focus on values only
230
+ else
231
+ false
232
+ end
233
+ end
234
+
235
+ private
236
+
237
+ def enforce_utf8_encoding(line, options)
238
+ # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
239
+
240
+ line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
241
+ end
242
+ end
243
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.11.0"
4
+ VERSION = "1.12.0.pre1"
5
5
  end
@@ -35,22 +35,36 @@ module SmarterCSV
35
35
  # Make sure to use the correct form when specifying headers manually,
36
36
  # in combination with the :discover_headers option
37
37
 
38
+ attr_reader :options, :row_sep, :col_sep, :quote_char, :force_quotes, :discover_headers, :headers, :map_headers, :output_file
39
+
38
40
  class Writer
39
41
  def initialize(file_path, options = {})
40
42
  @options = options
41
- @discover_headers = options.has_key?(:discover_headers) ? (options[:discover_headers] == true) : true
42
- @headers = options[:headers] || []
43
- @row_sep = options[:row_sep] || "\n" # RFC4180 "\r\n"
43
+
44
+ @row_sep = options[:row_sep] || $/ # Defaults to system's row separator. RFC4180 "\r\n"
44
45
  @col_sep = options[:col_sep] || ','
45
- @quote_char = '"'
46
+ @quote_char = options[:quote_char] || '"'
46
47
  @force_quotes = options[:force_quotes] == true
48
+ @discover_headers = true # defaults to true
49
+ if options.has_key?(:discover_headers)
50
+ # passing in the option overrides the default behavior
51
+ @discover_headers = options[:discover_headers] == true
52
+ else
53
+ # disable discover_headers when headers are given explicitly
54
+ @discover_headers = !(options.has_key?(:map_headers) || options.has_key?(:headers))
55
+ end
56
+ @headers = [] # start with empty headers
57
+ @headers = options[:headers] if options.has_key?(:headers) # unless explicitly given
58
+ @headers = options[:map_headers].keys if options.has_key?(:map_headers) && !options.has_key?(:headers)
47
59
  @map_headers = options[:map_headers] || {}
60
+
48
61
  @output_file = File.open(file_path, 'w+')
49
62
  # hidden state:
50
63
  @temp_file = Tempfile.new('tempfile', '/tmp')
51
64
  @quote_regex = Regexp.union(@col_sep, @row_sep, @quote_char)
52
65
  end
53
66
 
67
+ # this can be called many times in order to append lines to the csv file
54
68
  def <<(data)
55
69
  case data
56
70
  when Hash
@@ -60,7 +74,7 @@ module SmarterCSV
60
74
  when NilClass
61
75
  # ignore
62
76
  else
63
- raise ArgumentError, "Invalid data type: #{data.class}. Must be a Hash or an Array."
77
+ raise InvalidInputData, "Invalid data type: #{data.class}. Must be a Hash or an Array."
64
78
  end
65
79
  end
66
80
 
data/lib/smarter_csv.rb CHANGED
@@ -1,16 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "smarter_csv/version"
4
+ require "smarter_csv/errors"
5
+
4
6
  require "smarter_csv/file_io"
5
- require "smarter_csv/options_processing"
7
+ require "smarter_csv/options"
6
8
  require "smarter_csv/auto_detection"
7
- require "smarter_csv/variables"
8
9
  require 'smarter_csv/header_transformations'
9
10
  require 'smarter_csv/header_validations'
10
11
  require "smarter_csv/headers"
11
12
  require "smarter_csv/hash_transformations"
12
- require "smarter_csv/parse"
13
+
14
+ require "smarter_csv/parser"
13
15
  require "smarter_csv/writer"
16
+ require "smarter_csv/reader"
14
17
 
15
18
  # load the C-extension:
16
19
  case RUBY_ENGINE
@@ -49,4 +52,41 @@ else
49
52
  BLOCK_COMMENT
50
53
  end
51
54
  # :nocov:
52
- require "smarter_csv/smarter_csv"
55
+
56
+ module SmarterCSV
57
+ # For backwards compatibility:
58
+ #
59
+ # while `SmarterCSV.process` works for simple cases, you can't get access to the internal state any longer.
60
+ # e.g. you need the instance of the Reader to access the original headers
61
+ #
62
+ # Please use this instead:
63
+ #
64
+ # reader = SmarterCSV::Reader.new(input, options)
65
+ # reader.process # with or without block
66
+ #
67
+ def self.process(input, given_options = {}, &block)
68
+ reader = Reader.new(input, given_options)
69
+ reader.process(&block)
70
+ end
71
+
72
+ # Convenience method for generating CSV files:
73
+ #
74
+ # SmarterCSV.generate(filename, options) do |csv_writer|
75
+ # MyModel.find_in_batches(batch_size: 100) do |batch|
76
+ # batch.pluck(:name, :description, :instructor).each do |record|
77
+ # csv_writer << record
78
+ # end
79
+ # end
80
+ # end
81
+ #
82
+ # rubocop:disable Lint/UnusedMethodArgument
83
+ def self.generate(filename, options = {}, &block)
84
+ raise unless block_given?
85
+
86
+ writer = Writer.new(filename, options)
87
+ yield writer
88
+ ensure
89
+ writer.finalize
90
+ end
91
+ # rubocop:enable Lint/UnusedMethodArgument
92
+ end
data/smarter_csv.gemspec CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Tilo Sloboda"]
10
10
  spec.email = ["tilo.sloboda@gmail.com"]
11
11
 
12
- spec.summary = "CSV Reading and Writing"
13
- spec.description = "Ruby Gem for smarter importing of CSV Files as Array(s) of Hashes, with lots of features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys"
12
+ spec.summary = "Convenient CSV Reading and Writing"
13
+ spec.description = "Ruby Gem for convenient reading and writing: importing of CSV Files as Array(s) of Hashes, with lots of features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys"
14
14
  spec.homepage = "https://github.com/tilo/smarter_csv"
15
15
  spec.license = 'MIT'
16
16