smarter_csv 1.11.2 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/smarter_csv.rb CHANGED
@@ -4,17 +4,16 @@ require "smarter_csv/version"
4
4
  require "smarter_csv/errors"
5
5
 
6
6
  require "smarter_csv/file_io"
7
- require "smarter_csv/options_processing"
7
+ require "smarter_csv/options"
8
8
  require "smarter_csv/auto_detection"
9
- require "smarter_csv/variables"
10
9
  require 'smarter_csv/header_transformations'
11
10
  require 'smarter_csv/header_validations'
12
11
  require "smarter_csv/headers"
13
12
  require "smarter_csv/hash_transformations"
14
13
 
15
- require "smarter_csv/parse"
14
+ require "smarter_csv/parser"
16
15
  require "smarter_csv/writer"
17
- require "smarter_csv/smarter_csv"
16
+ require "smarter_csv/reader"
18
17
 
19
18
  # load the C-extension:
20
19
  case RUBY_ENGINE
@@ -55,6 +54,23 @@ end
55
54
  # :nocov:
56
55
 
57
56
  module SmarterCSV
57
+ # For backwards compatibility:
58
+ #
59
+ # while `SmarterCSV.process` works for simple cases, you can't get access to the internal state any longer.
60
+ # e.g. you need the instance of the Reader to access the original headers
61
+ #
62
+ # Please use this instead:
63
+ #
64
+ # reader = SmarterCSV::Reader.new(input, options)
65
+ # reader.process # with or without block
66
+ #
67
+ def self.process(input, given_options = {}, &block)
68
+ reader = Reader.new(input, given_options)
69
+ reader.process(&block)
70
+ end
71
+
72
+ # Convenience method for generating CSV files:
73
+ #
58
74
  # SmarterCSV.generate(filename, options) do |csv_writer|
59
75
  # MyModel.find_in_batches(batch_size: 100) do |batch|
60
76
  # batch.pluck(:name, :description, :instructor).each do |record|
data/smarter_csv.gemspec CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Tilo Sloboda"]
10
10
  spec.email = ["tilo.sloboda@gmail.com"]
11
11
 
12
- spec.summary = "CSV Reading and Writing"
13
- spec.description = "Ruby Gem for convenient reading and writing: importing of CSV Files as Array(s) of Hashes, with lots of features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys"
12
+ spec.summary = "Convenient CSV Reading and Writing"
13
+ spec.description = "Ruby Gem for convenient reading and writing of CSV files. It has intelligent defaults, and auto-discovery of column and row separators. It imports CSV Files as Array(s) of Hashes, suitable for direct processing with ActiveRecord, kicking-off batch jobs with Sidekiq, parallel processing, or oploading data to S3. Similarly, writing CSV files takes Hashes, or Arrays of Hashes to create a CSV file."
14
14
  spec.homepage = "https://github.com/tilo/smarter_csv"
15
15
  spec.license = 'MIT'
16
16
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.2
4
+ version: 1.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-07-06 00:00:00.000000000 Z
11
+ date: 2024-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -94,10 +94,11 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
- description: 'Ruby Gem for convenient reading and writing: importing of CSV Files
98
- as Array(s) of Hashes, with lots of features for processing large files in parallel,
99
- embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers
100
- to Hash-keys'
97
+ description: Ruby Gem for convenient reading and writing of CSV files. It has intelligent
98
+ defaults, and auto-discovery of column and row separators. It imports CSV Files
99
+ as Array(s) of Hashes, suitable for direct processing with ActiveRecord, kicking-off
100
+ batch jobs with Sidekiq, parallel processing, or oploading data to S3. Similarly,
101
+ writing CSV files takes Hashes, or Arrays of Hashes to create a CSV file.
101
102
  email:
102
103
  - tilo.sloboda@gmail.com
103
104
  executables: []
@@ -115,6 +116,16 @@ files:
115
116
  - README.md
116
117
  - Rakefile
117
118
  - TO_DO_v2.md
119
+ - docs/_introduction.md
120
+ - docs/basic_api.md
121
+ - docs/batch_processing.md
122
+ - docs/data_transformations.md
123
+ - docs/examples.md
124
+ - docs/header_transformations.md
125
+ - docs/header_validations.md
126
+ - docs/options.md
127
+ - docs/row_col_sep.md
128
+ - docs/value_converters.md
118
129
  - ext/smarter_csv/extconf.rb
119
130
  - ext/smarter_csv/smarter_csv.c
120
131
  - lib/smarter_csv.rb
@@ -125,10 +136,9 @@ files:
125
136
  - lib/smarter_csv/header_transformations.rb
126
137
  - lib/smarter_csv/header_validations.rb
127
138
  - lib/smarter_csv/headers.rb
128
- - lib/smarter_csv/options_processing.rb
129
- - lib/smarter_csv/parse.rb
130
- - lib/smarter_csv/smarter_csv.rb
131
- - lib/smarter_csv/variables.rb
139
+ - lib/smarter_csv/options.rb
140
+ - lib/smarter_csv/parser.rb
141
+ - lib/smarter_csv/reader.rb
132
142
  - lib/smarter_csv/version.rb
133
143
  - lib/smarter_csv/writer.rb
134
144
  - smarter_csv.gemspec
@@ -158,5 +168,5 @@ requirements: []
158
168
  rubygems_version: 3.2.3
159
169
  signing_key:
160
170
  specification_version: 4
161
- summary: CSV Reading and Writing
171
+ summary: Convenient CSV Reading and Writing
162
172
  test_files: []
@@ -1,210 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- # first parameter: filename or input object which responds to readline method
5
- def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
6
- initialize_variables
7
-
8
- options = process_options(given_options)
9
-
10
- @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
11
- @verbose = options[:verbose]
12
-
13
- begin
14
- fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
15
-
16
- if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
17
- puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
18
- end
19
-
20
- # auto-detect the row separator
21
- options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
22
- # attempt to auto-detect column separator
23
- options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
24
-
25
- skip_lines(fh, options)
26
-
27
- @headers, header_size = process_headers(fh, options)
28
- @headerA = @headers # @headerA is deprecated, use @headers
29
-
30
- puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
31
-
32
- header_validations(@headers, options)
33
-
34
- # in case we use chunking.. we'll need to set it up..
35
- if options[:chunk_size].to_i > 0
36
- use_chunks = true
37
- chunk_size = options[:chunk_size].to_i
38
- @chunk_count = 0
39
- chunk = []
40
- else
41
- use_chunks = false
42
- end
43
-
44
- # now on to processing all the rest of the lines in the CSV file:
45
- # fh.each_line |line|
46
- until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
47
- line = readline_with_counts(fh, options)
48
-
49
- # replace invalid byte sequence in UTF-8 with question mark to avoid errors
50
- line = enforce_utf8_encoding(line, options) if @enforce_utf8
51
-
52
- print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
53
-
54
- next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
55
-
56
- # cater for the quoted csv data containing the row separator carriage return character
57
- # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
58
- # by detecting the existence of an uneven number of quote characters
59
- multiline = count_quote_chars(line, options[:quote_char]).odd?
60
-
61
- while multiline
62
- next_line = fh.readline(options[:row_sep])
63
- next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
64
- line += next_line
65
- @file_line_count += 1
66
-
67
- break if fh.eof? # Exit loop if end of file is reached
68
-
69
- multiline = count_quote_chars(line, options[:quote_char]).odd?
70
- end
71
-
72
- # :nocov:
73
- if multiline && @verbose
74
- print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
75
- end
76
- # :nocov:
77
-
78
- line.chomp!(options[:row_sep])
79
-
80
- # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
81
- dataA, _data_size = parse(line, options, header_size)
82
-
83
- dataA.map!{|x| x.strip} if options[:strip_whitespace]
84
-
85
- # if all values are blank, then ignore this line
86
- next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
87
-
88
- # --- HASH TRANSFORMATIONS ------------------------------------------------------------
89
- hash = @headers.zip(dataA).to_h
90
-
91
- hash = hash_transformations(hash, options)
92
-
93
- # --- HASH VALIDATIONS ----------------------------------------------------------------
94
- # will go here, and be able to:
95
- # - validate correct format of the values for fields
96
- # - required fields to be non-empty
97
- # - ...
98
- # -------------------------------------------------------------------------------------
99
-
100
- next if options[:remove_empty_hashes] && hash.empty?
101
-
102
- puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
103
- # optional adding of csv_line_number to the hash to help debugging
104
- hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
105
-
106
- # process the chunks or the resulting hash
107
- if use_chunks
108
- chunk << hash # append temp result to chunk
109
-
110
- if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
111
- # do something with the chunk
112
- if block_given?
113
- yield chunk # do something with the hashes in the chunk in the block
114
- else
115
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
116
- end
117
- @chunk_count += 1
118
- chunk.clear # re-initialize for next chunk of data
119
- else
120
- # the last chunk may contain partial data, which is handled below
121
- end
122
- # while a chunk is being filled up we don't need to do anything else here
123
-
124
- else # no chunk handling
125
- if block_given?
126
- yield [hash] # do something with the hash in the block (better to use chunking here)
127
- else
128
- @result << hash
129
- end
130
- end
131
- end
132
-
133
- # print new line to retain last processing line message
134
- print "\n" if @verbose
135
-
136
- # handling of last chunk:
137
- if !chunk.nil? && chunk.size > 0
138
- # do something with the chunk
139
- if block_given?
140
- yield chunk # do something with the hashes in the chunk in the block
141
- else
142
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
143
- end
144
- @chunk_count += 1
145
- # chunk = [] # initialize for next chunk of data
146
- end
147
- ensure
148
- fh.close if fh.respond_to?(:close)
149
- end
150
-
151
- if block_given?
152
- @chunk_count # when we do processing through a block we only care how many chunks we processed
153
- else
154
- @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
155
- end
156
- end
157
-
158
- class << self
159
- def count_quote_chars(line, quote_char)
160
- return 0 if line.nil? || quote_char.nil? || quote_char.empty?
161
-
162
- count = 0
163
- escaped = false
164
-
165
- line.each_char do |char|
166
- if char == '\\' && !escaped
167
- escaped = true
168
- else
169
- count += 1 if char == quote_char && !escaped
170
- escaped = false
171
- end
172
- end
173
-
174
- count
175
- end
176
-
177
- def has_acceleration?
178
- @has_acceleration ||= !!defined?(parse_csv_line_c)
179
- end
180
-
181
- protected
182
-
183
- # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
184
- # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
185
- BLANK_RE = /\A\s*\z/.freeze
186
-
187
- def blank?(value)
188
- case value
189
- when String
190
- BLANK_RE.match?(value)
191
- when NilClass
192
- true
193
- when Array
194
- value.all? { |elem| blank?(elem) }
195
- when Hash
196
- value.values.all? { |elem| blank?(elem) } # Focus on values only
197
- else
198
- false
199
- end
200
- end
201
-
202
- private
203
-
204
- def enforce_utf8_encoding(line, options)
205
- # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
206
-
207
- line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
208
- end
209
- end
210
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- class << self
5
- attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
-
7
- def initialize_variables
8
- @has_rails = !!defined?(Rails)
9
- @csv_line_count = 0
10
- @chunk_count = 0
11
- @errors = {}
12
- @file_line_count = 0
13
- @headerA = []
14
- @headers = nil
15
- @raw_header = nil # header as it appears in the file
16
- @result = []
17
- @warnings = {}
18
- @enforce_utf8 = false # only set to true if needed (after options parsing)
19
- end
20
-
21
- # :nocov:
22
- # rubocop:disable Naming/MethodName
23
- def headerA
24
- warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
25
- @headerA
26
- end
27
- # rubocop:enable Naming/MethodName
28
- # :nocov:
29
- end
30
- end