smarter_csv 1.11.2 → 1.12.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/smarter_csv.rb CHANGED
@@ -4,17 +4,16 @@ require "smarter_csv/version"
4
4
  require "smarter_csv/errors"
5
5
 
6
6
  require "smarter_csv/file_io"
7
- require "smarter_csv/options_processing"
7
+ require "smarter_csv/options"
8
8
  require "smarter_csv/auto_detection"
9
- require "smarter_csv/variables"
10
9
  require 'smarter_csv/header_transformations'
11
10
  require 'smarter_csv/header_validations'
12
11
  require "smarter_csv/headers"
13
12
  require "smarter_csv/hash_transformations"
14
13
 
15
- require "smarter_csv/parse"
14
+ require "smarter_csv/parser"
16
15
  require "smarter_csv/writer"
17
- require "smarter_csv/smarter_csv"
16
+ require "smarter_csv/reader"
18
17
 
19
18
  # load the C-extension:
20
19
  case RUBY_ENGINE
@@ -55,6 +54,23 @@ end
55
54
  # :nocov:
56
55
 
57
56
  module SmarterCSV
57
+ # For backwards compatibility:
58
+ #
59
+ # while `SmarterCSV.process` works for simple cases, you can't get access to the internal state any longer.
60
+ # e.g. you need the instance of the Reader to access the original headers
61
+ #
62
+ # Please use this instead:
63
+ #
64
+ # reader = SmarterCSV::Reader.new(input, options)
65
+ # reader.process # with or without block
66
+ #
67
+ def self.process(input, given_options = {}, &block)
68
+ reader = Reader.new(input, given_options)
69
+ reader.process(&block)
70
+ end
71
+
72
+ # Convenience method for generating CSV files:
73
+ #
58
74
  # SmarterCSV.generate(filename, options) do |csv_writer|
59
75
  # MyModel.find_in_batches(batch_size: 100) do |batch|
60
76
  # batch.pluck(:name, :description, :instructor).each do |record|
data/smarter_csv.gemspec CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Tilo Sloboda"]
10
10
  spec.email = ["tilo.sloboda@gmail.com"]
11
11
 
12
- spec.summary = "CSV Reading and Writing"
13
- spec.description = "Ruby Gem for convenient reading and writing: importing of CSV Files as Array(s) of Hashes, with lots of features for processing large files in parallel, embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers to Hash-keys"
12
+ spec.summary = "Convenient CSV Reading and Writing"
13
+ spec.description = "Ruby Gem for convenient reading and writing of CSV files. It has intelligent defaults, and auto-discovery of column and row separators. It imports CSV Files as Array(s) of Hashes, suitable for direct processing with ActiveRecord, kicking-off batch jobs with Sidekiq, parallel processing, or oploading data to S3. Similarly, writing CSV files takes Hashes, or Arrays of Hashes to create a CSV file."
14
14
  spec.homepage = "https://github.com/tilo/smarter_csv"
15
15
  spec.license = 'MIT'
16
16
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.2
4
+ version: 1.12.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-07-06 00:00:00.000000000 Z
11
+ date: 2024-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -94,10 +94,11 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
- description: 'Ruby Gem for convenient reading and writing: importing of CSV Files
98
- as Array(s) of Hashes, with lots of features for processing large files in parallel,
99
- embedded comments, unusual field- and record-separators, flexible mapping of CSV-headers
100
- to Hash-keys'
97
+ description: Ruby Gem for convenient reading and writing of CSV files. It has intelligent
98
+ defaults, and auto-discovery of column and row separators. It imports CSV Files
99
+ as Array(s) of Hashes, suitable for direct processing with ActiveRecord, kicking-off
100
+ batch jobs with Sidekiq, parallel processing, or oploading data to S3. Similarly,
101
+ writing CSV files takes Hashes, or Arrays of Hashes to create a CSV file.
101
102
  email:
102
103
  - tilo.sloboda@gmail.com
103
104
  executables: []
@@ -115,6 +116,16 @@ files:
115
116
  - README.md
116
117
  - Rakefile
117
118
  - TO_DO_v2.md
119
+ - docs/_introduction.md
120
+ - docs/basic_api.md
121
+ - docs/batch_processing.md
122
+ - docs/data_transformations.md
123
+ - docs/examples.md
124
+ - docs/header_transformations.md
125
+ - docs/header_validations.md
126
+ - docs/options.md
127
+ - docs/row_col_sep.md
128
+ - docs/value_converters.md
118
129
  - ext/smarter_csv/extconf.rb
119
130
  - ext/smarter_csv/smarter_csv.c
120
131
  - lib/smarter_csv.rb
@@ -125,10 +136,9 @@ files:
125
136
  - lib/smarter_csv/header_transformations.rb
126
137
  - lib/smarter_csv/header_validations.rb
127
138
  - lib/smarter_csv/headers.rb
128
- - lib/smarter_csv/options_processing.rb
129
- - lib/smarter_csv/parse.rb
130
- - lib/smarter_csv/smarter_csv.rb
131
- - lib/smarter_csv/variables.rb
139
+ - lib/smarter_csv/options.rb
140
+ - lib/smarter_csv/parser.rb
141
+ - lib/smarter_csv/reader.rb
132
142
  - lib/smarter_csv/version.rb
133
143
  - lib/smarter_csv/writer.rb
134
144
  - smarter_csv.gemspec
@@ -158,5 +168,5 @@ requirements: []
158
168
  rubygems_version: 3.2.3
159
169
  signing_key:
160
170
  specification_version: 4
161
- summary: CSV Reading and Writing
171
+ summary: Convenient CSV Reading and Writing
162
172
  test_files: []
@@ -1,210 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- # first parameter: filename or input object which responds to readline method
5
- def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
6
- initialize_variables
7
-
8
- options = process_options(given_options)
9
-
10
- @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
11
- @verbose = options[:verbose]
12
-
13
- begin
14
- fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
15
-
16
- if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
17
- puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
18
- end
19
-
20
- # auto-detect the row separator
21
- options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
22
- # attempt to auto-detect column separator
23
- options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
24
-
25
- skip_lines(fh, options)
26
-
27
- @headers, header_size = process_headers(fh, options)
28
- @headerA = @headers # @headerA is deprecated, use @headers
29
-
30
- puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
31
-
32
- header_validations(@headers, options)
33
-
34
- # in case we use chunking.. we'll need to set it up..
35
- if options[:chunk_size].to_i > 0
36
- use_chunks = true
37
- chunk_size = options[:chunk_size].to_i
38
- @chunk_count = 0
39
- chunk = []
40
- else
41
- use_chunks = false
42
- end
43
-
44
- # now on to processing all the rest of the lines in the CSV file:
45
- # fh.each_line |line|
46
- until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
47
- line = readline_with_counts(fh, options)
48
-
49
- # replace invalid byte sequence in UTF-8 with question mark to avoid errors
50
- line = enforce_utf8_encoding(line, options) if @enforce_utf8
51
-
52
- print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
53
-
54
- next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
55
-
56
- # cater for the quoted csv data containing the row separator carriage return character
57
- # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
58
- # by detecting the existence of an uneven number of quote characters
59
- multiline = count_quote_chars(line, options[:quote_char]).odd?
60
-
61
- while multiline
62
- next_line = fh.readline(options[:row_sep])
63
- next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
64
- line += next_line
65
- @file_line_count += 1
66
-
67
- break if fh.eof? # Exit loop if end of file is reached
68
-
69
- multiline = count_quote_chars(line, options[:quote_char]).odd?
70
- end
71
-
72
- # :nocov:
73
- if multiline && @verbose
74
- print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
75
- end
76
- # :nocov:
77
-
78
- line.chomp!(options[:row_sep])
79
-
80
- # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
81
- dataA, _data_size = parse(line, options, header_size)
82
-
83
- dataA.map!{|x| x.strip} if options[:strip_whitespace]
84
-
85
- # if all values are blank, then ignore this line
86
- next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
87
-
88
- # --- HASH TRANSFORMATIONS ------------------------------------------------------------
89
- hash = @headers.zip(dataA).to_h
90
-
91
- hash = hash_transformations(hash, options)
92
-
93
- # --- HASH VALIDATIONS ----------------------------------------------------------------
94
- # will go here, and be able to:
95
- # - validate correct format of the values for fields
96
- # - required fields to be non-empty
97
- # - ...
98
- # -------------------------------------------------------------------------------------
99
-
100
- next if options[:remove_empty_hashes] && hash.empty?
101
-
102
- puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
103
- # optional adding of csv_line_number to the hash to help debugging
104
- hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
105
-
106
- # process the chunks or the resulting hash
107
- if use_chunks
108
- chunk << hash # append temp result to chunk
109
-
110
- if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
111
- # do something with the chunk
112
- if block_given?
113
- yield chunk # do something with the hashes in the chunk in the block
114
- else
115
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
116
- end
117
- @chunk_count += 1
118
- chunk.clear # re-initialize for next chunk of data
119
- else
120
- # the last chunk may contain partial data, which is handled below
121
- end
122
- # while a chunk is being filled up we don't need to do anything else here
123
-
124
- else # no chunk handling
125
- if block_given?
126
- yield [hash] # do something with the hash in the block (better to use chunking here)
127
- else
128
- @result << hash
129
- end
130
- end
131
- end
132
-
133
- # print new line to retain last processing line message
134
- print "\n" if @verbose
135
-
136
- # handling of last chunk:
137
- if !chunk.nil? && chunk.size > 0
138
- # do something with the chunk
139
- if block_given?
140
- yield chunk # do something with the hashes in the chunk in the block
141
- else
142
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
143
- end
144
- @chunk_count += 1
145
- # chunk = [] # initialize for next chunk of data
146
- end
147
- ensure
148
- fh.close if fh.respond_to?(:close)
149
- end
150
-
151
- if block_given?
152
- @chunk_count # when we do processing through a block we only care how many chunks we processed
153
- else
154
- @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
155
- end
156
- end
157
-
158
- class << self
159
- def count_quote_chars(line, quote_char)
160
- return 0 if line.nil? || quote_char.nil? || quote_char.empty?
161
-
162
- count = 0
163
- escaped = false
164
-
165
- line.each_char do |char|
166
- if char == '\\' && !escaped
167
- escaped = true
168
- else
169
- count += 1 if char == quote_char && !escaped
170
- escaped = false
171
- end
172
- end
173
-
174
- count
175
- end
176
-
177
- def has_acceleration?
178
- @has_acceleration ||= !!defined?(parse_csv_line_c)
179
- end
180
-
181
- protected
182
-
183
- # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
184
- # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
185
- BLANK_RE = /\A\s*\z/.freeze
186
-
187
- def blank?(value)
188
- case value
189
- when String
190
- BLANK_RE.match?(value)
191
- when NilClass
192
- true
193
- when Array
194
- value.all? { |elem| blank?(elem) }
195
- when Hash
196
- value.values.all? { |elem| blank?(elem) } # Focus on values only
197
- else
198
- false
199
- end
200
- end
201
-
202
- private
203
-
204
- def enforce_utf8_encoding(line, options)
205
- # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
206
-
207
- line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
208
- end
209
- end
210
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- class << self
5
- attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
-
7
- def initialize_variables
8
- @has_rails = !!defined?(Rails)
9
- @csv_line_count = 0
10
- @chunk_count = 0
11
- @errors = {}
12
- @file_line_count = 0
13
- @headerA = []
14
- @headers = nil
15
- @raw_header = nil # header as it appears in the file
16
- @result = []
17
- @warnings = {}
18
- @enforce_utf8 = false # only set to true if needed (after options parsing)
19
- end
20
-
21
- # :nocov:
22
- # rubocop:disable Naming/MethodName
23
- def headerA
24
- warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
25
- @headerA
26
- end
27
- # rubocop:enable Naming/MethodName
28
- # :nocov:
29
- end
30
- end