smarter_csv 1.11.2 → 1.12.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,210 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- # first parameter: filename or input object which responds to readline method
5
- def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
6
- initialize_variables
7
-
8
- options = process_options(given_options)
9
-
10
- @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
11
- @verbose = options[:verbose]
12
-
13
- begin
14
- fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
15
-
16
- if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
17
- puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
18
- end
19
-
20
- # auto-detect the row separator
21
- options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
22
- # attempt to auto-detect column separator
23
- options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
24
-
25
- skip_lines(fh, options)
26
-
27
- @headers, header_size = process_headers(fh, options)
28
- @headerA = @headers # @headerA is deprecated, use @headers
29
-
30
- puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
31
-
32
- header_validations(@headers, options)
33
-
34
- # in case we use chunking.. we'll need to set it up..
35
- if options[:chunk_size].to_i > 0
36
- use_chunks = true
37
- chunk_size = options[:chunk_size].to_i
38
- @chunk_count = 0
39
- chunk = []
40
- else
41
- use_chunks = false
42
- end
43
-
44
- # now on to processing all the rest of the lines in the CSV file:
45
- # fh.each_line |line|
46
- until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
47
- line = readline_with_counts(fh, options)
48
-
49
- # replace invalid byte sequence in UTF-8 with question mark to avoid errors
50
- line = enforce_utf8_encoding(line, options) if @enforce_utf8
51
-
52
- print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
53
-
54
- next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
55
-
56
- # cater for the quoted csv data containing the row separator carriage return character
57
- # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
58
- # by detecting the existence of an uneven number of quote characters
59
- multiline = count_quote_chars(line, options[:quote_char]).odd?
60
-
61
- while multiline
62
- next_line = fh.readline(options[:row_sep])
63
- next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
64
- line += next_line
65
- @file_line_count += 1
66
-
67
- break if fh.eof? # Exit loop if end of file is reached
68
-
69
- multiline = count_quote_chars(line, options[:quote_char]).odd?
70
- end
71
-
72
- # :nocov:
73
- if multiline && @verbose
74
- print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
75
- end
76
- # :nocov:
77
-
78
- line.chomp!(options[:row_sep])
79
-
80
- # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
81
- dataA, _data_size = parse(line, options, header_size)
82
-
83
- dataA.map!{|x| x.strip} if options[:strip_whitespace]
84
-
85
- # if all values are blank, then ignore this line
86
- next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
87
-
88
- # --- HASH TRANSFORMATIONS ------------------------------------------------------------
89
- hash = @headers.zip(dataA).to_h
90
-
91
- hash = hash_transformations(hash, options)
92
-
93
- # --- HASH VALIDATIONS ----------------------------------------------------------------
94
- # will go here, and be able to:
95
- # - validate correct format of the values for fields
96
- # - required fields to be non-empty
97
- # - ...
98
- # -------------------------------------------------------------------------------------
99
-
100
- next if options[:remove_empty_hashes] && hash.empty?
101
-
102
- puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
103
- # optional adding of csv_line_number to the hash to help debugging
104
- hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
105
-
106
- # process the chunks or the resulting hash
107
- if use_chunks
108
- chunk << hash # append temp result to chunk
109
-
110
- if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
111
- # do something with the chunk
112
- if block_given?
113
- yield chunk # do something with the hashes in the chunk in the block
114
- else
115
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
116
- end
117
- @chunk_count += 1
118
- chunk.clear # re-initialize for next chunk of data
119
- else
120
- # the last chunk may contain partial data, which is handled below
121
- end
122
- # while a chunk is being filled up we don't need to do anything else here
123
-
124
- else # no chunk handling
125
- if block_given?
126
- yield [hash] # do something with the hash in the block (better to use chunking here)
127
- else
128
- @result << hash
129
- end
130
- end
131
- end
132
-
133
- # print new line to retain last processing line message
134
- print "\n" if @verbose
135
-
136
- # handling of last chunk:
137
- if !chunk.nil? && chunk.size > 0
138
- # do something with the chunk
139
- if block_given?
140
- yield chunk # do something with the hashes in the chunk in the block
141
- else
142
- @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
143
- end
144
- @chunk_count += 1
145
- # chunk = [] # initialize for next chunk of data
146
- end
147
- ensure
148
- fh.close if fh.respond_to?(:close)
149
- end
150
-
151
- if block_given?
152
- @chunk_count # when we do processing through a block we only care how many chunks we processed
153
- else
154
- @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
155
- end
156
- end
157
-
158
- class << self
159
- def count_quote_chars(line, quote_char)
160
- return 0 if line.nil? || quote_char.nil? || quote_char.empty?
161
-
162
- count = 0
163
- escaped = false
164
-
165
- line.each_char do |char|
166
- if char == '\\' && !escaped
167
- escaped = true
168
- else
169
- count += 1 if char == quote_char && !escaped
170
- escaped = false
171
- end
172
- end
173
-
174
- count
175
- end
176
-
177
- def has_acceleration?
178
- @has_acceleration ||= !!defined?(parse_csv_line_c)
179
- end
180
-
181
- protected
182
-
183
- # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
184
- # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
185
- BLANK_RE = /\A\s*\z/.freeze
186
-
187
- def blank?(value)
188
- case value
189
- when String
190
- BLANK_RE.match?(value)
191
- when NilClass
192
- true
193
- when Array
194
- value.all? { |elem| blank?(elem) }
195
- when Hash
196
- value.values.all? { |elem| blank?(elem) } # Focus on values only
197
- else
198
- false
199
- end
200
- end
201
-
202
- private
203
-
204
- def enforce_utf8_encoding(line, options)
205
- # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
206
-
207
- line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
208
- end
209
- end
210
- end
@@ -1,30 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module SmarterCSV
4
- class << self
5
- attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
-
7
- def initialize_variables
8
- @has_rails = !!defined?(Rails)
9
- @csv_line_count = 0
10
- @chunk_count = 0
11
- @errors = {}
12
- @file_line_count = 0
13
- @headerA = []
14
- @headers = nil
15
- @raw_header = nil # header as it appears in the file
16
- @result = []
17
- @warnings = {}
18
- @enforce_utf8 = false # only set to true if needed (after options parsing)
19
- end
20
-
21
- # :nocov:
22
- # rubocop:disable Naming/MethodName
23
- def headerA
24
- warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
25
- @headerA
26
- end
27
- # rubocop:enable Naming/MethodName
28
- # :nocov:
29
- end
30
- end