smarter_csv 1.14.4 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,76 +2,98 @@
2
2
 
3
3
  module SmarterCSV
4
4
  module HashTransformations
5
+ # Frozen regex constants for performance (avoid recompilation on every value)
6
+ FLOAT_REGEX = /\A[+-]?\d+\.\d+\z/.freeze
7
+ INTEGER_REGEX = /\A[+-]?\d+\z/.freeze
8
+ ZERO_REGEX = /\A0+(?:\.0+)?\z/.freeze
9
+
5
10
  def hash_transformations(hash, options)
6
- # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
7
- # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
11
+ # Modify hash in-place for performance (avoids allocating a second hash per row)
12
+
13
+ # Remove nil/empty keys
14
+ hash.delete(nil)
15
+ hash.delete('')
16
+ hash.delete(:"")
17
+
8
18
  remove_empty_values = options[:remove_empty_values] == true
9
19
  remove_zero_values = options[:remove_zero_values]
10
20
  remove_values_matching = options[:remove_values_matching]
11
21
  convert_to_numeric = options[:convert_values_to_numeric]
12
22
  value_converters = options[:value_converters]
13
23
 
14
- hash.each_with_object({}) do |(k, v), new_hash|
15
- next if k.nil? || k == '' || k == :""
16
- next if remove_empty_values && (has_rails ? v.blank? : blank?(v))
17
- next if remove_zero_values && v.is_a?(String) && v =~ /^(0+|0+\.0+)$/ # values are Strings
18
- next if remove_values_matching && v =~ remove_values_matching
19
-
20
- # deal with the :only / :except options to :convert_values_to_numeric
21
- if convert_to_numeric && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
22
- if v =~ /^[+-]?\d+\.\d+$/
23
- v = v.to_f
24
- elsif v =~ /^[+-]?\d+$/
25
- v = v.to_i
26
- end
27
- end
24
+ # Early return if no transformations needed
25
+ return hash unless remove_empty_values || remove_zero_values || remove_values_matching || convert_to_numeric || value_converters
28
26
 
29
- converter = value_converters[k] if value_converters
30
- v = converter.convert(v) if converter
27
+ keys_to_delete = []
31
28
 
32
- new_hash[k] = v
33
- end
34
- end
29
+ hash.each do |k, v|
30
+ # Check if this key/value should be removed
31
+ if remove_empty_values && (has_rails ? v.blank? : blank?(v))
32
+ keys_to_delete << k
33
+ next
34
+ end
35
35
 
36
- # def hash_transformations(hash, options)
37
- # # there may be unmapped keys, or keys purposedly mapped to nil or an empty key..
38
- # # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
39
- # hash.delete(nil)
40
- # hash.delete('')
41
- # hash.delete(:"")
42
-
43
- # if options[:remove_empty_values] == true
44
- # hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
45
- # end
36
+ if remove_zero_values && v.is_a?(String) && ZERO_REGEX.match?(v)
37
+ keys_to_delete << k
38
+ next
39
+ end
40
+
41
+ if remove_values_matching && v.is_a?(String) && remove_values_matching.match?(v)
42
+ keys_to_delete << k
43
+ next
44
+ end
46
45
 
47
- # hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
48
- # hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
46
+ # Convert to numeric if requested
47
+ if convert_to_numeric && v.is_a?(String) && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
48
+ if FLOAT_REGEX.match?(v)
49
+ hash[k] = v.to_f
50
+ elsif INTEGER_REGEX.match?(v)
51
+ hash[k] = v.to_i
52
+ end
53
+ end
49
54
 
50
- # if options[:convert_values_to_numeric]
51
- # hash.each do |k, v|
52
- # # deal with the :only / :except options to :convert_values_to_numeric
53
- # next if limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
55
+ # Apply value converters
56
+ if value_converters
57
+ converter = value_converters[k]
58
+ hash[k] = converter.convert(hash[k]) if converter
59
+ end
60
+ end
54
61
 
55
- # # convert if it's a numeric value:
56
- # case v
57
- # when /^[+-]?\d+\.\d+$/
58
- # hash[k] = v.to_f
59
- # when /^[+-]?\d+$/
60
- # hash[k] = v.to_i
61
- # end
62
- # end
63
- # end
62
+ # Delete marked keys
63
+ keys_to_delete.each { |k| hash.delete(k) }
64
64
 
65
- # if options[:value_converters]
66
- # hash.each do |k, v|
67
- # converter = options[:value_converters][k]
68
- # next unless converter
65
+ hash
66
+ end
69
67
 
70
- # hash[k] = converter.convert(v)
68
+ # ORIGINAL each_with_object implementation (replaced with in-place modification above)
69
+ # def hash_transformations(hash, options)
70
+ # remove_empty_values = options[:remove_empty_values] == true
71
+ # remove_zero_values = options[:remove_zero_values]
72
+ # remove_values_matching = options[:remove_values_matching]
73
+ # convert_to_numeric = options[:convert_values_to_numeric]
74
+ # value_converters = options[:value_converters]
75
+ #
76
+ # hash.each_with_object({}) do |(k, v), new_hash|
77
+ # next if k.nil? || k == '' || k == :""
78
+ # next if remove_empty_values && (has_rails ? v.blank? : blank?(v))
79
+ # next if remove_zero_values && v.is_a?(String) && ZERO_REGEX.match?(v)
80
+ # next if remove_values_matching && remove_values_matching.match?(v)
81
+ #
82
+ # if convert_to_numeric && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
83
+ # if v.is_a?(String)
84
+ # if FLOAT_REGEX.match?(v)
85
+ # v = v.to_f
86
+ # elsif INTEGER_REGEX.match?(v)
87
+ # v = v.to_i
88
+ # end
89
+ # end
71
90
  # end
91
+ #
92
+ # converter = value_converters[k] if value_converters
93
+ # v = converter.convert(v) if converter
94
+ #
95
+ # new_hash[k] = v
72
96
  # end
73
-
74
- # hash
75
97
  # end
76
98
 
77
99
  protected
@@ -14,7 +14,7 @@ module SmarterCSV
14
14
  duplicates = header_counts.select { |_, count| count > 1 }
15
15
 
16
16
  unless duplicates.empty?
17
- raise(SmarterCSV::DuplicateHeaders, "Duplicate Headers in CSV: #{duplicates.inspect}")
17
+ raise SmarterCSV::DuplicateHeaders.new("Duplicate Headers in CSV: #{duplicates.inspect}", duplicates.keys)
18
18
  end
19
19
  end
20
20
 
@@ -26,7 +26,7 @@ module SmarterCSV
26
26
  missing_keys = options[:required_keys].select { |k| !headers_set.include?(k) }
27
27
 
28
28
  unless missing_keys.empty?
29
- raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}. Check `reader.headers` for original headers."
29
+ raise SmarterCSV::MissingKeys.new("ERROR: missing attributes: #{missing_keys.join(',')}. Check `reader.headers` for original headers.", missing_keys)
30
30
  end
31
31
  end
32
32
  end
@@ -12,7 +12,7 @@ module SmarterCSV
12
12
  if options[:headers_in_file] # extract the header line
13
13
  # process the header line in the CSV file..
14
14
  # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
15
- header_line = @raw_header = readline_with_counts(filehandle, options)
15
+ header_line = @raw_header = next_line_with_counts(filehandle, options)
16
16
  header_line = preprocess_header_line(header_line, options)
17
17
 
18
18
  file_header_array, file_header_size = parse(header_line, options)
@@ -2,7 +2,7 @@
2
2
 
3
3
  module SmarterCSV
4
4
  module Parser
5
- EMPTY_STRING = ''.freeze
5
+ EMPTY_STRING = '' # already frozen
6
6
 
7
7
  protected
8
8
 
@@ -26,6 +26,64 @@ module SmarterCSV
26
26
  end
27
27
  end
28
28
 
29
+ # Parse a CSV line directly into a hash, with support for extra columns.
30
+ # Returns [hash_or_nil, data_size] where hash is nil if all values are blank.
31
+ def parse_line_to_hash(line, headers, options)
32
+ has_quotes = line.include?(options[:quote_char])
33
+
34
+ if options[:acceleration] && has_acceleration
35
+ # :nocov:
36
+ parse_line_to_hash_c(
37
+ line,
38
+ headers,
39
+ options[:col_sep],
40
+ options[:quote_char],
41
+ options[:missing_header_prefix],
42
+ has_quotes,
43
+ options[:strip_whitespace],
44
+ options[:remove_empty_hashes],
45
+ options[:remove_empty_values]
46
+ )
47
+ # :nocov:
48
+ else
49
+ parse_line_to_hash_ruby(line, headers, options, has_quotes)
50
+ end
51
+ end
52
+
53
+ # Ruby implementation of parse_line_to_hash
54
+ def parse_line_to_hash_ruby(line, headers, options, has_quotes = false)
55
+ return [nil, 0] if line.nil?
56
+
57
+ # Parse the line into values
58
+ elements, data_size = parse_csv_line_ruby(line, options, nil, has_quotes)
59
+
60
+ # Check if all values are blank
61
+ if options[:remove_empty_hashes] && (elements.empty? || elements.all? { |v| v.nil? || v.to_s.strip.empty? })
62
+ return [nil, data_size]
63
+ end
64
+
65
+ # Build the hash - only include keys for values that exist
66
+ hash = {}
67
+ elements.each_with_index do |value, i|
68
+ key = if i < headers.size
69
+ headers[i]
70
+ else
71
+ "#{options[:missing_header_prefix]}#{i + 1}".to_sym
72
+ end
73
+ hash[key] = value
74
+ end
75
+
76
+ # Add nil for missing columns only when remove_empty_values is false
77
+ # (when true, nils would be removed anyway by hash_transformations)
78
+ unless options[:remove_empty_values]
79
+ (elements.size...headers.size).each do |i|
80
+ hash[headers[i]] = nil
81
+ end
82
+ end
83
+
84
+ [hash, data_size]
85
+ end
86
+
29
87
  # ------------------------------------------------------------------
30
88
  # Ruby equivalent of the C-extension for parse_line
31
89
  #
@@ -47,7 +105,7 @@ module SmarterCSV
47
105
  #
48
106
  # Our convention is that empty fields are returned as empty strings, not as nil.
49
107
 
50
- def parse_csv_line_ruby(line, options, header_size = nil, has_quotes = false)
108
+ def parse_csv_line_ruby(line, options, header_size = nil, _has_quotes = false)
51
109
  return [[], 0] if line.nil?
52
110
 
53
111
  line_size = line.size
@@ -81,9 +81,7 @@ module SmarterCSV
81
81
  end
82
82
 
83
83
  # now on to processing all the rest of the lines in the CSV file:
84
- # fh.each_line |line|
85
- until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
86
- line = readline_with_counts(fh, options)
84
+ while (line = next_line_with_counts(fh, options))
87
85
 
88
86
  # replace invalid byte sequence in UTF-8 with question mark to avoid errors
89
87
  line = enforce_utf8_encoding(line, options) if @enforce_utf8
@@ -98,25 +96,22 @@ module SmarterCSV
98
96
  multiline = count_quote_chars(line, options[:quote_char]).odd?
99
97
 
100
98
  while multiline
101
- begin
102
- next_line = fh.readline(options[:row_sep])
103
- next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
104
- line += next_line
105
- @file_line_count += 1
106
-
107
- multiline = count_quote_chars(line, options[:quote_char]).odd?
108
- rescue EOFError
99
+ next_line = fh.gets(options[:row_sep])
100
+ if next_line.nil?
109
101
  # End of file reached. Check if quotes are balanced.
110
102
  total_quotes = count_quote_chars(line, options[:quote_char])
111
103
  if total_quotes.odd?
112
104
  raise MalformedCSV, "Unclosed quoted field detected in multiline data"
113
105
  else
114
106
  # Quotes are balanced; proceed without raising an error.
115
- # :nocov:
116
107
  break
117
- # :nocov:
118
108
  end
119
109
  end
110
+ next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
111
+ line += next_line
112
+ @file_line_count += 1
113
+
114
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
120
115
  end
121
116
 
122
117
  # :nocov:
@@ -129,24 +124,25 @@ module SmarterCSV
129
124
 
130
125
  # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
131
126
  # we are now stripping whitespace inside the parse() methods
132
- dataA, data_size = parse(line, options) # we parse the extra columns
127
+ # we create additional columns on-the-fly when we find more data fields than headers
128
+ hash, data_size = parse_line_to_hash(line, @headers, options)
133
129
 
134
- if options[:strict]
135
- raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}"
136
- else
137
- # we create additional columns on-the-fly
138
- current_size = @headers.size
139
- while current_size < data_size
140
- @headers << "#{options[:missing_header_prefix]}#{current_size + 1}".to_sym
141
- current_size += 1
130
+ # Handle extra columns (more data fields than headers)
131
+ if data_size > @headers.size
132
+ if options[:strict]
133
+ raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}"
134
+ end
135
+
136
+ # Update headers array for subsequent rows
137
+ while @headers.size < data_size
138
+ @headers << "#{options[:missing_header_prefix]}#{@headers.size + 1}".to_sym
142
139
  end
143
140
  end
144
141
 
145
- # if all values are blank, then ignore this line
146
- next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
142
+ # if all values were blank (hash is nil) we ignore this CSV line
143
+ next if hash.nil?
147
144
 
148
145
  # --- HASH TRANSFORMATIONS ------------------------------------------------------------
149
- hash = @headers.zip(dataA).to_h
150
146
 
151
147
  hash = hash_transformations(hash, options)
152
148
 
@@ -170,7 +166,7 @@ module SmarterCSV
170
166
  if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
171
167
  # do something with the chunk
172
168
  if block_given?
173
- yield chunk # do something with the hashes in the chunk in the block
169
+ yield chunk, @chunk_count # do something with the hashes in the chunk in the block
174
170
  else
175
171
  @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
176
172
  end
@@ -183,7 +179,8 @@ module SmarterCSV
183
179
 
184
180
  else # no chunk handling
185
181
  if block_given?
186
- yield [hash] # do something with the hash in the block (better to use chunking here)
182
+ yield [hash], @chunk_count # do something with the hash in the block (better to use chunking here)
183
+ @chunk_count += 1
187
184
  else
188
185
  @result << hash
189
186
  end
@@ -197,7 +194,7 @@ module SmarterCSV
197
194
  if !chunk.nil? && chunk.size > 0
198
195
  # do something with the chunk
199
196
  if block_given?
200
- yield chunk # do something with the hashes in the chunk in the block
197
+ yield chunk, @chunk_count # do something with the hashes in the chunk in the block
201
198
  else
202
199
  @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
203
200
  end
@@ -218,6 +215,12 @@ module SmarterCSV
218
215
  def count_quote_chars(line, quote_char)
219
216
  return 0 if line.nil? || quote_char.nil? || quote_char.empty?
220
217
 
218
+ # Use C extension for performance if available (avoids creating a String object per character)
219
+ if @has_acceleration && SmarterCSV::Parser.respond_to?(:count_quote_chars_c)
220
+ return SmarterCSV::Parser.count_quote_chars_c(line, quote_char)
221
+ end
222
+
223
+ # Fallback to Ruby implementation
221
224
  count = 0
222
225
  escaped = false
223
226
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.14.4"
4
+ VERSION = "1.15.0"
5
5
  end
data/smarter_csv.gemspec CHANGED
@@ -36,7 +36,6 @@ Gem::Specification.new do |spec|
36
36
  spec.files += Dir.glob("ext/smarter_csv/**/*")
37
37
 
38
38
  spec.add_development_dependency "awesome_print"
39
- spec.add_development_dependency "codecov"
40
39
  spec.add_development_dependency "pry"
41
40
  spec.add_development_dependency "rspec"
42
41
  spec.add_development_dependency "rubocop"
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.14.4
4
+ version: 1.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2025-05-29 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: awesome_print
@@ -24,20 +23,6 @@ dependencies:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
25
  version: '0'
27
- - !ruby/object:Gem::Dependency
28
- name: codecov
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
26
  - !ruby/object:Gem::Dependency
42
27
  name: pry
43
28
  requirement: !ruby/object:Gem::Requirement
@@ -127,8 +112,14 @@ files:
127
112
  - docs/options.md
128
113
  - docs/row_col_sep.md
129
114
  - docs/value_converters.md
115
+ - ext/smarter_csv/Makefile
130
116
  - ext/smarter_csv/extconf.rb
117
+ - ext/smarter_csv/smarter_csv.bundle
118
+ - ext/smarter_csv/smarter_csv.bundle.dSYM/Contents/Info.plist
119
+ - ext/smarter_csv/smarter_csv.bundle.dSYM/Contents/Resources/DWARF/smarter_csv.bundle
120
+ - ext/smarter_csv/smarter_csv.bundle.dSYM/Contents/Resources/Relocations/aarch64/smarter_csv.bundle.yml
131
121
  - ext/smarter_csv/smarter_csv.c
122
+ - ext/smarter_csv/smarter_csv.o
132
123
  - lib/smarter_csv.rb
133
124
  - lib/smarter_csv/auto_detection.rb
134
125
  - lib/smarter_csv/errors.rb
@@ -150,7 +141,6 @@ metadata:
150
141
  homepage_uri: https://github.com/tilo/smarter_csv
151
142
  source_code_uri: https://github.com/tilo/smarter_csv
152
143
  changelog_uri: https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md
153
- post_install_message:
154
144
  rdoc_options: []
155
145
  require_paths:
156
146
  - lib
@@ -166,8 +156,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
156
  - !ruby/object:Gem::Version
167
157
  version: '0'
168
158
  requirements: []
169
- rubygems_version: 3.5.4
170
- signing_key:
159
+ rubygems_version: 3.6.9
171
160
  specification_version: 4
172
161
  summary: Convenient CSV Reading and Writing
173
162
  test_files: []