smart_csv_import 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.adoc +134 -0
  3. data/README.md +534 -0
  4. data/app/jobs/smart_csv_import/import_job.rb +22 -0
  5. data/app/models/smart_csv_import/import.rb +36 -0
  6. data/app/models/smart_csv_import/import_row_error.rb +17 -0
  7. data/lib/generators/smart_csv_import/import/import_generator.rb +49 -0
  8. data/lib/generators/smart_csv_import/import/templates/import_form.rb.tt +32 -0
  9. data/lib/generators/smart_csv_import/import/templates/import_form_spec.rb.tt +38 -0
  10. data/lib/generators/smart_csv_import/install/install_generator.rb +34 -0
  11. data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_import_row_errors.rb.tt +18 -0
  12. data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_imports.rb.tt +23 -0
  13. data/lib/generators/smart_csv_import/install/templates/initializer.rb.tt +51 -0
  14. data/lib/generators/smart_csv_import/scaffold/scaffold_generator.rb +56 -0
  15. data/lib/generators/smart_csv_import/scaffold/templates/controller.rb.tt +33 -0
  16. data/lib/generators/smart_csv_import/scaffold/templates/new.html.erb.tt +12 -0
  17. data/lib/generators/smart_csv_import/scaffold/templates/show.html.erb.tt +59 -0
  18. data/lib/smart_csv_import/configuration.rb +77 -0
  19. data/lib/smart_csv_import/cosine_similarity.rb +15 -0
  20. data/lib/smart_csv_import/engine.rb +12 -0
  21. data/lib/smart_csv_import/failed_row_exporter.rb +78 -0
  22. data/lib/smart_csv_import/file_storage.rb +34 -0
  23. data/lib/smart_csv_import/header_normalizer.rb +76 -0
  24. data/lib/smart_csv_import/logging.rb +37 -0
  25. data/lib/smart_csv_import/match_result.rb +36 -0
  26. data/lib/smart_csv_import/matchable.rb +76 -0
  27. data/lib/smart_csv_import/matcher.rb +198 -0
  28. data/lib/smart_csv_import/normalizers/boolean_converter.rb +26 -0
  29. data/lib/smart_csv_import/normalizers/date_converter.rb +28 -0
  30. data/lib/smart_csv_import/notifications.rb +16 -0
  31. data/lib/smart_csv_import/processor/csv_preflight_analyzer.rb +74 -0
  32. data/lib/smart_csv_import/processor/import_result_builder.rb +97 -0
  33. data/lib/smart_csv_import/processor/mapping_review_policy.rb +90 -0
  34. data/lib/smart_csv_import/processor/nil_cell_counter.rb +19 -0
  35. data/lib/smart_csv_import/processor/null_progress_callback.rb +11 -0
  36. data/lib/smart_csv_import/processor/row_processor.rb +70 -0
  37. data/lib/smart_csv_import/processor.rb +294 -0
  38. data/lib/smart_csv_import/result.rb +101 -0
  39. data/lib/smart_csv_import/stability_report.rb +104 -0
  40. data/lib/smart_csv_import/strategies/llm.rb +106 -0
  41. data/lib/smart_csv_import/strategies/lookup.rb +41 -0
  42. data/lib/smart_csv_import/strategies/vector.rb +155 -0
  43. data/lib/smart_csv_import/strategy.rb +9 -0
  44. data/lib/smart_csv_import/strategy_failure.rb +13 -0
  45. data/lib/smart_csv_import/version.rb +5 -0
  46. data/lib/smart_csv_import.rb +79 -0
  47. data/smart_csv_import.gemspec +35 -0
  48. metadata +216 -0
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "fileutils"
5
+
6
+ module SmartCsvImport
7
+ module FileStorage
8
+ module_function
9
+
10
+ def store(source_path:, import_type:)
11
+ raise SmartCsvImport::Error, "Source file not found: #{source_path}" unless File.exist?(source_path)
12
+
13
+ destination_dir = File.join(SmartCsvImport.configuration.storage_path, import_type)
14
+ FileUtils.mkdir_p(destination_dir)
15
+
16
+ extension = File.extname(source_path)
17
+ basename = File.basename(source_path, extension)
18
+ timestamp = Time.current.strftime("%Y%m%d%H%M%S")
19
+ random_suffix = SecureRandom.hex(4)
20
+ destination_filename = "#{timestamp}_#{random_suffix}_#{basename}#{extension}"
21
+ destination_path = File.join(destination_dir, destination_filename)
22
+
23
+ FileUtils.cp(source_path, destination_path)
24
+
25
+ destination_path
26
+ end
27
+
28
+ def compute_hash(file_path:)
29
+ raise SmartCsvImport::Error, "File not found: #{file_path}" unless File.exist?(file_path)
30
+
31
+ Digest::SHA256.file(file_path).hexdigest
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ module HeaderNormalizer
5
+ # Unambiguous abbreviations only — terms that reliably mean one thing in a
6
+ # business CSV context. This list is intentionally small and conservative.
7
+ #
8
+ # DO NOT add entries that could mean two different things depending on
9
+ # domain (e.g. "ext" = file extension or phone extension, "co" = company
10
+ # or county, "apt" = apartment or adjective, "sal" / "val" = names).
11
+ #
12
+ # This list is not meant to be comprehensive. The LLM fallback strategy
13
+ # handles the long tail of ambiguous and domain-specific abbreviations
14
+ # far better than any static dictionary can.
15
+ ABBREVIATIONS = {
16
+ # Personal
17
+ "dob" => "date of birth",
18
+ "dod" => "date of death",
19
+ "ssn" => "social security number",
20
+ "nin" => "national insurance number",
21
+ "dba" => "doing business as",
22
+ # Contact
23
+ "tel" => "telephone",
24
+ # Location
25
+ "addr" => "address",
26
+ "zip" => "zip code",
27
+ "ste" => "suite",
28
+ # Organisation / HR
29
+ "dept" => "department",
30
+ "mgr" => "manager",
31
+ "emp" => "employee",
32
+ "org" => "organization",
33
+ "corp" => "corporation",
34
+ # Quantities / identifiers
35
+ "qty" => "quantity",
36
+ "amt" => "amount",
37
+ "num" => "number",
38
+ "ref" => "reference",
39
+ "acct" => "account",
40
+ # Finance
41
+ "bal" => "balance",
42
+ "pmt" => "payment",
43
+ "inv" => "invoice",
44
+ # Misc
45
+ "desc" => "description",
46
+ "info" => "information",
47
+ "misc" => "miscellaneous",
48
+ }.freeze
49
+
50
+ def self.normalize(header)
51
+ text = header.to_s
52
+
53
+ # Split camelCase and PascalCase: "CustomerDOB" → "Customer DOB"
54
+ text = text
55
+ .gsub(/([a-z])([A-Z])/, '\1 \2')
56
+ .gsub(/([A-Z]{2,})([A-Z][a-z])/, '\1 \2')
57
+
58
+ # Underscores, dashes, dots, slashes → spaces
59
+ text = text.tr("_./\\-", " ")
60
+
61
+ # Strip non-alphanumeric characters (removes #, *, (, ), etc.)
62
+ text = text.gsub(/[^a-zA-Z0-9\s]/, " ")
63
+
64
+ # Collapse whitespace
65
+ text = text.gsub(/\s+/, " ").strip
66
+
67
+ # Expand abbreviations — whole-word, case-insensitive
68
+ text = text.split(" ").map do |word|
69
+ ABBREVIATIONS[word.downcase] || word
70
+ end.join(" ")
71
+
72
+ # Final collapse in case expansions introduced extra spaces
73
+ text.gsub(/\s+/, " ").strip
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ module Logging
5
+ private
6
+
7
+ def log_info(message)
8
+ tagged = "[SmartCsvImport::#{self.class.name.split("::").last}] #{message}"
9
+
10
+ if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
11
+ Rails.logger.info(tagged)
12
+ else
13
+ SmartCsvImport.configuration.logger.info(tagged)
14
+ end
15
+ end
16
+
17
+ def log_debug(message)
18
+ tagged = "[SmartCsvImport::#{self.class.name.split("::").last}] #{message}"
19
+
20
+ if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
21
+ Rails.logger.debug(tagged)
22
+ else
23
+ SmartCsvImport.configuration.logger.debug(tagged)
24
+ end
25
+ end
26
+
27
+ def log_error(message)
28
+ tagged = "[SmartCsvImport::#{self.class.name.split("::").last}] #{message}"
29
+
30
+ if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
31
+ Rails.logger.error(tagged)
32
+ else
33
+ SmartCsvImport.configuration.logger.error(tagged)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ class MatchResult
5
+ attr_reader :target_field, :confidence, :strategy_name
6
+
7
+ private_class_method :new
8
+
9
+ def self.matched(target_field:, confidence:, strategy_name:)
10
+ new(target_field: target_field, confidence: confidence, strategy_name: strategy_name)
11
+ end
12
+
13
+ def matched? = true
14
+ def unmatched? = false
15
+
16
+ private
17
+
18
+ def initialize(target_field:, confidence:, strategy_name:)
19
+ @target_field = target_field
20
+ @confidence = confidence
21
+ @strategy_name = strategy_name
22
+ end
23
+ end
24
+
25
+ class UnmatchedResult
26
+ attr_reader :csv_header, :attempted_strategies
27
+
28
+ def initialize(csv_header:, attempted_strategies:)
29
+ @csv_header = csv_header
30
+ @attempted_strategies = attempted_strategies
31
+ end
32
+
33
+ def matched? = false
34
+ def unmatched? = true
35
+ end
36
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ CsvFieldDefinition = Struct.new(:name, :description, :required, keyword_init: true)
5
+
6
+ module Matchable
7
+ def self.included(base)
8
+ base.extend(ClassMethods)
9
+ end
10
+
11
+ module ClassMethods
12
+ def csv_field(name, description:, required: false)
13
+ name = name.to_sym
14
+
15
+ raise ConfigurationError, "#{name} is not a declared attribute on #{self}" unless _has_attribute?(name)
16
+ raise ConfigurationError, "#{name} is already declared as a csv_field" if csv_fields.key?(name)
17
+ raise ConfigurationError, "description must be non-empty for #{name}" if description.to_s.strip.empty?
18
+
19
+ csv_fields[name] = CsvFieldDefinition.new(
20
+ name: name,
21
+ description: description,
22
+ required: required
23
+ )
24
+ end
25
+
26
+ def csv_fields
27
+ @csv_fields ||= {}
28
+ end
29
+
30
+ def inherited(subclass)
31
+ super
32
+ subclass.instance_variable_set(:@csv_fields, csv_fields.dup)
33
+ end
34
+
35
+ def required_csv_fields
36
+ csv_fields.select { |_, field| field.required }.keys
37
+ end
38
+
39
+ # Optional context that enriches the LLM matching prompt.
40
+ # Both act as getter (no arg) and setter (with arg):
41
+ #
42
+ # csv_source "ADP Workforce payroll export"
43
+ # csv_context "HR platform for staffing agencies"
44
+ #
45
+ # Source describes where the CSV comes from (system/tool).
46
+ # Context describes the business domain of the importing app.
47
+ # Together they let the LLM disambiguate headers like "Cell" or "Sal"
48
+ # that are genuinely ambiguous without domain knowledge.
49
+ def csv_source(value = nil)
50
+ value ? @csv_source = value : @csv_source
51
+ end
52
+
53
+ def csv_context(value = nil)
54
+ value ? @csv_context = value : @csv_context
55
+ end
56
+
57
+ def matching_strategy
58
+ @matching_strategy
59
+ end
60
+
61
+ def matching_strategy=(strategy)
62
+ @matching_strategy = strategy
63
+ end
64
+
65
+ private
66
+
67
+ def _has_attribute?(name)
68
+ if respond_to?(:attribute_names)
69
+ attribute_names.map(&:to_sym).include?(name)
70
+ else
71
+ method_defined?(name) || private_method_defined?(name)
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,198 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "smarter_csv"
4
+
5
+ module SmartCsvImport
6
+ class Matcher
7
+ include Logging
8
+ DATE_PATTERNS = [
9
+ /\A\d{4}-\d{2}-\d{2}\z/,
10
+ /\A\d{1,2}\/\d{1,2}\/\d{4}\z/,
11
+ /\A\d{2}-[A-Za-z]{3}-\d{4}\z/
12
+ ].freeze
13
+
14
+ PHONE_PATTERN = /\A[\d\s\-\(\)\+\.]{7,}\z/
15
+ EMAIL_PATTERN = /\A[^@\s]+@[^@\s]+\.[^@\s]+\z/
16
+
17
+ VALUE_BOOST = 0.05
18
+ VALUE_PENALTY = -0.10
19
+
20
+ def initialize(file_path:, form_class:, confidence_threshold: SmartCsvImport.configuration.confidence_threshold)
21
+ validate_form_class!(form_class)
22
+
23
+ @file_path = file_path
24
+ @form_class = form_class
25
+ @confidence_threshold = confidence_threshold
26
+ end
27
+
28
+ def call
29
+ validate_file!
30
+
31
+ parsed_rows = parse_csv
32
+ csv_headers = parsed_rows.first&.keys || []
33
+ @sample_rows = parsed_rows.first(SmartCsvImport.configuration.value_hint_rows)
34
+
35
+ log_info("Starting header matching for #{csv_headers.length} columns: #{csv_headers.join(", ")}")
36
+ log_info("Target fields: #{@form_class.csv_fields.keys.join(", ")}")
37
+
38
+ results = {}
39
+ attempted_strategies = []
40
+ remaining = csv_headers.dup
41
+
42
+ # Tier 1: Custom strategy from form_class
43
+ custom_strategy = @form_class.matching_strategy
44
+ if custom_strategy
45
+ tier_results = run_strategy(custom_strategy, remaining, attempted_strategies, "custom")
46
+ tier_results = with_value_hints(tier_results, @sample_rows, @form_class)
47
+ results, remaining = accept_matches(results, tier_results, remaining)
48
+ end
49
+
50
+ # Tier 2 and 3: Vector and LLM, ordered by SmartCsvImport.configuration.default_strategy
51
+ default_tier_strategies.each do |name, strategy|
52
+ tier_results = run_strategy(strategy, remaining, attempted_strategies, name)
53
+ tier_results = with_value_hints(tier_results, @sample_rows, @form_class)
54
+ results, remaining = accept_matches(results, tier_results, remaining)
55
+ end
56
+
57
+ # Remaining unresolved headers become UnmatchedResult
58
+ remaining.each do |header|
59
+ log_info("UNMATCHED: '#{header}' — tried: #{attempted_strategies.join(", ")}")
60
+ results[header] = UnmatchedResult.new(
61
+ csv_header: header,
62
+ attempted_strategies: attempted_strategies.dup
63
+ )
64
+ end
65
+
66
+ log_info("Matching complete: #{results.count { |_, r| r.matched? }} matched, #{results.count { |_, r| r.unmatched? }} unmatched")
67
+ results
68
+ end
69
+
70
+ private
71
+
72
+ def validate_form_class!(form_class)
73
+ return if form_class.respond_to?(:csv_fields) && form_class.ancestors.include?(Matchable)
74
+
75
+ raise ConfigurationError, "form_class must include SmartCsvImport::Matchable"
76
+ end
77
+
78
+ def default_tier_strategies
79
+ tiers = { "vector" => Strategies::Vector.new, "llm" => Strategies::Llm.new }
80
+ SmartCsvImport.configuration.default_strategy == :llm ? tiers.to_a.reverse : tiers.to_a
81
+ end
82
+
83
+ def validate_file!
84
+ raise Error, "CSV file not found: #{@file_path}" unless File.exist?(@file_path)
85
+ end
86
+
87
+ def parse_csv
88
+ rows = SmarterCSV.process(
89
+ @file_path,
90
+ strings_as_keys: true,
91
+ strip_whitespace: true,
92
+ keep_original_headers: true,
93
+ field_size_limit: SmartCsvImport.configuration.field_size_limit,
94
+ convert_values_to_numeric: false,
95
+ remove_empty_values: false,
96
+ remove_empty_hashes: false,
97
+ invalid_byte_sequence: "",
98
+ force_utf8: true,
99
+ duplicate_header_suffix: "_",
100
+ nil_values_matching: SmartCsvImport.configuration.nil_values_regexp
101
+ )
102
+
103
+ rows.reject { |row| row.except(:csv_line_number).values.all? { |v| v.nil? || v.to_s.empty? } }
104
+ end
105
+
106
+ def run_strategy(strategy, unresolved_headers, attempted_strategies, strategy_name)
107
+ return {} if unresolved_headers.empty?
108
+
109
+ log_info("Running #{strategy_name} strategy for #{unresolved_headers.length} headers: #{unresolved_headers.join(", ")}")
110
+ attempted_strategies << strategy_name
111
+
112
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
113
+ tier_results = strategy.match(csv_headers: unresolved_headers, form_class: @form_class, sample_rows: @sample_rows)
114
+ elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time).round(2)
115
+
116
+ if tier_results.is_a?(StrategyFailure)
117
+ log_error("Strategy #{strategy_name} errored (#{tier_results.reason}) after #{elapsed}s — continuing with next tier")
118
+ return {}
119
+ end
120
+
121
+ tier_results.each do |header, result|
122
+ if result&.matched?
123
+ log_info(" #{strategy_name}: '#{header}' → :#{result.target_field} (confidence: #{result.confidence})")
124
+ else
125
+ log_debug(" #{strategy_name}: '#{header}' → no match")
126
+ end
127
+ end
128
+ log_info("#{strategy_name} completed in #{elapsed}s — #{tier_results.count { |_, r| r&.matched? }} matches")
129
+
130
+ tier_results
131
+ end
132
+
133
+ def accept_matches(results, tier_results, remaining)
134
+ accepted = results.dup
135
+ still_remaining = remaining.dup
136
+
137
+ tier_results.each do |header, result|
138
+ next unless result&.matched?
139
+
140
+ if result.confidence >= @confidence_threshold
141
+ accepted[header] = result
142
+ still_remaining.delete(header)
143
+ else
144
+ log_info(" BELOW THRESHOLD: '#{header}' → :#{result.target_field} (#{result.confidence} < #{@confidence_threshold})")
145
+ end
146
+ end
147
+
148
+ [accepted, still_remaining]
149
+ end
150
+
151
+ def with_value_hints(tier_results, sample_rows, form_class)
152
+ return tier_results if sample_rows.empty?
153
+
154
+ tier_results.each_with_object({}) do |(header, result), adjusted|
155
+ unless result&.matched?
156
+ adjusted[header] = result
157
+ next
158
+ end
159
+
160
+ field_def = form_class.csv_fields[result.target_field]
161
+ sample_values = sample_rows.filter_map { |row| row[header] }
162
+ adjustment = field_def && sample_values.any? ? compute_value_hint(sample_values, field_def) : 0
163
+
164
+ if adjustment.zero?
165
+ adjusted[header] = result
166
+ else
167
+ adjusted[header] = MatchResult.matched(
168
+ target_field: result.target_field,
169
+ confidence: (result.confidence + adjustment).clamp(0.0, 1.0).round(4),
170
+ strategy_name: result.strategy_name
171
+ )
172
+ end
173
+ end
174
+ end
175
+
176
+ def compute_value_hint(sample_values, field_def)
177
+ description = field_def.description.downcase
178
+
179
+ if description.include?("date") || description.include?("birth")
180
+ date_match = sample_values.count { |v| DATE_PATTERNS.any? { |p| p.match?(v.to_s) } }
181
+ return date_match > 0 ? VALUE_BOOST : VALUE_PENALTY
182
+ end
183
+
184
+ if description.include?("email")
185
+ email_match = sample_values.count { |v| EMAIL_PATTERN.match?(v.to_s) }
186
+ return email_match > 0 ? VALUE_BOOST : VALUE_PENALTY
187
+ end
188
+
189
+ if description.include?("phone")
190
+ phone_match = sample_values.count { |v| PHONE_PATTERN.match?(v.to_s) }
191
+ return phone_match > 0 ? VALUE_BOOST : VALUE_PENALTY
192
+ end
193
+
194
+ 0
195
+ end
196
+
197
+ end
198
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ module Normalizers
5
+ class BooleanConverter
6
+ TRUTHY = Set.new(%w[true yes 1 t y]).freeze
7
+ FALSEY = Set.new(%w[false no 0 f n]).freeze
8
+
9
+ def call(value)
10
+ return nil if value.nil?
11
+ return value if value == true || value == false
12
+
13
+ normalized = value.to_s.strip.downcase
14
+ return nil if normalized.empty?
15
+
16
+ if TRUTHY.include?(normalized)
17
+ true
18
+ elsif FALSEY.include?(normalized)
19
+ false
20
+ else
21
+ nil
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "date"
4
+
5
+ module SmartCsvImport
6
+ module Normalizers
7
+ class DateConverter
8
+ US_DATE = %r{\A(\d{1,2})/(\d{1,2})/(\d{4})\z}
9
+
10
+ def call(value)
11
+ return nil if value.nil?
12
+ return value if value.is_a?(Date)
13
+
14
+ str = value.to_s.strip
15
+ return nil if str.empty?
16
+
17
+ if (match = str.match(US_DATE))
18
+ month, day, year = match.captures.map(&:to_i)
19
+ Date.new(year, month, day)
20
+ else
21
+ Date.parse(str)
22
+ end
23
+ rescue Date::Error, ArgumentError
24
+ nil
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/notifications"
4
+
5
+ module SmartCsvImport
6
+ module Notifications
7
+ def self.progress_notifier(import_id:)
8
+ ->(info) {
9
+ ActiveSupport::Notifications.instrument("smart_csv_import.import.progress", {
10
+ import_id: import_id,
11
+ **info
12
+ })
13
+ }
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module SmartCsvImport
6
+ class Processor
7
+ module CsvPreflightAnalyzer
8
+ module_function
9
+
10
+ def duplicate_header_warnings(file_path:)
11
+ headers = read_headers(file_path)
12
+ return [] if headers.nil?
13
+
14
+ find_duplicates(headers).map { |dup| build_duplicate_warning(dup) }
15
+ rescue StandardError
16
+ []
17
+ end
18
+
19
+ def build_duplicate_warning(dup)
20
+ RowWarning.new(
21
+ row: 0,
22
+ message: "Duplicate column '#{dup[:original]}' — occurrence renamed to '#{dup[:renamed_to]}'",
23
+ type: :duplicate_header
24
+ )
25
+ end
26
+
27
+ def read_headers(file_path)
28
+ first_line = File.open(file_path, 'r:bom|utf-8:utf-8', invalid: :replace, undef: :replace, &:gets)
29
+ return nil unless first_line
30
+
31
+ CSV.parse_line(first_line)&.map { |h| h&.strip }
32
+ end
33
+
34
+ def count_nil_matches(file_path:, nil_values:)
35
+ return 0 if nil_values.nil? || (nil_values.respond_to?(:empty?) && nil_values.empty?)
36
+
37
+ count = 0
38
+ begin
39
+ each_data_row(file_path) { |row| count += NilCellCounter.count_row(row, nil_values) }
40
+ rescue CSV::MalformedCSVError
41
+ # Stop the scan and return the partial count — SmarterCSV surfaces the parse error downstream.
42
+ end
43
+ count
44
+ rescue Errno::ENOENT, Errno::EACCES => e
45
+ raise SmartCsvImport::Error, "Failed to read CSV for nil-value scan in #{File.basename(file_path)}: #{e.message}"
46
+ end
47
+
48
+ def find_duplicates(headers)
49
+ seen = {}
50
+ headers.each_with_object([]) do |header, duplicates|
51
+ next unless header
52
+
53
+ if seen[header]
54
+ seen[header] += 1
55
+ duplicates << { original: header, renamed_to: "#{header}_#{seen[header]}" }
56
+ else
57
+ seen[header] = 1
58
+ end
59
+ end
60
+ end
61
+
62
+ def each_data_row(file_path)
63
+ first_row = true
64
+ CSV.foreach(file_path, encoding: 'bom|utf-8:utf-8', invalid: :replace, undef: :replace) do |row|
65
+ if first_row
66
+ first_row = false
67
+ next
68
+ end
69
+ yield row
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end