smart_csv_import 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.adoc +134 -0
- data/README.md +534 -0
- data/app/jobs/smart_csv_import/import_job.rb +22 -0
- data/app/models/smart_csv_import/import.rb +36 -0
- data/app/models/smart_csv_import/import_row_error.rb +17 -0
- data/lib/generators/smart_csv_import/import/import_generator.rb +49 -0
- data/lib/generators/smart_csv_import/import/templates/import_form.rb.tt +32 -0
- data/lib/generators/smart_csv_import/import/templates/import_form_spec.rb.tt +38 -0
- data/lib/generators/smart_csv_import/install/install_generator.rb +34 -0
- data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_import_row_errors.rb.tt +18 -0
- data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_imports.rb.tt +23 -0
- data/lib/generators/smart_csv_import/install/templates/initializer.rb.tt +51 -0
- data/lib/generators/smart_csv_import/scaffold/scaffold_generator.rb +56 -0
- data/lib/generators/smart_csv_import/scaffold/templates/controller.rb.tt +33 -0
- data/lib/generators/smart_csv_import/scaffold/templates/new.html.erb.tt +12 -0
- data/lib/generators/smart_csv_import/scaffold/templates/show.html.erb.tt +59 -0
- data/lib/smart_csv_import/configuration.rb +77 -0
- data/lib/smart_csv_import/cosine_similarity.rb +15 -0
- data/lib/smart_csv_import/engine.rb +12 -0
- data/lib/smart_csv_import/failed_row_exporter.rb +78 -0
- data/lib/smart_csv_import/file_storage.rb +34 -0
- data/lib/smart_csv_import/header_normalizer.rb +76 -0
- data/lib/smart_csv_import/logging.rb +37 -0
- data/lib/smart_csv_import/match_result.rb +36 -0
- data/lib/smart_csv_import/matchable.rb +76 -0
- data/lib/smart_csv_import/matcher.rb +198 -0
- data/lib/smart_csv_import/normalizers/boolean_converter.rb +26 -0
- data/lib/smart_csv_import/normalizers/date_converter.rb +28 -0
- data/lib/smart_csv_import/notifications.rb +16 -0
- data/lib/smart_csv_import/processor/csv_preflight_analyzer.rb +74 -0
- data/lib/smart_csv_import/processor/import_result_builder.rb +97 -0
- data/lib/smart_csv_import/processor/mapping_review_policy.rb +90 -0
- data/lib/smart_csv_import/processor/nil_cell_counter.rb +19 -0
- data/lib/smart_csv_import/processor/null_progress_callback.rb +11 -0
- data/lib/smart_csv_import/processor/row_processor.rb +70 -0
- data/lib/smart_csv_import/processor.rb +294 -0
- data/lib/smart_csv_import/result.rb +101 -0
- data/lib/smart_csv_import/stability_report.rb +104 -0
- data/lib/smart_csv_import/strategies/llm.rb +106 -0
- data/lib/smart_csv_import/strategies/lookup.rb +41 -0
- data/lib/smart_csv_import/strategies/vector.rb +155 -0
- data/lib/smart_csv_import/strategy.rb +9 -0
- data/lib/smart_csv_import/strategy_failure.rb +13 -0
- data/lib/smart_csv_import/version.rb +5 -0
- data/lib/smart_csv_import.rb +79 -0
- data/smart_csv_import.gemspec +35 -0
- metadata +216 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
6
|
+
module SmartCsvImport
|
|
7
|
+
module FileStorage
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def store(source_path:, import_type:)
|
|
11
|
+
raise SmartCsvImport::Error, "Source file not found: #{source_path}" unless File.exist?(source_path)
|
|
12
|
+
|
|
13
|
+
destination_dir = File.join(SmartCsvImport.configuration.storage_path, import_type)
|
|
14
|
+
FileUtils.mkdir_p(destination_dir)
|
|
15
|
+
|
|
16
|
+
extension = File.extname(source_path)
|
|
17
|
+
basename = File.basename(source_path, extension)
|
|
18
|
+
timestamp = Time.current.strftime("%Y%m%d%H%M%S")
|
|
19
|
+
random_suffix = SecureRandom.hex(4)
|
|
20
|
+
destination_filename = "#{timestamp}_#{random_suffix}_#{basename}#{extension}"
|
|
21
|
+
destination_path = File.join(destination_dir, destination_filename)
|
|
22
|
+
|
|
23
|
+
FileUtils.cp(source_path, destination_path)
|
|
24
|
+
|
|
25
|
+
destination_path
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def compute_hash(file_path:)
|
|
29
|
+
raise SmartCsvImport::Error, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
30
|
+
|
|
31
|
+
Digest::SHA256.file(file_path).hexdigest
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
module HeaderNormalizer
|
|
5
|
+
# Unambiguous abbreviations only — terms that reliably mean one thing in a
|
|
6
|
+
# business CSV context. This list is intentionally small and conservative.
|
|
7
|
+
#
|
|
8
|
+
# DO NOT add entries that could mean two different things depending on
|
|
9
|
+
# domain (e.g. "ext" = file extension or phone extension, "co" = company
|
|
10
|
+
# or county, "apt" = apartment or adjective, "sal" / "val" = names).
|
|
11
|
+
#
|
|
12
|
+
# This list is not meant to be comprehensive. The LLM fallback strategy
|
|
13
|
+
# handles the long tail of ambiguous and domain-specific abbreviations
|
|
14
|
+
# far better than any static dictionary can.
|
|
15
|
+
ABBREVIATIONS = {
|
|
16
|
+
# Personal
|
|
17
|
+
"dob" => "date of birth",
|
|
18
|
+
"dod" => "date of death",
|
|
19
|
+
"ssn" => "social security number",
|
|
20
|
+
"nin" => "national insurance number",
|
|
21
|
+
"dba" => "doing business as",
|
|
22
|
+
# Contact
|
|
23
|
+
"tel" => "telephone",
|
|
24
|
+
# Location
|
|
25
|
+
"addr" => "address",
|
|
26
|
+
"zip" => "zip code",
|
|
27
|
+
"ste" => "suite",
|
|
28
|
+
# Organisation / HR
|
|
29
|
+
"dept" => "department",
|
|
30
|
+
"mgr" => "manager",
|
|
31
|
+
"emp" => "employee",
|
|
32
|
+
"org" => "organization",
|
|
33
|
+
"corp" => "corporation",
|
|
34
|
+
# Quantities / identifiers
|
|
35
|
+
"qty" => "quantity",
|
|
36
|
+
"amt" => "amount",
|
|
37
|
+
"num" => "number",
|
|
38
|
+
"ref" => "reference",
|
|
39
|
+
"acct" => "account",
|
|
40
|
+
# Finance
|
|
41
|
+
"bal" => "balance",
|
|
42
|
+
"pmt" => "payment",
|
|
43
|
+
"inv" => "invoice",
|
|
44
|
+
# Misc
|
|
45
|
+
"desc" => "description",
|
|
46
|
+
"info" => "information",
|
|
47
|
+
"misc" => "miscellaneous",
|
|
48
|
+
}.freeze
|
|
49
|
+
|
|
50
|
+
def self.normalize(header)
|
|
51
|
+
text = header.to_s
|
|
52
|
+
|
|
53
|
+
# Split camelCase and PascalCase: "CustomerDOB" → "Customer DOB"
|
|
54
|
+
text = text
|
|
55
|
+
.gsub(/([a-z])([A-Z])/, '\1 \2')
|
|
56
|
+
.gsub(/([A-Z]{2,})([A-Z][a-z])/, '\1 \2')
|
|
57
|
+
|
|
58
|
+
# Underscores, dashes, dots, slashes → spaces
|
|
59
|
+
text = text.tr("_./\\-", " ")
|
|
60
|
+
|
|
61
|
+
# Strip non-alphanumeric characters (removes #, *, (, ), etc.)
|
|
62
|
+
text = text.gsub(/[^a-zA-Z0-9\s]/, " ")
|
|
63
|
+
|
|
64
|
+
# Collapse whitespace
|
|
65
|
+
text = text.gsub(/\s+/, " ").strip
|
|
66
|
+
|
|
67
|
+
# Expand abbreviations — whole-word, case-insensitive
|
|
68
|
+
text = text.split(" ").map do |word|
|
|
69
|
+
ABBREVIATIONS[word.downcase] || word
|
|
70
|
+
end.join(" ")
|
|
71
|
+
|
|
72
|
+
# Final collapse in case expansions introduced extra spaces
|
|
73
|
+
text.gsub(/\s+/, " ").strip
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
module Logging
|
|
5
|
+
private
|
|
6
|
+
|
|
7
|
+
def log_info(message)
|
|
8
|
+
tagged = "[SmartCsvImport::#{self.class.name.split("::").last}] #{message}"
|
|
9
|
+
|
|
10
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
11
|
+
Rails.logger.info(tagged)
|
|
12
|
+
else
|
|
13
|
+
SmartCsvImport.configuration.logger.info(tagged)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def log_debug(message)
|
|
18
|
+
tagged = "[SmartCsvImport::#{self.class.name.split("::").last}] #{message}"
|
|
19
|
+
|
|
20
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
21
|
+
Rails.logger.debug(tagged)
|
|
22
|
+
else
|
|
23
|
+
SmartCsvImport.configuration.logger.debug(tagged)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def log_error(message)
|
|
28
|
+
tagged = "[SmartCsvImport::#{self.class.name.split("::").last}] #{message}"
|
|
29
|
+
|
|
30
|
+
if defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
31
|
+
Rails.logger.error(tagged)
|
|
32
|
+
else
|
|
33
|
+
SmartCsvImport.configuration.logger.error(tagged)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class MatchResult
|
|
5
|
+
attr_reader :target_field, :confidence, :strategy_name
|
|
6
|
+
|
|
7
|
+
private_class_method :new
|
|
8
|
+
|
|
9
|
+
def self.matched(target_field:, confidence:, strategy_name:)
|
|
10
|
+
new(target_field: target_field, confidence: confidence, strategy_name: strategy_name)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def matched? = true
|
|
14
|
+
def unmatched? = false
|
|
15
|
+
|
|
16
|
+
private
|
|
17
|
+
|
|
18
|
+
def initialize(target_field:, confidence:, strategy_name:)
|
|
19
|
+
@target_field = target_field
|
|
20
|
+
@confidence = confidence
|
|
21
|
+
@strategy_name = strategy_name
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
class UnmatchedResult
|
|
26
|
+
attr_reader :csv_header, :attempted_strategies
|
|
27
|
+
|
|
28
|
+
def initialize(csv_header:, attempted_strategies:)
|
|
29
|
+
@csv_header = csv_header
|
|
30
|
+
@attempted_strategies = attempted_strategies
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def matched? = false
|
|
34
|
+
def unmatched? = true
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
CsvFieldDefinition = Struct.new(:name, :description, :required, keyword_init: true)
|
|
5
|
+
|
|
6
|
+
module Matchable
|
|
7
|
+
def self.included(base)
|
|
8
|
+
base.extend(ClassMethods)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
module ClassMethods
|
|
12
|
+
def csv_field(name, description:, required: false)
|
|
13
|
+
name = name.to_sym
|
|
14
|
+
|
|
15
|
+
raise ConfigurationError, "#{name} is not a declared attribute on #{self}" unless _has_attribute?(name)
|
|
16
|
+
raise ConfigurationError, "#{name} is already declared as a csv_field" if csv_fields.key?(name)
|
|
17
|
+
raise ConfigurationError, "description must be non-empty for #{name}" if description.to_s.strip.empty?
|
|
18
|
+
|
|
19
|
+
csv_fields[name] = CsvFieldDefinition.new(
|
|
20
|
+
name: name,
|
|
21
|
+
description: description,
|
|
22
|
+
required: required
|
|
23
|
+
)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def csv_fields
|
|
27
|
+
@csv_fields ||= {}
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def inherited(subclass)
|
|
31
|
+
super
|
|
32
|
+
subclass.instance_variable_set(:@csv_fields, csv_fields.dup)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def required_csv_fields
|
|
36
|
+
csv_fields.select { |_, field| field.required }.keys
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Optional context that enriches the LLM matching prompt.
|
|
40
|
+
# Both act as getter (no arg) and setter (with arg):
|
|
41
|
+
#
|
|
42
|
+
# csv_source "ADP Workforce payroll export"
|
|
43
|
+
# csv_context "HR platform for staffing agencies"
|
|
44
|
+
#
|
|
45
|
+
# Source describes where the CSV comes from (system/tool).
|
|
46
|
+
# Context describes the business domain of the importing app.
|
|
47
|
+
# Together they let the LLM disambiguate headers like "Cell" or "Sal"
|
|
48
|
+
# that are genuinely ambiguous without domain knowledge.
|
|
49
|
+
def csv_source(value = nil)
|
|
50
|
+
value ? @csv_source = value : @csv_source
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def csv_context(value = nil)
|
|
54
|
+
value ? @csv_context = value : @csv_context
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def matching_strategy
|
|
58
|
+
@matching_strategy
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def matching_strategy=(strategy)
|
|
62
|
+
@matching_strategy = strategy
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
def _has_attribute?(name)
|
|
68
|
+
if respond_to?(:attribute_names)
|
|
69
|
+
attribute_names.map(&:to_sym).include?(name)
|
|
70
|
+
else
|
|
71
|
+
method_defined?(name) || private_method_defined?(name)
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "smarter_csv"
|
|
4
|
+
|
|
5
|
+
module SmartCsvImport
|
|
6
|
+
class Matcher
|
|
7
|
+
include Logging
|
|
8
|
+
DATE_PATTERNS = [
|
|
9
|
+
/\A\d{4}-\d{2}-\d{2}\z/,
|
|
10
|
+
/\A\d{1,2}\/\d{1,2}\/\d{4}\z/,
|
|
11
|
+
/\A\d{2}-[A-Za-z]{3}-\d{4}\z/
|
|
12
|
+
].freeze
|
|
13
|
+
|
|
14
|
+
PHONE_PATTERN = /\A[\d\s\-\(\)\+\.]{7,}\z/
|
|
15
|
+
EMAIL_PATTERN = /\A[^@\s]+@[^@\s]+\.[^@\s]+\z/
|
|
16
|
+
|
|
17
|
+
VALUE_BOOST = 0.05
|
|
18
|
+
VALUE_PENALTY = -0.10
|
|
19
|
+
|
|
20
|
+
def initialize(file_path:, form_class:, confidence_threshold: SmartCsvImport.configuration.confidence_threshold)
|
|
21
|
+
validate_form_class!(form_class)
|
|
22
|
+
|
|
23
|
+
@file_path = file_path
|
|
24
|
+
@form_class = form_class
|
|
25
|
+
@confidence_threshold = confidence_threshold
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def call
|
|
29
|
+
validate_file!
|
|
30
|
+
|
|
31
|
+
parsed_rows = parse_csv
|
|
32
|
+
csv_headers = parsed_rows.first&.keys || []
|
|
33
|
+
@sample_rows = parsed_rows.first(SmartCsvImport.configuration.value_hint_rows)
|
|
34
|
+
|
|
35
|
+
log_info("Starting header matching for #{csv_headers.length} columns: #{csv_headers.join(", ")}")
|
|
36
|
+
log_info("Target fields: #{@form_class.csv_fields.keys.join(", ")}")
|
|
37
|
+
|
|
38
|
+
results = {}
|
|
39
|
+
attempted_strategies = []
|
|
40
|
+
remaining = csv_headers.dup
|
|
41
|
+
|
|
42
|
+
# Tier 1: Custom strategy from form_class
|
|
43
|
+
custom_strategy = @form_class.matching_strategy
|
|
44
|
+
if custom_strategy
|
|
45
|
+
tier_results = run_strategy(custom_strategy, remaining, attempted_strategies, "custom")
|
|
46
|
+
tier_results = with_value_hints(tier_results, @sample_rows, @form_class)
|
|
47
|
+
results, remaining = accept_matches(results, tier_results, remaining)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Tier 2 and 3: Vector and LLM, ordered by SmartCsvImport.configuration.default_strategy
|
|
51
|
+
default_tier_strategies.each do |name, strategy|
|
|
52
|
+
tier_results = run_strategy(strategy, remaining, attempted_strategies, name)
|
|
53
|
+
tier_results = with_value_hints(tier_results, @sample_rows, @form_class)
|
|
54
|
+
results, remaining = accept_matches(results, tier_results, remaining)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Remaining unresolved headers become UnmatchedResult
|
|
58
|
+
remaining.each do |header|
|
|
59
|
+
log_info("UNMATCHED: '#{header}' — tried: #{attempted_strategies.join(", ")}")
|
|
60
|
+
results[header] = UnmatchedResult.new(
|
|
61
|
+
csv_header: header,
|
|
62
|
+
attempted_strategies: attempted_strategies.dup
|
|
63
|
+
)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
log_info("Matching complete: #{results.count { |_, r| r.matched? }} matched, #{results.count { |_, r| r.unmatched? }} unmatched")
|
|
67
|
+
results
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
def validate_form_class!(form_class)
|
|
73
|
+
return if form_class.respond_to?(:csv_fields) && form_class.ancestors.include?(Matchable)
|
|
74
|
+
|
|
75
|
+
raise ConfigurationError, "form_class must include SmartCsvImport::Matchable"
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def default_tier_strategies
|
|
79
|
+
tiers = { "vector" => Strategies::Vector.new, "llm" => Strategies::Llm.new }
|
|
80
|
+
SmartCsvImport.configuration.default_strategy == :llm ? tiers.to_a.reverse : tiers.to_a
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def validate_file!
|
|
84
|
+
raise Error, "CSV file not found: #{@file_path}" unless File.exist?(@file_path)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def parse_csv
|
|
88
|
+
rows = SmarterCSV.process(
|
|
89
|
+
@file_path,
|
|
90
|
+
strings_as_keys: true,
|
|
91
|
+
strip_whitespace: true,
|
|
92
|
+
keep_original_headers: true,
|
|
93
|
+
field_size_limit: SmartCsvImport.configuration.field_size_limit,
|
|
94
|
+
convert_values_to_numeric: false,
|
|
95
|
+
remove_empty_values: false,
|
|
96
|
+
remove_empty_hashes: false,
|
|
97
|
+
invalid_byte_sequence: "",
|
|
98
|
+
force_utf8: true,
|
|
99
|
+
duplicate_header_suffix: "_",
|
|
100
|
+
nil_values_matching: SmartCsvImport.configuration.nil_values_regexp
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
rows.reject { |row| row.except(:csv_line_number).values.all? { |v| v.nil? || v.to_s.empty? } }
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def run_strategy(strategy, unresolved_headers, attempted_strategies, strategy_name)
|
|
107
|
+
return {} if unresolved_headers.empty?
|
|
108
|
+
|
|
109
|
+
log_info("Running #{strategy_name} strategy for #{unresolved_headers.length} headers: #{unresolved_headers.join(", ")}")
|
|
110
|
+
attempted_strategies << strategy_name
|
|
111
|
+
|
|
112
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
113
|
+
tier_results = strategy.match(csv_headers: unresolved_headers, form_class: @form_class, sample_rows: @sample_rows)
|
|
114
|
+
elapsed = (Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time).round(2)
|
|
115
|
+
|
|
116
|
+
if tier_results.is_a?(StrategyFailure)
|
|
117
|
+
log_error("Strategy #{strategy_name} errored (#{tier_results.reason}) after #{elapsed}s — continuing with next tier")
|
|
118
|
+
return {}
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
tier_results.each do |header, result|
|
|
122
|
+
if result&.matched?
|
|
123
|
+
log_info(" #{strategy_name}: '#{header}' → :#{result.target_field} (confidence: #{result.confidence})")
|
|
124
|
+
else
|
|
125
|
+
log_debug(" #{strategy_name}: '#{header}' → no match")
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
log_info("#{strategy_name} completed in #{elapsed}s — #{tier_results.count { |_, r| r&.matched? }} matches")
|
|
129
|
+
|
|
130
|
+
tier_results
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def accept_matches(results, tier_results, remaining)
|
|
134
|
+
accepted = results.dup
|
|
135
|
+
still_remaining = remaining.dup
|
|
136
|
+
|
|
137
|
+
tier_results.each do |header, result|
|
|
138
|
+
next unless result&.matched?
|
|
139
|
+
|
|
140
|
+
if result.confidence >= @confidence_threshold
|
|
141
|
+
accepted[header] = result
|
|
142
|
+
still_remaining.delete(header)
|
|
143
|
+
else
|
|
144
|
+
log_info(" BELOW THRESHOLD: '#{header}' → :#{result.target_field} (#{result.confidence} < #{@confidence_threshold})")
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
[accepted, still_remaining]
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def with_value_hints(tier_results, sample_rows, form_class)
|
|
152
|
+
return tier_results if sample_rows.empty?
|
|
153
|
+
|
|
154
|
+
tier_results.each_with_object({}) do |(header, result), adjusted|
|
|
155
|
+
unless result&.matched?
|
|
156
|
+
adjusted[header] = result
|
|
157
|
+
next
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
field_def = form_class.csv_fields[result.target_field]
|
|
161
|
+
sample_values = sample_rows.filter_map { |row| row[header] }
|
|
162
|
+
adjustment = field_def && sample_values.any? ? compute_value_hint(sample_values, field_def) : 0
|
|
163
|
+
|
|
164
|
+
if adjustment.zero?
|
|
165
|
+
adjusted[header] = result
|
|
166
|
+
else
|
|
167
|
+
adjusted[header] = MatchResult.matched(
|
|
168
|
+
target_field: result.target_field,
|
|
169
|
+
confidence: (result.confidence + adjustment).clamp(0.0, 1.0).round(4),
|
|
170
|
+
strategy_name: result.strategy_name
|
|
171
|
+
)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def compute_value_hint(sample_values, field_def)
|
|
177
|
+
description = field_def.description.downcase
|
|
178
|
+
|
|
179
|
+
if description.include?("date") || description.include?("birth")
|
|
180
|
+
date_match = sample_values.count { |v| DATE_PATTERNS.any? { |p| p.match?(v.to_s) } }
|
|
181
|
+
return date_match > 0 ? VALUE_BOOST : VALUE_PENALTY
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
if description.include?("email")
|
|
185
|
+
email_match = sample_values.count { |v| EMAIL_PATTERN.match?(v.to_s) }
|
|
186
|
+
return email_match > 0 ? VALUE_BOOST : VALUE_PENALTY
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
if description.include?("phone")
|
|
190
|
+
phone_match = sample_values.count { |v| PHONE_PATTERN.match?(v.to_s) }
|
|
191
|
+
return phone_match > 0 ? VALUE_BOOST : VALUE_PENALTY
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
0
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
end
|
|
198
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
module Normalizers
|
|
5
|
+
class BooleanConverter
|
|
6
|
+
TRUTHY = Set.new(%w[true yes 1 t y]).freeze
|
|
7
|
+
FALSEY = Set.new(%w[false no 0 f n]).freeze
|
|
8
|
+
|
|
9
|
+
def call(value)
|
|
10
|
+
return nil if value.nil?
|
|
11
|
+
return value if value == true || value == false
|
|
12
|
+
|
|
13
|
+
normalized = value.to_s.strip.downcase
|
|
14
|
+
return nil if normalized.empty?
|
|
15
|
+
|
|
16
|
+
if TRUTHY.include?(normalized)
|
|
17
|
+
true
|
|
18
|
+
elsif FALSEY.include?(normalized)
|
|
19
|
+
false
|
|
20
|
+
else
|
|
21
|
+
nil
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "date"
|
|
4
|
+
|
|
5
|
+
module SmartCsvImport
|
|
6
|
+
module Normalizers
|
|
7
|
+
class DateConverter
|
|
8
|
+
US_DATE = %r{\A(\d{1,2})/(\d{1,2})/(\d{4})\z}
|
|
9
|
+
|
|
10
|
+
def call(value)
|
|
11
|
+
return nil if value.nil?
|
|
12
|
+
return value if value.is_a?(Date)
|
|
13
|
+
|
|
14
|
+
str = value.to_s.strip
|
|
15
|
+
return nil if str.empty?
|
|
16
|
+
|
|
17
|
+
if (match = str.match(US_DATE))
|
|
18
|
+
month, day, year = match.captures.map(&:to_i)
|
|
19
|
+
Date.new(year, month, day)
|
|
20
|
+
else
|
|
21
|
+
Date.parse(str)
|
|
22
|
+
end
|
|
23
|
+
rescue Date::Error, ArgumentError
|
|
24
|
+
nil
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support/notifications"
|
|
4
|
+
|
|
5
|
+
module SmartCsvImport
|
|
6
|
+
module Notifications
|
|
7
|
+
def self.progress_notifier(import_id:)
|
|
8
|
+
->(info) {
|
|
9
|
+
ActiveSupport::Notifications.instrument("smart_csv_import.import.progress", {
|
|
10
|
+
import_id: import_id,
|
|
11
|
+
**info
|
|
12
|
+
})
|
|
13
|
+
}
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'csv'
|
|
4
|
+
|
|
5
|
+
module SmartCsvImport
|
|
6
|
+
class Processor
|
|
7
|
+
module CsvPreflightAnalyzer
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def duplicate_header_warnings(file_path:)
|
|
11
|
+
headers = read_headers(file_path)
|
|
12
|
+
return [] if headers.nil?
|
|
13
|
+
|
|
14
|
+
find_duplicates(headers).map { |dup| build_duplicate_warning(dup) }
|
|
15
|
+
rescue StandardError
|
|
16
|
+
[]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def build_duplicate_warning(dup)
|
|
20
|
+
RowWarning.new(
|
|
21
|
+
row: 0,
|
|
22
|
+
message: "Duplicate column '#{dup[:original]}' — occurrence renamed to '#{dup[:renamed_to]}'",
|
|
23
|
+
type: :duplicate_header
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def read_headers(file_path)
|
|
28
|
+
first_line = File.open(file_path, 'r:bom|utf-8:utf-8', invalid: :replace, undef: :replace, &:gets)
|
|
29
|
+
return nil unless first_line
|
|
30
|
+
|
|
31
|
+
CSV.parse_line(first_line)&.map { |h| h&.strip }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def count_nil_matches(file_path:, nil_values:)
|
|
35
|
+
return 0 if nil_values.nil? || (nil_values.respond_to?(:empty?) && nil_values.empty?)
|
|
36
|
+
|
|
37
|
+
count = 0
|
|
38
|
+
begin
|
|
39
|
+
each_data_row(file_path) { |row| count += NilCellCounter.count_row(row, nil_values) }
|
|
40
|
+
rescue CSV::MalformedCSVError
|
|
41
|
+
# Stop the scan and return the partial count — SmarterCSV surfaces the parse error downstream.
|
|
42
|
+
end
|
|
43
|
+
count
|
|
44
|
+
rescue Errno::ENOENT, Errno::EACCES => e
|
|
45
|
+
raise SmartCsvImport::Error, "Failed to read CSV for nil-value scan in #{File.basename(file_path)}: #{e.message}"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def find_duplicates(headers)
|
|
49
|
+
seen = {}
|
|
50
|
+
headers.each_with_object([]) do |header, duplicates|
|
|
51
|
+
next unless header
|
|
52
|
+
|
|
53
|
+
if seen[header]
|
|
54
|
+
seen[header] += 1
|
|
55
|
+
duplicates << { original: header, renamed_to: "#{header}_#{seen[header]}" }
|
|
56
|
+
else
|
|
57
|
+
seen[header] = 1
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def each_data_row(file_path)
|
|
63
|
+
first_row = true
|
|
64
|
+
CSV.foreach(file_path, encoding: 'bom|utf-8:utf-8', invalid: :replace, undef: :replace) do |row|
|
|
65
|
+
if first_row
|
|
66
|
+
first_row = false
|
|
67
|
+
next
|
|
68
|
+
end
|
|
69
|
+
yield row
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|