smart_csv_import 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.adoc +134 -0
- data/README.md +534 -0
- data/app/jobs/smart_csv_import/import_job.rb +22 -0
- data/app/models/smart_csv_import/import.rb +36 -0
- data/app/models/smart_csv_import/import_row_error.rb +17 -0
- data/lib/generators/smart_csv_import/import/import_generator.rb +49 -0
- data/lib/generators/smart_csv_import/import/templates/import_form.rb.tt +32 -0
- data/lib/generators/smart_csv_import/import/templates/import_form_spec.rb.tt +38 -0
- data/lib/generators/smart_csv_import/install/install_generator.rb +34 -0
- data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_import_row_errors.rb.tt +18 -0
- data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_imports.rb.tt +23 -0
- data/lib/generators/smart_csv_import/install/templates/initializer.rb.tt +51 -0
- data/lib/generators/smart_csv_import/scaffold/scaffold_generator.rb +56 -0
- data/lib/generators/smart_csv_import/scaffold/templates/controller.rb.tt +33 -0
- data/lib/generators/smart_csv_import/scaffold/templates/new.html.erb.tt +12 -0
- data/lib/generators/smart_csv_import/scaffold/templates/show.html.erb.tt +59 -0
- data/lib/smart_csv_import/configuration.rb +77 -0
- data/lib/smart_csv_import/cosine_similarity.rb +15 -0
- data/lib/smart_csv_import/engine.rb +12 -0
- data/lib/smart_csv_import/failed_row_exporter.rb +78 -0
- data/lib/smart_csv_import/file_storage.rb +34 -0
- data/lib/smart_csv_import/header_normalizer.rb +76 -0
- data/lib/smart_csv_import/logging.rb +37 -0
- data/lib/smart_csv_import/match_result.rb +36 -0
- data/lib/smart_csv_import/matchable.rb +76 -0
- data/lib/smart_csv_import/matcher.rb +198 -0
- data/lib/smart_csv_import/normalizers/boolean_converter.rb +26 -0
- data/lib/smart_csv_import/normalizers/date_converter.rb +28 -0
- data/lib/smart_csv_import/notifications.rb +16 -0
- data/lib/smart_csv_import/processor/csv_preflight_analyzer.rb +74 -0
- data/lib/smart_csv_import/processor/import_result_builder.rb +97 -0
- data/lib/smart_csv_import/processor/mapping_review_policy.rb +90 -0
- data/lib/smart_csv_import/processor/nil_cell_counter.rb +19 -0
- data/lib/smart_csv_import/processor/null_progress_callback.rb +11 -0
- data/lib/smart_csv_import/processor/row_processor.rb +70 -0
- data/lib/smart_csv_import/processor.rb +294 -0
- data/lib/smart_csv_import/result.rb +101 -0
- data/lib/smart_csv_import/stability_report.rb +104 -0
- data/lib/smart_csv_import/strategies/llm.rb +106 -0
- data/lib/smart_csv_import/strategies/lookup.rb +41 -0
- data/lib/smart_csv_import/strategies/vector.rb +155 -0
- data/lib/smart_csv_import/strategy.rb +9 -0
- data/lib/smart_csv_import/strategy_failure.rb +13 -0
- data/lib/smart_csv_import/version.rb +5 -0
- data/lib/smart_csv_import.rb +79 -0
- data/smart_csv_import.gemspec +35 -0
- metadata +216 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class Processor
|
|
5
|
+
module ImportResultBuilder
|
|
6
|
+
module_function
|
|
7
|
+
|
|
8
|
+
def build_parse_errors(bad_rows)
|
|
9
|
+
bad_rows.map do |bad_row|
|
|
10
|
+
ParseError.new(
|
|
11
|
+
line_number: bad_row[:csv_line_number] || bad_row[:file_line_number],
|
|
12
|
+
raw_line: bad_row[:raw_logical_line].to_s.chomp,
|
|
13
|
+
error_message: bad_row[:error_message].to_s
|
|
14
|
+
)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def build_row_error_attributes(errors:, parse_errors:, import_id:)
|
|
19
|
+
now = Time.current
|
|
20
|
+
validation_attrs = errors.map do |err|
|
|
21
|
+
{
|
|
22
|
+
import_id: import_id,
|
|
23
|
+
row_number: err.row,
|
|
24
|
+
error_type: 'validation',
|
|
25
|
+
column_name: err.column.to_s,
|
|
26
|
+
messages: err.messages,
|
|
27
|
+
raw_line: nil,
|
|
28
|
+
error_message: nil,
|
|
29
|
+
created_at: now
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
parse_attrs = parse_errors.map do |pe|
|
|
33
|
+
{
|
|
34
|
+
import_id: import_id,
|
|
35
|
+
row_number: pe.line_number,
|
|
36
|
+
error_type: 'parse',
|
|
37
|
+
column_name: nil,
|
|
38
|
+
messages: [],
|
|
39
|
+
raw_line: pe.raw_line,
|
|
40
|
+
error_message: pe.error_message,
|
|
41
|
+
created_at: now
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
validation_attrs + parse_attrs
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def build_blank_row_warnings(blank_count)
|
|
48
|
+
return [] if blank_count.zero?
|
|
49
|
+
|
|
50
|
+
noun = blank_count == 1 ? 'row' : 'rows'
|
|
51
|
+
verb = blank_count == 1 ? 'was' : 'were'
|
|
52
|
+
[RowWarning.new(row: 0, message: "#{blank_count} blank #{noun} #{verb} skipped", type: :blank_rows)]
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def build_abort_warning(parse_error_count:, total:, bad_row_limit:)
|
|
56
|
+
ratio = total.positive? ? parse_error_count.to_f / total : 0
|
|
57
|
+
pct = (ratio * 100).round(1)
|
|
58
|
+
limit_pct = (bad_row_limit * 100).round(1)
|
|
59
|
+
RowWarning.new(
|
|
60
|
+
row: 0,
|
|
61
|
+
message: "Import aborted: #{pct}% of rows were malformed (limit: #{limit_pct}%)"
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def final_result(imported:, failed:, errors:, parse_errors:, warnings:, header_mappings:, import_id:, dry_run:)
|
|
66
|
+
total = imported + failed + parse_errors.size
|
|
67
|
+
all_failed = failed + parse_errors.size
|
|
68
|
+
|
|
69
|
+
if dry_run
|
|
70
|
+
Result.dry_run(imported: imported, failed: failed, total: total, errors: errors,
|
|
71
|
+
header_mappings: header_mappings, warnings: warnings, parse_errors: parse_errors)
|
|
72
|
+
elsif all_failed.positive?
|
|
73
|
+
Result.partial_failure(imported: imported, failed: failed, total: total, errors: errors,
|
|
74
|
+
header_mappings: header_mappings, import_id: import_id,
|
|
75
|
+
warnings: warnings, parse_errors: parse_errors)
|
|
76
|
+
else
|
|
77
|
+
Result.completed(imported: imported, failed: failed, total: total, errors: errors,
|
|
78
|
+
header_mappings: header_mappings, import_id: import_id,
|
|
79
|
+
warnings: warnings, parse_errors: parse_errors)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def abort_result(imported:, failed:, parse_errors:, warnings:, header_mappings:, import_id:, bad_row_limit:)
|
|
84
|
+
total = imported + failed + parse_errors.size
|
|
85
|
+
abort_warning = build_abort_warning(
|
|
86
|
+
parse_error_count: parse_errors.size, total: total, bad_row_limit: bad_row_limit
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
Result.partial_failure(
|
|
90
|
+
imported: imported, failed: failed, total: total, errors: [],
|
|
91
|
+
header_mappings: header_mappings, import_id: import_id,
|
|
92
|
+
warnings: [*warnings, abort_warning], parse_errors: parse_errors
|
|
93
|
+
)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class Processor
|
|
5
|
+
module MappingReviewPolicy
|
|
6
|
+
module_function
|
|
7
|
+
|
|
8
|
+
def serialize_mappings(mappings)
|
|
9
|
+
mappings.each_with_object({}) do |(header, result), acc|
|
|
10
|
+
acc[header] = result.respond_to?(:target_field) ? result.target_field.to_s : nil
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def build_field_map(mappings)
|
|
15
|
+
mappings.each_with_object({}) do |(header, result), acc|
|
|
16
|
+
next unless result.respond_to?(:target_field) && result.matched?
|
|
17
|
+
|
|
18
|
+
acc[header] = result.target_field
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def unmatched_headers(mappings)
|
|
23
|
+
mappings.select { |_, result| result.unmatched? }.keys
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def build_proposed_mappings(mappings, confidence_threshold:)
|
|
27
|
+
mappings.each_with_object({}) do |(header, result), proposed|
|
|
28
|
+
proposed[header] = if result.unmatched?
|
|
29
|
+
{ field: nil, confidence: 0.0, status: :unmatched }
|
|
30
|
+
elsif result.confidence < confidence_threshold
|
|
31
|
+
{ field: result.target_field, confidence: result.confidence, status: :low_confidence }
|
|
32
|
+
else
|
|
33
|
+
{ field: result.target_field, confidence: result.confidence, status: :matched }
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def review_gate_triggered?(mappings, review_mode:, confidence_threshold:, required_fields:)
|
|
39
|
+
case review_mode
|
|
40
|
+
when :always
|
|
41
|
+
true
|
|
42
|
+
when :auto
|
|
43
|
+
auto_gate?(mappings, confidence_threshold)
|
|
44
|
+
when :skip
|
|
45
|
+
required_field_below_threshold?(mappings, confidence_threshold, required_fields)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def collect_warnings(mappings, duplicate_warning: nil)
|
|
50
|
+
base = duplicate_warning ? [duplicate_warning] : []
|
|
51
|
+
unmatched = unmatched_headers(mappings)
|
|
52
|
+
return base if unmatched.empty?
|
|
53
|
+
|
|
54
|
+
per_column = unmatched.map do |header|
|
|
55
|
+
UnmatchedColumnWarning.new(
|
|
56
|
+
column_name: header,
|
|
57
|
+
message: "Column '#{header}' was not imported — no matching field found"
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
[*base, *per_column, summary_warning(unmatched)]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def auto_gate?(mappings, confidence_threshold)
|
|
64
|
+
return true if mappings.values.any?(&:unmatched?)
|
|
65
|
+
|
|
66
|
+
mappings.values.any? do |r|
|
|
67
|
+
r.respond_to?(:confidence) && r.confidence < confidence_threshold
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def required_field_below_threshold?(mappings, confidence_threshold, required_fields)
|
|
72
|
+
required_fields.any? do |field_name|
|
|
73
|
+
mapping = mappings.values.find do |result|
|
|
74
|
+
result.respond_to?(:target_field) && result.target_field == field_name
|
|
75
|
+
end
|
|
76
|
+
mapping.nil? || mapping.confidence < confidence_threshold
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def summary_warning(unmatched)
|
|
81
|
+
noun = unmatched.size == 1 ? 'column' : 'columns'
|
|
82
|
+
verb = unmatched.size == 1 ? 'was' : 'were'
|
|
83
|
+
RowWarning.new(
|
|
84
|
+
row: 0,
|
|
85
|
+
message: "#{unmatched.size} #{noun} from your file #{verb} not imported: #{unmatched.join(', ')}"
|
|
86
|
+
)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class Processor
|
|
5
|
+
module NilCellCounter
|
|
6
|
+
module_function
|
|
7
|
+
|
|
8
|
+
def count_cells(rows:, nil_values:)
|
|
9
|
+
return 0 if nil_values.blank?
|
|
10
|
+
|
|
11
|
+
rows.sum { |row| count_row(row, nil_values) }
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def count_row(row, nil_values)
|
|
15
|
+
row.count { |cell| cell && nil_values.include?(cell.strip) }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class Processor
|
|
5
|
+
module RowProcessor
|
|
6
|
+
module_function
|
|
7
|
+
|
|
8
|
+
ChunkResult = Struct.new(:imported, :failed, :blank, :errors, :aborted_error, keyword_init: true)
|
|
9
|
+
|
|
10
|
+
def process_chunk(chunk, field_map:, form_class:, dry_run:)
|
|
11
|
+
blank_rows, data_rows = chunk.partition { |row| blank_row?(row) }
|
|
12
|
+
imported = 0
|
|
13
|
+
failed = 0
|
|
14
|
+
errors = []
|
|
15
|
+
aborted_error = nil
|
|
16
|
+
|
|
17
|
+
data_rows.each do |csv_row|
|
|
18
|
+
outcome = submit_row(csv_row, field_map: field_map, form_class: form_class, dry_run: dry_run)
|
|
19
|
+
case outcome.first
|
|
20
|
+
when :imported
|
|
21
|
+
imported += 1
|
|
22
|
+
when :failed
|
|
23
|
+
failed += 1
|
|
24
|
+
errors = [*errors, *outcome.last]
|
|
25
|
+
when :aborted
|
|
26
|
+
aborted_error = outcome.last
|
|
27
|
+
break
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
ChunkResult.new(
|
|
32
|
+
imported: imported, failed: failed, blank: blank_rows.size,
|
|
33
|
+
errors: errors, aborted_error: aborted_error
|
|
34
|
+
)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def submit_row(csv_row, field_map:, form_class:, dry_run:)
|
|
38
|
+
form = form_class.new(build_attributes(csv_row, field_map))
|
|
39
|
+
success = dry_run ? form.valid? : form.save
|
|
40
|
+
return [:imported] if success
|
|
41
|
+
|
|
42
|
+
[:failed, collect_row_errors(form, csv_row[:csv_line_number] || 0)]
|
|
43
|
+
rescue ActiveRecord::StatementInvalid => e
|
|
44
|
+
[:aborted, e]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def blank_row?(row)
|
|
48
|
+
row.except(:csv_line_number).values.all? { |v| v.nil? || v.to_s.empty? }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def build_attributes(csv_row, field_map)
|
|
52
|
+
field_map.each_with_object({}) do |(header, field_name), attrs|
|
|
53
|
+
next if header == :csv_line_number
|
|
54
|
+
|
|
55
|
+
attrs[field_name] = csv_row[header]
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def collect_row_errors(form, row_number)
|
|
60
|
+
form.errors.group_by_attribute.each_with_object([]) do |(attr, row_errors), errs|
|
|
61
|
+
errs << RowError.new(
|
|
62
|
+
row: row_number,
|
|
63
|
+
column: attr.to_sym,
|
|
64
|
+
messages: row_errors.map(&:message)
|
|
65
|
+
)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
require "smarter_csv"
|
|
5
|
+
|
|
6
|
+
module SmartCsvImport
|
|
7
|
+
class Processor
|
|
8
|
+
include Logging
|
|
9
|
+
|
|
10
|
+
FULL_LOAD_CHUNK_SIZE = 10_000_000
|
|
11
|
+
|
|
12
|
+
def initialize(file_path:, form_class:, mode: :sync, batch_size: SmartCsvImport.configuration.batch_size, dry_run: false, import: nil, confirmed_mappings: nil, on_progress: nil)
|
|
13
|
+
@file_path = file_path
|
|
14
|
+
@form_class = form_class
|
|
15
|
+
@mode = mode
|
|
16
|
+
@batch_size = batch_size
|
|
17
|
+
@dry_run = dry_run
|
|
18
|
+
@existing_import = import
|
|
19
|
+
@confirmed_mappings = confirmed_mappings
|
|
20
|
+
@on_progress = on_progress
|
|
21
|
+
|
|
22
|
+
validate_form_class!
|
|
23
|
+
validate_on_progress!
|
|
24
|
+
|
|
25
|
+
@on_progress ||= NullProgressCallback.new
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def call
|
|
29
|
+
import = @existing_import || create_import
|
|
30
|
+
check_duplicate(import) unless @existing_import
|
|
31
|
+
|
|
32
|
+
return enqueue_async(import) if @mode == :async
|
|
33
|
+
|
|
34
|
+
process_csv(import)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def validate_form_class!
|
|
40
|
+
unless @form_class.respond_to?(:csv_fields) && @form_class.ancestors.include?(Matchable)
|
|
41
|
+
raise ConfigurationError, "form_class must include SmartCsvImport::Matchable"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
unless @form_class.method_defined?(:save)
|
|
45
|
+
raise ConfigurationError, "form_class must define a #save method"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def create_import
|
|
50
|
+
stored_path = FileStorage.store(source_path: @file_path, import_type: @form_class.name)
|
|
51
|
+
file_hash = FileStorage.compute_hash(file_path: @file_path)
|
|
52
|
+
|
|
53
|
+
Import.create!(
|
|
54
|
+
import_type: @form_class.name,
|
|
55
|
+
original_filename: File.basename(@file_path),
|
|
56
|
+
file_path: stored_path,
|
|
57
|
+
file_hash: file_hash,
|
|
58
|
+
status: "processing"
|
|
59
|
+
)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def check_duplicate(import)
|
|
63
|
+
duplicate = Import.find_duplicate(file_hash: import.file_hash, import_type: import.import_type)
|
|
64
|
+
return unless duplicate && duplicate.id != import.id
|
|
65
|
+
|
|
66
|
+
@duplicate_warning = RowWarning.new(
|
|
67
|
+
row: 0,
|
|
68
|
+
message: "File appears to be a duplicate of import ##{duplicate.id}"
|
|
69
|
+
)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def enqueue_async(import)
|
|
73
|
+
ImportJob.perform_later(import.id, @form_class.name)
|
|
74
|
+
Result.queued(import_id: import.id)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def process_csv(import)
|
|
78
|
+
dup_header_warnings = CsvPreflightAnalyzer.duplicate_header_warnings(file_path: @file_path)
|
|
79
|
+
nil_match_count = CsvPreflightAnalyzer.count_nil_matches(
|
|
80
|
+
file_path: @file_path, nil_values: SmartCsvImport.configuration.nil_values_matching
|
|
81
|
+
)
|
|
82
|
+
nil_warnings = if nil_match_count > 0
|
|
83
|
+
noun = nil_match_count == 1 ? "cell" : "cells"
|
|
84
|
+
[RowWarning.new(row: 0, message: "#{nil_match_count} #{noun} contained Excel error markers and were treated as empty", type: :nil_cleaned)]
|
|
85
|
+
else
|
|
86
|
+
[]
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
if @confirmed_mappings
|
|
90
|
+
field_map = @confirmed_mappings
|
|
91
|
+
serialized_mappings = field_map.transform_values(&:to_s)
|
|
92
|
+
import.update!(header_mappings: serialized_mappings)
|
|
93
|
+
return process_rows(import, field_map, [*dup_header_warnings, *nil_warnings])
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
mappings = run_matcher
|
|
97
|
+
serialized_mappings = MappingReviewPolicy.serialize_mappings(mappings)
|
|
98
|
+
import.update!(header_mappings: serialized_mappings)
|
|
99
|
+
|
|
100
|
+
warnings = MappingReviewPolicy.collect_warnings(mappings, duplicate_warning: @duplicate_warning)
|
|
101
|
+
|
|
102
|
+
threshold = SmartCsvImport.configuration.confidence_threshold
|
|
103
|
+
|
|
104
|
+
if MappingReviewPolicy.review_gate_triggered?(
|
|
105
|
+
mappings,
|
|
106
|
+
review_mode: SmartCsvImport.configuration.review_mode,
|
|
107
|
+
confidence_threshold: threshold,
|
|
108
|
+
required_fields: @form_class.required_csv_fields
|
|
109
|
+
)
|
|
110
|
+
import.update!(status: "mapping_review")
|
|
111
|
+
proposed = MappingReviewPolicy.build_proposed_mappings(mappings, confidence_threshold: threshold)
|
|
112
|
+
unmatched = MappingReviewPolicy.unmatched_headers(mappings)
|
|
113
|
+
return Result.review_required(
|
|
114
|
+
header_mappings: serialized_mappings,
|
|
115
|
+
import_id: import.id,
|
|
116
|
+
proposed_mappings: proposed,
|
|
117
|
+
unmatched_columns: unmatched,
|
|
118
|
+
warnings: [*dup_header_warnings, *nil_warnings, *warnings]
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
field_map = MappingReviewPolicy.build_field_map(mappings)
|
|
123
|
+
process_rows(import, field_map, [*dup_header_warnings, *nil_warnings, *warnings])
|
|
124
|
+
rescue StandardError => e
|
|
125
|
+
log_error("Processing failed: #{e.message}")
|
|
126
|
+
import.update!(status: "failed")
|
|
127
|
+
raise
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def run_matcher
|
|
131
|
+
Matcher.new(
|
|
132
|
+
file_path: @file_path,
|
|
133
|
+
form_class: @form_class,
|
|
134
|
+
confidence_threshold: SmartCsvImport.configuration.confidence_threshold
|
|
135
|
+
).call
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def process_rows(import, field_map, warnings)
|
|
139
|
+
imported_count = 0
|
|
140
|
+
failed_count = 0
|
|
141
|
+
errors = []
|
|
142
|
+
blank_count = 0
|
|
143
|
+
aborted = false
|
|
144
|
+
|
|
145
|
+
reader = SmarterCSV::Reader.new(@file_path, smarter_csv_options.merge(chunk_size: effective_chunk_size))
|
|
146
|
+
reader.each_chunk do |chunk, chunk_index|
|
|
147
|
+
chunk_number = chunk_index + 1
|
|
148
|
+
|
|
149
|
+
if chunk_number > 1
|
|
150
|
+
bad_rows_so_far = reader.errors.fetch(:bad_rows, []).size
|
|
151
|
+
total_seen = imported_count + failed_count + bad_rows_so_far
|
|
152
|
+
if total_seen > 0 && bad_rows_so_far.to_f / total_seen > SmartCsvImport.configuration.bad_row_limit
|
|
153
|
+
aborted = true
|
|
154
|
+
break
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
chunk_result = RowProcessor.process_chunk(
|
|
159
|
+
chunk, field_map: field_map, form_class: @form_class, dry_run: @dry_run
|
|
160
|
+
)
|
|
161
|
+
imported_count += chunk_result.imported
|
|
162
|
+
failed_count += chunk_result.failed
|
|
163
|
+
blank_count += chunk_result.blank
|
|
164
|
+
errors = [*errors, *chunk_result.errors]
|
|
165
|
+
|
|
166
|
+
if chunk_result.aborted_error
|
|
167
|
+
log_error("Database error during chunk #{chunk_number}: #{chunk_result.aborted_error.message}")
|
|
168
|
+
chunk_parse_errors = ImportResultBuilder.build_parse_errors(reader.errors.fetch(:bad_rows, []))
|
|
169
|
+
persist_final_import(
|
|
170
|
+
import,
|
|
171
|
+
status: "partial_failure",
|
|
172
|
+
imported_count: imported_count,
|
|
173
|
+
failed_count: failed_count + chunk_parse_errors.size,
|
|
174
|
+
total_rows: imported_count + failed_count + chunk_parse_errors.size,
|
|
175
|
+
errors: errors, parse_errors: chunk_parse_errors
|
|
176
|
+
)
|
|
177
|
+
return Result.partial_failure(
|
|
178
|
+
imported: imported_count,
|
|
179
|
+
failed: failed_count,
|
|
180
|
+
total: imported_count + failed_count + chunk_parse_errors.size,
|
|
181
|
+
errors: errors,
|
|
182
|
+
header_mappings: import.header_mappings,
|
|
183
|
+
import_id: import.id,
|
|
184
|
+
warnings: [*warnings, *ImportResultBuilder.build_blank_row_warnings(blank_count)],
|
|
185
|
+
parse_errors: chunk_parse_errors
|
|
186
|
+
)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
parse_errors_count = reader.errors.fetch(:bad_rows, []).size
|
|
190
|
+
import.update!(imported_count: imported_count, failed_count: failed_count + parse_errors_count)
|
|
191
|
+
|
|
192
|
+
fire_progress_callback(imported_count, failed_count, chunk_number)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
parse_errors = ImportResultBuilder.build_parse_errors(reader.errors.fetch(:bad_rows, []))
|
|
196
|
+
all_warnings = [*warnings, *ImportResultBuilder.build_blank_row_warnings(blank_count)]
|
|
197
|
+
|
|
198
|
+
total = imported_count + failed_count + parse_errors.size
|
|
199
|
+
if !aborted && parse_errors.any? && total > 0
|
|
200
|
+
ratio = parse_errors.size.to_f / total
|
|
201
|
+
aborted = ratio > SmartCsvImport.configuration.bad_row_limit
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
return build_abort_result(import, imported_count, failed_count, errors, parse_errors, all_warnings) if aborted
|
|
205
|
+
|
|
206
|
+
build_final_result(import, imported_count, failed_count, errors, parse_errors, all_warnings)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def smarter_csv_options
|
|
210
|
+
{
|
|
211
|
+
strings_as_keys: true,
|
|
212
|
+
strip_whitespace: true,
|
|
213
|
+
keep_original_headers: true,
|
|
214
|
+
field_size_limit: SmartCsvImport.configuration.field_size_limit,
|
|
215
|
+
with_line_numbers: true,
|
|
216
|
+
on_bad_row: :collect,
|
|
217
|
+
collect_raw_lines: true,
|
|
218
|
+
convert_values_to_numeric: false,
|
|
219
|
+
remove_empty_values: false,
|
|
220
|
+
remove_empty_hashes: false,
|
|
221
|
+
invalid_byte_sequence: "",
|
|
222
|
+
force_utf8: true,
|
|
223
|
+
duplicate_header_suffix: "_",
|
|
224
|
+
nil_values_matching: SmartCsvImport.configuration.nil_values_regexp,
|
|
225
|
+
bad_row_limit: nil
|
|
226
|
+
}
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def effective_chunk_size
|
|
230
|
+
SmartCsvImport.configuration.chunk_size || FULL_LOAD_CHUNK_SIZE
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def build_abort_result(import, imported_count, failed_count, errors, parse_errors, warnings)
|
|
234
|
+
persist_final_import(
|
|
235
|
+
import, status: "failed",
|
|
236
|
+
imported_count: imported_count, failed_count: failed_count + parse_errors.size,
|
|
237
|
+
total_rows: imported_count + failed_count + parse_errors.size,
|
|
238
|
+
errors: errors, parse_errors: parse_errors
|
|
239
|
+
)
|
|
240
|
+
ImportResultBuilder.abort_result(
|
|
241
|
+
imported: imported_count, failed: failed_count, parse_errors: parse_errors,
|
|
242
|
+
warnings: warnings, header_mappings: import.header_mappings, import_id: import.id,
|
|
243
|
+
bad_row_limit: SmartCsvImport.configuration.bad_row_limit
|
|
244
|
+
)
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def build_final_result(import, imported_count, failed_count, errors, parse_errors, warnings)
|
|
248
|
+
all_failed = failed_count + parse_errors.size
|
|
249
|
+
total = imported_count + failed_count + parse_errors.size
|
|
250
|
+
persist_final_import(
|
|
251
|
+
import, status: all_failed > 0 ? "partial_failure" : "completed",
|
|
252
|
+
imported_count: imported_count, failed_count: all_failed,
|
|
253
|
+
total_rows: total, errors: errors, parse_errors: parse_errors
|
|
254
|
+
)
|
|
255
|
+
ImportResultBuilder.final_result(
|
|
256
|
+
imported: imported_count, failed: failed_count, errors: errors,
|
|
257
|
+
parse_errors: parse_errors, warnings: warnings,
|
|
258
|
+
header_mappings: import.header_mappings, import_id: import.id, dry_run: @dry_run
|
|
259
|
+
)
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
def persist_final_import(import, status:, imported_count:, failed_count:, total_rows:, errors:, parse_errors:)
|
|
263
|
+
ImportRowError.transaction do
|
|
264
|
+
import.update!(
|
|
265
|
+
status: status, imported_count: imported_count, failed_count: failed_count,
|
|
266
|
+
total_rows: total_rows
|
|
267
|
+
)
|
|
268
|
+
attrs = ImportResultBuilder.build_row_error_attributes(
|
|
269
|
+
errors: errors, parse_errors: parse_errors, import_id: import.id
|
|
270
|
+
)
|
|
271
|
+
# rubocop:disable Rails/SkipsModelValidations
|
|
272
|
+
# Bulk insert is intentional — attributes built from pre-validated RowError/ParseError structs.
|
|
273
|
+
attrs.each_slice(1000) { |batch| ImportRowError.insert_all(batch) } if attrs.any?
|
|
274
|
+
# rubocop:enable Rails/SkipsModelValidations
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def fire_progress_callback(imported_count, failed_count, chunk_number)
|
|
279
|
+
@on_progress.call({
|
|
280
|
+
imported: imported_count,
|
|
281
|
+
failed: failed_count,
|
|
282
|
+
chunk_number: chunk_number
|
|
283
|
+
})
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
def validate_on_progress!
|
|
287
|
+
return if @on_progress.nil?
|
|
288
|
+
|
|
289
|
+
unless @on_progress.respond_to?(:call)
|
|
290
|
+
raise ConfigurationError, "on_progress must respond to :call"
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
end
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
RowError = Struct.new(:row, :column, :messages, keyword_init: true)
|
|
5
|
+
RowWarning = Struct.new(:row, :message, :type, keyword_init: true)
|
|
6
|
+
ParseError = Struct.new(:line_number, :raw_line, :error_message, keyword_init: true)
|
|
7
|
+
UnmatchedColumnWarning = Struct.new(:column_name, :message, keyword_init: true)
|
|
8
|
+
|
|
9
|
+
class Result
|
|
10
|
+
attr_reader :status, :imported, :failed, :total, :errors,
|
|
11
|
+
:header_mappings, :import_id, :warnings, :parse_errors,
|
|
12
|
+
:proposed_mappings, :unmatched_columns
|
|
13
|
+
|
|
14
|
+
private_class_method :new
|
|
15
|
+
|
|
16
|
+
def self.completed(imported:, failed:, total:, errors:, header_mappings:, import_id: nil, warnings: [], parse_errors: [])
|
|
17
|
+
new(
|
|
18
|
+
status: :completed,
|
|
19
|
+
imported: imported,
|
|
20
|
+
failed: failed,
|
|
21
|
+
total: total,
|
|
22
|
+
errors: errors,
|
|
23
|
+
header_mappings: header_mappings,
|
|
24
|
+
import_id: import_id,
|
|
25
|
+
warnings: warnings,
|
|
26
|
+
parse_errors: parse_errors
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def self.partial_failure(imported:, failed:, total:, errors:, header_mappings:, import_id: nil, warnings: [], parse_errors: [])
|
|
31
|
+
new(
|
|
32
|
+
status: :partial_failure,
|
|
33
|
+
imported: imported,
|
|
34
|
+
failed: failed,
|
|
35
|
+
total: total,
|
|
36
|
+
errors: errors,
|
|
37
|
+
header_mappings: header_mappings,
|
|
38
|
+
import_id: import_id,
|
|
39
|
+
warnings: warnings,
|
|
40
|
+
parse_errors: parse_errors
|
|
41
|
+
)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def self.review_required(header_mappings:, import_id: nil, proposed_mappings: {}, unmatched_columns: [], warnings: [])
|
|
45
|
+
new(
|
|
46
|
+
status: :review_required,
|
|
47
|
+
header_mappings: header_mappings,
|
|
48
|
+
import_id: import_id,
|
|
49
|
+
proposed_mappings: proposed_mappings,
|
|
50
|
+
unmatched_columns: unmatched_columns,
|
|
51
|
+
warnings: warnings
|
|
52
|
+
)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def self.queued(import_id:, **rest)
|
|
56
|
+
invalid_keys = rest.keys & %i[imported failed total]
|
|
57
|
+
raise ArgumentError, "queued result does not accept: #{invalid_keys.join(", ")}" if invalid_keys.any?
|
|
58
|
+
|
|
59
|
+
new(
|
|
60
|
+
status: :queued,
|
|
61
|
+
import_id: import_id
|
|
62
|
+
)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def self.dry_run(imported:, failed:, total:, errors:, header_mappings:, warnings: [], parse_errors: [])
|
|
66
|
+
new(
|
|
67
|
+
status: :dry_run,
|
|
68
|
+
imported: imported,
|
|
69
|
+
failed: failed,
|
|
70
|
+
total: total,
|
|
71
|
+
errors: errors,
|
|
72
|
+
header_mappings: header_mappings,
|
|
73
|
+
warnings: warnings,
|
|
74
|
+
parse_errors: parse_errors
|
|
75
|
+
)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def completed? = status == :completed
|
|
79
|
+
def partial_failure? = status == :partial_failure
|
|
80
|
+
def review_required? = status == :review_required
|
|
81
|
+
def queued? = status == :queued
|
|
82
|
+
def dry_run? = status == :dry_run
|
|
83
|
+
def success? = completed? && errors.empty?
|
|
84
|
+
|
|
85
|
+
private
|
|
86
|
+
|
|
87
|
+
def initialize(status:, imported: nil, failed: nil, total: nil, errors: [], header_mappings: {}, import_id: nil, warnings: [], parse_errors: [], proposed_mappings: {}, unmatched_columns: [])
|
|
88
|
+
@status = status
|
|
89
|
+
@imported = imported
|
|
90
|
+
@failed = failed
|
|
91
|
+
@total = total
|
|
92
|
+
@errors = errors
|
|
93
|
+
@header_mappings = header_mappings
|
|
94
|
+
@import_id = import_id
|
|
95
|
+
@warnings = warnings
|
|
96
|
+
@parse_errors = parse_errors
|
|
97
|
+
@proposed_mappings = proposed_mappings
|
|
98
|
+
@unmatched_columns = unmatched_columns
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|