smart_csv_import 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.adoc +134 -0
- data/README.md +534 -0
- data/app/jobs/smart_csv_import/import_job.rb +22 -0
- data/app/models/smart_csv_import/import.rb +36 -0
- data/app/models/smart_csv_import/import_row_error.rb +17 -0
- data/lib/generators/smart_csv_import/import/import_generator.rb +49 -0
- data/lib/generators/smart_csv_import/import/templates/import_form.rb.tt +32 -0
- data/lib/generators/smart_csv_import/import/templates/import_form_spec.rb.tt +38 -0
- data/lib/generators/smart_csv_import/install/install_generator.rb +34 -0
- data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_import_row_errors.rb.tt +18 -0
- data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_imports.rb.tt +23 -0
- data/lib/generators/smart_csv_import/install/templates/initializer.rb.tt +51 -0
- data/lib/generators/smart_csv_import/scaffold/scaffold_generator.rb +56 -0
- data/lib/generators/smart_csv_import/scaffold/templates/controller.rb.tt +33 -0
- data/lib/generators/smart_csv_import/scaffold/templates/new.html.erb.tt +12 -0
- data/lib/generators/smart_csv_import/scaffold/templates/show.html.erb.tt +59 -0
- data/lib/smart_csv_import/configuration.rb +77 -0
- data/lib/smart_csv_import/cosine_similarity.rb +15 -0
- data/lib/smart_csv_import/engine.rb +12 -0
- data/lib/smart_csv_import/failed_row_exporter.rb +78 -0
- data/lib/smart_csv_import/file_storage.rb +34 -0
- data/lib/smart_csv_import/header_normalizer.rb +76 -0
- data/lib/smart_csv_import/logging.rb +37 -0
- data/lib/smart_csv_import/match_result.rb +36 -0
- data/lib/smart_csv_import/matchable.rb +76 -0
- data/lib/smart_csv_import/matcher.rb +198 -0
- data/lib/smart_csv_import/normalizers/boolean_converter.rb +26 -0
- data/lib/smart_csv_import/normalizers/date_converter.rb +28 -0
- data/lib/smart_csv_import/notifications.rb +16 -0
- data/lib/smart_csv_import/processor/csv_preflight_analyzer.rb +74 -0
- data/lib/smart_csv_import/processor/import_result_builder.rb +97 -0
- data/lib/smart_csv_import/processor/mapping_review_policy.rb +90 -0
- data/lib/smart_csv_import/processor/nil_cell_counter.rb +19 -0
- data/lib/smart_csv_import/processor/null_progress_callback.rb +11 -0
- data/lib/smart_csv_import/processor/row_processor.rb +70 -0
- data/lib/smart_csv_import/processor.rb +294 -0
- data/lib/smart_csv_import/result.rb +101 -0
- data/lib/smart_csv_import/stability_report.rb +104 -0
- data/lib/smart_csv_import/strategies/llm.rb +106 -0
- data/lib/smart_csv_import/strategies/lookup.rb +41 -0
- data/lib/smart_csv_import/strategies/vector.rb +155 -0
- data/lib/smart_csv_import/strategy.rb +9 -0
- data/lib/smart_csv_import/strategy_failure.rb +13 -0
- data/lib/smart_csv_import/version.rb +5 -0
- data/lib/smart_csv_import.rb +79 -0
- data/smart_csv_import.gemspec +35 -0
- metadata +216 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class ImportJob < ActiveJob::Base
|
|
5
|
+
queue_as :smart_csv_import
|
|
6
|
+
|
|
7
|
+
def perform(import_id, form_class_name)
|
|
8
|
+
import = Import.find(import_id)
|
|
9
|
+
form_class = form_class_name.constantize
|
|
10
|
+
|
|
11
|
+
Processor.new(
|
|
12
|
+
file_path: import.file_path,
|
|
13
|
+
form_class: form_class,
|
|
14
|
+
mode: :sync,
|
|
15
|
+
import: import
|
|
16
|
+
).call
|
|
17
|
+
rescue StandardError => e
|
|
18
|
+
import&.update!(status: "failed")
|
|
19
|
+
raise
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
# Parent resolved at class-definition time; set SmartCsvImport.base_model_class in an
|
|
5
|
+
# initializer before this model is autoloaded if you want a custom AR subclass.
|
|
6
|
+
class Import < SmartCsvImport.base_model_class
|
|
7
|
+
self.table_name = "smart_csv_import_imports"
|
|
8
|
+
|
|
9
|
+
STATUSES = %w[pending mapping_review processing completed partial_failure failed].freeze
|
|
10
|
+
|
|
11
|
+
has_many :row_errors,
|
|
12
|
+
-> { order(:row_number) },
|
|
13
|
+
class_name: "SmartCsvImport::ImportRowError",
|
|
14
|
+
foreign_key: :import_id,
|
|
15
|
+
inverse_of: :import,
|
|
16
|
+
dependent: :destroy
|
|
17
|
+
|
|
18
|
+
validates :import_type, presence: true
|
|
19
|
+
validates :original_filename, presence: true
|
|
20
|
+
validates :status, inclusion: { in: STATUSES }
|
|
21
|
+
|
|
22
|
+
STATUSES.each do |status_name|
|
|
23
|
+
define_method(:"#{status_name}?") do
|
|
24
|
+
status == status_name
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def self.find_duplicate(file_hash:, import_type: nil)
|
|
29
|
+
return nil if file_hash.nil?
|
|
30
|
+
|
|
31
|
+
scope = where(file_hash: file_hash)
|
|
32
|
+
scope = scope.where(import_type: import_type) if import_type
|
|
33
|
+
scope.first
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class ImportRowError < SmartCsvImport.base_model_class
|
|
5
|
+
self.table_name = 'smart_csv_import_import_row_errors'
|
|
6
|
+
|
|
7
|
+
ERROR_TYPES = %w[parse validation].freeze
|
|
8
|
+
|
|
9
|
+
belongs_to :import, class_name: 'SmartCsvImport::Import', inverse_of: :row_errors, optional: false
|
|
10
|
+
|
|
11
|
+
validates :error_type, inclusion: { in: ERROR_TYPES }
|
|
12
|
+
validates :row_number, presence: true
|
|
13
|
+
|
|
14
|
+
scope :validation_errors, -> { where(error_type: 'validation') }
|
|
15
|
+
scope :parse_errors, -> { where(error_type: 'parse') }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails/generators"
|
|
4
|
+
|
|
5
|
+
module SmartCsvImport
|
|
6
|
+
module Generators
|
|
7
|
+
class ImportGenerator < Rails::Generators::Base
|
|
8
|
+
source_root File.expand_path("templates", __dir__)
|
|
9
|
+
|
|
10
|
+
argument :model_name, type: :string, desc: "The model name (e.g., Employee)"
|
|
11
|
+
argument :fields, type: :array, default: [], desc: "Fields with optional types (e.g., first_name last_name:string email:string hire_date:date)"
|
|
12
|
+
|
|
13
|
+
desc "Generates a SmartCsvImport import form object with csv_field declarations"
|
|
14
|
+
|
|
15
|
+
def create_form_object
|
|
16
|
+
template "import_form.rb.tt", "app/forms/#{form_file_name}.rb"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def create_form_spec
|
|
20
|
+
template "import_form_spec.rb.tt", "spec/forms/#{form_file_name}_spec.rb"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def form_class_name
|
|
26
|
+
"#{model_name.camelize}ImportForm"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def form_file_name
|
|
30
|
+
"#{model_name.underscore}_import_form"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def parsed_fields
|
|
34
|
+
@parsed_fields ||= fields.map do |field_spec|
|
|
35
|
+
name, type = field_spec.split(":")
|
|
36
|
+
{ name: name, type: type || "string" }
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def field_names
|
|
41
|
+
parsed_fields.map { |f| f[:name] }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def email_fields
|
|
45
|
+
parsed_fields.select { |f| f[:name] =~ /email/ }.map { |f| f[:name] }
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class <%= form_class_name %>
|
|
4
|
+
include ActiveModel::API
|
|
5
|
+
include ActiveModel::Attributes
|
|
6
|
+
include SmartCsvImport::Matchable
|
|
7
|
+
|
|
8
|
+
<% parsed_fields.each do |field| -%>
|
|
9
|
+
attribute :<%= field[:name] %>, :<%= field[:type] %>
|
|
10
|
+
<% end -%>
|
|
11
|
+
|
|
12
|
+
<% parsed_fields.each do |field| -%>
|
|
13
|
+
csv_field :<%= field[:name] %>, description: "<%= model_name.camelize %> <%= field[:name].humanize.downcase %>"
|
|
14
|
+
<% end -%>
|
|
15
|
+
|
|
16
|
+
validates <%= field_names.map { |n| ":#{n}" }.join(", ") %>, presence: true
|
|
17
|
+
<% email_fields.each do |email_field| -%>
|
|
18
|
+
validates :<%= email_field %>, format: { with: URI::MailTo::EMAIL_REGEXP }, allow_blank: true
|
|
19
|
+
<% end -%>
|
|
20
|
+
|
|
21
|
+
def save
|
|
22
|
+
return false unless valid?
|
|
23
|
+
|
|
24
|
+
record = <%= model_name.camelize %>.new(
|
|
25
|
+
<% parsed_fields.each_with_index do |field, index| -%>
|
|
26
|
+
<%= field[:name] %>: <%= field[:name] %><%= index < parsed_fields.length - 1 ? "," : "" %>
|
|
27
|
+
<% end -%>
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
record.save
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe <%= form_class_name %> do
|
|
6
|
+
describe ".csv_fields" do
|
|
7
|
+
it "declares all expected csv_fields" do
|
|
8
|
+
expected_fields = %i[<%= field_names.join(" ") %>]
|
|
9
|
+
expect(described_class.csv_fields.keys).to match_array(expected_fields)
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
describe "validations" do
|
|
14
|
+
<% parsed_fields.each do |field| -%>
|
|
15
|
+
it "requires <%= field[:name] %>" do
|
|
16
|
+
form = described_class.new(<%= field[:name] %>: nil)
|
|
17
|
+
form.valid?
|
|
18
|
+
expect(form.errors[:<%= field[:name] %>]).to include("can't be blank")
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
<% end -%>
|
|
22
|
+
<% email_fields.each do |email_field| -%>
|
|
23
|
+
it "validates format of <%= email_field %>" do
|
|
24
|
+
form = described_class.new(<%= email_field %>: "invalid-email")
|
|
25
|
+
form.valid?
|
|
26
|
+
expect(form.errors[:<%= email_field %>]).to include("is invalid")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
<% end -%>
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe "#save" do
|
|
33
|
+
it "responds to save" do
|
|
34
|
+
form = described_class.new
|
|
35
|
+
expect(form).to respond_to(:save)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails/generators"
|
|
4
|
+
|
|
5
|
+
module SmartCsvImport
|
|
6
|
+
module Generators
|
|
7
|
+
class InstallGenerator < Rails::Generators::Base
|
|
8
|
+
source_root File.expand_path("templates", __dir__)
|
|
9
|
+
|
|
10
|
+
desc "Installs SmartCsvImport: initializer, migration, and storage directories"
|
|
11
|
+
|
|
12
|
+
def copy_initializer
|
|
13
|
+
template "initializer.rb.tt", "config/initializers/smart_csv_import.rb"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def create_migration
|
|
17
|
+
base_time = Time.now
|
|
18
|
+
template(
|
|
19
|
+
"create_smart_csv_import_imports.rb.tt",
|
|
20
|
+
"db/migrate/#{base_time.strftime('%Y%m%d%H%M%S')}_create_smart_csv_import_imports.rb"
|
|
21
|
+
)
|
|
22
|
+
template(
|
|
23
|
+
"create_smart_csv_import_import_row_errors.rb.tt",
|
|
24
|
+
"db/migrate/#{(base_time + 1).strftime('%Y%m%d%H%M%S')}_create_smart_csv_import_import_row_errors.rb"
|
|
25
|
+
)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def create_storage_directories
|
|
29
|
+
empty_directory "tmp/smart_csv_import/imports"
|
|
30
|
+
empty_directory "tmp/smart_csv_import/embeddings_cache"
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class CreateSmartCsvImportImportRowErrors < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
|
4
|
+
def change
|
|
5
|
+
create_table :smart_csv_import_import_row_errors do |t|
|
|
6
|
+
t.references :import, null: false, foreign_key: { to_table: :smart_csv_import_imports, on_delete: :cascade }
|
|
7
|
+
t.integer :row_number, null: false
|
|
8
|
+
t.string :error_type, null: false
|
|
9
|
+
t.string :column_name
|
|
10
|
+
t.json :messages, default: []
|
|
11
|
+
t.text :raw_line
|
|
12
|
+
t.text :error_message
|
|
13
|
+
t.datetime :created_at, null: false
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
add_index :smart_csv_import_import_row_errors, [:import_id, :error_type, :row_number]
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class CreateSmartCsvImportImports < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
|
|
4
|
+
def change
|
|
5
|
+
create_table :smart_csv_import_imports do |t|
|
|
6
|
+
t.string :import_type, null: false
|
|
7
|
+
t.string :original_filename, null: false
|
|
8
|
+
t.string :file_path
|
|
9
|
+
t.string :file_hash
|
|
10
|
+
t.json :header_mappings, default: {}
|
|
11
|
+
t.integer :total_rows, default: 0
|
|
12
|
+
t.integer :imported_count, default: 0
|
|
13
|
+
t.integer :failed_count, default: 0
|
|
14
|
+
t.json :warnings, default: []
|
|
15
|
+
t.string :status, null: false, default: "pending"
|
|
16
|
+
|
|
17
|
+
t.timestamps
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
add_index :smart_csv_import_imports, :file_hash
|
|
21
|
+
add_index :smart_csv_import_imports, :status
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# SmartCsvImport uses RubyLLM for AI-powered header matching.
|
|
4
|
+
# Configure your AI provider(s) below. At minimum, set one embedding model
|
|
5
|
+
# (for vector matching) and one LLM model (for fallback matching).
|
|
6
|
+
#
|
|
7
|
+
# Recommended free-tier setup: Gemini embeddings + Anthropic LLM
|
|
8
|
+
#
|
|
9
|
+
# RubyLLM.configure do |config|
|
|
10
|
+
# # --- Gemini (embeddings) ---
|
|
11
|
+
# # Free tier available at https://aistudio.google.com/apikey
|
|
12
|
+
# # config.gemini_api_key = ENV["GEMINI_API_KEY"]
|
|
13
|
+
#
|
|
14
|
+
# # --- Anthropic (LLM fallback) ---
|
|
15
|
+
# # config.anthropic_api_key = ENV["ANTHROPIC_API_KEY"]
|
|
16
|
+
#
|
|
17
|
+
# # --- OpenAI (alternative for both) ---
|
|
18
|
+
# # config.openai_api_key = ENV["OPENAI_API_KEY"]
|
|
19
|
+
# end
|
|
20
|
+
|
|
21
|
+
SmartCsvImport.configure do |config|
|
|
22
|
+
# Path where uploaded CSV files are stored
|
|
23
|
+
# config.storage_path = "tmp/smart_csv_import"
|
|
24
|
+
|
|
25
|
+
# Number of rows to process per batch
|
|
26
|
+
# config.batch_size = 500
|
|
27
|
+
|
|
28
|
+
# Minimum confidence threshold for header matching (0.0 - 1.0)
|
|
29
|
+
# config.confidence_threshold = 0.80
|
|
30
|
+
|
|
31
|
+
# Which strategy tier runs first when no custom strategy is set on the form class.
|
|
32
|
+
# :vector (default) — vector similarity first, LLM as fallback.
|
|
33
|
+
# :llm — LLM first, vector as fallback.
|
|
34
|
+
# config.default_strategy = :vector
|
|
35
|
+
|
|
36
|
+
# LLM model for fallback header matching
|
|
37
|
+
# Anthropic: "claude-haiku-4-5-20251001" (fast, cheap)
|
|
38
|
+
# OpenAI: "gpt-4o-mini"
|
|
39
|
+
# config.llm_model = "claude-haiku-4-5-20251001"
|
|
40
|
+
|
|
41
|
+
# Embedding model for vector similarity matching
|
|
42
|
+
# Gemini: "gemini-embedding-001" (free tier, recommended)
|
|
43
|
+
# OpenAI: "text-embedding-3-small"
|
|
44
|
+
# config.embedding_model = "gemini-embedding-001"
|
|
45
|
+
|
|
46
|
+
# Number of sample rows to use for value-based confidence hints
|
|
47
|
+
# config.value_hint_rows = 3
|
|
48
|
+
|
|
49
|
+
# Process imports asynchronously with ActiveJob
|
|
50
|
+
# config.async = false
|
|
51
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rails/generators"
|
|
4
|
+
|
|
5
|
+
module SmartCsvImport
|
|
6
|
+
module Generators
|
|
7
|
+
class ScaffoldGenerator < Rails::Generators::Base
|
|
8
|
+
source_root File.expand_path("templates", __dir__)
|
|
9
|
+
|
|
10
|
+
argument :model_name, type: :string, desc: "The model name (e.g., Employee)"
|
|
11
|
+
|
|
12
|
+
desc "Generates SmartCsvImport import scaffold: controller, views, and routes"
|
|
13
|
+
|
|
14
|
+
def create_controller
|
|
15
|
+
template "controller.rb.tt", "app/controllers/#{imports_controller_file_name}.rb"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def create_views
|
|
19
|
+
template "new.html.erb.tt", "app/views/#{imports_name}/new.html.erb"
|
|
20
|
+
template "show.html.erb.tt", "app/views/#{imports_name}/show.html.erb"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def add_routes
|
|
24
|
+
route_line = " resources :#{imports_name}, only: [:new, :create, :show]"
|
|
25
|
+
|
|
26
|
+
inject_into_file(
|
|
27
|
+
"config/routes.rb",
|
|
28
|
+
"#{route_line}\n",
|
|
29
|
+
after: "Rails.application.routes.draw do\n"
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def model_class_name
|
|
36
|
+
model_name.camelize
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def form_class_name
|
|
40
|
+
"#{model_class_name}ImportForm"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def imports_name
|
|
44
|
+
"#{model_name.underscore}_imports"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def imports_controller_class_name
|
|
48
|
+
"#{model_class_name}ImportsController"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def imports_controller_file_name
|
|
52
|
+
"#{model_name.underscore}_imports_controller"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class <%= imports_controller_class_name %> < ApplicationController
|
|
4
|
+
def new
|
|
5
|
+
@import_form = <%= form_class_name %>.new
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def create
|
|
9
|
+
uploaded_file = params[:file]
|
|
10
|
+
|
|
11
|
+
result = SmartCsvImport.process(
|
|
12
|
+
uploaded_file.tempfile.path,
|
|
13
|
+
form_class: <%= form_class_name %>
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
if result.review_required?
|
|
17
|
+
flash[:notice] = "Import requires header mapping review."
|
|
18
|
+
elsif result.completed?
|
|
19
|
+
flash[:notice] = "Imported #{result.imported} of #{result.total} rows."
|
|
20
|
+
elsif result.partial_failure?
|
|
21
|
+
flash[:alert] = "Imported #{result.imported} of #{result.total} rows with #{result.failed} failures."
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
redirect_to <%= imports_name.singularize %>_path(result.import_id)
|
|
25
|
+
rescue StandardError => e
|
|
26
|
+
flash[:alert] = "Import failed: #{e.message}"
|
|
27
|
+
redirect_to new_<%= imports_name.singularize %>_path
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def show
|
|
31
|
+
@import = SmartCsvImport::Import.find(params[:id])
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<h1>Import <%= model_class_name.pluralize %></h1>
|
|
2
|
+
|
|
3
|
+
<%%= form_with url: <%= imports_name %>_path, method: :post, multipart: true do |form| %>
|
|
4
|
+
<div>
|
|
5
|
+
<%%= form.label :file, "Choose CSV file" %>
|
|
6
|
+
<%%= form.file_field :file, accept: ".csv" %>
|
|
7
|
+
</div>
|
|
8
|
+
|
|
9
|
+
<div>
|
|
10
|
+
<%%= form.submit "Upload and Import" %>
|
|
11
|
+
</div>
|
|
12
|
+
<%% end %>
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
<h1>Import Result</h1>
|
|
2
|
+
|
|
3
|
+
<dl>
|
|
4
|
+
<dt>Status</dt>
|
|
5
|
+
<dd><%%= @import.status %></dd>
|
|
6
|
+
|
|
7
|
+
<dt>File</dt>
|
|
8
|
+
<dd><%%= @import.original_filename %></dd>
|
|
9
|
+
|
|
10
|
+
<dt>Total Rows</dt>
|
|
11
|
+
<dd><%%= @import.total_rows %></dd>
|
|
12
|
+
|
|
13
|
+
<dt>Imported</dt>
|
|
14
|
+
<dd><%%= @import.imported_count %></dd>
|
|
15
|
+
|
|
16
|
+
<dt>Failed</dt>
|
|
17
|
+
<dd><%%= @import.failed_count %></dd>
|
|
18
|
+
</dl>
|
|
19
|
+
|
|
20
|
+
<%% if @import.warnings.any? %>
|
|
21
|
+
<h2>Warnings</h2>
|
|
22
|
+
<ul>
|
|
23
|
+
<%% @import.warnings.each do |warning| %>
|
|
24
|
+
<li><%%= warning %></li>
|
|
25
|
+
<%% end %>
|
|
26
|
+
</ul>
|
|
27
|
+
<%% end %>
|
|
28
|
+
|
|
29
|
+
<%% if @import.row_errors.any? %>
|
|
30
|
+
<h2>Failed Rows</h2>
|
|
31
|
+
<table>
|
|
32
|
+
<thead>
|
|
33
|
+
<tr>
|
|
34
|
+
<th>Row</th>
|
|
35
|
+
<th>Type</th>
|
|
36
|
+
<th>Column</th>
|
|
37
|
+
<th>Errors</th>
|
|
38
|
+
</tr>
|
|
39
|
+
</thead>
|
|
40
|
+
<tbody>
|
|
41
|
+
<%% @import.row_errors.each do |row_error| %>
|
|
42
|
+
<tr>
|
|
43
|
+
<td><%%= row_error.row_number %></td>
|
|
44
|
+
<td><%%= row_error.error_type %></td>
|
|
45
|
+
<td><%%= row_error.column_name %></td>
|
|
46
|
+
<td>
|
|
47
|
+
<%% if row_error.error_type == "parse" %>
|
|
48
|
+
<%%= row_error.error_message %>
|
|
49
|
+
<%% else %>
|
|
50
|
+
<%%= row_error.messages.join(", ") %>
|
|
51
|
+
<%% end %>
|
|
52
|
+
</td>
|
|
53
|
+
</tr>
|
|
54
|
+
<%% end %>
|
|
55
|
+
</tbody>
|
|
56
|
+
</table>
|
|
57
|
+
<%% end %>
|
|
58
|
+
|
|
59
|
+
<%%= link_to "New Import", new_<%= imports_name.singularize %>_path %>
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class Configuration
|
|
5
|
+
REVIEW_MODES = %i[auto always skip].freeze
|
|
6
|
+
|
|
7
|
+
attr_accessor :confidence_threshold,
|
|
8
|
+
:batch_size,
|
|
9
|
+
:storage_path,
|
|
10
|
+
:default_strategy,
|
|
11
|
+
:llm_model,
|
|
12
|
+
:embedding_model,
|
|
13
|
+
:async,
|
|
14
|
+
:value_hint_rows,
|
|
15
|
+
:field_size_limit,
|
|
16
|
+
:bad_row_limit
|
|
17
|
+
|
|
18
|
+
attr_reader :review_mode, :logger, :nil_values_matching, :chunk_size
|
|
19
|
+
|
|
20
|
+
def initialize
|
|
21
|
+
@confidence_threshold = 0.80
|
|
22
|
+
@batch_size = 500
|
|
23
|
+
@storage_path = "tmp/smart_csv_import"
|
|
24
|
+
@default_strategy = :vector
|
|
25
|
+
@llm_model = "gpt-4o-mini"
|
|
26
|
+
@embedding_model = "text-embedding-3-small"
|
|
27
|
+
@async = false
|
|
28
|
+
@value_hint_rows = 5
|
|
29
|
+
@field_size_limit = 1_048_576
|
|
30
|
+
@bad_row_limit = 0.10
|
|
31
|
+
@chunk_size = 1000
|
|
32
|
+
@review_mode = :skip
|
|
33
|
+
@nil_values_matching = %w[#N/A #VALUE! #REF! #DIV/0! #NUM! #NAME? #NULL! NULL null N/A n/a nil Nil].freeze
|
|
34
|
+
@logger = Logger.new($stdout)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def logger=(logger)
|
|
38
|
+
if logger.nil?
|
|
39
|
+
@logger = Logger.new(nil)
|
|
40
|
+
elsif !logger.respond_to?(:info) || !logger.respond_to?(:debug) || !logger.respond_to?(:error)
|
|
41
|
+
raise ConfigurationError, "logger must respond to :info, :debug, and :error"
|
|
42
|
+
else
|
|
43
|
+
@logger = logger
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def nil_values_regexp
|
|
48
|
+
return nil if nil_values_matching.nil? || nil_values_matching.empty?
|
|
49
|
+
|
|
50
|
+
Regexp.union(nil_values_matching.map { |v| /\A#{Regexp.escape(v)}\z/ })
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def nil_values_matching=(value)
|
|
54
|
+
unless value.nil? || value.is_a?(Array)
|
|
55
|
+
raise ConfigurationError, "nil_values_matching must be an Array of strings or nil"
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
@nil_values_matching = value
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def chunk_size=(value)
|
|
62
|
+
unless value.nil? || (value.is_a?(Integer) && value > 0)
|
|
63
|
+
raise ConfigurationError, "chunk_size must be a positive integer or nil"
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
@chunk_size = value
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def review_mode=(value)
|
|
70
|
+
unless REVIEW_MODES.include?(value)
|
|
71
|
+
raise ConfigurationError, "review_mode must be one of: #{REVIEW_MODES.join(", ")}"
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
@review_mode = value
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
module CosineSimilarity
|
|
5
|
+
def self.call(vec_a, vec_b)
|
|
6
|
+
dot_product = vec_a.zip(vec_b).sum { |a, b| a * b }
|
|
7
|
+
magnitude_a = Math.sqrt(vec_a.sum { |v| v**2 })
|
|
8
|
+
magnitude_b = Math.sqrt(vec_b.sum { |v| v**2 })
|
|
9
|
+
|
|
10
|
+
return 0.0 if magnitude_a.zero? || magnitude_b.zero?
|
|
11
|
+
|
|
12
|
+
dot_product / (magnitude_a * magnitude_b)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmartCsvImport
|
|
4
|
+
class Engine < ::Rails::Engine
|
|
5
|
+
isolate_namespace SmartCsvImport
|
|
6
|
+
|
|
7
|
+
config.generators do |g|
|
|
8
|
+
g.test_framework :rspec
|
|
9
|
+
g.fixture_replacement :factory_bot, dir: "spec/factories"
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module SmartCsvImport
|
|
6
|
+
class FailedRowExporter
|
|
7
|
+
def initialize(csv_path:, result: nil, import: nil)
|
|
8
|
+
raise ArgumentError, "provide either result: or import:" if result.nil? && import.nil?
|
|
9
|
+
raise ArgumentError, "provide only one of result: or import:" if result && import
|
|
10
|
+
|
|
11
|
+
@csv_path = csv_path
|
|
12
|
+
@failed_lines = if result
|
|
13
|
+
errors_by_line_from_result(result)
|
|
14
|
+
else
|
|
15
|
+
errors_by_line_from_import(import)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def call
|
|
20
|
+
return nil if @failed_lines.empty?
|
|
21
|
+
|
|
22
|
+
original_headers = read_original_headers
|
|
23
|
+
output_path = generate_output_path
|
|
24
|
+
|
|
25
|
+
write_failed_rows(original_headers, @failed_lines, output_path)
|
|
26
|
+
|
|
27
|
+
output_path
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def errors_by_line_from_result(result)
|
|
33
|
+
result.errors.each_with_object({}) do |error, acc|
|
|
34
|
+
acc[error.row] ||= []
|
|
35
|
+
acc[error.row] << { column: error.column, messages: error.messages }
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def errors_by_line_from_import(import)
|
|
40
|
+
import.row_errors.validation_errors.each_with_object({}) do |error, acc|
|
|
41
|
+
acc[error.row_number] ||= []
|
|
42
|
+
acc[error.row_number] << { column: error.column_name, messages: error.messages }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def read_original_headers
|
|
47
|
+
CSV.open(@csv_path, "r", &:readline)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def generate_output_path
|
|
51
|
+
output_dir = File.join(SmartCsvImport.configuration.storage_path, "failed_rows")
|
|
52
|
+
FileUtils.mkdir_p(output_dir)
|
|
53
|
+
timestamp = Time.current.strftime("%Y%m%d%H%M%S")
|
|
54
|
+
File.join(output_dir, "#{timestamp}_failed.csv")
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def write_failed_rows(original_headers, failed_lines, output_path)
|
|
58
|
+
output_headers = [*original_headers, "_error"]
|
|
59
|
+
|
|
60
|
+
CSV.open(output_path, "w") do |csv|
|
|
61
|
+
csv << output_headers
|
|
62
|
+
|
|
63
|
+
CSV.foreach(@csv_path, headers: true).with_index(2) do |row, line_number|
|
|
64
|
+
next unless failed_lines.key?(line_number)
|
|
65
|
+
|
|
66
|
+
error_message = format_errors(failed_lines[line_number])
|
|
67
|
+
csv << [*row.fields, error_message]
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def format_errors(errors)
|
|
73
|
+
errors.map do |error|
|
|
74
|
+
"#{error[:column]}: #{error[:messages].join(", ")}"
|
|
75
|
+
end.join("; ")
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|