smart_csv_import 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.adoc +134 -0
  3. data/README.md +534 -0
  4. data/app/jobs/smart_csv_import/import_job.rb +22 -0
  5. data/app/models/smart_csv_import/import.rb +36 -0
  6. data/app/models/smart_csv_import/import_row_error.rb +17 -0
  7. data/lib/generators/smart_csv_import/import/import_generator.rb +49 -0
  8. data/lib/generators/smart_csv_import/import/templates/import_form.rb.tt +32 -0
  9. data/lib/generators/smart_csv_import/import/templates/import_form_spec.rb.tt +38 -0
  10. data/lib/generators/smart_csv_import/install/install_generator.rb +34 -0
  11. data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_import_row_errors.rb.tt +18 -0
  12. data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_imports.rb.tt +23 -0
  13. data/lib/generators/smart_csv_import/install/templates/initializer.rb.tt +51 -0
  14. data/lib/generators/smart_csv_import/scaffold/scaffold_generator.rb +56 -0
  15. data/lib/generators/smart_csv_import/scaffold/templates/controller.rb.tt +33 -0
  16. data/lib/generators/smart_csv_import/scaffold/templates/new.html.erb.tt +12 -0
  17. data/lib/generators/smart_csv_import/scaffold/templates/show.html.erb.tt +59 -0
  18. data/lib/smart_csv_import/configuration.rb +77 -0
  19. data/lib/smart_csv_import/cosine_similarity.rb +15 -0
  20. data/lib/smart_csv_import/engine.rb +12 -0
  21. data/lib/smart_csv_import/failed_row_exporter.rb +78 -0
  22. data/lib/smart_csv_import/file_storage.rb +34 -0
  23. data/lib/smart_csv_import/header_normalizer.rb +76 -0
  24. data/lib/smart_csv_import/logging.rb +37 -0
  25. data/lib/smart_csv_import/match_result.rb +36 -0
  26. data/lib/smart_csv_import/matchable.rb +76 -0
  27. data/lib/smart_csv_import/matcher.rb +198 -0
  28. data/lib/smart_csv_import/normalizers/boolean_converter.rb +26 -0
  29. data/lib/smart_csv_import/normalizers/date_converter.rb +28 -0
  30. data/lib/smart_csv_import/notifications.rb +16 -0
  31. data/lib/smart_csv_import/processor/csv_preflight_analyzer.rb +74 -0
  32. data/lib/smart_csv_import/processor/import_result_builder.rb +97 -0
  33. data/lib/smart_csv_import/processor/mapping_review_policy.rb +90 -0
  34. data/lib/smart_csv_import/processor/nil_cell_counter.rb +19 -0
  35. data/lib/smart_csv_import/processor/null_progress_callback.rb +11 -0
  36. data/lib/smart_csv_import/processor/row_processor.rb +70 -0
  37. data/lib/smart_csv_import/processor.rb +294 -0
  38. data/lib/smart_csv_import/result.rb +101 -0
  39. data/lib/smart_csv_import/stability_report.rb +104 -0
  40. data/lib/smart_csv_import/strategies/llm.rb +106 -0
  41. data/lib/smart_csv_import/strategies/lookup.rb +41 -0
  42. data/lib/smart_csv_import/strategies/vector.rb +155 -0
  43. data/lib/smart_csv_import/strategy.rb +9 -0
  44. data/lib/smart_csv_import/strategy_failure.rb +13 -0
  45. data/lib/smart_csv_import/version.rb +5 -0
  46. data/lib/smart_csv_import.rb +79 -0
  47. data/smart_csv_import.gemspec +35 -0
  48. metadata +216 -0
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ class ImportJob < ActiveJob::Base
5
+ queue_as :smart_csv_import
6
+
7
+ def perform(import_id, form_class_name)
8
+ import = Import.find(import_id)
9
+ form_class = form_class_name.constantize
10
+
11
+ Processor.new(
12
+ file_path: import.file_path,
13
+ form_class: form_class,
14
+ mode: :sync,
15
+ import: import
16
+ ).call
17
+ rescue StandardError => e
18
+ import&.update!(status: "failed")
19
+ raise
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ # Parent resolved at class-definition time; set SmartCsvImport.base_model_class in an
5
+ # initializer before this model is autoloaded if you want a custom AR subclass.
6
+ class Import < SmartCsvImport.base_model_class
7
+ self.table_name = "smart_csv_import_imports"
8
+
9
+ STATUSES = %w[pending mapping_review processing completed partial_failure failed].freeze
10
+
11
+ has_many :row_errors,
12
+ -> { order(:row_number) },
13
+ class_name: "SmartCsvImport::ImportRowError",
14
+ foreign_key: :import_id,
15
+ inverse_of: :import,
16
+ dependent: :destroy
17
+
18
+ validates :import_type, presence: true
19
+ validates :original_filename, presence: true
20
+ validates :status, inclusion: { in: STATUSES }
21
+
22
+ STATUSES.each do |status_name|
23
+ define_method(:"#{status_name}?") do
24
+ status == status_name
25
+ end
26
+ end
27
+
28
+ def self.find_duplicate(file_hash:, import_type: nil)
29
+ return nil if file_hash.nil?
30
+
31
+ scope = where(file_hash: file_hash)
32
+ scope = scope.where(import_type: import_type) if import_type
33
+ scope.first
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ class ImportRowError < SmartCsvImport.base_model_class
5
+ self.table_name = 'smart_csv_import_import_row_errors'
6
+
7
+ ERROR_TYPES = %w[parse validation].freeze
8
+
9
+ belongs_to :import, class_name: 'SmartCsvImport::Import', inverse_of: :row_errors, optional: false
10
+
11
+ validates :error_type, inclusion: { in: ERROR_TYPES }
12
+ validates :row_number, presence: true
13
+
14
+ scope :validation_errors, -> { where(error_type: 'validation') }
15
+ scope :parse_errors, -> { where(error_type: 'parse') }
16
+ end
17
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rails/generators"
4
+
5
+ module SmartCsvImport
6
+ module Generators
7
+ class ImportGenerator < Rails::Generators::Base
8
+ source_root File.expand_path("templates", __dir__)
9
+
10
+ argument :model_name, type: :string, desc: "The model name (e.g., Employee)"
11
+ argument :fields, type: :array, default: [], desc: "Fields with optional types (e.g., first_name last_name:string email:string hire_date:date)"
12
+
13
+ desc "Generates a SmartCsvImport import form object with csv_field declarations"
14
+
15
+ def create_form_object
16
+ template "import_form.rb.tt", "app/forms/#{form_file_name}.rb"
17
+ end
18
+
19
+ def create_form_spec
20
+ template "import_form_spec.rb.tt", "spec/forms/#{form_file_name}_spec.rb"
21
+ end
22
+
23
+ private
24
+
25
+ def form_class_name
26
+ "#{model_name.camelize}ImportForm"
27
+ end
28
+
29
+ def form_file_name
30
+ "#{model_name.underscore}_import_form"
31
+ end
32
+
33
+ def parsed_fields
34
+ @parsed_fields ||= fields.map do |field_spec|
35
+ name, type = field_spec.split(":")
36
+ { name: name, type: type || "string" }
37
+ end
38
+ end
39
+
40
+ def field_names
41
+ parsed_fields.map { |f| f[:name] }
42
+ end
43
+
44
+ def email_fields
45
+ parsed_fields.select { |f| f[:name] =~ /email/ }.map { |f| f[:name] }
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ class <%= form_class_name %>
4
+ include ActiveModel::API
5
+ include ActiveModel::Attributes
6
+ include SmartCsvImport::Matchable
7
+
8
+ <% parsed_fields.each do |field| -%>
9
+ attribute :<%= field[:name] %>, :<%= field[:type] %>
10
+ <% end -%>
11
+
12
+ <% parsed_fields.each do |field| -%>
13
+ csv_field :<%= field[:name] %>, description: "<%= model_name.camelize %> <%= field[:name].humanize.downcase %>"
14
+ <% end -%>
15
+
16
+ validates <%= field_names.map { |n| ":#{n}" }.join(", ") %>, presence: true
17
+ <% email_fields.each do |email_field| -%>
18
+ validates :<%= email_field %>, format: { with: URI::MailTo::EMAIL_REGEXP }, allow_blank: true
19
+ <% end -%>
20
+
21
+ def save
22
+ return false unless valid?
23
+
24
+ record = <%= model_name.camelize %>.new(
25
+ <% parsed_fields.each_with_index do |field, index| -%>
26
+ <%= field[:name] %>: <%= field[:name] %><%= index < parsed_fields.length - 1 ? "," : "" %>
27
+ <% end -%>
28
+ )
29
+
30
+ record.save
31
+ end
32
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rails_helper"
4
+
5
+ RSpec.describe <%= form_class_name %> do
6
+ describe ".csv_fields" do
7
+ it "declares all expected csv_fields" do
8
+ expected_fields = %i[<%= field_names.join(" ") %>]
9
+ expect(described_class.csv_fields.keys).to match_array(expected_fields)
10
+ end
11
+ end
12
+
13
+ describe "validations" do
14
+ <% parsed_fields.each do |field| -%>
15
+ it "requires <%= field[:name] %>" do
16
+ form = described_class.new(<%= field[:name] %>: nil)
17
+ form.valid?
18
+ expect(form.errors[:<%= field[:name] %>]).to include("can't be blank")
19
+ end
20
+
21
+ <% end -%>
22
+ <% email_fields.each do |email_field| -%>
23
+ it "validates format of <%= email_field %>" do
24
+ form = described_class.new(<%= email_field %>: "invalid-email")
25
+ form.valid?
26
+ expect(form.errors[:<%= email_field %>]).to include("is invalid")
27
+ end
28
+
29
+ <% end -%>
30
+ end
31
+
32
+ describe "#save" do
33
+ it "responds to save" do
34
+ form = described_class.new
35
+ expect(form).to respond_to(:save)
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rails/generators"
4
+
5
+ module SmartCsvImport
6
+ module Generators
7
+ class InstallGenerator < Rails::Generators::Base
8
+ source_root File.expand_path("templates", __dir__)
9
+
10
+ desc "Installs SmartCsvImport: initializer, migration, and storage directories"
11
+
12
+ def copy_initializer
13
+ template "initializer.rb.tt", "config/initializers/smart_csv_import.rb"
14
+ end
15
+
16
+ def create_migration
17
+ base_time = Time.now
18
+ template(
19
+ "create_smart_csv_import_imports.rb.tt",
20
+ "db/migrate/#{base_time.strftime('%Y%m%d%H%M%S')}_create_smart_csv_import_imports.rb"
21
+ )
22
+ template(
23
+ "create_smart_csv_import_import_row_errors.rb.tt",
24
+ "db/migrate/#{(base_time + 1).strftime('%Y%m%d%H%M%S')}_create_smart_csv_import_import_row_errors.rb"
25
+ )
26
+ end
27
+
28
+ def create_storage_directories
29
+ empty_directory "tmp/smart_csv_import/imports"
30
+ empty_directory "tmp/smart_csv_import/embeddings_cache"
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CreateSmartCsvImportImportRowErrors < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
4
+ def change
5
+ create_table :smart_csv_import_import_row_errors do |t|
6
+ t.references :import, null: false, foreign_key: { to_table: :smart_csv_import_imports, on_delete: :cascade }
7
+ t.integer :row_number, null: false
8
+ t.string :error_type, null: false
9
+ t.string :column_name
10
+ t.json :messages, default: []
11
+ t.text :raw_line
12
+ t.text :error_message
13
+ t.datetime :created_at, null: false
14
+ end
15
+
16
+ add_index :smart_csv_import_import_row_errors, [:import_id, :error_type, :row_number]
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CreateSmartCsvImportImports < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
4
+ def change
5
+ create_table :smart_csv_import_imports do |t|
6
+ t.string :import_type, null: false
7
+ t.string :original_filename, null: false
8
+ t.string :file_path
9
+ t.string :file_hash
10
+ t.json :header_mappings, default: {}
11
+ t.integer :total_rows, default: 0
12
+ t.integer :imported_count, default: 0
13
+ t.integer :failed_count, default: 0
14
+ t.json :warnings, default: []
15
+ t.string :status, null: false, default: "pending"
16
+
17
+ t.timestamps
18
+ end
19
+
20
+ add_index :smart_csv_import_imports, :file_hash
21
+ add_index :smart_csv_import_imports, :status
22
+ end
23
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ # SmartCsvImport uses RubyLLM for AI-powered header matching.
4
+ # Configure your AI provider(s) below. At minimum, set one embedding model
5
+ # (for vector matching) and one LLM model (for fallback matching).
6
+ #
7
+ # Recommended free-tier setup: Gemini embeddings + Anthropic LLM
8
+ #
9
+ # RubyLLM.configure do |config|
10
+ # # --- Gemini (embeddings) ---
11
+ # # Free tier available at https://aistudio.google.com/apikey
12
+ # # config.gemini_api_key = ENV["GEMINI_API_KEY"]
13
+ #
14
+ # # --- Anthropic (LLM fallback) ---
15
+ # # config.anthropic_api_key = ENV["ANTHROPIC_API_KEY"]
16
+ #
17
+ # # --- OpenAI (alternative for both) ---
18
+ # # config.openai_api_key = ENV["OPENAI_API_KEY"]
19
+ # end
20
+
21
+ SmartCsvImport.configure do |config|
22
+ # Path where uploaded CSV files are stored
23
+ # config.storage_path = "tmp/smart_csv_import"
24
+
25
+ # Number of rows to process per batch
26
+ # config.batch_size = 500
27
+
28
+ # Minimum confidence threshold for header matching (0.0 - 1.0)
29
+ # config.confidence_threshold = 0.80
30
+
31
+ # Which strategy tier runs first when no custom strategy is set on the form class.
32
+ # :vector (default) — vector similarity first, LLM as fallback.
33
+ # :llm — LLM first, vector as fallback.
34
+ # config.default_strategy = :vector
35
+
36
+ # LLM model for fallback header matching
37
+ # Anthropic: "claude-haiku-4-5-20251001" (fast, cheap)
38
+ # OpenAI: "gpt-4o-mini"
39
+ # config.llm_model = "claude-haiku-4-5-20251001"
40
+
41
+ # Embedding model for vector similarity matching
42
+ # Gemini: "gemini-embedding-001" (free tier, recommended)
43
+ # OpenAI: "text-embedding-3-small"
44
+ # config.embedding_model = "gemini-embedding-001"
45
+
46
+ # Number of sample rows to use for value-based confidence hints
47
+ # config.value_hint_rows = 3
48
+
49
+ # Process imports asynchronously with ActiveJob
50
+ # config.async = false
51
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rails/generators"
4
+
5
+ module SmartCsvImport
6
+ module Generators
7
+ class ScaffoldGenerator < Rails::Generators::Base
8
+ source_root File.expand_path("templates", __dir__)
9
+
10
+ argument :model_name, type: :string, desc: "The model name (e.g., Employee)"
11
+
12
+ desc "Generates SmartCsvImport import scaffold: controller, views, and routes"
13
+
14
+ def create_controller
15
+ template "controller.rb.tt", "app/controllers/#{imports_controller_file_name}.rb"
16
+ end
17
+
18
+ def create_views
19
+ template "new.html.erb.tt", "app/views/#{imports_name}/new.html.erb"
20
+ template "show.html.erb.tt", "app/views/#{imports_name}/show.html.erb"
21
+ end
22
+
23
+ def add_routes
24
+ route_line = " resources :#{imports_name}, only: [:new, :create, :show]"
25
+
26
+ inject_into_file(
27
+ "config/routes.rb",
28
+ "#{route_line}\n",
29
+ after: "Rails.application.routes.draw do\n"
30
+ )
31
+ end
32
+
33
+ private
34
+
35
+ def model_class_name
36
+ model_name.camelize
37
+ end
38
+
39
+ def form_class_name
40
+ "#{model_class_name}ImportForm"
41
+ end
42
+
43
+ def imports_name
44
+ "#{model_name.underscore}_imports"
45
+ end
46
+
47
+ def imports_controller_class_name
48
+ "#{model_class_name}ImportsController"
49
+ end
50
+
51
+ def imports_controller_file_name
52
+ "#{model_name.underscore}_imports_controller"
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ class <%= imports_controller_class_name %> < ApplicationController
4
+ def new
5
+ @import_form = <%= form_class_name %>.new
6
+ end
7
+
8
+ def create
9
+ uploaded_file = params[:file]
10
+
11
+ result = SmartCsvImport.process(
12
+ uploaded_file.tempfile.path,
13
+ form_class: <%= form_class_name %>
14
+ )
15
+
16
+ if result.review_required?
17
+ flash[:notice] = "Import requires header mapping review."
18
+ elsif result.completed?
19
+ flash[:notice] = "Imported #{result.imported} of #{result.total} rows."
20
+ elsif result.partial_failure?
21
+ flash[:alert] = "Imported #{result.imported} of #{result.total} rows with #{result.failed} failures."
22
+ end
23
+
24
+ redirect_to <%= imports_name.singularize %>_path(result.import_id)
25
+ rescue StandardError => e
26
+ flash[:alert] = "Import failed: #{e.message}"
27
+ redirect_to new_<%= imports_name.singularize %>_path
28
+ end
29
+
30
+ def show
31
+ @import = SmartCsvImport::Import.find(params[:id])
32
+ end
33
+ end
@@ -0,0 +1,12 @@
1
+ <h1>Import <%= model_class_name.pluralize %></h1>
2
+
3
+ <%%= form_with url: <%= imports_name %>_path, method: :post, multipart: true do |form| %>
4
+ <div>
5
+ <%%= form.label :file, "Choose CSV file" %>
6
+ <%%= form.file_field :file, accept: ".csv" %>
7
+ </div>
8
+
9
+ <div>
10
+ <%%= form.submit "Upload and Import" %>
11
+ </div>
12
+ <%% end %>
@@ -0,0 +1,59 @@
1
+ <h1>Import Result</h1>
2
+
3
+ <dl>
4
+ <dt>Status</dt>
5
+ <dd><%%= @import.status %></dd>
6
+
7
+ <dt>File</dt>
8
+ <dd><%%= @import.original_filename %></dd>
9
+
10
+ <dt>Total Rows</dt>
11
+ <dd><%%= @import.total_rows %></dd>
12
+
13
+ <dt>Imported</dt>
14
+ <dd><%%= @import.imported_count %></dd>
15
+
16
+ <dt>Failed</dt>
17
+ <dd><%%= @import.failed_count %></dd>
18
+ </dl>
19
+
20
+ <%% if @import.warnings.any? %>
21
+ <h2>Warnings</h2>
22
+ <ul>
23
+ <%% @import.warnings.each do |warning| %>
24
+ <li><%%= warning %></li>
25
+ <%% end %>
26
+ </ul>
27
+ <%% end %>
28
+
29
+ <%% if @import.row_errors.any? %>
30
+ <h2>Failed Rows</h2>
31
+ <table>
32
+ <thead>
33
+ <tr>
34
+ <th>Row</th>
35
+ <th>Type</th>
36
+ <th>Column</th>
37
+ <th>Errors</th>
38
+ </tr>
39
+ </thead>
40
+ <tbody>
41
+ <%% @import.row_errors.each do |row_error| %>
42
+ <tr>
43
+ <td><%%= row_error.row_number %></td>
44
+ <td><%%= row_error.error_type %></td>
45
+ <td><%%= row_error.column_name %></td>
46
+ <td>
47
+ <%% if row_error.error_type == "parse" %>
48
+ <%%= row_error.error_message %>
49
+ <%% else %>
50
+ <%%= row_error.messages.join(", ") %>
51
+ <%% end %>
52
+ </td>
53
+ </tr>
54
+ <%% end %>
55
+ </tbody>
56
+ </table>
57
+ <%% end %>
58
+
59
+ <%%= link_to "New Import", new_<%= imports_name.singularize %>_path %>
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ class Configuration
5
+ REVIEW_MODES = %i[auto always skip].freeze
6
+
7
+ attr_accessor :confidence_threshold,
8
+ :batch_size,
9
+ :storage_path,
10
+ :default_strategy,
11
+ :llm_model,
12
+ :embedding_model,
13
+ :async,
14
+ :value_hint_rows,
15
+ :field_size_limit,
16
+ :bad_row_limit
17
+
18
+ attr_reader :review_mode, :logger, :nil_values_matching, :chunk_size
19
+
20
+ def initialize
21
+ @confidence_threshold = 0.80
22
+ @batch_size = 500
23
+ @storage_path = "tmp/smart_csv_import"
24
+ @default_strategy = :vector
25
+ @llm_model = "gpt-4o-mini"
26
+ @embedding_model = "text-embedding-3-small"
27
+ @async = false
28
+ @value_hint_rows = 5
29
+ @field_size_limit = 1_048_576
30
+ @bad_row_limit = 0.10
31
+ @chunk_size = 1000
32
+ @review_mode = :skip
33
+ @nil_values_matching = %w[#N/A #VALUE! #REF! #DIV/0! #NUM! #NAME? #NULL! NULL null N/A n/a nil Nil].freeze
34
+ @logger = Logger.new($stdout)
35
+ end
36
+
37
+ def logger=(logger)
38
+ if logger.nil?
39
+ @logger = Logger.new(nil)
40
+ elsif !logger.respond_to?(:info) || !logger.respond_to?(:debug) || !logger.respond_to?(:error)
41
+ raise ConfigurationError, "logger must respond to :info, :debug, and :error"
42
+ else
43
+ @logger = logger
44
+ end
45
+ end
46
+
47
+ def nil_values_regexp
48
+ return nil if nil_values_matching.nil? || nil_values_matching.empty?
49
+
50
+ Regexp.union(nil_values_matching.map { |v| /\A#{Regexp.escape(v)}\z/ })
51
+ end
52
+
53
+ def nil_values_matching=(value)
54
+ unless value.nil? || value.is_a?(Array)
55
+ raise ConfigurationError, "nil_values_matching must be an Array of strings or nil"
56
+ end
57
+
58
+ @nil_values_matching = value
59
+ end
60
+
61
+ def chunk_size=(value)
62
+ unless value.nil? || (value.is_a?(Integer) && value > 0)
63
+ raise ConfigurationError, "chunk_size must be a positive integer or nil"
64
+ end
65
+
66
+ @chunk_size = value
67
+ end
68
+
69
+ def review_mode=(value)
70
+ unless REVIEW_MODES.include?(value)
71
+ raise ConfigurationError, "review_mode must be one of: #{REVIEW_MODES.join(", ")}"
72
+ end
73
+
74
+ @review_mode = value
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ module CosineSimilarity
5
+ def self.call(vec_a, vec_b)
6
+ dot_product = vec_a.zip(vec_b).sum { |a, b| a * b }
7
+ magnitude_a = Math.sqrt(vec_a.sum { |v| v**2 })
8
+ magnitude_b = Math.sqrt(vec_b.sum { |v| v**2 })
9
+
10
+ return 0.0 if magnitude_a.zero? || magnitude_b.zero?
11
+
12
+ dot_product / (magnitude_a * magnitude_b)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmartCsvImport
4
+ class Engine < ::Rails::Engine
5
+ isolate_namespace SmartCsvImport
6
+
7
+ config.generators do |g|
8
+ g.test_framework :rspec
9
+ g.fixture_replacement :factory_bot, dir: "spec/factories"
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module SmartCsvImport
6
+ class FailedRowExporter
7
+ def initialize(csv_path:, result: nil, import: nil)
8
+ raise ArgumentError, "provide either result: or import:" if result.nil? && import.nil?
9
+ raise ArgumentError, "provide only one of result: or import:" if result && import
10
+
11
+ @csv_path = csv_path
12
+ @failed_lines = if result
13
+ errors_by_line_from_result(result)
14
+ else
15
+ errors_by_line_from_import(import)
16
+ end
17
+ end
18
+
19
+ def call
20
+ return nil if @failed_lines.empty?
21
+
22
+ original_headers = read_original_headers
23
+ output_path = generate_output_path
24
+
25
+ write_failed_rows(original_headers, @failed_lines, output_path)
26
+
27
+ output_path
28
+ end
29
+
30
+ private
31
+
32
+ def errors_by_line_from_result(result)
33
+ result.errors.each_with_object({}) do |error, acc|
34
+ acc[error.row] ||= []
35
+ acc[error.row] << { column: error.column, messages: error.messages }
36
+ end
37
+ end
38
+
39
+ def errors_by_line_from_import(import)
40
+ import.row_errors.validation_errors.each_with_object({}) do |error, acc|
41
+ acc[error.row_number] ||= []
42
+ acc[error.row_number] << { column: error.column_name, messages: error.messages }
43
+ end
44
+ end
45
+
46
+ def read_original_headers
47
+ CSV.open(@csv_path, "r", &:readline)
48
+ end
49
+
50
+ def generate_output_path
51
+ output_dir = File.join(SmartCsvImport.configuration.storage_path, "failed_rows")
52
+ FileUtils.mkdir_p(output_dir)
53
+ timestamp = Time.current.strftime("%Y%m%d%H%M%S")
54
+ File.join(output_dir, "#{timestamp}_failed.csv")
55
+ end
56
+
57
+ def write_failed_rows(original_headers, failed_lines, output_path)
58
+ output_headers = [*original_headers, "_error"]
59
+
60
+ CSV.open(output_path, "w") do |csv|
61
+ csv << output_headers
62
+
63
+ CSV.foreach(@csv_path, headers: true).with_index(2) do |row, line_number|
64
+ next unless failed_lines.key?(line_number)
65
+
66
+ error_message = format_errors(failed_lines[line_number])
67
+ csv << [*row.fields, error_message]
68
+ end
69
+ end
70
+ end
71
+
72
+ def format_errors(errors)
73
+ errors.map do |error|
74
+ "#{error[:column]}: #{error[:messages].join(", ")}"
75
+ end.join("; ")
76
+ end
77
+ end
78
+ end