RubyGems - smart_csv_import - Versions diffs - 0.1.0 - Mend

smart_csv_import 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

checksums.yaml +7 -0
data/LICENSE.adoc +134 -0
data/README.md +534 -0
data/app/jobs/smart_csv_import/import_job.rb +22 -0
data/app/models/smart_csv_import/import.rb +36 -0
data/app/models/smart_csv_import/import_row_error.rb +17 -0
data/lib/generators/smart_csv_import/import/import_generator.rb +49 -0
data/lib/generators/smart_csv_import/import/templates/import_form.rb.tt +32 -0
data/lib/generators/smart_csv_import/import/templates/import_form_spec.rb.tt +38 -0
data/lib/generators/smart_csv_import/install/install_generator.rb +34 -0
data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_import_row_errors.rb.tt +18 -0
data/lib/generators/smart_csv_import/install/templates/create_smart_csv_import_imports.rb.tt +23 -0
data/lib/generators/smart_csv_import/install/templates/initializer.rb.tt +51 -0
data/lib/generators/smart_csv_import/scaffold/scaffold_generator.rb +56 -0
data/lib/generators/smart_csv_import/scaffold/templates/controller.rb.tt +33 -0
data/lib/generators/smart_csv_import/scaffold/templates/new.html.erb.tt +12 -0
data/lib/generators/smart_csv_import/scaffold/templates/show.html.erb.tt +59 -0
data/lib/smart_csv_import/configuration.rb +77 -0
data/lib/smart_csv_import/cosine_similarity.rb +15 -0
data/lib/smart_csv_import/engine.rb +12 -0
data/lib/smart_csv_import/failed_row_exporter.rb +78 -0
data/lib/smart_csv_import/file_storage.rb +34 -0
data/lib/smart_csv_import/header_normalizer.rb +76 -0
data/lib/smart_csv_import/logging.rb +37 -0
data/lib/smart_csv_import/match_result.rb +36 -0
data/lib/smart_csv_import/matchable.rb +76 -0
data/lib/smart_csv_import/matcher.rb +198 -0
data/lib/smart_csv_import/normalizers/boolean_converter.rb +26 -0
data/lib/smart_csv_import/normalizers/date_converter.rb +28 -0
data/lib/smart_csv_import/notifications.rb +16 -0
data/lib/smart_csv_import/processor/csv_preflight_analyzer.rb +74 -0
data/lib/smart_csv_import/processor/import_result_builder.rb +97 -0
data/lib/smart_csv_import/processor/mapping_review_policy.rb +90 -0
data/lib/smart_csv_import/processor/nil_cell_counter.rb +19 -0
data/lib/smart_csv_import/processor/null_progress_callback.rb +11 -0
data/lib/smart_csv_import/processor/row_processor.rb +70 -0
data/lib/smart_csv_import/processor.rb +294 -0
data/lib/smart_csv_import/result.rb +101 -0
data/lib/smart_csv_import/stability_report.rb +104 -0
data/lib/smart_csv_import/strategies/llm.rb +106 -0
data/lib/smart_csv_import/strategies/lookup.rb +41 -0
data/lib/smart_csv_import/strategies/vector.rb +155 -0
data/lib/smart_csv_import/strategy.rb +9 -0
data/lib/smart_csv_import/strategy_failure.rb +13 -0
data/lib/smart_csv_import/version.rb +5 -0
data/lib/smart_csv_import.rb +79 -0
data/smart_csv_import.gemspec +35 -0
metadata +216 -0

data/lib/smart_csv_import/stability_report.rb ADDED Viewed

@@ -0,0 +1,104 @@
+# frozen_string_literal: true
+module SmartCsvImport
+  StableField = Struct.new(:csv_header, :target_field, :strategy, :consistency_rate, keyword_init: true)
+  UnstableField = Struct.new(:csv_header, :resolutions, keyword_init: true)
+  StabilityAnalysis = Struct.new(:import_type, :imports_analyzed, :stable_fields, :unstable_fields, keyword_init: true)
+  class StabilityReport
+    STABILITY_THRESHOLD = 0.9
+    ANALYZABLE_STATUSES = %w[completed partial_failure].freeze
+    def initialize(import_type:, lookback: 20)
+      @import_type = import_type
+      @lookback = lookback
+    end
+    def analyze
+      imports = fetch_imports
+      header_tallies = tally_header_mappings(imports)
+      total = imports.length
+      stable_fields = []
+      unstable_fields = []
+      header_tallies.each do |csv_header, target_counts|
+        top_target, top_count = target_counts.max_by { |_, count| count }
+        consistency_rate = (top_count.to_f / total).round(4)
+        if consistency_rate >= STABILITY_THRESHOLD
+          stable_fields << StableField.new(
+            csv_header: csv_header,
+            target_field: top_target.to_sym,
+            strategy: nil,
+            consistency_rate: consistency_rate
+          )
+        else
+          resolutions = target_counts.map { |target, count| { target: target, count: count } }
+          unstable_fields << UnstableField.new(
+            csv_header: csv_header,
+            resolutions: resolutions
+          )
+        end
+      end
+      StabilityAnalysis.new(
+        import_type: @import_type,
+        imports_analyzed: total,
+        stable_fields: stable_fields,
+        unstable_fields: unstable_fields
+      )
+    end
+    def summary
+      analysis = analyze
+      if analysis.imports_analyzed.zero?
+        return "No completed imports found for #{@import_type}."
+      end
+      lines = ["Stability report for #{@import_type} (#{analysis.imports_analyzed} imports analyzed):"]
+      if analysis.stable_fields.any?
+        lines << "  Stable fields (#{analysis.stable_fields.length}):"
+        analysis.stable_fields.each do |field|
+          lines << "    - #{field.csv_header} → #{field.target_field} (#{(field.consistency_rate * 100).round(1)}% consistent)"
+        end
+      end
+      if analysis.unstable_fields.any?
+        lines << "  Unstable fields (#{analysis.unstable_fields.length}):"
+        analysis.unstable_fields.each do |field|
+          resolutions_desc = field.resolutions.map { |r| "#{r[:target]}(#{r[:count]})" }.join(", ")
+          lines << "    - #{field.csv_header}: #{resolutions_desc}"
+        end
+      end
+      lines.join("\n")
+    end
+    private
+    def fetch_imports
+      Import
+        .where(import_type: @import_type, status: ANALYZABLE_STATUSES)
+        .where.not(header_mappings: [nil, {}])
+        .order(created_at: :desc)
+        .limit(@lookback)
+        .to_a
+    end
+    def tally_header_mappings(imports)
+      imports.each_with_object(Hash.new { |h, k| h[k] = Hash.new(0) }) do |import, tallies|
+        mappings = import.header_mappings
+        next unless mappings.is_a?(Hash)
+        mappings.each do |csv_header, target_field|
+          next if target_field.nil?
+          tallies[csv_header][target_field] += 1
+        end
+      end
+    end
+  end
+end

data/lib/smart_csv_import/strategies/llm.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+require "json"
+require "ruby_llm"
+module SmartCsvImport
+  module Strategies
+    class Llm < Strategy
+      include Logging
+      # Why we do NOT use HyDE (Hypothetical Document Embeddings) here:
+      #
+      # HyDE would ask the LLM to generate a description of each header in
+      # isolation, then compare those descriptions to field descriptions via
+      # embeddings. It was trialled and rejected for two reasons:
+      #
+      # 1. It throws away the best signal we have. The LLM here already sees
+      #    both sides — all headers AND all field definitions — in one prompt.
+      #    That cross-field context is what disambiguates genuinely ambiguous
+      #    headers. "Cell" next to first_name/last_name/email is clearly a
+      #    phone number. "Cell" described in isolation could be a phone, a
+      #    prison cell, or a biological cell — the LLM can't know which.
+      #
+      # 2. It adds indirection without benefit. Direct matching lets the LLM
+      #    reason holistically. HyDE turns that into a blind embedding lookup
+      #    that loses the reasoning context.
+      #
+      # The right path for genuinely ambiguous headers is: enrich this prompt
+      # with business context (csv_source, csv_context on the form class) so
+      # the LLM has more signal — not strip signal away via HyDE. If even that
+      # isn't enough, surface the header as UnmatchedResult for human review.
+      def match(csv_headers:, form_class:, sample_rows: [])
+        field_definitions = form_class.csv_fields
+        return {} if field_definitions.empty?
+        prompt = build_prompt(csv_headers, field_definitions, form_class)
+        response = fetch_llm_response(prompt)
+        parse_response(response, csv_headers)
+      rescue StandardError => e
+        log_error("LLM strategy failed: #{e.message}")
+        {}
+      end
+      private
+      def build_prompt(csv_headers, field_definitions, form_class)
+        fields_desc  = field_definitions.map { |name, defn| "- #{name}: #{defn.description}" }.join("\n")
+        context_line = form_class.csv_context ? "\nBusiness context: #{form_class.csv_context}" : ""
+        source_line  = form_class.csv_source  ? "\nCSV source: #{form_class.csv_source}"       : ""
+        <<~PROMPT
+          You are a CSV column matching assistant. Match the following CSV headers to the target fields.#{context_line}#{source_line}
+          CSV Headers:
+          #{csv_headers.map { |h| "- #{h}" }.join("\n")}
+          Target Fields:
+          #{fields_desc}
+          Return a JSON object with this exact format:
+          {"mappings": {"<csv_header>": {"field": "<field_name>", "confidence": <0.0-1.0>}}}
+          Only include headers you can confidently match. Return valid JSON only, no other text.
+        PROMPT
+      end
+      def fetch_llm_response(prompt)
+        model = SmartCsvImport.configuration.llm_model
+        log_info("Sending prompt to #{model}")
+        chat     = RubyLLM.chat(model: model)
+        response = chat.ask(prompt)
+        log_info("Received response (#{response.content.length} chars)")
+        response
+      end
+      def parse_response(response, csv_headers)
+        content  = strip_code_fences(response.content)
+        data     = JSON.parse(content)
+        mappings = data["mappings"] || {}
+        csv_headers.each_with_object({}) do |header, results|
+          mapping = mappings[header]
+          next unless mapping
+          confidence = mapping["confidence"].to_f.clamp(0.0, 1.0)
+          field      = mapping["field"]&.to_sym
+          next unless field && confidence > 0
+          results[header] = MatchResult.matched(
+            target_field: field,
+            confidence: confidence,
+            strategy_name: "llm"
+          )
+        end
+      rescue JSON::ParserError => e
+        log_error("Failed to parse LLM response: #{e.message}")
+        {}
+      end
+      def strip_code_fences(text)
+        text.gsub(/\A\s*```\w*\n/, "").gsub(/\n```\s*\z/, "")
+      end
+    end
+  end
+end

data/lib/smart_csv_import/strategies/lookup.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# frozen_string_literal: true
+module SmartCsvImport
+  module Strategies
+    class Lookup < Strategy
+      class << self
+        def mappings(hash = nil)
+          return @defined_mappings ||= {} unless hash
+          @defined_mappings = hash.each_with_object({}) do |(header, field), acc|
+            acc[header.downcase] = field
+          end
+        end
+        def defined_mappings
+          @defined_mappings ||= {}
+        end
+        def inherited(subclass)
+          super
+          subclass.instance_variable_set(:@defined_mappings, {})
+        end
+      end
+      def match(csv_headers:, form_class:, sample_rows: [])
+        mappings = self.class.defined_mappings
+        csv_headers.each_with_object({}) do |header, results|
+          field = mappings[header.downcase]
+          next unless field
+          results[header] = MatchResult.matched(
+            target_field: field,
+            confidence: 1.0,
+            strategy_name: "lookup"
+          )
+        end
+      end
+    end
+  end
+end

data/lib/smart_csv_import/strategies/vector.rb ADDED Viewed

@@ -0,0 +1,155 @@
+# frozen_string_literal: true
+require "json"
+require "digest"
+require "fileutils"
+require "ruby_llm"
+require "faraday"
+require_relative "../header_normalizer"
+module SmartCsvImport
+  module Strategies
+    class Vector < Strategy
+      include Logging
+      def match(csv_headers:, form_class:, sample_rows: [])
+        field_definitions = form_class.csv_fields
+        return {} if field_definitions.empty?
+        field_names = field_definitions.keys
+        humanized_names = field_names.map { |name| name.to_s.tr("_", " ") }
+        # Index humanized names for O(1) exact-match lookup
+        humanized_index = humanized_names.each_with_index.to_h { |name, i| [name.downcase, field_names[i]] }
+        results = {}
+        needs_embedding = []
+        csv_headers.each do |header|
+          normalized = HeaderNormalizer.normalize(header)
+          if (field = humanized_index[normalized.downcase])
+            log_info("Exact match: '#{header}' → :#{field} (normalized: '#{normalized}')")
+            results[header] = MatchResult.matched(
+              target_field: field,
+              confidence: 1.0,
+              strategy_name: "vector"
+            )
+          else
+            needs_embedding << header
+          end
+        end
+        return results if needs_embedding.empty?
+        field_embeddings = fetch_field_embeddings(humanized_names, field_names)
+        normalized_remaining = needs_embedding.map { |h| HeaderNormalizer.normalize(h) }
+        raw_header_embeddings = compute_embeddings(normalized_remaining.uniq)
+        header_embeddings = needs_embedding.zip(normalized_remaining).to_h do |orig, norm|
+          [orig, raw_header_embeddings[norm]]
+        end
+        # Build full score matrix so we can check both directions
+        score_matrix = needs_embedding.each_with_object({}) do |header, matrix|
+          header_vec = header_embeddings[header]
+          next unless header_vec
+          matrix[header] = field_names.each_with_object({}) do |field_name, scores|
+            field_vec = field_embeddings[field_name]
+            scores[field_name] = CosineSimilarity.call(header_vec, field_vec) if field_vec
+          end
+        end
+        # Best field for each header
+        best_field_for = score_matrix.transform_values { |scores| scores.max_by { |_, s| s }&.first }
+        # Best header for each field (among headers needing embedding)
+        best_header_for = field_names.each_with_object({}) do |field_name, bh|
+          bh[field_name] = score_matrix.max_by { |_, scores| scores[field_name] || -1 }&.first
+        end
+        needs_embedding.each do |header|
+          best_field = best_field_for[header]
+          next unless best_field
+          score = score_matrix[header][best_field]
+          unless best_header_for[best_field] == header
+            log_info("Non-mutual: '#{header}' → :#{best_field} (#{score.round(4)}) — field's best header is '#{best_header_for[best_field]}'")
+            next
+          end
+          results[header] = MatchResult.matched(
+            target_field: best_field,
+            confidence: score.round(4),
+            strategy_name: "vector"
+          )
+        end
+        results
+      rescue RubyLLM::Error, Faraday::Error => e
+        log_error("Vector strategy errored (#{e.class}): #{e.message}")
+        StrategyFailure.new(strategy_name: "vector", error: e)
+      end
+      private
+      def fetch_field_embeddings(embed_texts, field_names)
+        cache = load_cache(embed_texts)
+        if cache
+          log_info("Using cached field embeddings (#{cache.size} fields)")
+          return cache
+        end
+        log_info("No embedding cache found, computing fresh")
+        embeddings = compute_embeddings(embed_texts)
+        result = field_names.zip(embed_texts).each_with_object({}) do |(name, text), acc|
+          acc[name] = embeddings[text]
+        end
+        save_cache(embed_texts, result)
+        result
+      end
+      def compute_embeddings(texts)
+        model = SmartCsvImport.configuration.embedding_model
+        log_info("Computing embeddings for #{texts.length} texts via #{model}")
+        response = RubyLLM.embed(texts, model: model)
+        vectors = response.vectors
+        log_info("Received #{vectors.length} embedding vectors")
+        texts.zip(vectors).to_h
+      end
+      def cache_dir
+        File.join(SmartCsvImport.configuration.storage_path, "embeddings_cache")
+      end
+      def cache_key(texts)
+        Digest::SHA256.hexdigest(texts.sort.join("|"))
+      end
+      def load_cache(texts)
+        key = cache_key(texts)
+        path = File.join(cache_dir, "#{key}.json")
+        return nil unless File.exist?(path)
+        data = JSON.parse(File.read(path))
+        data.transform_keys(&:to_sym)
+      rescue StandardError
+        nil
+      end
+      def save_cache(texts, embeddings)
+        FileUtils.mkdir_p(cache_dir)
+        key = cache_key(texts)
+        path = File.join(cache_dir, "#{key}.json")
+        serializable = embeddings.transform_keys(&:to_s)
+        File.write(path, JSON.generate(serializable))
+      rescue StandardError => e
+        log_error("Failed to save embedding cache: #{e.message}")
+      end
+    end
+  end
+end

data/lib/smart_csv_import/strategy.rb ADDED Viewed

@@ -0,0 +1,9 @@
+# frozen_string_literal: true
+module SmartCsvImport
+  class Strategy
+    def match(csv_headers:, form_class:, sample_rows: [])
+      raise NotImplementedError, "#{self.class}#match must be implemented"
+    end
+  end
+end

data/lib/smart_csv_import/strategy_failure.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+module SmartCsvImport
+  StrategyFailure = Struct.new(:strategy_name, :error, keyword_init: true) do
+    def reason
+      "#{error.class}: #{error.message}"
+    end
+    def failure?
+      true
+    end
+  end
+end

data/lib/smart_csv_import/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module SmartCsvImport
+  VERSION = "0.1.0"
+end

data/lib/smart_csv_import.rb ADDED Viewed

@@ -0,0 +1,79 @@
+# frozen_string_literal: true
+require "logger"
+require "active_support"
+require "active_support/core_ext/module/attribute_accessors"
+require "active_support/core_ext/time/calculations"
+require_relative "smart_csv_import/version"
+require_relative "smart_csv_import/engine" if defined?(Rails)
+module SmartCsvImport
+  class Error < StandardError; end
+  class ConfigurationError < Error; end
+  class << self
+    attr_writer :configuration, :base_model_class
+    def configuration
+      @configuration ||= Configuration.new
+    end
+    def configure
+      yield(configuration)
+    end
+    def reset_configuration!
+      @configuration = Configuration.new
+      @base_model_class = nil
+    end
+    def base_model_class
+      klass = @base_model_class || ActiveRecord::Base
+      klass.is_a?(String) ? klass.constantize : klass
+    end
+    def process(file_path, form_class:, mode: :sync, batch_size: configuration.batch_size, dry_run: false, confirmed_mappings: nil)
+      Processor.new(
+        file_path: file_path,
+        form_class: form_class,
+        mode: mode,
+        batch_size: batch_size,
+        dry_run: dry_run,
+        confirmed_mappings: confirmed_mappings
+      ).call
+    end
+    def match_headers(file_path, form_class:)
+      Matcher.new(file_path: file_path, form_class: form_class).call
+    end
+  end
+end
+require_relative "smart_csv_import/configuration"
+require_relative "smart_csv_import/logging"
+require_relative "smart_csv_import/result"
+require_relative "smart_csv_import/match_result"
+require_relative "smart_csv_import/strategy"
+require_relative "smart_csv_import/strategy_failure"
+require_relative "smart_csv_import/matchable"
+# User-space normalizer utilities.
+# Not called by the processing pipeline — available for use in form objects
+# or ETL scripts: SmartCsvImport::Normalizers::BooleanConverter.new.call(value)
+require_relative "smart_csv_import/normalizers/date_converter"
+require_relative "smart_csv_import/normalizers/boolean_converter"
+require_relative "smart_csv_import/file_storage"
+require_relative "smart_csv_import/cosine_similarity"
+require_relative "smart_csv_import/strategies/lookup"
+require_relative "smart_csv_import/strategies/vector"
+require_relative "smart_csv_import/strategies/llm"
+require_relative "smart_csv_import/matcher"
+require_relative "smart_csv_import/processor"
+require_relative "smart_csv_import/processor/nil_cell_counter"
+require_relative "smart_csv_import/processor/null_progress_callback"
+require_relative "smart_csv_import/processor/row_processor"
+require_relative "smart_csv_import/processor/mapping_review_policy"
+require_relative "smart_csv_import/processor/csv_preflight_analyzer"
+require_relative "smart_csv_import/processor/import_result_builder"
+require_relative "smart_csv_import/failed_row_exporter"
+require_relative "smart_csv_import/stability_report"

data/smart_csv_import.gemspec ADDED Viewed

@@ -0,0 +1,35 @@
+require_relative "lib/smart_csv_import/version"
+Gem::Specification.new do |spec|
+  spec.name = "smart_csv_import"
+  spec.version = SmartCsvImport::VERSION
+  spec.authors = ["Nico Roulston"]
+  spec.email = ["nicolas.roulston@gmail.com"]
+  spec.homepage = "https://github.com/Nroulston/smart_csv_import"
+  spec.summary = "AI-powered CSV import with automatic header matching for Rails"
+  spec.description = "A Rails Engine wrapping SmarterCSV for CSV importing with " \
+                     "three-tier AI header matching (lookup + vector similarity + LLM fallback), " \
+                     "batch processing, and import tracking. Only headers are sent to AI — " \
+                     "row data never leaves your application."
+  spec.license = "MIT"
+  spec.metadata = {
+    "bug_tracker_uri" => "https://github.com/Nroulston/smart_csv_import/issues",
+    "changelog_uri" => "https://github.com/Nroulston/smart_csv_import/blob/main/CHANGELOG.md",
+    "source_code_uri" => "https://github.com/Nroulston/smart_csv_import",
+    "rubygems_mfa_required" => "true"
+  }
+  spec.required_ruby_version = ">= 3.1"
+  spec.add_dependency "activemodel", ">= 7.0", "< 9"
+  spec.add_dependency "activerecord", ">= 7.0", "< 9"
+  spec.add_dependency "activejob", ">= 7.0", "< 9"
+  spec.add_dependency "activesupport", ">= 7.0", "< 9"
+  spec.add_dependency "csv", "~> 3.0"
+  spec.add_dependency "smarter_csv", "~> 1.10"
+  spec.add_dependency "ruby_llm", "~> 1.0"
+  spec.extra_rdoc_files = Dir["README*", "LICENSE*"]
+  spec.files = Dir["*.gemspec", "lib/**/*", "app/**/*", "config/**/*", "db/**/*"]
+end