RubyGems - codebase_index - Versions diffs - 0.1.0 - Mend

codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +29 -0
data/CODE_OF_CONDUCT.md +83 -0
data/CONTRIBUTING.md +65 -0
data/LICENSE.txt +21 -0
data/README.md +481 -0
data/exe/codebase-console-mcp +22 -0
data/exe/codebase-index-mcp +61 -0
data/exe/codebase-index-mcp-http +64 -0
data/exe/codebase-index-mcp-start +58 -0
data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
data/lib/codebase_index/ast/method_extractor.rb +76 -0
data/lib/codebase_index/ast/node.rb +88 -0
data/lib/codebase_index/ast/parser.rb +653 -0
data/lib/codebase_index/ast.rb +6 -0
data/lib/codebase_index/builder.rb +137 -0
data/lib/codebase_index/chunking/chunk.rb +84 -0
data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
data/lib/codebase_index/console/audit_logger.rb +75 -0
data/lib/codebase_index/console/bridge.rb +170 -0
data/lib/codebase_index/console/confirmation.rb +90 -0
data/lib/codebase_index/console/connection_manager.rb +173 -0
data/lib/codebase_index/console/console_response_renderer.rb +78 -0
data/lib/codebase_index/console/model_validator.rb +81 -0
data/lib/codebase_index/console/safe_context.rb +82 -0
data/lib/codebase_index/console/server.rb +557 -0
data/lib/codebase_index/console/sql_validator.rb +172 -0
data/lib/codebase_index/console/tools/tier1.rb +118 -0
data/lib/codebase_index/console/tools/tier2.rb +117 -0
data/lib/codebase_index/console/tools/tier3.rb +110 -0
data/lib/codebase_index/console/tools/tier4.rb +79 -0
data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
data/lib/codebase_index/cost_model/estimator.rb +128 -0
data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
data/lib/codebase_index/cost_model.rb +22 -0
data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
data/lib/codebase_index/db/migrator.rb +71 -0
data/lib/codebase_index/db/schema_version.rb +73 -0
data/lib/codebase_index/dependency_graph.rb +227 -0
data/lib/codebase_index/embedding/indexer.rb +130 -0
data/lib/codebase_index/embedding/openai.rb +105 -0
data/lib/codebase_index/embedding/provider.rb +135 -0
data/lib/codebase_index/embedding/text_preparer.rb +112 -0
data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
data/lib/codebase_index/evaluation/evaluator.rb +146 -0
data/lib/codebase_index/evaluation/metrics.rb +79 -0
data/lib/codebase_index/evaluation/query_set.rb +148 -0
data/lib/codebase_index/evaluation/report_generator.rb +90 -0
data/lib/codebase_index/extracted_unit.rb +145 -0
data/lib/codebase_index/extractor.rb +956 -0
data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
data/lib/codebase_index/extractors/event_extractor.rb +211 -0
data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
data/lib/codebase_index/extractors/job_extractor.rb +369 -0
data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
data/lib/codebase_index/extractors/model_extractor.rb +960 -0
data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
data/lib/codebase_index/extractors/route_extractor.rb +181 -0
data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
data/lib/codebase_index/extractors/service_extractor.rb +254 -0
data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
data/lib/codebase_index/feedback/gap_detector.rb +89 -0
data/lib/codebase_index/feedback/store.rb +119 -0
data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
data/lib/codebase_index/flow_assembler.rb +290 -0
data/lib/codebase_index/flow_document.rb +191 -0
data/lib/codebase_index/flow_precomputer.rb +102 -0
data/lib/codebase_index/formatting/base.rb +40 -0
data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
data/lib/codebase_index/formatting/human_adapter.rb +78 -0
data/lib/codebase_index/graph_analyzer.rb +374 -0
data/lib/codebase_index/mcp/index_reader.rb +394 -0
data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
data/lib/codebase_index/mcp/server.rb +935 -0
data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
data/lib/codebase_index/model_name_cache.rb +51 -0
data/lib/codebase_index/notion/client.rb +217 -0
data/lib/codebase_index/notion/exporter.rb +219 -0
data/lib/codebase_index/notion/mapper.rb +39 -0
data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
data/lib/codebase_index/notion/rate_limiter.rb +68 -0
data/lib/codebase_index/observability/health_check.rb +81 -0
data/lib/codebase_index/observability/instrumentation.rb +34 -0
data/lib/codebase_index/observability/structured_logger.rb +75 -0
data/lib/codebase_index/operator/error_escalator.rb +81 -0
data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
data/lib/codebase_index/operator/status_reporter.rb +80 -0
data/lib/codebase_index/railtie.rb +26 -0
data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
data/lib/codebase_index/resilience/index_validator.rb +185 -0
data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
data/lib/codebase_index/retrieval/ranker.rb +273 -0
data/lib/codebase_index/retrieval/search_executor.rb +327 -0
data/lib/codebase_index/retriever.rb +160 -0
data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
data/lib/codebase_index/ruby_analyzer.rb +87 -0
data/lib/codebase_index/session_tracer/file_store.rb +111 -0
data/lib/codebase_index/session_tracer/middleware.rb +143 -0
data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
data/lib/codebase_index/session_tracer/store.rb +67 -0
data/lib/codebase_index/storage/graph_store.rb +120 -0
data/lib/codebase_index/storage/metadata_store.rb +169 -0
data/lib/codebase_index/storage/pgvector.rb +163 -0
data/lib/codebase_index/storage/qdrant.rb +172 -0
data/lib/codebase_index/storage/vector_store.rb +156 -0
data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
data/lib/codebase_index/version.rb +5 -0
data/lib/codebase_index.rb +223 -0
data/lib/generators/codebase_index/install_generator.rb +32 -0
data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
data/lib/tasks/codebase_index.rake +583 -0
data/lib/tasks/codebase_index_evaluation.rake +115 -0
metadata +252 -0

data/lib/codebase_index/db/migrator.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+require_relative 'schema_version'
+require_relative 'migrations/001_create_units'
+require_relative 'migrations/002_create_edges'
+require_relative 'migrations/003_create_embeddings'
+require_relative 'migrations/004_create_snapshots'
+require_relative 'migrations/005_create_snapshot_units'
+module CodebaseIndex
+  module Db
+    # Runs schema migrations against a database connection.
+    #
+    # Tracks applied migrations via {SchemaVersion} and only runs pending ones.
+    # Migrations are defined as modules in `db/migrations/` with a VERSION
+    # constant and a `.up(connection)` class method.
+    #
+    # @example
+    #   db = SQLite3::Database.new('codebase_index.db')
+    #   migrator = Migrator.new(connection: db)
+    #   migrator.migrate!  # => [1, 2, 3]
+    #
+    class Migrator
+      MIGRATIONS = [
+        Migrations::CreateUnits,
+        Migrations::CreateEdges,
+        Migrations::CreateEmbeddings,
+        Migrations::CreateSnapshots,
+        Migrations::CreateSnapshotUnits
+      ].freeze
+      attr_reader :schema_version
+      # @param connection [Object] Database connection supporting #execute
+      def initialize(connection:)
+        @connection = connection
+        @schema_version = SchemaVersion.new(connection: connection)
+        @schema_version.ensure_table!
+      end
+      # Run all pending migrations.
+      #
+      # @return [Array<Integer>] Version numbers of newly applied migrations
+      def migrate!
+        applied = []
+        pending_migrations.each do |migration|
+          migration.up(@connection)
+          @schema_version.record_version(migration::VERSION)
+          applied << migration::VERSION
+        end
+        applied
+      end
+      # List version numbers of pending (unapplied) migrations.
+      #
+      # @return [Array<Integer>]
+      def pending_versions
+        applied = @schema_version.applied_versions
+        MIGRATIONS.map { |m| m::VERSION }.reject { |v| applied.include?(v) }
+      end
+      private
+      # @return [Array<Module>] Pending migration modules
+      def pending_migrations
+        applied = @schema_version.applied_versions
+        MIGRATIONS.reject { |m| applied.include?(m::VERSION) }
+      end
+    end
+  end
+end

data/lib/codebase_index/db/schema_version.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+module CodebaseIndex
+  module Db
+    # Tracks which schema migrations have been applied.
+    #
+    # Uses a simple `codebase_index_schema_migrations` table with a single
+    # `version` column. Works with any database connection that supports
+    # `execute` and returns arrays (SQLite3, pg, mysql2).
+    #
+    # @example
+    #   db = SQLite3::Database.new('codebase_index.db')
+    #   sv = SchemaVersion.new(connection: db)
+    #   sv.ensure_table!
+    #   sv.current_version  # => 0
+    #   sv.record_version(1)
+    #   sv.current_version  # => 1
+    #
+    class SchemaVersion
+      TABLE_NAME = 'codebase_index_schema_migrations'
+      # @param connection [Object] Database connection supporting #execute
+      def initialize(connection:)
+        @connection = connection
+      end
+      # Create the schema migrations table if it does not exist.
+      #
+      # @return [void]
+      def ensure_table!
+        @connection.execute(<<~SQL)
+          CREATE TABLE IF NOT EXISTS #{TABLE_NAME} (
+            version INTEGER PRIMARY KEY NOT NULL,
+            applied_at TEXT NOT NULL DEFAULT (datetime('now'))
+          )
+        SQL
+      end
+      # List all applied migration version numbers, sorted ascending.
+      #
+      # @return [Array<Integer>]
+      def applied_versions
+        rows = @connection.execute("SELECT version FROM #{TABLE_NAME} ORDER BY version ASC")
+        rows.map { |row| row.is_a?(Array) ? row[0] : row['version'] }
+      end
+      # Record a migration version as applied.
+      #
+      # @param version [Integer] The migration version number
+      # @return [void]
+      def record_version(version)
+        @connection.execute(
+          "INSERT OR IGNORE INTO #{TABLE_NAME} (version) VALUES (?)", [version]
+        )
+      end
+      # Check whether a version has been applied.
+      #
+      # @param version [Integer]
+      # @return [Boolean]
+      def applied?(version)
+        applied_versions.include?(version)
+      end
+      # The highest applied version, or 0 if none.
+      #
+      # @return [Integer]
+      def current_version
+        applied_versions.last || 0
+      end
+    end
+  end
+end

data/lib/codebase_index/dependency_graph.rb ADDED Viewed

@@ -0,0 +1,227 @@
+# frozen_string_literal: true
+require 'set'
+require 'json'
+module CodebaseIndex
+  # DependencyGraph tracks relationships between code units for:
+  # 1. Understanding what depends on what
+  # 2. Computing "blast radius" for incremental re-indexing
+  # 3. Enabling graph-based retrieval queries
+  #
+  # The graph is bidirectional - we track both what a unit depends on
+  # and what depends on that unit (reverse edges).
+  #
+  # @example Building and querying the graph
+  #   graph = DependencyGraph.new
+  #   graph.register(user_model_unit)
+  #   graph.register(user_service_unit)
+  #
+  #   # Find everything affected by a change to user.rb
+  #   affected = graph.affected_by(["app/models/user.rb"])
+  #
+  class DependencyGraph
+    def initialize
+      @nodes = {}      # identifier => { type:, file_path: }
+      @edges = {}      # identifier => [dependency identifiers]
+      @reverse = {}    # identifier => [dependent identifiers]
+      @file_map = {}   # file_path => identifier
+      @type_index = {} # type => [identifiers]
+    end
+    # Register a unit in the graph
+    #
+    # @param unit [ExtractedUnit] The unit to register
+    def register(unit)
+      @nodes[unit.identifier] = {
+        type: unit.type,
+        file_path: unit.file_path,
+        namespace: unit.namespace
+      }
+      @edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
+      @file_map[unit.file_path] = unit.identifier if unit.file_path
+      # Type index for filtering
+      @type_index[unit.type] ||= []
+      @type_index[unit.type] << unit.identifier unless @type_index[unit.type].include?(unit.identifier)
+      # Build reverse edges
+      unit.dependencies.each do |dep|
+        @reverse[dep[:target]] ||= []
+        @reverse[dep[:target]] << unit.identifier unless @reverse[dep[:target]].include?(unit.identifier)
+      end
+    end
+    # Find all units affected by changes to given files
+    # Uses BFS to find transitive dependents
+    #
+    # @param changed_files [Array<String>] List of changed file paths
+    # @param max_depth [Integer] Maximum traversal depth (nil for unlimited)
+    # @return [Array<String>] List of affected unit identifiers
+    def affected_by(changed_files, max_depth: nil)
+      directly_changed = changed_files.filter_map { |f| @file_map[f] }
+      affected = Set.new(directly_changed)
+      queue = directly_changed.map { |id| [id, 0] } # [identifier, depth]
+      while queue.any?
+        current, depth = queue.shift
+        next if max_depth && depth >= max_depth
+        dependents = @reverse[current] || []
+        dependents.each do |dep|
+          unless affected.include?(dep)
+            affected.add(dep)
+            queue.push([dep, depth + 1])
+          end
+        end
+      end
+      affected.to_a
+    end
+    # Check if a node exists in the graph by exact identifier.
+    #
+    # @param identifier [String] Unit identifier to check
+    # @return [Boolean] true if the node exists
+    def node_exists?(identifier)
+      @nodes.key?(identifier)
+    end
+    # Find a node by suffix matching (e.g., "Update" matches "Order::Update").
+    #
+    # When multiple nodes share the same suffix, the first match wins.
+    # Suffix matching requires a "::" separator — bare identifiers (no namespace)
+    # are not matched by this method; use {#node_exists?} for exact lookups.
+    #
+    # @param suffix [String] The suffix to match against
+    # @return [String, nil] The first matching identifier, or nil
+    def find_node_by_suffix(suffix)
+      target_suffix = "::#{suffix}"
+      @nodes.keys.find { |id| id.end_with?(target_suffix) }
+    end
+    # Get direct dependencies of a unit
+    #
+    # @param identifier [String] Unit identifier
+    # @return [Array<String>] List of dependency identifiers
+    def dependencies_of(identifier)
+      @edges[identifier] || []
+    end
+    # Get direct dependents of a unit (what depends on it)
+    #
+    # @param identifier [String] Unit identifier
+    # @return [Array<String>] List of dependent identifiers
+    def dependents_of(identifier)
+      @reverse[identifier] || []
+    end
+    # Get all units of a specific type
+    #
+    # @param type [Symbol] Unit type (:model, :controller, etc.)
+    # @return [Array<String>] List of unit identifiers
+    def units_of_type(type)
+      @type_index[type] || []
+    end
+    # Compute PageRank scores for all nodes
+    #
+    # Uses the reverse edges (dependents) as the link structure: a node
+    # with many dependents gets a higher score. This matches Aider's insight
+    # that structural importance correlates with retrieval relevance.
+    #
+    # @param damping [Float] Damping factor (default: 0.85)
+    # @param iterations [Integer] Number of iterations (default: 20)
+    # @return [Hash<String, Float>] Identifier => PageRank score
+    def pagerank(damping: 0.85, iterations: 20)
+      n = @nodes.size
+      return {} if n.zero?
+      base_score = 1.0 / n
+      scores = @nodes.keys.to_h { |id| [id, base_score] }
+      iterations.times do
+        # Collect rank from dangling nodes (no outgoing edges) and redistribute
+        dangling_sum = @nodes.keys.sum do |id|
+          @edges[id].nil? || @edges[id].empty? ? scores[id] : 0.0
+        end
+        new_scores = {}
+        @nodes.each_key do |id|
+          # Sum contributions from nodes that depend on this one
+          incoming = @reverse[id] || []
+          rank_sum = incoming.sum do |src|
+            out_degree = (@edges[src] || []).size
+            out_degree.positive? ? scores[src] / out_degree : 0.0
+          end
+          new_scores[id] = ((1.0 - damping) / n) + (damping * (rank_sum + (dangling_sum / n)))
+        end
+        scores = new_scores
+      end
+      scores
+    end
+    # Serialize graph for persistence
+    #
+    # @return [Hash] Complete graph data
+    def to_h
+      {
+        nodes: @nodes,
+        edges: @edges,
+        reverse: @reverse,
+        file_map: @file_map,
+        type_index: @type_index,
+        stats: {
+          node_count: @nodes.size,
+          edge_count: @edges.values.sum(&:size),
+          types: @type_index.transform_values(&:size)
+        }
+      }
+    end
+    # Load graph from persisted data
+    #
+    # After JSON round-trip all keys become strings. This method normalizes
+    # them back to the expected types: node values use symbol keys (:type,
+    # :file_path, :namespace), and type_index uses symbol keys for types.
+    #
+    # @param data [Hash] Previously serialized graph data
+    # @return [DependencyGraph] Restored graph
+    def self.from_h(data)
+      graph = new
+      raw_nodes = data[:nodes] || data['nodes'] || {}
+      graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
+      graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
+      graph.instance_variable_set(:@reverse, data[:reverse] || data['reverse'] || {})
+      graph.instance_variable_set(:@file_map, data[:file_map] || data['file_map'] || {})
+      raw_type_index = data[:type_index] || data['type_index'] || {}
+      graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym))
+      graph
+    end
+    # Normalize a node hash to use symbol keys
+    #
+    # @param node [Hash] Node data with string or symbol keys
+    # @return [Hash] Node data with symbol keys
+    def self.symbolize_node(node)
+      return node unless node.is_a?(Hash)
+      {
+        type: (node[:type] || node['type'])&.to_sym,
+        file_path: node[:file_path] || node['file_path'],
+        namespace: node[:namespace] || node['namespace']
+      }
+    end
+  end
+end

data/lib/codebase_index/embedding/indexer.rb ADDED Viewed

@@ -0,0 +1,130 @@
+# frozen_string_literal: true
+require 'json'
+require 'digest'
+module CodebaseIndex
+  module Embedding
+    # Orchestrates the indexing pipeline: reads extracted units, prepares text,
+    # generates embeddings, and stores vectors. Supports full and incremental
+    # modes with checkpoint-based resumability.
+    class Indexer
+      def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32)
+        @provider = provider
+        @text_preparer = text_preparer
+        @vector_store = vector_store
+        @output_dir = output_dir
+        @batch_size = batch_size
+      end
+      # Index all extracted units (full mode). Returns stats hash.
+      # @return [Hash] Stats with :processed, :skipped, :errors counts
+      def index_all
+        process_units(load_units, incremental: false)
+      end
+      # Index only changed units (incremental mode). Returns stats hash.
+      # @return [Hash] Stats with :processed, :skipped, :errors counts
+      def index_incremental
+        process_units(load_units, incremental: true)
+      end
+      private
+      def load_units
+        Dir.glob(File.join(@output_dir, '**', '*.json')).filter_map do |path|
+          next if File.basename(path) == 'checkpoint.json'
+          JSON.parse(File.read(path))
+        rescue JSON::ParserError
+          nil
+        end
+      end
+      def process_units(units, incremental:)
+        checkpoint = incremental ? load_checkpoint : {}
+        stats = { processed: 0, skipped: 0, errors: 0 }
+        units.each_slice(@batch_size) do |batch|
+          process_batch(batch, checkpoint, stats, incremental: incremental)
+          save_checkpoint(checkpoint)
+        end
+        stats
+      end
+      def process_batch(batch, checkpoint, stats, incremental:)
+        to_embed = batch.each_with_object([]) do |unit_data, items|
+          if incremental && checkpoint[unit_data['identifier']] == unit_data['source_hash']
+            stats[:skipped] += 1
+            next
+          end
+          collect_embed_items(unit_data, items)
+        end
+        embed_and_store(to_embed, checkpoint, stats)
+      end
+      def collect_embed_items(unit_data, items)
+        texts = prepare_texts(unit_data)
+        identifier = unit_data['identifier']
+        texts.each_with_index do |text, idx|
+          embed_id = texts.length > 1 ? "#{identifier}#chunk_#{idx}" : identifier
+          items << { id: embed_id, text: text, unit_data: unit_data,
+                     source_hash: unit_data['source_hash'], identifier: identifier }
+        end
+      end
+      def prepare_texts(unit_data)
+        unit = build_unit(unit_data)
+        unit.chunks&.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
+      end
+      def build_unit(data)
+        unit = ExtractedUnit.new(type: data['type']&.to_sym, identifier: data['identifier'],
+                                 file_path: data['file_path'])
+        unit.namespace = data['namespace']
+        unit.source_code = data['source_code']
+        unit.dependencies = data['dependencies'] || []
+        unit.chunks = (data['chunks'] || []).map { |c| c.transform_keys(&:to_sym) }
+        unit
+      end
+      def embed_and_store(items, checkpoint, stats)
+        return if items.empty?
+        vectors = @provider.embed_batch(items.map { |i| i[:text] })
+        store_vectors(items, vectors, checkpoint, stats)
+      rescue StandardError => e
+        stats[:errors] += items.size
+        stats[:error_messages] ||= []
+        stats[:error_messages] << e.message
+        raise CodebaseIndex::Error, "Embedding failed: #{e.message}"
+      end
+      def store_vectors(items, vectors, checkpoint, stats)
+        items.each_with_index do |item, idx|
+          metadata = { type: item[:unit_data]['type'], identifier: item[:identifier],
+                       file_path: item[:unit_data]['file_path'] }
+          @vector_store.store(item[:id], vectors[idx], metadata)
+          checkpoint[item[:identifier]] = item[:source_hash]
+          stats[:processed] += 1
+        end
+      end
+      def load_checkpoint
+        path = File.join(@output_dir, 'checkpoint.json')
+        return {} unless File.exist?(path)
+        JSON.parse(File.read(path))
+      rescue JSON::ParserError
+        {}
+      end
+      def save_checkpoint(checkpoint)
+        File.write(File.join(@output_dir, 'checkpoint.json'), JSON.generate(checkpoint))
+      end
+    end
+  end
+end

data/lib/codebase_index/embedding/openai.rb ADDED Viewed

@@ -0,0 +1,105 @@
+# frozen_string_literal: true
+require 'net/http'
+require 'json'
+module CodebaseIndex
+  module Embedding
+    module Provider
+      # OpenAI adapter for cloud embeddings via the OpenAI HTTP API.
+      #
+      # Uses the `/v1/embeddings` endpoint to generate embeddings. Requires a valid
+      # OpenAI API key.
+      #
+      # @example
+      #   provider = CodebaseIndex::Embedding::Provider::OpenAI.new(api_key: ENV['OPENAI_API_KEY'])
+      #   vector = provider.embed("class User < ApplicationRecord; end")
+      #   vectors = provider.embed_batch(["text1", "text2"])
+      class OpenAI
+        include Interface
+        ENDPOINT = URI('https://api.openai.com/v1/embeddings')
+        DEFAULT_MODEL = 'text-embedding-3-small'
+        DIMENSIONS = {
+          'text-embedding-3-small' => 1536,
+          'text-embedding-3-large' => 3072
+        }.freeze
+        # @param api_key [String] OpenAI API key
+        # @param model [String] OpenAI embedding model name (default: text-embedding-3-small)
+        def initialize(api_key:, model: DEFAULT_MODEL)
+          @api_key = api_key
+          @model = model
+        end
+        # Embed a single text string.
+        #
+        # @param text [String] the text to embed
+        # @return [Array<Float>] the embedding vector
+        # @raise [CodebaseIndex::Error] if the API returns an error
+        def embed(text)
+          response = post_request({ model: @model, input: text })
+          response['data'].first['embedding']
+        end
+        # Embed multiple texts in a single request.
+        #
+        # Sorts results by the index field to guarantee ordering matches input.
+        #
+        # @param texts [Array<String>] the texts to embed
+        # @return [Array<Array<Float>>] array of embedding vectors
+        # @raise [CodebaseIndex::Error] if the API returns an error
+        def embed_batch(texts)
+          response = post_request({ model: @model, input: texts })
+          response['data']
+            .sort_by { |item| item['index'] }
+            .map { |item| item['embedding'] }
+        end
+        # Return the dimensionality of vectors produced by this model.
+        #
+        # Uses the known dimensions for standard models, falling back to a
+        # test embedding for unknown models.
+        #
+        # @return [Integer] number of dimensions
+        def dimensions
+          DIMENSIONS[@model] || embed('test').length
+        end
+        # Return the model name.
+        #
+        # @return [String] the OpenAI model name
+        def model_name
+          @model
+        end
+        private
+        # Send a POST request to the OpenAI embeddings API.
+        #
+        # @param body [Hash] request body
+        # @return [Hash] parsed JSON response
+        # @raise [CodebaseIndex::Error] if the API returns a non-success status
+        def post_request(body)
+          http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
+          http.use_ssl = true
+          http.open_timeout = 10
+          http.read_timeout = 30
+          request = Net::HTTP::Post.new(ENDPOINT.path)
+          request['Content-Type'] = 'application/json'
+          request['Authorization'] = "Bearer #{@api_key}"
+          request.body = body.to_json
+          response = http.request(request)
+          unless response.is_a?(Net::HTTPSuccess)
+            raise CodebaseIndex::Error, "OpenAI API error: #{response.code} #{response.body}"
+          end
+          JSON.parse(response.body)
+        end
+      end
+    end
+  end
+end