RubyGems - codebase_index - Versions diffs - 0.3.2 → 0.4.0 - Mend

codebase_index 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

checksums.yaml +4 -4
data/lib/codebase_index.rb +3 -243
metadata +28 -223
data/CHANGELOG.md +0 -89
data/CODE_OF_CONDUCT.md +0 -83
data/CONTRIBUTING.md +0 -65
data/LICENSE.txt +0 -21
data/README.md +0 -325
data/exe/codebase-console +0 -59
data/exe/codebase-console-mcp +0 -22
data/exe/codebase-index-mcp +0 -34
data/exe/codebase-index-mcp-http +0 -37
data/exe/codebase-index-mcp-start +0 -58
data/lib/codebase_index/ast/call_site_extractor.rb +0 -106
data/lib/codebase_index/ast/method_extractor.rb +0 -71
data/lib/codebase_index/ast/node.rb +0 -116
data/lib/codebase_index/ast/parser.rb +0 -614
data/lib/codebase_index/ast.rb +0 -6
data/lib/codebase_index/builder.rb +0 -200
data/lib/codebase_index/cache/cache_middleware.rb +0 -199
data/lib/codebase_index/cache/cache_store.rb +0 -264
data/lib/codebase_index/cache/redis_cache_store.rb +0 -116
data/lib/codebase_index/cache/solid_cache_store.rb +0 -111
data/lib/codebase_index/chunking/chunk.rb +0 -84
data/lib/codebase_index/chunking/semantic_chunker.rb +0 -295
data/lib/codebase_index/console/adapters/cache_adapter.rb +0 -58
data/lib/codebase_index/console/adapters/good_job_adapter.rb +0 -33
data/lib/codebase_index/console/adapters/job_adapter.rb +0 -68
data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +0 -33
data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +0 -33
data/lib/codebase_index/console/audit_logger.rb +0 -75
data/lib/codebase_index/console/bridge.rb +0 -177
data/lib/codebase_index/console/confirmation.rb +0 -90
data/lib/codebase_index/console/connection_manager.rb +0 -173
data/lib/codebase_index/console/console_response_renderer.rb +0 -74
data/lib/codebase_index/console/embedded_executor.rb +0 -373
data/lib/codebase_index/console/model_validator.rb +0 -81
data/lib/codebase_index/console/rack_middleware.rb +0 -87
data/lib/codebase_index/console/safe_context.rb +0 -82
data/lib/codebase_index/console/server.rb +0 -612
data/lib/codebase_index/console/sql_validator.rb +0 -172
data/lib/codebase_index/console/tools/tier1.rb +0 -118
data/lib/codebase_index/console/tools/tier2.rb +0 -117
data/lib/codebase_index/console/tools/tier3.rb +0 -110
data/lib/codebase_index/console/tools/tier4.rb +0 -79
data/lib/codebase_index/coordination/pipeline_lock.rb +0 -109
data/lib/codebase_index/cost_model/embedding_cost.rb +0 -88
data/lib/codebase_index/cost_model/estimator.rb +0 -128
data/lib/codebase_index/cost_model/provider_pricing.rb +0 -67
data/lib/codebase_index/cost_model/storage_cost.rb +0 -52
data/lib/codebase_index/cost_model.rb +0 -22
data/lib/codebase_index/db/migrations/001_create_units.rb +0 -38
data/lib/codebase_index/db/migrations/002_create_edges.rb +0 -35
data/lib/codebase_index/db/migrations/003_create_embeddings.rb +0 -37
data/lib/codebase_index/db/migrations/004_create_snapshots.rb +0 -45
data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +0 -40
data/lib/codebase_index/db/migrator.rb +0 -71
data/lib/codebase_index/db/schema_version.rb +0 -73
data/lib/codebase_index/dependency_graph.rb +0 -236
data/lib/codebase_index/embedding/indexer.rb +0 -140
data/lib/codebase_index/embedding/openai.rb +0 -126
data/lib/codebase_index/embedding/provider.rb +0 -162
data/lib/codebase_index/embedding/text_preparer.rb +0 -112
data/lib/codebase_index/evaluation/baseline_runner.rb +0 -115
data/lib/codebase_index/evaluation/evaluator.rb +0 -139
data/lib/codebase_index/evaluation/metrics.rb +0 -79
data/lib/codebase_index/evaluation/query_set.rb +0 -148
data/lib/codebase_index/evaluation/report_generator.rb +0 -90
data/lib/codebase_index/extracted_unit.rb +0 -145
data/lib/codebase_index/extractor.rb +0 -1028
data/lib/codebase_index/extractors/action_cable_extractor.rb +0 -201
data/lib/codebase_index/extractors/ast_source_extraction.rb +0 -46
data/lib/codebase_index/extractors/behavioral_profile.rb +0 -309
data/lib/codebase_index/extractors/caching_extractor.rb +0 -261
data/lib/codebase_index/extractors/callback_analyzer.rb +0 -246
data/lib/codebase_index/extractors/concern_extractor.rb +0 -292
data/lib/codebase_index/extractors/configuration_extractor.rb +0 -219
data/lib/codebase_index/extractors/controller_extractor.rb +0 -404
data/lib/codebase_index/extractors/database_view_extractor.rb +0 -278
data/lib/codebase_index/extractors/decorator_extractor.rb +0 -253
data/lib/codebase_index/extractors/engine_extractor.rb +0 -223
data/lib/codebase_index/extractors/event_extractor.rb +0 -211
data/lib/codebase_index/extractors/factory_extractor.rb +0 -289
data/lib/codebase_index/extractors/graphql_extractor.rb +0 -892
data/lib/codebase_index/extractors/i18n_extractor.rb +0 -117
data/lib/codebase_index/extractors/job_extractor.rb +0 -374
data/lib/codebase_index/extractors/lib_extractor.rb +0 -218
data/lib/codebase_index/extractors/mailer_extractor.rb +0 -269
data/lib/codebase_index/extractors/manager_extractor.rb +0 -188
data/lib/codebase_index/extractors/middleware_extractor.rb +0 -133
data/lib/codebase_index/extractors/migration_extractor.rb +0 -469
data/lib/codebase_index/extractors/model_extractor.rb +0 -988
data/lib/codebase_index/extractors/phlex_extractor.rb +0 -252
data/lib/codebase_index/extractors/policy_extractor.rb +0 -191
data/lib/codebase_index/extractors/poro_extractor.rb +0 -229
data/lib/codebase_index/extractors/pundit_extractor.rb +0 -223
data/lib/codebase_index/extractors/rails_source_extractor.rb +0 -473
data/lib/codebase_index/extractors/rake_task_extractor.rb +0 -343
data/lib/codebase_index/extractors/route_extractor.rb +0 -181
data/lib/codebase_index/extractors/scheduled_job_extractor.rb +0 -331
data/lib/codebase_index/extractors/serializer_extractor.rb +0 -339
data/lib/codebase_index/extractors/service_extractor.rb +0 -217
data/lib/codebase_index/extractors/shared_dependency_scanner.rb +0 -91
data/lib/codebase_index/extractors/shared_utility_methods.rb +0 -281
data/lib/codebase_index/extractors/state_machine_extractor.rb +0 -398
data/lib/codebase_index/extractors/test_mapping_extractor.rb +0 -225
data/lib/codebase_index/extractors/validator_extractor.rb +0 -211
data/lib/codebase_index/extractors/view_component_extractor.rb +0 -311
data/lib/codebase_index/extractors/view_template_extractor.rb +0 -261
data/lib/codebase_index/feedback/gap_detector.rb +0 -89
data/lib/codebase_index/feedback/store.rb +0 -119
data/lib/codebase_index/filename_utils.rb +0 -32
data/lib/codebase_index/flow_analysis/operation_extractor.rb +0 -206
data/lib/codebase_index/flow_analysis/response_code_mapper.rb +0 -154
data/lib/codebase_index/flow_assembler.rb +0 -290
data/lib/codebase_index/flow_document.rb +0 -191
data/lib/codebase_index/flow_precomputer.rb +0 -102
data/lib/codebase_index/formatting/base.rb +0 -30
data/lib/codebase_index/formatting/claude_adapter.rb +0 -98
data/lib/codebase_index/formatting/generic_adapter.rb +0 -56
data/lib/codebase_index/formatting/gpt_adapter.rb +0 -64
data/lib/codebase_index/formatting/human_adapter.rb +0 -78
data/lib/codebase_index/graph_analyzer.rb +0 -374
data/lib/codebase_index/mcp/bootstrapper.rb +0 -96
data/lib/codebase_index/mcp/index_reader.rb +0 -394
data/lib/codebase_index/mcp/renderers/claude_renderer.rb +0 -81
data/lib/codebase_index/mcp/renderers/json_renderer.rb +0 -17
data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +0 -353
data/lib/codebase_index/mcp/renderers/plain_renderer.rb +0 -240
data/lib/codebase_index/mcp/server.rb +0 -961
data/lib/codebase_index/mcp/tool_response_renderer.rb +0 -85
data/lib/codebase_index/model_name_cache.rb +0 -51
data/lib/codebase_index/notion/client.rb +0 -217
data/lib/codebase_index/notion/exporter.rb +0 -219
data/lib/codebase_index/notion/mapper.rb +0 -40
data/lib/codebase_index/notion/mappers/column_mapper.rb +0 -57
data/lib/codebase_index/notion/mappers/migration_mapper.rb +0 -39
data/lib/codebase_index/notion/mappers/model_mapper.rb +0 -161
data/lib/codebase_index/notion/mappers/shared.rb +0 -22
data/lib/codebase_index/notion/rate_limiter.rb +0 -68
data/lib/codebase_index/observability/health_check.rb +0 -79
data/lib/codebase_index/observability/instrumentation.rb +0 -34
data/lib/codebase_index/observability/structured_logger.rb +0 -57
data/lib/codebase_index/operator/error_escalator.rb +0 -81
data/lib/codebase_index/operator/pipeline_guard.rb +0 -92
data/lib/codebase_index/operator/status_reporter.rb +0 -80
data/lib/codebase_index/railtie.rb +0 -38
data/lib/codebase_index/resilience/circuit_breaker.rb +0 -99
data/lib/codebase_index/resilience/index_validator.rb +0 -167
data/lib/codebase_index/resilience/retryable_provider.rb +0 -108
data/lib/codebase_index/retrieval/context_assembler.rb +0 -261
data/lib/codebase_index/retrieval/query_classifier.rb +0 -133
data/lib/codebase_index/retrieval/ranker.rb +0 -277
data/lib/codebase_index/retrieval/search_executor.rb +0 -316
data/lib/codebase_index/retriever.rb +0 -152
data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +0 -170
data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +0 -77
data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +0 -18
data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +0 -280
data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +0 -143
data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +0 -143
data/lib/codebase_index/ruby_analyzer.rb +0 -87
data/lib/codebase_index/session_tracer/file_store.rb +0 -104
data/lib/codebase_index/session_tracer/middleware.rb +0 -143
data/lib/codebase_index/session_tracer/redis_store.rb +0 -106
data/lib/codebase_index/session_tracer/session_flow_assembler.rb +0 -254
data/lib/codebase_index/session_tracer/session_flow_document.rb +0 -223
data/lib/codebase_index/session_tracer/solid_cache_store.rb +0 -139
data/lib/codebase_index/session_tracer/store.rb +0 -81
data/lib/codebase_index/storage/graph_store.rb +0 -120
data/lib/codebase_index/storage/metadata_store.rb +0 -196
data/lib/codebase_index/storage/pgvector.rb +0 -195
data/lib/codebase_index/storage/qdrant.rb +0 -205
data/lib/codebase_index/storage/vector_store.rb +0 -167
data/lib/codebase_index/temporal/json_snapshot_store.rb +0 -245
data/lib/codebase_index/temporal/snapshot_store.rb +0 -345
data/lib/codebase_index/token_utils.rb +0 -19
data/lib/codebase_index/version.rb +0 -5
data/lib/generators/codebase_index/install_generator.rb +0 -32
data/lib/generators/codebase_index/pgvector_generator.rb +0 -37
data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +0 -15
data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +0 -43
data/lib/tasks/codebase_index.rake +0 -597
data/lib/tasks/codebase_index_evaluation.rake +0 -115

data/lib/codebase_index/embedding/provider.rb DELETED Viewed

@@ -1,162 +0,0 @@
-# frozen_string_literal: true
-require 'net/http'
-require 'json'
-module CodebaseIndex
-  module Embedding
-    # Interface and adapters for embedding providers.
-    #
-    # All embedding providers implement the {Interface} module, which defines
-    # the contract for generating vector embeddings from text.
-    module Provider
-      # Interface that all embedding providers must implement.
-      #
-      # Defines the contract for embedding text into vector representations.
-      # Implementations must provide single-text embedding, batch embedding,
-      # dimension reporting, and model identification.
-      module Interface
-        # Embed a single text string into a vector.
-        #
-        # @param text [String] the text to embed
-        # @return [Array<Float>] the embedding vector
-        # @raise [NotImplementedError] if not implemented by the provider
-        def embed(text)
-          raise NotImplementedError
-        end
-        # Embed multiple texts into vectors in a single request.
-        #
-        # @param texts [Array<String>] the texts to embed
-        # @return [Array<Array<Float>>] array of embedding vectors
-        # @raise [NotImplementedError] if not implemented by the provider
-        def embed_batch(texts)
-          raise NotImplementedError
-        end
-        # Return the dimensionality of the embedding vectors.
-        #
-        # @return [Integer] number of dimensions
-        # @raise [NotImplementedError] if not implemented by the provider
-        def dimensions
-          raise NotImplementedError
-        end
-        # Return the name of the embedding model.
-        #
-        # @return [String] model name
-        # @raise [NotImplementedError] if not implemented by the provider
-        def model_name
-          raise NotImplementedError
-        end
-      end
-      # Ollama adapter for local embeddings via the Ollama HTTP API.
-      #
-      # Uses the `/api/embed` endpoint to generate embeddings. Requires a running
-      # Ollama instance (default: localhost:11434) with the specified model pulled.
-      #
-      # @example
-      #   provider = CodebaseIndex::Embedding::Provider::Ollama.new
-      #   vector = provider.embed("class User < ApplicationRecord; end")
-      #   vectors = provider.embed_batch(["text1", "text2"])
-      class Ollama
-        include Interface
-        DEFAULT_MODEL = 'nomic-embed-text'
-        DEFAULT_HOST = 'http://localhost:11434'
-        # @param model [String] Ollama model name (default: nomic-embed-text)
-        # @param host [String] Ollama server URL (default: http://localhost:11434)
-        def initialize(model: DEFAULT_MODEL, host: DEFAULT_HOST)
-          @model = model
-          @host = host
-          @uri = URI("#{host}/api/embed")
-        end
-        # Embed a single text string.
-        #
-        # @param text [String] the text to embed
-        # @return [Array<Float>] the embedding vector
-        # @raise [CodebaseIndex::Error] if the API returns an error
-        def embed(text)
-          response = post_request({ model: @model, input: text })
-          response['embeddings'].first
-        end
-        # Embed multiple texts in a single request.
-        #
-        # @param texts [Array<String>] the texts to embed
-        # @return [Array<Array<Float>>] array of embedding vectors
-        # @raise [CodebaseIndex::Error] if the API returns an error
-        def embed_batch(texts)
-          response = post_request({ model: @model, input: texts })
-          response['embeddings']
-        end
-        # Return the dimensionality of vectors produced by this model.
-        #
-        # Determined dynamically by embedding a test string on first call.
-        #
-        # @return [Integer] number of dimensions
-        def dimensions
-          @dimensions ||= embed('test').length
-        end
-        # Return the model name.
-        #
-        # @return [String] the Ollama model name
-        def model_name
-          @model
-        end
-        private
-        # Send a POST request to the Ollama API.
-        #
-        # @param body [Hash] request body
-        # @return [Hash] parsed JSON response
-        # @raise [CodebaseIndex::Error] if the API returns a non-success status
-        def post_request(body)
-          request = Net::HTTP::Post.new(@uri.path, 'Content-Type' => 'application/json')
-          request.body = body.to_json
-          response = http_client.request(request)
-          unless response.is_a?(Net::HTTPSuccess)
-            raise CodebaseIndex::Error, "Ollama API error: #{response.code} #{response.body}"
-          end
-          JSON.parse(response.body)
-        rescue Errno::ECONNRESET, Net::OpenTimeout, Net::ReadTimeout, IOError
-          # Connection dropped — reset and retry once
-          @http_client = nil
-          begin
-            response = http_client.request(request)
-          rescue StandardError => retry_error
-            raise CodebaseIndex::Error, "Ollama API error (retry failed): #{retry_error.message}"
-          end
-          unless response.is_a?(Net::HTTPSuccess)
-            raise CodebaseIndex::Error, "Ollama API error: #{response.code} #{response.body}"
-          end
-          JSON.parse(response.body)
-        end
-        # Return a reusable, started HTTP client for the Ollama API.
-        #
-        # @return [Net::HTTP]
-        def http_client
-          return @http_client if @http_client&.started?
-          http = Net::HTTP.new(@uri.host, @uri.port)
-          http.use_ssl = @uri.scheme == 'https'
-          http.open_timeout = 10
-          http.read_timeout = 30
-          http.keep_alive_timeout = 30
-          http.start
-          @http_client = http
-        end
-      end
-    end
-  end
-end

data/lib/codebase_index/embedding/text_preparer.rb DELETED Viewed

@@ -1,112 +0,0 @@
-# frozen_string_literal: true
-module CodebaseIndex
-  module Embedding
-    # Prepares ExtractedUnit data for embedding by building context-prefixed text.
-    #
-    # Follows the context prefix format from docs/CONTEXT_AND_CHUNKING.md:
-    #   [type] identifier
-    #   namespace: ...
-    #   file: ...
-    #   dependencies: dep1, dep2, ...
-    #
-    # Handles token limit enforcement by truncating text that exceeds the
-    # embedding model's context window.
-    #
-    # @example
-    #   preparer = CodebaseIndex::Embedding::TextPreparer.new(max_tokens: 8192)
-    #   text = preparer.prepare(unit)
-    #   chunks = preparer.prepare_chunks(unit)
-    class TextPreparer
-      DEFAULT_MAX_TOKENS = 8192
-      # @param max_tokens [Integer] maximum token budget for prepared text
-      def initialize(max_tokens: DEFAULT_MAX_TOKENS)
-        @max_tokens = max_tokens
-      end
-      # Prepare text for embedding from an ExtractedUnit.
-      #
-      # Builds a context prefix and appends the unit's source code (or first
-      # chunk content for chunked units). Enforces token limits via truncation.
-      #
-      # @param unit [CodebaseIndex::ExtractedUnit] the unit to prepare
-      # @return [String] context-prefixed text ready for embedding
-      def prepare(unit)
-        prefix = build_prefix(unit)
-        content = select_content(unit)
-        text = "#{prefix}\n#{content}"
-        enforce_token_limit(text)
-      end
-      # Prepare text for each chunk of an ExtractedUnit.
-      #
-      # If the unit has no chunks, returns a single-element array with the
-      # full prepared text. For chunked units, each chunk gets the same
-      # context prefix prepended.
-      #
-      # @param unit [CodebaseIndex::ExtractedUnit] the unit to prepare
-      # @return [Array<String>] array of context-prefixed texts
-      def prepare_chunks(unit)
-        return [prepare(unit)] unless unit.chunks&.any?
-        prefix = build_prefix(unit)
-        unit.chunks.map do |chunk|
-          text = "#{prefix}\n#{chunk[:content]}"
-          enforce_token_limit(text)
-        end
-      end
-      private
-      # Build the context prefix for a unit.
-      #
-      # @param unit [CodebaseIndex::ExtractedUnit] the unit
-      # @return [String] formatted prefix lines
-      def build_prefix(unit)
-        lines = []
-        lines << "[#{unit.type}] #{unit.identifier}"
-        lines << "namespace: #{unit.namespace}" if unit.namespace
-        lines << "file: #{unit.file_path}" if unit.file_path
-        append_dependency_line(lines, unit.dependencies)
-        lines.join("\n")
-      end
-      # Append a formatted dependency line if dependencies exist.
-      #
-      # @param lines [Array<String>] lines to append to
-      # @param dependencies [Array<Hash>, nil] dependency list
-      # @return [void]
-      def append_dependency_line(lines, dependencies)
-        return unless dependencies&.any?
-        dep_names = dependencies.map { |d| d[:target] }.compact.first(10)
-        lines << "dependencies: #{dep_names.join(', ')}" if dep_names.any?
-      end
-      # Select the content to embed for a unit.
-      #
-      # @param unit [CodebaseIndex::ExtractedUnit] the unit
-      # @return [String] source code or first chunk content
-      def select_content(unit)
-        if unit.chunks&.any?
-          unit.chunks.first[:content]
-        else
-          unit.source_code || ''
-        end
-      end
-      # Truncate text to fit within the token budget.
-      #
-      # @param text [String] the text to truncate
-      # @return [String] text within token limits
-      def enforce_token_limit(text)
-        estimated = (text.length / 4.0).ceil
-        return text if estimated <= @max_tokens
-        max_chars = (@max_tokens * 4.0).floor
-        text[0...max_chars]
-      end
-    end
-  end
-end

data/lib/codebase_index/evaluation/baseline_runner.rb DELETED Viewed

@@ -1,115 +0,0 @@
-# frozen_string_literal: true
-module CodebaseIndex
-  module Evaluation
-    # Runs simple baseline strategies for comparison against the full
-    # retrieval pipeline.
-    #
-    # Provides three baseline strategies:
-    # - `:grep` — substring match on unit identifiers
-    # - `:random` — random selection from available units
-    # - `:file_level` — returns identifiers matching file paths
-    #
-    # @example
-    #   runner = BaselineRunner.new(metadata_store: store)
-    #   results = runner.run("User model", strategy: :grep, limit: 10)
-    #   results  # => ["User", "UserProfile", "UserSerializer"]
-    #
-    class BaselineRunner
-      VALID_STRATEGIES = %i[grep random file_level].freeze
-      # @param metadata_store [Object] Store that responds to #all_identifiers and #find_by_type
-      def initialize(metadata_store:)
-        @metadata_store = metadata_store
-      end
-      # Run a baseline strategy for a query.
-      #
-      # @param query [String] Natural language query
-      # @param strategy [Symbol] Baseline strategy (:grep, :random, :file_level)
-      # @param limit [Integer] Maximum number of results
-      # @return [Array<String>] Unit identifiers
-      # @raise [ArgumentError] if the strategy is invalid
-      def run(query, strategy:, limit: 10)
-        unless VALID_STRATEGIES.include?(strategy)
-          raise ArgumentError, "Invalid strategy: #{strategy}. Must be one of #{VALID_STRATEGIES.join(', ')}"
-        end
-        send(:"run_#{strategy}", query, limit)
-      end
-      private
-      # Grep strategy: substring match on unit identifiers.
-      #
-      # Extracts words from the query and matches identifiers that contain
-      # any query word (case-insensitive).
-      #
-      # @param query [String] Query string
-      # @param limit [Integer] Max results
-      # @return [Array<String>]
-      def run_grep(query, limit)
-        all_ids = @metadata_store.all_identifiers
-        keywords = extract_keywords(query)
-        return all_ids.first(limit) if keywords.empty?
-        matches = all_ids.select do |id|
-          id_lower = id.downcase
-          keywords.any? { |kw| id_lower.include?(kw) }
-        end
-        matches.first(limit)
-      end
-      # Random strategy: random selection from all available units.
-      #
-      # @param _query [String] Query string (unused)
-      # @param limit [Integer] Max results
-      # @return [Array<String>]
-      def run_random(_query, limit)
-        @metadata_store.all_identifiers.sample(limit)
-      end
-      # File-level strategy: matches identifiers that look like file paths
-      # or class names extracted from the query.
-      #
-      # @param query [String] Query string
-      # @param limit [Integer] Max results
-      # @return [Array<String>]
-      def run_file_level(query, limit)
-        all_ids = @metadata_store.all_identifiers
-        keywords = extract_keywords(query)
-        return all_ids.first(limit) if keywords.empty?
-        # Score each identifier by how many keywords it matches
-        scored = all_ids.map do |id|
-          id_lower = id.downcase
-          score = keywords.count { |kw| id_lower.include?(kw) }
-          [id, score]
-        end
-        scored.select { |_, score| score.positive? }
-              .sort_by { |_, score| -score }
-              .first(limit)
-              .map(&:first)
-      end
-      # Extract lowercase keywords from a query string.
-      #
-      # Filters out common stop words and short words.
-      #
-      # @param query [String] Query text
-      # @return [Array<String>] Keywords
-      def extract_keywords(query)
-        stop_words = %w[the a an is are was were how does do what which where when why
-                        this that these those in on at to for of and or but with from by]
-        query.downcase
-             .scan(/[a-z0-9_]+/)
-             .reject { |w| stop_words.include?(w) || w.length < 2 }
-      end
-    end
-  end
-end

data/lib/codebase_index/evaluation/evaluator.rb DELETED Viewed

@@ -1,139 +0,0 @@
-# frozen_string_literal: true
-require_relative 'metrics'
-module CodebaseIndex
-  module Evaluation
-    # Runs evaluation queries through a Retriever and scores results
-    # against ground truth annotations.
-    #
-    # Takes a configured retriever and a query set, runs each query,
-    # and produces per-query and aggregate metrics.
-    #
-    # @example
-    #   evaluator = Evaluator.new(retriever: retriever, query_set: query_set)
-    #   report = evaluator.evaluate
-    #   report.aggregates[:mean_mrr]  # => 0.75
-    #
-    class Evaluator
-      # Result for a single evaluation query.
-      QueryResult = Struct.new(:query, :expected_units, :retrieved_units, :scores, :tokens_used,
-                               keyword_init: true)
-      # Aggregate report across all queries.
-      EvaluationReport = Struct.new(:results, :aggregates, keyword_init: true)
-      METRIC_KEYS = %i[precision_at5 precision_at10 recall mrr context_completeness token_efficiency].freeze
-      # @param retriever [CodebaseIndex::Retriever] Configured retriever instance
-      # @param query_set [QuerySet] Set of evaluation queries with ground truth
-      # @param budget [Integer] Token budget per query
-      def initialize(retriever:, query_set:, budget: 8000)
-        @retriever = retriever
-        @query_set = query_set
-        @budget = budget
-      end
-      # Run all queries and produce an evaluation report.
-      #
-      # @return [EvaluationReport] Per-query results and aggregate metrics
-      def evaluate
-        results = @query_set.queries.map { |q| evaluate_query(q) }
-        aggregates = compute_aggregates(results)
-        EvaluationReport.new(results: results, aggregates: aggregates)
-      end
-      private
-      # Evaluate a single query against the retriever.
-      #
-      # @param query [QuerySet::Query] Evaluation query
-      # @return [QueryResult]
-      def evaluate_query(query)
-        retrieval_result = @retriever.retrieve(query.query, budget: @budget)
-        retrieved_ids = extract_identifiers(retrieval_result)
-        scores = compute_scores(retrieved_ids, query.expected_units, retrieval_result)
-        QueryResult.new(
-          query: query.query,
-          expected_units: query.expected_units,
-          retrieved_units: retrieved_ids,
-          scores: scores,
-          tokens_used: retrieval_result.tokens_used
-        )
-      end
-      # Extract unit identifiers from retrieval result sources.
-      #
-      # @param result [Retriever::RetrievalResult] Retrieval result
-      # @return [Array<String>] Ordered list of unit identifiers
-      def extract_identifiers(result)
-        return [] unless result.sources
-        result.sources.map { |s| s.is_a?(Hash) ? s[:identifier] || s['identifier'] : s.to_s }
-      end
-      # Compute all metrics for a query result.
-      #
-      # @param retrieved [Array<String>] Retrieved identifiers
-      # @param expected [Array<String>] Expected identifiers
-      # @param result [Retriever::RetrievalResult] Retrieval result
-      # @return [Hash] Metric scores
-      def compute_scores(retrieved, expected, result)
-        {
-          precision_at5: Metrics.precision_at_k(retrieved, expected, cutoff: 5),
-          precision_at10: Metrics.precision_at_k(retrieved, expected, cutoff: 10),
-          recall: Metrics.recall(retrieved, expected),
-          mrr: Metrics.mrr(retrieved, expected),
-          context_completeness: Metrics.context_completeness(retrieved, expected),
-          token_efficiency: compute_token_efficiency(retrieved, expected, result)
-        }
-      end
-      # Compute token efficiency from the retrieval result.
-      #
-      # @param retrieved [Array<String>] Retrieved identifiers
-      # @param expected [Array<String>] Expected identifiers
-      # @param result [Retriever::RetrievalResult] Retrieval result
-      # @return [Float]
-      def compute_token_efficiency(retrieved, expected, result)
-        return 0.0 if result.tokens_used.nil? || result.tokens_used.zero?
-        expected_set = expected.to_set
-        relevant_count = retrieved.count { |id| expected_set.include?(id) }
-        total_count = [retrieved.size, 1].max
-        relevant_ratio = relevant_count.to_f / total_count
-        Metrics.token_efficiency((result.tokens_used * relevant_ratio).ceil, result.tokens_used)
-      end
-      # Compute aggregate metrics across all query results.
-      #
-      # @param results [Array<QueryResult>] Individual query results
-      # @return [Hash] Aggregate metrics
-      def compute_aggregates(results)
-        return empty_aggregates if results.empty?
-        aggregates = {}
-        METRIC_KEYS.each do |key|
-          values = results.map { |r| r.scores[key] }
-          aggregates[:"mean_#{key}"] = values.sum / values.size.to_f
-        end
-        aggregates[:total_queries] = results.size
-        aggregates[:mean_tokens_used] = results.sum(&:tokens_used) / results.size.to_f
-        aggregates
-      end
-      # Return zero-valued aggregates for empty result sets.
-      #
-      # @return [Hash]
-      def empty_aggregates
-        METRIC_KEYS.to_h { |key| [:"mean_#{key}", 0.0] }
-                   .merge(total_queries: 0, mean_tokens_used: 0.0)
-      end
-    end
-  end
-end

data/lib/codebase_index/evaluation/metrics.rb DELETED Viewed

@@ -1,79 +0,0 @@
-# frozen_string_literal: true
-module CodebaseIndex
-  module Evaluation
-    # Retrieval quality metrics.
-    #
-    # All methods are stateless pure functions that take arrays of identifiers
-    # and return numeric scores.
-    #
-    module Metrics
-      module_function
-      # Fraction of top-k results that are relevant.
-      #
-      # @param retrieved [Array<String>] Retrieved unit identifiers (ordered)
-      # @param relevant [Array<String>] Ground-truth relevant identifiers
-      # @param cutoff [Integer] Number of top results to consider
-      # @return [Float] 0.0 to 1.0
-      def precision_at_k(retrieved, relevant, cutoff: 5)
-        return 0.0 if retrieved.empty? || relevant.empty?
-        top_k = retrieved.first(cutoff)
-        relevant_set = relevant.to_set
-        hits = top_k.count { |id| relevant_set.include?(id) }
-        hits.to_f / cutoff
-      end
-      # Fraction of relevant items that were retrieved.
-      #
-      # @param retrieved [Array<String>] Retrieved identifiers
-      # @param relevant [Array<String>] Ground-truth relevant identifiers
-      # @return [Float] 0.0 to 1.0
-      def recall(retrieved, relevant)
-        return 0.0 if relevant.empty?
-        retrieved_set = retrieved.to_set
-        found = relevant.count { |id| retrieved_set.include?(id) }
-        found.to_f / relevant.size
-      end
-      # Mean Reciprocal Rank — inverse of the rank of the first relevant result.
-      #
-      # @param retrieved [Array<String>] Retrieved identifiers (ordered)
-      # @param relevant [Array<String>] Ground-truth relevant identifiers
-      # @return [Float] 0.0 to 1.0
-      def mrr(retrieved, relevant)
-        relevant_set = relevant.to_set
-        retrieved.each_with_index do |id, idx|
-          return 1.0 / (idx + 1) if relevant_set.include?(id)
-        end
-        0.0
-      end
-      # Fraction of required units present in retrieved results.
-      #
-      # @param retrieved [Array<String>] Retrieved identifiers
-      # @param required [Array<String>] Required identifiers (subset of relevant)
-      # @return [Float] 0.0 to 1.0
-      def context_completeness(retrieved, required)
-        return 1.0 if required.empty?
-        retrieved_set = retrieved.to_set
-        found = required.count { |id| retrieved_set.include?(id) }
-        found.to_f / required.size
-      end
-      # Ratio of relevant tokens to total tokens in context.
-      #
-      # @param relevant_tokens [Integer] Tokens from relevant units
-      # @param total_tokens [Integer] Total tokens in assembled context
-      # @return [Float] 0.0 to 1.0
-      def token_efficiency(relevant_tokens, total_tokens)
-        return 0.0 if total_tokens.zero?
-        [relevant_tokens.to_f / total_tokens, 1.0].min
-      end
-    end
-  end
-end