codebase_index 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +29 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +481 -0
- data/exe/codebase-console-mcp +22 -0
- data/exe/codebase-index-mcp +61 -0
- data/exe/codebase-index-mcp-http +64 -0
- data/exe/codebase-index-mcp-start +58 -0
- data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
- data/lib/codebase_index/ast/method_extractor.rb +76 -0
- data/lib/codebase_index/ast/node.rb +88 -0
- data/lib/codebase_index/ast/parser.rb +653 -0
- data/lib/codebase_index/ast.rb +6 -0
- data/lib/codebase_index/builder.rb +137 -0
- data/lib/codebase_index/chunking/chunk.rb +84 -0
- data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
- data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
- data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
- data/lib/codebase_index/console/audit_logger.rb +75 -0
- data/lib/codebase_index/console/bridge.rb +170 -0
- data/lib/codebase_index/console/confirmation.rb +90 -0
- data/lib/codebase_index/console/connection_manager.rb +173 -0
- data/lib/codebase_index/console/console_response_renderer.rb +78 -0
- data/lib/codebase_index/console/model_validator.rb +81 -0
- data/lib/codebase_index/console/safe_context.rb +82 -0
- data/lib/codebase_index/console/server.rb +557 -0
- data/lib/codebase_index/console/sql_validator.rb +172 -0
- data/lib/codebase_index/console/tools/tier1.rb +118 -0
- data/lib/codebase_index/console/tools/tier2.rb +117 -0
- data/lib/codebase_index/console/tools/tier3.rb +110 -0
- data/lib/codebase_index/console/tools/tier4.rb +79 -0
- data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
- data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
- data/lib/codebase_index/cost_model/estimator.rb +128 -0
- data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
- data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
- data/lib/codebase_index/cost_model.rb +22 -0
- data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
- data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
- data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/codebase_index/db/migrator.rb +71 -0
- data/lib/codebase_index/db/schema_version.rb +73 -0
- data/lib/codebase_index/dependency_graph.rb +227 -0
- data/lib/codebase_index/embedding/indexer.rb +130 -0
- data/lib/codebase_index/embedding/openai.rb +105 -0
- data/lib/codebase_index/embedding/provider.rb +135 -0
- data/lib/codebase_index/embedding/text_preparer.rb +112 -0
- data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
- data/lib/codebase_index/evaluation/evaluator.rb +146 -0
- data/lib/codebase_index/evaluation/metrics.rb +79 -0
- data/lib/codebase_index/evaluation/query_set.rb +148 -0
- data/lib/codebase_index/evaluation/report_generator.rb +90 -0
- data/lib/codebase_index/extracted_unit.rb +145 -0
- data/lib/codebase_index/extractor.rb +956 -0
- data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
- data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
- data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
- data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
- data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
- data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
- data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
- data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
- data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
- data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
- data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
- data/lib/codebase_index/extractors/event_extractor.rb +211 -0
- data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
- data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
- data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
- data/lib/codebase_index/extractors/job_extractor.rb +369 -0
- data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
- data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
- data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
- data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
- data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
- data/lib/codebase_index/extractors/model_extractor.rb +960 -0
- data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
- data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
- data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
- data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
- data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
- data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
- data/lib/codebase_index/extractors/route_extractor.rb +181 -0
- data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
- data/lib/codebase_index/extractors/service_extractor.rb +254 -0
- data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
- data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
- data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
- data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
- data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
- data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
- data/lib/codebase_index/feedback/gap_detector.rb +89 -0
- data/lib/codebase_index/feedback/store.rb +119 -0
- data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
- data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/codebase_index/flow_assembler.rb +290 -0
- data/lib/codebase_index/flow_document.rb +191 -0
- data/lib/codebase_index/flow_precomputer.rb +102 -0
- data/lib/codebase_index/formatting/base.rb +40 -0
- data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
- data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
- data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
- data/lib/codebase_index/formatting/human_adapter.rb +78 -0
- data/lib/codebase_index/graph_analyzer.rb +374 -0
- data/lib/codebase_index/mcp/index_reader.rb +394 -0
- data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
- data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
- data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/codebase_index/mcp/server.rb +935 -0
- data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
- data/lib/codebase_index/model_name_cache.rb +51 -0
- data/lib/codebase_index/notion/client.rb +217 -0
- data/lib/codebase_index/notion/exporter.rb +219 -0
- data/lib/codebase_index/notion/mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
- data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
- data/lib/codebase_index/notion/rate_limiter.rb +68 -0
- data/lib/codebase_index/observability/health_check.rb +81 -0
- data/lib/codebase_index/observability/instrumentation.rb +34 -0
- data/lib/codebase_index/observability/structured_logger.rb +75 -0
- data/lib/codebase_index/operator/error_escalator.rb +81 -0
- data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
- data/lib/codebase_index/operator/status_reporter.rb +80 -0
- data/lib/codebase_index/railtie.rb +26 -0
- data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
- data/lib/codebase_index/resilience/index_validator.rb +185 -0
- data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
- data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
- data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
- data/lib/codebase_index/retrieval/ranker.rb +273 -0
- data/lib/codebase_index/retrieval/search_executor.rb +327 -0
- data/lib/codebase_index/retriever.rb +160 -0
- data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
- data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
- data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
- data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
- data/lib/codebase_index/ruby_analyzer.rb +87 -0
- data/lib/codebase_index/session_tracer/file_store.rb +111 -0
- data/lib/codebase_index/session_tracer/middleware.rb +143 -0
- data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
- data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
- data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
- data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
- data/lib/codebase_index/session_tracer/store.rb +67 -0
- data/lib/codebase_index/storage/graph_store.rb +120 -0
- data/lib/codebase_index/storage/metadata_store.rb +169 -0
- data/lib/codebase_index/storage/pgvector.rb +163 -0
- data/lib/codebase_index/storage/qdrant.rb +172 -0
- data/lib/codebase_index/storage/vector_store.rb +156 -0
- data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
- data/lib/codebase_index/version.rb +5 -0
- data/lib/codebase_index.rb +223 -0
- data/lib/generators/codebase_index/install_generator.rb +32 -0
- data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
- data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
- data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
- data/lib/tasks/codebase_index.rake +583 -0
- data/lib/tasks/codebase_index_evaluation.rake +115 -0
- metadata +252 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
# Railtie integrates CodebaseIndex into Rails applications.
|
|
5
|
+
# Loads rake tasks automatically when the gem is bundled.
|
|
6
|
+
# Conditionally inserts session tracer middleware when enabled.
|
|
7
|
+
class Railtie < Rails::Railtie
|
|
8
|
+
rake_tasks do
|
|
9
|
+
load File.expand_path('../tasks/codebase_index.rake', __dir__)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
initializer 'codebase_index.session_tracer' do |app|
|
|
13
|
+
config = CodebaseIndex.configuration
|
|
14
|
+
if config.session_tracer_enabled
|
|
15
|
+
require 'codebase_index/session_tracer/middleware'
|
|
16
|
+
|
|
17
|
+
app.middleware.use(
|
|
18
|
+
CodebaseIndex::SessionTracer::Middleware,
|
|
19
|
+
store: config.session_store,
|
|
20
|
+
session_id_proc: config.session_id_proc,
|
|
21
|
+
exclude_paths: config.session_exclude_paths
|
|
22
|
+
)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Resilience
|
|
5
|
+
# Raised when the circuit breaker is open and calls are being rejected.
|
|
6
|
+
#
|
|
7
|
+
# @example Handling a circuit open condition
|
|
8
|
+
# begin
|
|
9
|
+
# breaker.call { provider.embed(text) }
|
|
10
|
+
# rescue CircuitOpenError => e
|
|
11
|
+
# use_cached_result(text)
|
|
12
|
+
# end
|
|
13
|
+
class CircuitOpenError < CodebaseIndex::Error; end
|
|
14
|
+
|
|
15
|
+
# Circuit breaker pattern for protecting external service calls.
|
|
16
|
+
#
|
|
17
|
+
# Tracks failures and transitions between three states:
|
|
18
|
+
# - **:closed** — normal operation, calls pass through
|
|
19
|
+
# - **:open** — too many failures, calls are rejected immediately
|
|
20
|
+
# - **:half_open** — testing recovery, one call is allowed through
|
|
21
|
+
#
|
|
22
|
+
# @example Basic usage
|
|
23
|
+
# breaker = CircuitBreaker.new(threshold: 5, reset_timeout: 60)
|
|
24
|
+
# result = breaker.call { external_service.request }
|
|
25
|
+
#
|
|
26
|
+
# @example With retry logic
|
|
27
|
+
# breaker = CircuitBreaker.new(threshold: 3, reset_timeout: 30)
|
|
28
|
+
# begin
|
|
29
|
+
# breaker.call { api.embed(text) }
|
|
30
|
+
# rescue CircuitOpenError
|
|
31
|
+
# # Service is down, use fallback
|
|
32
|
+
# end
|
|
33
|
+
class CircuitBreaker
|
|
34
|
+
# @return [Symbol] Current state — :closed, :open, or :half_open
|
|
35
|
+
attr_reader :state
|
|
36
|
+
|
|
37
|
+
# @param threshold [Integer] Number of consecutive failures before opening the circuit
|
|
38
|
+
# @param reset_timeout [Numeric] Seconds to wait before transitioning from open to half_open
|
|
39
|
+
def initialize(threshold: 5, reset_timeout: 60)
|
|
40
|
+
@threshold = threshold
|
|
41
|
+
@reset_timeout = reset_timeout
|
|
42
|
+
@state = :closed
|
|
43
|
+
@failure_count = 0
|
|
44
|
+
@last_failure_time = nil
|
|
45
|
+
@mutex = Mutex.new
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Execute a block through the circuit breaker.
|
|
49
|
+
#
|
|
50
|
+
# @yield The block to execute
|
|
51
|
+
# @return [Object] The return value of the block
|
|
52
|
+
# @raise [CircuitOpenError] if the circuit is open and the timeout has not elapsed
|
|
53
|
+
# @raise [StandardError] re-raises any error from the block
|
|
54
|
+
def call(&block)
|
|
55
|
+
# Phase 1: Check state under mutex
|
|
56
|
+
@mutex.synchronize do
|
|
57
|
+
case @state
|
|
58
|
+
when :open
|
|
59
|
+
unless Time.now - @last_failure_time >= @reset_timeout
|
|
60
|
+
raise CircuitOpenError, "Circuit breaker is open (#{@failure_count} failures)"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
@state = :half_open
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Phase 2: Execute outside mutex
|
|
68
|
+
result = block.call
|
|
69
|
+
|
|
70
|
+
# Phase 3: Record success under mutex
|
|
71
|
+
@mutex.synchronize { reset! }
|
|
72
|
+
|
|
73
|
+
result
|
|
74
|
+
rescue CircuitOpenError
|
|
75
|
+
raise
|
|
76
|
+
rescue StandardError => e
|
|
77
|
+
# Phase 4: Record failure under mutex
|
|
78
|
+
@mutex.synchronize { record_failure }
|
|
79
|
+
raise e
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
# Record a failure and potentially open the circuit.
|
|
85
|
+
def record_failure
|
|
86
|
+
@failure_count += 1
|
|
87
|
+
@last_failure_time = Time.now
|
|
88
|
+
@state = :open if @failure_count >= @threshold
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Reset the circuit breaker to closed state with zero failures.
|
|
92
|
+
def reset!
|
|
93
|
+
@state = :closed
|
|
94
|
+
@failure_count = 0
|
|
95
|
+
@last_failure_time = nil
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'digest'
|
|
5
|
+
|
|
6
|
+
module CodebaseIndex
|
|
7
|
+
module Resilience
|
|
8
|
+
# Validates the integrity of a codebase index output directory.
|
|
9
|
+
#
|
|
10
|
+
# Checks that:
|
|
11
|
+
# - Each type directory has a valid `_index.json`
|
|
12
|
+
# - All files referenced in the index exist on disk
|
|
13
|
+
# - Content hashes (source_hash) match the actual source_code
|
|
14
|
+
# - No stale unit files exist that aren't listed in the index
|
|
15
|
+
#
|
|
16
|
+
# @example
|
|
17
|
+
# validator = IndexValidator.new(index_dir: "tmp/codebase_index")
|
|
18
|
+
# report = validator.validate
|
|
19
|
+
# puts report.errors if !report.valid?
|
|
20
|
+
class IndexValidator
|
|
21
|
+
# Report produced by {#validate}.
|
|
22
|
+
#
|
|
23
|
+
# @!attribute [r] valid?
|
|
24
|
+
# @return [Boolean] true if no errors were found
|
|
25
|
+
# @!attribute [r] warnings
|
|
26
|
+
# @return [Array<String>] non-fatal issues (e.g., stale files)
|
|
27
|
+
# @!attribute [r] errors
|
|
28
|
+
# @return [Array<String>] fatal integrity issues
|
|
29
|
+
ValidationReport = Struct.new(:valid?, :warnings, :errors, keyword_init: true)
|
|
30
|
+
|
|
31
|
+
# @param index_dir [String] Path to the codebase index output directory
|
|
32
|
+
def initialize(index_dir:)
|
|
33
|
+
@index_dir = index_dir
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Validate the index directory and return a report.
|
|
37
|
+
#
|
|
38
|
+
# @return [ValidationReport] the validation results
|
|
39
|
+
def validate
|
|
40
|
+
warnings = []
|
|
41
|
+
errors = []
|
|
42
|
+
|
|
43
|
+
unless Dir.exist?(@index_dir)
|
|
44
|
+
errors << "Index directory does not exist: #{@index_dir}"
|
|
45
|
+
return ValidationReport.new(valid?: false, warnings: warnings, errors: errors)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
type_dirs = Dir.children(@index_dir).filter_map do |name|
|
|
49
|
+
full_path = File.join(@index_dir, name)
|
|
50
|
+
full_path if File.directory?(full_path)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
type_dirs.each do |type_dir|
|
|
54
|
+
validate_type_directory(type_dir, warnings, errors)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
ValidationReport.new(valid?: errors.empty?, warnings: warnings, errors: errors)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
# Validate a single type directory (e.g., models/, controllers/).
|
|
63
|
+
#
|
|
64
|
+
# @param type_dir [String] Absolute path to the type directory
|
|
65
|
+
# @param warnings [Array<String>] Accumulated warnings
|
|
66
|
+
# @param errors [Array<String>] Accumulated errors
|
|
67
|
+
def validate_type_directory(type_dir, warnings, errors)
|
|
68
|
+
type_name = File.basename(type_dir)
|
|
69
|
+
index_path = File.join(type_dir, '_index.json')
|
|
70
|
+
|
|
71
|
+
unless File.exist?(index_path)
|
|
72
|
+
errors << "Missing _index.json in #{type_name}/"
|
|
73
|
+
return
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
index_entries = JSON.parse(File.read(index_path))
|
|
77
|
+
indexed_identifiers = Set.new
|
|
78
|
+
|
|
79
|
+
index_entries.each do |entry|
|
|
80
|
+
identifier = entry['identifier']
|
|
81
|
+
indexed_identifiers << identifier
|
|
82
|
+
validate_index_entry(type_dir, type_name, identifier, errors)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
check_stale_files(type_dir, type_name, indexed_identifiers, warnings)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Validate that a single index entry has a corresponding unit file with correct hash.
|
|
89
|
+
#
|
|
90
|
+
# @param type_dir [String] Path to the type directory
|
|
91
|
+
# @param type_name [String] Name of the type (for error messages)
|
|
92
|
+
# @param identifier [String] The unit identifier from the index
|
|
93
|
+
# @param errors [Array<String>] Accumulated errors
|
|
94
|
+
def validate_index_entry(type_dir, type_name, identifier, errors)
|
|
95
|
+
unit_file = find_unit_file(type_dir, identifier)
|
|
96
|
+
|
|
97
|
+
unless unit_file
|
|
98
|
+
errors << "Missing unit file for #{identifier} in #{type_name}/"
|
|
99
|
+
return
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
validate_content_hash(unit_file, identifier, errors)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Find the JSON file for a given identifier in a type directory.
|
|
106
|
+
#
|
|
107
|
+
# @param type_dir [String] Path to the type directory
|
|
108
|
+
# @param identifier [String] The unit identifier
|
|
109
|
+
# @return [String, nil] Path to the unit file, or nil if not found
|
|
110
|
+
def find_unit_file(type_dir, identifier)
|
|
111
|
+
# Try collision-safe first (current format), then legacy safe_filename, then exact match
|
|
112
|
+
candidates = [
|
|
113
|
+
File.join(type_dir, collision_safe_filename(identifier)),
|
|
114
|
+
File.join(type_dir, safe_filename(identifier)),
|
|
115
|
+
File.join(type_dir, "#{identifier}.json")
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
candidates.find { |path| File.exist?(path) }
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Validate that the source_hash in a unit file matches the actual source_code.
|
|
122
|
+
#
|
|
123
|
+
# @param unit_file [String] Path to the unit JSON file
|
|
124
|
+
# @param identifier [String] The unit identifier (for error messages)
|
|
125
|
+
# @param errors [Array<String>] Accumulated errors
|
|
126
|
+
def validate_content_hash(unit_file, identifier, errors)
|
|
127
|
+
data = JSON.parse(File.read(unit_file))
|
|
128
|
+
source_code = data['source_code']
|
|
129
|
+
stored_hash = data['source_hash']
|
|
130
|
+
|
|
131
|
+
return unless source_code && stored_hash
|
|
132
|
+
|
|
133
|
+
expected_hash = Digest::SHA256.hexdigest(source_code)
|
|
134
|
+
return if stored_hash == expected_hash
|
|
135
|
+
|
|
136
|
+
errors << "Content hash mismatch for #{identifier}: expected #{expected_hash[0..7]}..., " \
|
|
137
|
+
"got #{stored_hash[0..7]}..."
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Check for unit files that exist on disk but aren't referenced in the index.
|
|
141
|
+
#
|
|
142
|
+
# @param type_dir [String] Path to the type directory
|
|
143
|
+
# @param type_name [String] Name of the type (for warning messages)
|
|
144
|
+
# @param indexed_identifiers [Set<String>] Identifiers listed in the index
|
|
145
|
+
# @param warnings [Array<String>] Accumulated warnings
|
|
146
|
+
def check_stale_files(type_dir, type_name, indexed_identifiers, warnings)
|
|
147
|
+
# Build a set of expected filenames from indexed identifiers (both current and legacy formats)
|
|
148
|
+
expected_filenames = Set.new
|
|
149
|
+
indexed_identifiers.each do |id|
|
|
150
|
+
expected_filenames << collision_safe_filename(id)
|
|
151
|
+
expected_filenames << safe_filename(id)
|
|
152
|
+
expected_filenames << "#{id}.json"
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
Dir[File.join(type_dir, '*.json')].each do |file|
|
|
156
|
+
basename = File.basename(file)
|
|
157
|
+
next if basename == '_index.json'
|
|
158
|
+
next if expected_filenames.include?(basename)
|
|
159
|
+
|
|
160
|
+
warnings << "Stale file not in index: #{type_name}/#{basename}"
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Convert an identifier to a safe filename (legacy format, mirrors Extractor#safe_filename).
|
|
165
|
+
#
|
|
166
|
+
# @param identifier [String] The unit identifier (e.g., "Admin::UsersController")
|
|
167
|
+
# @return [String] A filesystem-safe filename (e.g., "Admin__UsersController.json")
|
|
168
|
+
def safe_filename(identifier)
|
|
169
|
+
"#{identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')}.json"
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Convert an identifier to a collision-safe filename (current format).
|
|
173
|
+
# Mirrors {Extractor#collision_safe_filename} — appends a short SHA256 digest
|
|
174
|
+
# to disambiguate identifiers that normalize to the same safe_filename.
|
|
175
|
+
#
|
|
176
|
+
# @param identifier [String] The unit identifier
|
|
177
|
+
# @return [String] Collision-safe filename (e.g., "Admin__UsersController_a1b2c3d4.json")
|
|
178
|
+
def collision_safe_filename(identifier)
|
|
179
|
+
base = identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')
|
|
180
|
+
digest = Digest::SHA256.hexdigest(identifier)[0, 8]
|
|
181
|
+
"#{base}_#{digest}.json"
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../embedding/provider'
|
|
4
|
+
require_relative 'circuit_breaker'
|
|
5
|
+
|
|
6
|
+
module CodebaseIndex
|
|
7
|
+
module Resilience
|
|
8
|
+
# Wraps an embedding provider with retry logic and optional circuit breaker.
|
|
9
|
+
#
|
|
10
|
+
# Transparently retries transient failures with exponential backoff.
|
|
11
|
+
# When a circuit breaker is provided, all calls are routed through it,
|
|
12
|
+
# and {CircuitOpenError} is never retried.
|
|
13
|
+
#
|
|
14
|
+
# @example Without circuit breaker
|
|
15
|
+
# retryable = RetryableProvider.new(provider: ollama_provider, max_retries: 3)
|
|
16
|
+
# vector = retryable.embed("some text")
|
|
17
|
+
#
|
|
18
|
+
# @example With circuit breaker
|
|
19
|
+
# breaker = CircuitBreaker.new(threshold: 5, reset_timeout: 60)
|
|
20
|
+
# retryable = RetryableProvider.new(
|
|
21
|
+
# provider: ollama_provider,
|
|
22
|
+
# max_retries: 3,
|
|
23
|
+
# circuit_breaker: breaker
|
|
24
|
+
# )
|
|
25
|
+
# vector = retryable.embed("some text")
|
|
26
|
+
class RetryableProvider
|
|
27
|
+
include CodebaseIndex::Embedding::Provider::Interface
|
|
28
|
+
|
|
29
|
+
# @param provider [#embed, #embed_batch, #dimensions, #model_name] The underlying embedding provider
|
|
30
|
+
# @param max_retries [Integer] Maximum number of retry attempts
|
|
31
|
+
# @param circuit_breaker [CircuitBreaker, nil] Optional circuit breaker instance
|
|
32
|
+
def initialize(provider:, max_retries: 3, circuit_breaker: nil)
|
|
33
|
+
@provider = provider
|
|
34
|
+
@max_retries = max_retries
|
|
35
|
+
@circuit_breaker = circuit_breaker
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Embed a single text string with retry logic.
|
|
39
|
+
#
|
|
40
|
+
# @param text [String] the text to embed
|
|
41
|
+
# @return [Array<Float>] the embedding vector
|
|
42
|
+
# @raise [CircuitOpenError] if the circuit breaker is open
|
|
43
|
+
# @raise [StandardError] if all retries are exhausted
|
|
44
|
+
def embed(text)
|
|
45
|
+
with_retries { call_provider { @provider.embed(text) } }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Embed multiple texts with retry logic.
|
|
49
|
+
#
|
|
50
|
+
# @param texts [Array<String>] the texts to embed
|
|
51
|
+
# @return [Array<Array<Float>>] array of embedding vectors
|
|
52
|
+
# @raise [CircuitOpenError] if the circuit breaker is open
|
|
53
|
+
# @raise [StandardError] if all retries are exhausted
|
|
54
|
+
def embed_batch(texts)
|
|
55
|
+
with_retries { call_provider { @provider.embed_batch(texts) } }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Return the dimensionality of the embedding vectors.
|
|
59
|
+
#
|
|
60
|
+
# @return [Integer] number of dimensions
|
|
61
|
+
def dimensions
|
|
62
|
+
@provider.dimensions
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Return the name of the embedding model.
|
|
66
|
+
#
|
|
67
|
+
# @return [String] model name
|
|
68
|
+
def model_name
|
|
69
|
+
@provider.model_name
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
# Execute a block with retry logic and exponential backoff.
|
|
75
|
+
#
|
|
76
|
+
# @yield The block to execute
|
|
77
|
+
# @return [Object] The return value of the block
|
|
78
|
+
# @raise [CircuitOpenError] immediately without retrying
|
|
79
|
+
# @raise [StandardError] the last error if all retries are exhausted
|
|
80
|
+
def with_retries
|
|
81
|
+
attempt = 0
|
|
82
|
+
begin
|
|
83
|
+
attempt += 1
|
|
84
|
+
yield
|
|
85
|
+
rescue CircuitOpenError
|
|
86
|
+
raise
|
|
87
|
+
rescue StandardError => e
|
|
88
|
+
raise e if attempt > @max_retries
|
|
89
|
+
|
|
90
|
+
sleep((2**attempt) * 0.1)
|
|
91
|
+
retry
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Route a call through the circuit breaker if one is configured.
|
|
96
|
+
#
|
|
97
|
+
# @yield The block to execute
|
|
98
|
+
# @return [Object] The return value of the block
|
|
99
|
+
def call_provider(&block)
|
|
100
|
+
if @circuit_breaker
|
|
101
|
+
@circuit_breaker.call(&block)
|
|
102
|
+
else
|
|
103
|
+
block.call
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Retrieval
|
|
5
|
+
# Transforms ranked search candidates into a token-budgeted context string
|
|
6
|
+
# for LLM consumption.
|
|
7
|
+
#
|
|
8
|
+
# Allocates a fixed token budget across four sections:
|
|
9
|
+
# - Structural (10%): Always-included codebase overview
|
|
10
|
+
# - Primary (50%): Direct query results
|
|
11
|
+
# - Supporting (25%): Dependencies and related context
|
|
12
|
+
# - Framework (15%): Rails/gem source when query has framework context
|
|
13
|
+
#
|
|
14
|
+
# When framework context is not needed, primary and supporting sections
|
|
15
|
+
# receive the framework allocation proportionally.
|
|
16
|
+
#
|
|
17
|
+
# @example
|
|
18
|
+
# assembler = ContextAssembler.new(metadata_store: store)
|
|
19
|
+
# result = assembler.assemble(candidates: ranked, classification: cls)
|
|
20
|
+
# result.context # => "## User (model)\n..."
|
|
21
|
+
# result.tokens_used # => 4200
|
|
22
|
+
# result.sections # => [:structural, :primary, :supporting]
|
|
23
|
+
#
|
|
24
|
+
class ContextAssembler
|
|
25
|
+
DEFAULT_BUDGET = 8000 # tokens
|
|
26
|
+
|
|
27
|
+
BUDGET_ALLOCATION = {
|
|
28
|
+
structural: 0.10,
|
|
29
|
+
primary: 0.50,
|
|
30
|
+
supporting: 0.25,
|
|
31
|
+
framework: 0.15
|
|
32
|
+
}.freeze
|
|
33
|
+
|
|
34
|
+
# Minimum token count for a section to be worth including.
|
|
35
|
+
MIN_USEFUL_TOKENS = 200
|
|
36
|
+
|
|
37
|
+
# @param metadata_store [#find] Store that resolves identifiers to unit data
|
|
38
|
+
# @param budget [Integer] Total token budget
|
|
39
|
+
def initialize(metadata_store:, budget: DEFAULT_BUDGET)
|
|
40
|
+
@metadata_store = metadata_store
|
|
41
|
+
@budget = budget
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Assemble context from ranked candidates within token budget.
|
|
45
|
+
#
|
|
46
|
+
# @param candidates [Array<Candidate>] Ranked search candidates
|
|
47
|
+
# @param classification [QueryClassifier::Classification] Query classification
|
|
48
|
+
# @param structural_context [String, nil] Optional codebase overview text
|
|
49
|
+
# @param budget [Integer, nil] Override token budget; falls back to @budget
|
|
50
|
+
# @return [AssembledContext] Token-budgeted context with source attribution
|
|
51
|
+
def assemble(candidates:, classification:, structural_context: nil, budget: nil)
|
|
52
|
+
effective_budget = budget || @budget
|
|
53
|
+
sections = []
|
|
54
|
+
sources = []
|
|
55
|
+
tokens_used = 0
|
|
56
|
+
|
|
57
|
+
# 1. Structural context (always first if provided)
|
|
58
|
+
tokens_used = add_structural_section(sections, structural_context, tokens_used, effective_budget)
|
|
59
|
+
|
|
60
|
+
# 2. Compute per-section budgets from remaining tokens
|
|
61
|
+
budgets = compute_section_budgets(effective_budget - tokens_used, classification)
|
|
62
|
+
|
|
63
|
+
# 3. Primary, supporting, and framework sections
|
|
64
|
+
add_candidate_section(sections, sources, :primary,
|
|
65
|
+
candidates.reject { |c| c.source == :graph_expansion }, budgets[:primary])
|
|
66
|
+
add_candidate_section(sections, sources, :supporting,
|
|
67
|
+
candidates.select { |c| c.source == :graph_expansion }, budgets[:supporting])
|
|
68
|
+
if budgets[:framework].positive?
|
|
69
|
+
add_candidate_section(sections, sources, :framework,
|
|
70
|
+
candidates.select { |c| framework_candidate?(c) }, budgets[:framework])
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
build_result(sections, sources, effective_budget)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
# Add structural context section if provided.
|
|
79
|
+
#
|
|
80
|
+
# @return [Integer] Updated tokens_used count
|
|
81
|
+
def add_structural_section(sections, structural_context, tokens_used, effective_budget)
|
|
82
|
+
return tokens_used unless structural_context
|
|
83
|
+
|
|
84
|
+
budget = (effective_budget * BUDGET_ALLOCATION[:structural]).to_i
|
|
85
|
+
text = truncate_to_budget(structural_context, budget)
|
|
86
|
+
sections << { section: :structural, content: text }
|
|
87
|
+
tokens_used + estimate_tokens(text)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Add a candidate-based section if candidates produce content.
|
|
91
|
+
#
|
|
92
|
+
# @return [void]
|
|
93
|
+
def add_candidate_section(sections, sources, section_name, candidates, budget)
|
|
94
|
+
return if candidates.empty?
|
|
95
|
+
|
|
96
|
+
content, section_sources = assemble_section(candidates, budget)
|
|
97
|
+
return if content.empty?
|
|
98
|
+
|
|
99
|
+
sections << { section: section_name, content: content }
|
|
100
|
+
sources.concat(section_sources)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Compute token budgets for primary/supporting/framework sections.
|
|
104
|
+
#
|
|
105
|
+
# @param remaining [Integer] Tokens available after structural
|
|
106
|
+
# @param classification [QueryClassifier::Classification]
|
|
107
|
+
# @return [Hash<Symbol, Integer>]
|
|
108
|
+
def compute_section_budgets(remaining, classification)
|
|
109
|
+
if classification.framework_context
|
|
110
|
+
{
|
|
111
|
+
primary: (remaining * 0.55).to_i,
|
|
112
|
+
supporting: (remaining * 0.25).to_i,
|
|
113
|
+
framework: (remaining * 0.20).to_i
|
|
114
|
+
}
|
|
115
|
+
else
|
|
116
|
+
{
|
|
117
|
+
primary: (remaining * 0.65).to_i,
|
|
118
|
+
supporting: (remaining * 0.35).to_i,
|
|
119
|
+
framework: 0
|
|
120
|
+
}
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Assemble content for a single section within a token budget.
|
|
125
|
+
#
|
|
126
|
+
# @param candidates [Array<Candidate>] Candidates for this section
|
|
127
|
+
# @param budget [Integer] Token budget for this section
|
|
128
|
+
# @return [Array(String, Array<Hash>)] Content string and source attributions
|
|
129
|
+
def assemble_section(candidates, budget)
|
|
130
|
+
content_parts = []
|
|
131
|
+
sources = []
|
|
132
|
+
tokens_used = 0
|
|
133
|
+
|
|
134
|
+
candidates.sort_by { |c| -c.score }.each do |candidate|
|
|
135
|
+
tokens_used = append_candidate(content_parts, sources, candidate, budget, tokens_used)
|
|
136
|
+
break if tokens_used.nil?
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
[content_parts.join("\n\n"), sources]
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Append a single candidate to the section. Returns updated tokens_used, or nil to stop.
|
|
143
|
+
def append_candidate(parts, sources, candidate, budget, tokens_used)
|
|
144
|
+
unit = @metadata_store.find(candidate.identifier)
|
|
145
|
+
return tokens_used unless unit
|
|
146
|
+
|
|
147
|
+
text = format_unit(unit, candidate)
|
|
148
|
+
tokens = estimate_tokens(text)
|
|
149
|
+
remaining = budget - tokens_used
|
|
150
|
+
|
|
151
|
+
if tokens <= remaining
|
|
152
|
+
parts << text
|
|
153
|
+
sources << build_source_attribution(candidate, unit)
|
|
154
|
+
tokens_used + tokens
|
|
155
|
+
elsif remaining > MIN_USEFUL_TOKENS
|
|
156
|
+
parts << truncate_to_budget(text, remaining)
|
|
157
|
+
sources << build_source_attribution(candidate, unit, truncated: true)
|
|
158
|
+
nil
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Format a unit for inclusion in context.
|
|
163
|
+
#
|
|
164
|
+
# @param unit [Hash] Unit data from metadata store
|
|
165
|
+
# @param candidate [Candidate] The search candidate
|
|
166
|
+
# @return [String]
|
|
167
|
+
def format_unit(unit, _candidate)
|
|
168
|
+
identifier = unit[:identifier] || unit['identifier']
|
|
169
|
+
type = unit[:type] || unit['type']
|
|
170
|
+
file_path = unit[:file_path] || unit['file_path']
|
|
171
|
+
source = unit[:source_code] || unit['source_code'] || ''
|
|
172
|
+
|
|
173
|
+
<<~UNIT.strip
|
|
174
|
+
## #{identifier} (#{type})
|
|
175
|
+
File: #{file_path}
|
|
176
|
+
|
|
177
|
+
#{source}
|
|
178
|
+
UNIT
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Build source attribution hash for a candidate.
|
|
182
|
+
#
|
|
183
|
+
# @return [Hash]
|
|
184
|
+
def build_source_attribution(candidate, unit, truncated: false)
|
|
185
|
+
attribution = {
|
|
186
|
+
identifier: candidate.identifier,
|
|
187
|
+
type: unit[:type] || unit['type'],
|
|
188
|
+
score: candidate.score,
|
|
189
|
+
file_path: unit[:file_path] || unit['file_path']
|
|
190
|
+
}
|
|
191
|
+
attribution[:truncated] = true if truncated
|
|
192
|
+
attribution
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Check if a candidate is framework source.
|
|
196
|
+
#
|
|
197
|
+
# @param candidate [Candidate]
|
|
198
|
+
# @return [Boolean]
|
|
199
|
+
def framework_candidate?(candidate)
|
|
200
|
+
metadata = candidate.metadata
|
|
201
|
+
return false unless metadata
|
|
202
|
+
|
|
203
|
+
type = metadata[:type] || metadata['type']
|
|
204
|
+
%w[rails_source gem_source].include?(type.to_s)
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Truncate text to fit within a token budget.
|
|
208
|
+
#
|
|
209
|
+
# @param text [String]
|
|
210
|
+
# @param token_budget [Integer]
|
|
211
|
+
# @return [String]
|
|
212
|
+
def truncate_to_budget(text, token_budget)
|
|
213
|
+
return text if estimate_tokens(text) <= token_budget
|
|
214
|
+
|
|
215
|
+
# Estimate target character count with 10% safety margin
|
|
216
|
+
target_chars = (token_budget * 4.0 * 0.9).to_i
|
|
217
|
+
"#{text[0...target_chars]}\n... [truncated]"
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Estimate token count using the project convention.
|
|
221
|
+
#
|
|
222
|
+
# @param text [String]
|
|
223
|
+
# @return [Integer]
|
|
224
|
+
def estimate_tokens(text)
|
|
225
|
+
(text.length / 4.0).ceil
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Build the final AssembledContext result.
|
|
229
|
+
#
|
|
230
|
+
# @param sections [Array<Hash>] Assembled sections
|
|
231
|
+
# @param sources [Array<Hash>] Source attributions
|
|
232
|
+
# @param effective_budget [Integer] The budget actually used for assembly
|
|
233
|
+
# @return [AssembledContext]
|
|
234
|
+
def build_result(sections, sources, effective_budget)
|
|
235
|
+
context = sections.map { |s| s[:content] }.join("\n\n---\n\n")
|
|
236
|
+
AssembledContext.new(
|
|
237
|
+
context: context,
|
|
238
|
+
tokens_used: estimate_tokens(context),
|
|
239
|
+
budget: effective_budget,
|
|
240
|
+
sources: sources.uniq,
|
|
241
|
+
sections: sections.map { |s| s[:section] }
|
|
242
|
+
)
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# Result of context assembly.
|
|
247
|
+
AssembledContext = Struct.new(:context, :tokens_used, :budget, :sources, :sections, keyword_init: true)
|
|
248
|
+
end
|
|
249
|
+
end
|