codebase_index 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +29 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +481 -0
- data/exe/codebase-console-mcp +22 -0
- data/exe/codebase-index-mcp +61 -0
- data/exe/codebase-index-mcp-http +64 -0
- data/exe/codebase-index-mcp-start +58 -0
- data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
- data/lib/codebase_index/ast/method_extractor.rb +76 -0
- data/lib/codebase_index/ast/node.rb +88 -0
- data/lib/codebase_index/ast/parser.rb +653 -0
- data/lib/codebase_index/ast.rb +6 -0
- data/lib/codebase_index/builder.rb +137 -0
- data/lib/codebase_index/chunking/chunk.rb +84 -0
- data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
- data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
- data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
- data/lib/codebase_index/console/audit_logger.rb +75 -0
- data/lib/codebase_index/console/bridge.rb +170 -0
- data/lib/codebase_index/console/confirmation.rb +90 -0
- data/lib/codebase_index/console/connection_manager.rb +173 -0
- data/lib/codebase_index/console/console_response_renderer.rb +78 -0
- data/lib/codebase_index/console/model_validator.rb +81 -0
- data/lib/codebase_index/console/safe_context.rb +82 -0
- data/lib/codebase_index/console/server.rb +557 -0
- data/lib/codebase_index/console/sql_validator.rb +172 -0
- data/lib/codebase_index/console/tools/tier1.rb +118 -0
- data/lib/codebase_index/console/tools/tier2.rb +117 -0
- data/lib/codebase_index/console/tools/tier3.rb +110 -0
- data/lib/codebase_index/console/tools/tier4.rb +79 -0
- data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
- data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
- data/lib/codebase_index/cost_model/estimator.rb +128 -0
- data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
- data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
- data/lib/codebase_index/cost_model.rb +22 -0
- data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
- data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
- data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/codebase_index/db/migrator.rb +71 -0
- data/lib/codebase_index/db/schema_version.rb +73 -0
- data/lib/codebase_index/dependency_graph.rb +227 -0
- data/lib/codebase_index/embedding/indexer.rb +130 -0
- data/lib/codebase_index/embedding/openai.rb +105 -0
- data/lib/codebase_index/embedding/provider.rb +135 -0
- data/lib/codebase_index/embedding/text_preparer.rb +112 -0
- data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
- data/lib/codebase_index/evaluation/evaluator.rb +146 -0
- data/lib/codebase_index/evaluation/metrics.rb +79 -0
- data/lib/codebase_index/evaluation/query_set.rb +148 -0
- data/lib/codebase_index/evaluation/report_generator.rb +90 -0
- data/lib/codebase_index/extracted_unit.rb +145 -0
- data/lib/codebase_index/extractor.rb +956 -0
- data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
- data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
- data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
- data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
- data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
- data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
- data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
- data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
- data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
- data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
- data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
- data/lib/codebase_index/extractors/event_extractor.rb +211 -0
- data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
- data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
- data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
- data/lib/codebase_index/extractors/job_extractor.rb +369 -0
- data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
- data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
- data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
- data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
- data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
- data/lib/codebase_index/extractors/model_extractor.rb +960 -0
- data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
- data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
- data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
- data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
- data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
- data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
- data/lib/codebase_index/extractors/route_extractor.rb +181 -0
- data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
- data/lib/codebase_index/extractors/service_extractor.rb +254 -0
- data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
- data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
- data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
- data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
- data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
- data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
- data/lib/codebase_index/feedback/gap_detector.rb +89 -0
- data/lib/codebase_index/feedback/store.rb +119 -0
- data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
- data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/codebase_index/flow_assembler.rb +290 -0
- data/lib/codebase_index/flow_document.rb +191 -0
- data/lib/codebase_index/flow_precomputer.rb +102 -0
- data/lib/codebase_index/formatting/base.rb +40 -0
- data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
- data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
- data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
- data/lib/codebase_index/formatting/human_adapter.rb +78 -0
- data/lib/codebase_index/graph_analyzer.rb +374 -0
- data/lib/codebase_index/mcp/index_reader.rb +394 -0
- data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
- data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
- data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/codebase_index/mcp/server.rb +935 -0
- data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
- data/lib/codebase_index/model_name_cache.rb +51 -0
- data/lib/codebase_index/notion/client.rb +217 -0
- data/lib/codebase_index/notion/exporter.rb +219 -0
- data/lib/codebase_index/notion/mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
- data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
- data/lib/codebase_index/notion/rate_limiter.rb +68 -0
- data/lib/codebase_index/observability/health_check.rb +81 -0
- data/lib/codebase_index/observability/instrumentation.rb +34 -0
- data/lib/codebase_index/observability/structured_logger.rb +75 -0
- data/lib/codebase_index/operator/error_escalator.rb +81 -0
- data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
- data/lib/codebase_index/operator/status_reporter.rb +80 -0
- data/lib/codebase_index/railtie.rb +26 -0
- data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
- data/lib/codebase_index/resilience/index_validator.rb +185 -0
- data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
- data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
- data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
- data/lib/codebase_index/retrieval/ranker.rb +273 -0
- data/lib/codebase_index/retrieval/search_executor.rb +327 -0
- data/lib/codebase_index/retriever.rb +160 -0
- data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
- data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
- data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
- data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
- data/lib/codebase_index/ruby_analyzer.rb +87 -0
- data/lib/codebase_index/session_tracer/file_store.rb +111 -0
- data/lib/codebase_index/session_tracer/middleware.rb +143 -0
- data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
- data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
- data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
- data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
- data/lib/codebase_index/session_tracer/store.rb +67 -0
- data/lib/codebase_index/storage/graph_store.rb +120 -0
- data/lib/codebase_index/storage/metadata_store.rb +169 -0
- data/lib/codebase_index/storage/pgvector.rb +163 -0
- data/lib/codebase_index/storage/qdrant.rb +172 -0
- data/lib/codebase_index/storage/vector_store.rb +156 -0
- data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
- data/lib/codebase_index/version.rb +5 -0
- data/lib/codebase_index.rb +223 -0
- data/lib/generators/codebase_index/install_generator.rb +32 -0
- data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
- data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
- data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
- data/lib/tasks/codebase_index.rake +583 -0
- data/lib/tasks/codebase_index_evaluation.rake +115 -0
- metadata +252 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Retrieval
|
|
5
|
+
# SearchExecutor maps a query classification to a retrieval strategy and
|
|
6
|
+
# executes it against the configured stores.
|
|
7
|
+
#
|
|
8
|
+
# Strategies:
|
|
9
|
+
# - :vector — semantic similarity search (understand, implement, debug)
|
|
10
|
+
# - :keyword — exact identifier/text matching (locate, reference)
|
|
11
|
+
# - :graph — dependency traversal (trace)
|
|
12
|
+
# - :hybrid — vector + keyword + graph expansion (exploratory/comprehensive)
|
|
13
|
+
# - :direct — direct metadata lookup (pinpoint + locate/reference)
|
|
14
|
+
#
|
|
15
|
+
# @example
|
|
16
|
+
# executor = SearchExecutor.new(
|
|
17
|
+
# vector_store: vector_store,
|
|
18
|
+
# metadata_store: metadata_store,
|
|
19
|
+
# graph_store: graph_store,
|
|
20
|
+
# embedding_provider: embedding_provider
|
|
21
|
+
# )
|
|
22
|
+
# classification = QueryClassifier.new.classify("How does User model work?")
|
|
23
|
+
# result = executor.execute(query: "How does User model work?", classification: classification)
|
|
24
|
+
# result.candidates # => [Candidate, ...]
|
|
25
|
+
# result.strategy # => :hybrid
|
|
26
|
+
#
|
|
27
|
+
class SearchExecutor
|
|
28
|
+
# A single search candidate with provenance tracking.
|
|
29
|
+
Candidate = Struct.new(:identifier, :score, :source, :metadata, keyword_init: true)
|
|
30
|
+
|
|
31
|
+
# The result of a search execution.
|
|
32
|
+
ExecutionResult = Struct.new(:candidates, :strategy, :query, keyword_init: true)
|
|
33
|
+
|
|
34
|
+
# Strategy mapping from (intent, scope) → strategy.
|
|
35
|
+
#
|
|
36
|
+
# Pinpoint scope always uses :direct for locate/reference.
|
|
37
|
+
# Comprehensive/exploratory scopes use :hybrid.
|
|
38
|
+
# Framework intent always uses :keyword against framework sources.
|
|
39
|
+
STRATEGY_MAP = {
|
|
40
|
+
# [intent, scope] => strategy
|
|
41
|
+
# Pinpoint
|
|
42
|
+
%i[locate pinpoint] => :direct,
|
|
43
|
+
%i[reference pinpoint] => :direct,
|
|
44
|
+
|
|
45
|
+
# Trace always uses graph
|
|
46
|
+
%i[trace pinpoint] => :graph,
|
|
47
|
+
%i[trace focused] => :graph,
|
|
48
|
+
%i[trace exploratory] => :graph,
|
|
49
|
+
%i[trace comprehensive] => :graph,
|
|
50
|
+
|
|
51
|
+
# Framework always keyword
|
|
52
|
+
%i[framework pinpoint] => :keyword,
|
|
53
|
+
%i[framework focused] => :keyword,
|
|
54
|
+
%i[framework exploratory] => :keyword,
|
|
55
|
+
%i[framework comprehensive] => :keyword
|
|
56
|
+
}.freeze
|
|
57
|
+
|
|
58
|
+
# @param vector_store [Storage::VectorStore::Interface] Vector store adapter
|
|
59
|
+
# @param metadata_store [Storage::MetadataStore::Interface] Metadata store adapter
|
|
60
|
+
# @param graph_store [Storage::GraphStore::Interface] Graph store adapter
|
|
61
|
+
# @param embedding_provider [Embedding::Provider::Interface] Embedding provider
|
|
62
|
+
def initialize(vector_store:, metadata_store:, graph_store:, embedding_provider:)
|
|
63
|
+
@vector_store = vector_store
|
|
64
|
+
@metadata_store = metadata_store
|
|
65
|
+
@graph_store = graph_store
|
|
66
|
+
@embedding_provider = embedding_provider
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Execute a search based on query classification.
|
|
70
|
+
#
|
|
71
|
+
# @param query [String] The original query text
|
|
72
|
+
# @param classification [QueryClassifier::Classification] Classified query
|
|
73
|
+
# @param limit [Integer] Maximum candidates to return
|
|
74
|
+
# @return [ExecutionResult] Candidates with strategy metadata
|
|
75
|
+
def execute(query:, classification:, limit: 20)
|
|
76
|
+
strategy = select_strategy(classification)
|
|
77
|
+
candidates = run_strategy(strategy, query: query, classification: classification, limit: limit)
|
|
78
|
+
|
|
79
|
+
ExecutionResult.new(
|
|
80
|
+
candidates: candidates.first(limit),
|
|
81
|
+
strategy: strategy,
|
|
82
|
+
query: query
|
|
83
|
+
)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
# Select the best retrieval strategy for a classification.
|
|
89
|
+
#
|
|
90
|
+
# @param classification [QueryClassifier::Classification]
|
|
91
|
+
# @return [Symbol] One of :vector, :keyword, :graph, :hybrid, :direct
|
|
92
|
+
def select_strategy(classification)
|
|
93
|
+
intent = classification.intent
|
|
94
|
+
scope = classification.scope
|
|
95
|
+
|
|
96
|
+
# Check explicit mapping first
|
|
97
|
+
mapped = STRATEGY_MAP[[intent, scope]]
|
|
98
|
+
return mapped if mapped
|
|
99
|
+
|
|
100
|
+
# Comprehensive and exploratory scopes default to hybrid
|
|
101
|
+
return :hybrid if %i[comprehensive exploratory].include?(scope)
|
|
102
|
+
|
|
103
|
+
# Scope-based defaults for remaining intents
|
|
104
|
+
case intent
|
|
105
|
+
when :locate, :reference
|
|
106
|
+
:keyword
|
|
107
|
+
else
|
|
108
|
+
:vector
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Execute the selected strategy.
|
|
113
|
+
#
|
|
114
|
+
# @param strategy [Symbol] Strategy to execute
|
|
115
|
+
# @param query [String] Original query text
|
|
116
|
+
# @param classification [QueryClassifier::Classification]
|
|
117
|
+
# @param limit [Integer] Max results
|
|
118
|
+
# @return [Array<Candidate>]
|
|
119
|
+
def run_strategy(strategy, query:, classification:, limit:)
|
|
120
|
+
case strategy
|
|
121
|
+
when :vector
|
|
122
|
+
execute_vector(query, classification: classification, limit: limit)
|
|
123
|
+
when :keyword
|
|
124
|
+
execute_keyword(classification: classification, limit: limit)
|
|
125
|
+
when :graph
|
|
126
|
+
execute_graph(classification: classification, limit: limit)
|
|
127
|
+
when :hybrid
|
|
128
|
+
execute_hybrid(query, classification: classification, limit: limit)
|
|
129
|
+
when :direct
|
|
130
|
+
execute_direct(classification: classification, limit: limit)
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Vector strategy: embed the query and search by similarity.
|
|
135
|
+
#
|
|
136
|
+
# @return [Array<Candidate>]
|
|
137
|
+
def execute_vector(query, classification:, limit:)
|
|
138
|
+
query_vector = @embedding_provider.embed(query)
|
|
139
|
+
filters = build_vector_filters(classification)
|
|
140
|
+
|
|
141
|
+
results = @vector_store.search(query_vector, limit: limit, filters: filters)
|
|
142
|
+
results.map do |r|
|
|
143
|
+
Candidate.new(identifier: r.id, score: r.score, source: :vector, metadata: r.metadata)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Keyword strategy: search metadata store by extracted keywords.
|
|
148
|
+
#
|
|
149
|
+
# Searches each keyword individually and merges results, keeping the
|
|
150
|
+
# best score per identifier.
|
|
151
|
+
#
|
|
152
|
+
# @return [Array<Candidate>]
|
|
153
|
+
def execute_keyword(classification:, limit:)
|
|
154
|
+
keywords = classification.keywords
|
|
155
|
+
return [] if keywords.empty?
|
|
156
|
+
|
|
157
|
+
all_results = merge_keyword_results(keywords)
|
|
158
|
+
rank_keyword_results(all_results, limit)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Search each keyword individually and merge, keeping best score per ID.
|
|
162
|
+
#
|
|
163
|
+
# @param keywords [Array<String>]
|
|
164
|
+
# @return [Hash<String, Hash>] id => { score:, metadata: }
|
|
165
|
+
def merge_keyword_results(keywords)
|
|
166
|
+
results_by_id = {}
|
|
167
|
+
keywords.each do |keyword|
|
|
168
|
+
results = @metadata_store.search(keyword)
|
|
169
|
+
results.each_with_index do |r, index|
|
|
170
|
+
id = r['id']
|
|
171
|
+
score = 1.0 - (index.to_f / [results.size, 10].max)
|
|
172
|
+
results_by_id[id] = { score: score, metadata: r } if !results_by_id[id] || score > results_by_id[id][:score]
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
results_by_id
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Rank merged keyword results into Candidate objects.
|
|
179
|
+
#
|
|
180
|
+
# @param results [Hash<String, Hash>]
|
|
181
|
+
# @param limit [Integer]
|
|
182
|
+
# @return [Array<Candidate>]
|
|
183
|
+
def rank_keyword_results(results, limit)
|
|
184
|
+
scored = results.map do |id, data|
|
|
185
|
+
Candidate.new(identifier: id, score: data[:score], source: :keyword, metadata: data[:metadata])
|
|
186
|
+
end
|
|
187
|
+
scored.sort_by { |c| -c.score }.first(limit)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Graph strategy: find related units via dependency traversal.
|
|
191
|
+
#
|
|
192
|
+
# @return [Array<Candidate>]
|
|
193
|
+
def execute_graph(classification:, limit:)
|
|
194
|
+
candidates = []
|
|
195
|
+
|
|
196
|
+
# First, use keywords to find seed identifiers in the metadata store
|
|
197
|
+
seeds = find_seed_identifiers(classification)
|
|
198
|
+
return [] if seeds.empty?
|
|
199
|
+
|
|
200
|
+
seeds.each do |seed_id|
|
|
201
|
+
# Forward dependencies
|
|
202
|
+
deps = @graph_store.dependencies_of(seed_id)
|
|
203
|
+
deps.each do |dep|
|
|
204
|
+
candidates << Candidate.new(identifier: dep, score: 0.8, source: :graph, metadata: {})
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Reverse dependencies (dependents)
|
|
208
|
+
dependents = @graph_store.dependents_of(seed_id)
|
|
209
|
+
dependents.each do |dep|
|
|
210
|
+
candidates << Candidate.new(identifier: dep, score: 0.7, source: :graph, metadata: {})
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# The seed itself
|
|
214
|
+
candidates << Candidate.new(identifier: seed_id, score: 1.0, source: :graph, metadata: {})
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
deduplicate(candidates).first(limit)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Hybrid strategy: combine vector, keyword, and graph expansion.
|
|
221
|
+
#
|
|
222
|
+
# @return [Array<Candidate>]
|
|
223
|
+
def execute_hybrid(query, classification:, limit:)
|
|
224
|
+
# Gather from all three sources
|
|
225
|
+
vector_candidates = execute_vector(query, classification: classification, limit: limit)
|
|
226
|
+
keyword_candidates = execute_keyword(classification: classification, limit: limit)
|
|
227
|
+
|
|
228
|
+
# Graph expansion on top vector results
|
|
229
|
+
graph_candidates = []
|
|
230
|
+
vector_candidates.first(3).each do |candidate|
|
|
231
|
+
deps = @graph_store.dependencies_of(candidate.identifier)
|
|
232
|
+
deps.each do |dep|
|
|
233
|
+
graph_candidates << Candidate.new(
|
|
234
|
+
identifier: dep, score: 0.5, source: :graph_expansion, metadata: {}
|
|
235
|
+
)
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
all = vector_candidates + keyword_candidates + graph_candidates
|
|
240
|
+
deduplicate(all).first(limit)
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Direct strategy: look up specific identifiers from keywords.
|
|
244
|
+
#
|
|
245
|
+
# Tries each keyword as-is and capitalized (e.g. "user" → "User")
|
|
246
|
+
# since keywords are lowercased but identifiers are typically PascalCase.
|
|
247
|
+
#
|
|
248
|
+
# @return [Array<Candidate>]
|
|
249
|
+
def execute_direct(classification:, limit:)
|
|
250
|
+
keywords = classification.keywords
|
|
251
|
+
return [] if keywords.empty?
|
|
252
|
+
|
|
253
|
+
candidates = lookup_keyword_variants(keywords)
|
|
254
|
+
|
|
255
|
+
# Fall back to keyword search if direct lookups miss
|
|
256
|
+
return execute_keyword(classification: classification, limit: limit) if candidates.empty?
|
|
257
|
+
|
|
258
|
+
candidates.first(limit)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Try each keyword as-is and in capitalized forms against the metadata store.
|
|
262
|
+
#
|
|
263
|
+
# @param keywords [Array<String>]
|
|
264
|
+
# @return [Array<Candidate>]
|
|
265
|
+
def lookup_keyword_variants(keywords)
|
|
266
|
+
candidates = []
|
|
267
|
+
keywords.each do |keyword|
|
|
268
|
+
variants = [keyword, keyword.capitalize, keyword.split('_').map(&:capitalize).join].uniq
|
|
269
|
+
variants.each do |variant|
|
|
270
|
+
result = @metadata_store.find(variant)
|
|
271
|
+
next unless result
|
|
272
|
+
|
|
273
|
+
candidates << Candidate.new(identifier: variant, score: 1.0, source: :direct, metadata: result)
|
|
274
|
+
break
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
candidates
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Build metadata filters for vector search based on classification.
|
|
281
|
+
#
|
|
282
|
+
# @param classification [QueryClassifier::Classification]
|
|
283
|
+
# @return [Hash]
|
|
284
|
+
def build_vector_filters(classification)
|
|
285
|
+
filters = {}
|
|
286
|
+
filters[:type] = classification.target_type.to_s if classification.target_type
|
|
287
|
+
filters
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
# Find seed identifiers from classification keywords via metadata search.
|
|
291
|
+
#
|
|
292
|
+
# @param classification [QueryClassifier::Classification]
|
|
293
|
+
# @return [Array<String>]
|
|
294
|
+
def find_seed_identifiers(classification)
|
|
295
|
+
seeds = []
|
|
296
|
+
|
|
297
|
+
# Try direct lookups for capitalized keywords (likely class names)
|
|
298
|
+
classification.keywords.each do |keyword|
|
|
299
|
+
capitalized = keyword.split('_').map(&:capitalize).join
|
|
300
|
+
result = @metadata_store.find(capitalized)
|
|
301
|
+
seeds << capitalized if result
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# Fall back to search if no direct hits
|
|
305
|
+
if seeds.empty? && classification.keywords.any?
|
|
306
|
+
results = @metadata_store.search(classification.keywords.join(' '))
|
|
307
|
+
seeds = results.first(3).map { |r| r['id'] }
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
seeds
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Deduplicate candidates, keeping the highest-scored entry per identifier.
|
|
314
|
+
#
|
|
315
|
+
# @param candidates [Array<Candidate>]
|
|
316
|
+
# @return [Array<Candidate>]
|
|
317
|
+
def deduplicate(candidates)
|
|
318
|
+
best = {}
|
|
319
|
+
candidates.each do |c|
|
|
320
|
+
existing = best[c.identifier]
|
|
321
|
+
best[c.identifier] = c if existing.nil? || c.score > existing.score
|
|
322
|
+
end
|
|
323
|
+
best.values.sort_by { |c| -c.score }
|
|
324
|
+
end
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
end
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'retrieval/query_classifier'
|
|
4
|
+
require_relative 'retrieval/search_executor'
|
|
5
|
+
require_relative 'retrieval/ranker'
|
|
6
|
+
require_relative 'retrieval/context_assembler'
|
|
7
|
+
|
|
8
|
+
module CodebaseIndex
|
|
9
|
+
# Retriever orchestrates the full retrieval pipeline: classify, execute,
|
|
10
|
+
# rank, and assemble context from a natural language query.
|
|
11
|
+
#
|
|
12
|
+
# Coordinates four internal components:
|
|
13
|
+
# - {Retrieval::QueryClassifier} — determines intent, scope, target type
|
|
14
|
+
# - {Retrieval::SearchExecutor} — maps classification to search strategy
|
|
15
|
+
# - {Retrieval::Ranker} — re-ranks candidates with weighted signals
|
|
16
|
+
# - {Retrieval::ContextAssembler} — builds token-budgeted context string
|
|
17
|
+
#
|
|
18
|
+
# Optionally builds a structural context overview (codebase unit counts
|
|
19
|
+
# by type) that is prepended to the assembled context.
|
|
20
|
+
#
|
|
21
|
+
# @example
|
|
22
|
+
# retriever = CodebaseIndex::Retriever.new(
|
|
23
|
+
# vector_store: vector_store,
|
|
24
|
+
# metadata_store: metadata_store,
|
|
25
|
+
# graph_store: graph_store,
|
|
26
|
+
# embedding_provider: embedding_provider
|
|
27
|
+
# )
|
|
28
|
+
# result = retriever.retrieve("How does the User model work?")
|
|
29
|
+
# result.context # => "Codebase: 42 units (10 models, ...)\n\n---\n\n## User (model)..."
|
|
30
|
+
# result.strategy # => :vector
|
|
31
|
+
# result.tokens_used # => 4200
|
|
32
|
+
#
|
|
33
|
+
class Retriever
|
|
34
|
+
# Diagnostic trace for retrieval quality analysis.
|
|
35
|
+
RetrievalTrace = Struct.new(:classification, :strategy, :candidate_count,
|
|
36
|
+
:ranked_count, :tokens_used, :elapsed_ms,
|
|
37
|
+
keyword_init: true)
|
|
38
|
+
|
|
39
|
+
# The result of a retrieval operation.
|
|
40
|
+
RetrievalResult = Struct.new(:context, :sources, :classification, :strategy, :tokens_used, :budget, :trace,
|
|
41
|
+
keyword_init: true)
|
|
42
|
+
|
|
43
|
+
# Unit types queried for the structural context overview.
|
|
44
|
+
STRUCTURAL_TYPES = %w[model controller service job mailer component graphql].freeze
|
|
45
|
+
|
|
46
|
+
# @param vector_store [Storage::VectorStore::Interface] Vector store adapter
|
|
47
|
+
# @param metadata_store [Storage::MetadataStore::Interface] Metadata store adapter
|
|
48
|
+
# @param graph_store [Storage::GraphStore::Interface] Graph store adapter
|
|
49
|
+
# @param embedding_provider [Embedding::Provider::Interface] Embedding provider
|
|
50
|
+
# @param formatter [#call, nil] Optional callable to post-process the context string
|
|
51
|
+
def initialize(vector_store:, metadata_store:, graph_store:, embedding_provider:, formatter: nil)
|
|
52
|
+
@metadata_store = metadata_store
|
|
53
|
+
@formatter = formatter
|
|
54
|
+
|
|
55
|
+
@classifier = Retrieval::QueryClassifier.new
|
|
56
|
+
@executor = Retrieval::SearchExecutor.new(
|
|
57
|
+
vector_store: vector_store,
|
|
58
|
+
metadata_store: metadata_store,
|
|
59
|
+
graph_store: graph_store,
|
|
60
|
+
embedding_provider: embedding_provider
|
|
61
|
+
)
|
|
62
|
+
@ranker = Retrieval::Ranker.new(metadata_store: metadata_store)
|
|
63
|
+
@assembler = Retrieval::ContextAssembler.new(metadata_store: metadata_store)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Execute the full retrieval pipeline for a natural language query.
|
|
67
|
+
#
|
|
68
|
+
# Pipeline: classify -> execute -> rank -> assemble -> format
|
|
69
|
+
#
|
|
70
|
+
# @param query [String] Natural language query
|
|
71
|
+
# @param budget [Integer] Token budget for context assembly
|
|
72
|
+
# @return [RetrievalResult] Complete retrieval result
|
|
73
|
+
def retrieve(query, budget: 8000)
|
|
74
|
+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
75
|
+
|
|
76
|
+
classification = @classifier.classify(query)
|
|
77
|
+
execution_result = @executor.execute(query: query, classification: classification)
|
|
78
|
+
ranked = @ranker.rank(execution_result.candidates, classification: classification)
|
|
79
|
+
assembled = assemble_context(ranked, classification, budget)
|
|
80
|
+
|
|
81
|
+
elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round(1)
|
|
82
|
+
|
|
83
|
+
trace = RetrievalTrace.new(
|
|
84
|
+
classification: classification,
|
|
85
|
+
strategy: execution_result.strategy,
|
|
86
|
+
candidate_count: execution_result.candidates.size,
|
|
87
|
+
ranked_count: ranked.size,
|
|
88
|
+
tokens_used: assembled.tokens_used,
|
|
89
|
+
elapsed_ms: elapsed_ms
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
build_result(assembled, classification, execution_result.strategy, budget, trace)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
private
|
|
96
|
+
|
|
97
|
+
# Assemble token-budgeted context from ranked candidates.
|
|
98
|
+
#
|
|
99
|
+
# @param ranked [Array<Candidate>] Ranked search candidates
|
|
100
|
+
# @param classification [QueryClassifier::Classification] Query classification
|
|
101
|
+
# @return [AssembledContext]
|
|
102
|
+
def assemble_context(ranked, classification, budget)
|
|
103
|
+
@assembler.assemble(
|
|
104
|
+
candidates: ranked,
|
|
105
|
+
classification: classification,
|
|
106
|
+
structural_context: build_structural_context,
|
|
107
|
+
budget: budget
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Build a RetrievalResult from assembled context and pipeline metadata.
|
|
112
|
+
#
|
|
113
|
+
# @param assembled [AssembledContext] Assembled context
|
|
114
|
+
# @param classification [QueryClassifier::Classification] Query classification
|
|
115
|
+
# @param strategy [Symbol] Search strategy used
|
|
116
|
+
# @param budget [Integer] Token budget
|
|
117
|
+
# @return [RetrievalResult]
|
|
118
|
+
def build_result(assembled, classification, strategy, budget, trace = nil)
|
|
119
|
+
context = @formatter ? @formatter.call(assembled.context) : assembled.context
|
|
120
|
+
|
|
121
|
+
RetrievalResult.new(
|
|
122
|
+
context: context,
|
|
123
|
+
sources: assembled.sources,
|
|
124
|
+
classification: classification,
|
|
125
|
+
strategy: strategy,
|
|
126
|
+
tokens_used: assembled.tokens_used,
|
|
127
|
+
budget: budget,
|
|
128
|
+
trace: trace
|
|
129
|
+
)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Build a structural context overview from the metadata store.
|
|
133
|
+
#
|
|
134
|
+
# Queries the metadata store for total unit count and counts per type,
|
|
135
|
+
# producing a summary like "Codebase: 42 units (10 models, 5 controllers, ...)".
|
|
136
|
+
#
|
|
137
|
+
# @return [String, nil] Overview string, or nil if the store is empty or on error
|
|
138
|
+
def build_structural_context
|
|
139
|
+
total = @metadata_store.count
|
|
140
|
+
return nil if total.zero?
|
|
141
|
+
|
|
142
|
+
type_counts = STRUCTURAL_TYPES.filter_map do |type|
|
|
143
|
+
count = count_by_type(type)
|
|
144
|
+
"#{count} #{type}s" if count.positive?
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
"Codebase: #{total} units (#{type_counts.join(', ')})"
|
|
148
|
+
rescue StandardError
|
|
149
|
+
nil
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Count units of a given type in the metadata store.
|
|
153
|
+
#
|
|
154
|
+
# @param type [String] The unit type to count
|
|
155
|
+
# @return [Integer] Number of units of this type
|
|
156
|
+
def count_by_type(type)
|
|
157
|
+
@metadata_store.find_by_type(type).size
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative '../ast/parser'
|
|
4
|
+
require_relative '../extracted_unit'
|
|
5
|
+
require_relative 'fqn_builder'
|
|
6
|
+
|
|
7
|
+
module CodebaseIndex
|
|
8
|
+
module RubyAnalyzer
|
|
9
|
+
# Extracts class and module definitions from Ruby source code using the AST layer.
|
|
10
|
+
#
|
|
11
|
+
# Produces ExtractedUnit objects with type :ruby_class or :ruby_module, including
|
|
12
|
+
# metadata about superclass, includes, extends, constants, and method count.
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# analyzer = RubyAnalyzer::ClassAnalyzer.new
|
|
16
|
+
# units = analyzer.analyze(source: File.read(path), file_path: path)
|
|
17
|
+
# units.first.type #=> :ruby_class
|
|
18
|
+
#
|
|
19
|
+
class ClassAnalyzer
|
|
20
|
+
include FqnBuilder
|
|
21
|
+
|
|
22
|
+
# @param parser [Ast::Parser, nil] Parser instance (creates default if nil)
|
|
23
|
+
def initialize(parser: nil)
|
|
24
|
+
@parser = parser || Ast::Parser.new
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Analyze source code and extract class/module units.
|
|
28
|
+
#
|
|
29
|
+
# @param source [String] Ruby source code
|
|
30
|
+
# @param file_path [String] Absolute path to the source file
|
|
31
|
+
# @return [Array<ExtractedUnit>] Extracted class and module units
|
|
32
|
+
def analyze(source:, file_path:)
|
|
33
|
+
root = @parser.parse(source)
|
|
34
|
+
units = []
|
|
35
|
+
extract_definitions(root, source, file_path, [], units)
|
|
36
|
+
units
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def extract_definitions(node, source, file_path, namespace_stack, units)
|
|
42
|
+
return unless node.is_a?(Ast::Node)
|
|
43
|
+
|
|
44
|
+
case node.type
|
|
45
|
+
when :class
|
|
46
|
+
process_class(node, source, file_path, namespace_stack, units)
|
|
47
|
+
when :module
|
|
48
|
+
process_module(node, source, file_path, namespace_stack, units)
|
|
49
|
+
else
|
|
50
|
+
(node.children || []).each do |child|
|
|
51
|
+
extract_definitions(child, source, file_path, namespace_stack, units)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def process_class(node, source, file_path, namespace_stack, units)
|
|
57
|
+
process_definition(node, :ruby_class, source, file_path, namespace_stack, units)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def process_module(node, source, file_path, namespace_stack, units)
|
|
61
|
+
process_definition(node, :ruby_module, source, file_path, namespace_stack, units)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def process_definition(node, type, source, file_path, namespace_stack, units)
|
|
65
|
+
name = node.method_name
|
|
66
|
+
fqn = build_fqn(name, namespace_stack)
|
|
67
|
+
namespace = build_namespace(name, namespace_stack)
|
|
68
|
+
|
|
69
|
+
superclass = type == :ruby_class ? extract_superclass(node) : nil
|
|
70
|
+
children = body_children(node, type)
|
|
71
|
+
includes = extract_mixins(children, 'include')
|
|
72
|
+
extends = extract_mixins(children, 'extend')
|
|
73
|
+
constants = extract_constants(children)
|
|
74
|
+
method_count = count_methods(children)
|
|
75
|
+
|
|
76
|
+
unit = ExtractedUnit.new(type: type, identifier: fqn, file_path: file_path)
|
|
77
|
+
unit.namespace = namespace
|
|
78
|
+
unit.source_code = extract_source(node, source)
|
|
79
|
+
unit.metadata = {
|
|
80
|
+
superclass: superclass,
|
|
81
|
+
includes: includes,
|
|
82
|
+
extends: extends,
|
|
83
|
+
constants: constants,
|
|
84
|
+
method_count: method_count
|
|
85
|
+
}
|
|
86
|
+
unit.dependencies = build_dependencies(superclass, includes, extends)
|
|
87
|
+
units << unit
|
|
88
|
+
|
|
89
|
+
# Recurse into body for nested definitions
|
|
90
|
+
inner_ns = namespace_stack + fqn_parts(name)
|
|
91
|
+
children.each do |child|
|
|
92
|
+
extract_definitions(child, source, file_path, inner_ns, units)
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Build namespace string (everything except the leaf name).
|
|
97
|
+
def build_namespace(name, namespace_stack)
|
|
98
|
+
parts = namespace_stack + fqn_parts(name)
|
|
99
|
+
parts.pop # Remove leaf
|
|
100
|
+
parts.empty? ? nil : parts.join('::')
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Split a name that may contain :: into parts.
|
|
104
|
+
def fqn_parts(name)
|
|
105
|
+
name.to_s.split('::')
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Extract superclass name from a class node.
|
|
109
|
+
# Children[0] is name, children[1] is superclass (or nil).
|
|
110
|
+
def extract_superclass(class_node)
|
|
111
|
+
superclass_node = class_node.children[1]
|
|
112
|
+
return nil unless superclass_node.is_a?(Ast::Node) && superclass_node.type == :const
|
|
113
|
+
|
|
114
|
+
build_const_name(superclass_node)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Get body children of a class or module node.
|
|
118
|
+
# Class: children[0] = name, children[1] = superclass, rest = body
|
|
119
|
+
# Module: children[0] = name, rest = body
|
|
120
|
+
def body_children(node, type)
|
|
121
|
+
offset = type == :ruby_class ? 2 : 1
|
|
122
|
+
(node.children || [])[offset..] || []
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Extract include/extend module names from body send nodes.
|
|
126
|
+
def extract_mixins(body_children, method_name)
|
|
127
|
+
body_children.filter_map do |child|
|
|
128
|
+
next unless child.is_a?(Ast::Node) && child.type == :send
|
|
129
|
+
next unless child.method_name == method_name
|
|
130
|
+
next if child.arguments.nil? || child.arguments.empty?
|
|
131
|
+
|
|
132
|
+
child.arguments.first
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Extract constant assignment names from body.
|
|
137
|
+
def extract_constants(body_children)
|
|
138
|
+
body_children.filter_map do |child|
|
|
139
|
+
next unless child.is_a?(Ast::Node) && child.type == :casgn
|
|
140
|
+
|
|
141
|
+
child.method_name
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Count def and defs nodes in body children (non-recursive — only direct methods).
|
|
146
|
+
def count_methods(body_children)
|
|
147
|
+
count = 0
|
|
148
|
+
body_children.each do |child|
|
|
149
|
+
next unless child.is_a?(Ast::Node)
|
|
150
|
+
|
|
151
|
+
count += 1 if %i[def defs].include?(child.type)
|
|
152
|
+
end
|
|
153
|
+
count
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Build the constant name from a :const node (may have receiver for namespaced).
|
|
157
|
+
def build_const_name(const_node)
|
|
158
|
+
parts = []
|
|
159
|
+
parts << const_node.receiver if const_node.receiver
|
|
160
|
+
parts << const_node.method_name if const_node.method_name
|
|
161
|
+
parts.join('::')
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Extract source text for a node using line range.
|
|
165
|
+
def extract_source(node, source)
|
|
166
|
+
return nil unless node.line && node.end_line
|
|
167
|
+
|
|
168
|
+
lines = source.lines
|
|
169
|
+
start_idx = node.line - 1
|
|
170
|
+
end_idx = node.end_line - 1
|
|
171
|
+
return nil if start_idx.negative? || end_idx >= lines.length
|
|
172
|
+
|
|
173
|
+
lines[start_idx..end_idx].join
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Build dependency list from superclass, includes, and extends.
|
|
177
|
+
def build_dependencies(superclass, includes, extends)
|
|
178
|
+
deps = []
|
|
179
|
+
deps << { type: :ruby_class, target: superclass, via: :inheritance } if superclass
|
|
180
|
+
includes.each do |mod|
|
|
181
|
+
deps << { type: :ruby_class, target: mod, via: :include }
|
|
182
|
+
end
|
|
183
|
+
extends.each do |mod|
|
|
184
|
+
deps << { type: :ruby_class, target: mod, via: :extend }
|
|
185
|
+
end
|
|
186
|
+
deps
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|