codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,327 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Retrieval
5
+ # SearchExecutor maps a query classification to a retrieval strategy and
6
+ # executes it against the configured stores.
7
+ #
8
+ # Strategies:
9
+ # - :vector — semantic similarity search (understand, implement, debug)
10
+ # - :keyword — exact identifier/text matching (locate, reference)
11
+ # - :graph — dependency traversal (trace)
12
+ # - :hybrid — vector + keyword + graph expansion (exploratory/comprehensive)
13
+ # - :direct — direct metadata lookup (pinpoint + locate/reference)
14
+ #
15
+ # @example
16
+ # executor = SearchExecutor.new(
17
+ # vector_store: vector_store,
18
+ # metadata_store: metadata_store,
19
+ # graph_store: graph_store,
20
+ # embedding_provider: embedding_provider
21
+ # )
22
+ # classification = QueryClassifier.new.classify("How does User model work?")
23
+ # result = executor.execute(query: "How does User model work?", classification: classification)
24
+ # result.candidates # => [Candidate, ...]
25
+ # result.strategy # => :hybrid
26
+ #
27
+ class SearchExecutor
28
+ # A single search candidate with provenance tracking.
29
+ Candidate = Struct.new(:identifier, :score, :source, :metadata, keyword_init: true)
30
+
31
+ # The result of a search execution.
32
+ ExecutionResult = Struct.new(:candidates, :strategy, :query, keyword_init: true)
33
+
34
+ # Strategy mapping from (intent, scope) → strategy.
35
+ #
36
+ # Pinpoint scope always uses :direct for locate/reference.
37
+ # Comprehensive/exploratory scopes use :hybrid.
38
+ # Framework intent always uses :keyword against framework sources.
39
+ STRATEGY_MAP = {
40
+ # [intent, scope] => strategy
41
+ # Pinpoint
42
+ %i[locate pinpoint] => :direct,
43
+ %i[reference pinpoint] => :direct,
44
+
45
+ # Trace always uses graph
46
+ %i[trace pinpoint] => :graph,
47
+ %i[trace focused] => :graph,
48
+ %i[trace exploratory] => :graph,
49
+ %i[trace comprehensive] => :graph,
50
+
51
+ # Framework always keyword
52
+ %i[framework pinpoint] => :keyword,
53
+ %i[framework focused] => :keyword,
54
+ %i[framework exploratory] => :keyword,
55
+ %i[framework comprehensive] => :keyword
56
+ }.freeze
57
+
58
+ # @param vector_store [Storage::VectorStore::Interface] Vector store adapter
59
+ # @param metadata_store [Storage::MetadataStore::Interface] Metadata store adapter
60
+ # @param graph_store [Storage::GraphStore::Interface] Graph store adapter
61
+ # @param embedding_provider [Embedding::Provider::Interface] Embedding provider
62
+ def initialize(vector_store:, metadata_store:, graph_store:, embedding_provider:)
63
+ @vector_store = vector_store
64
+ @metadata_store = metadata_store
65
+ @graph_store = graph_store
66
+ @embedding_provider = embedding_provider
67
+ end
68
+
69
+ # Execute a search based on query classification.
70
+ #
71
+ # @param query [String] The original query text
72
+ # @param classification [QueryClassifier::Classification] Classified query
73
+ # @param limit [Integer] Maximum candidates to return
74
+ # @return [ExecutionResult] Candidates with strategy metadata
75
+ def execute(query:, classification:, limit: 20)
76
+ strategy = select_strategy(classification)
77
+ candidates = run_strategy(strategy, query: query, classification: classification, limit: limit)
78
+
79
+ ExecutionResult.new(
80
+ candidates: candidates.first(limit),
81
+ strategy: strategy,
82
+ query: query
83
+ )
84
+ end
85
+
86
+ private
87
+
88
+ # Select the best retrieval strategy for a classification.
89
+ #
90
+ # @param classification [QueryClassifier::Classification]
91
+ # @return [Symbol] One of :vector, :keyword, :graph, :hybrid, :direct
92
+ def select_strategy(classification)
93
+ intent = classification.intent
94
+ scope = classification.scope
95
+
96
+ # Check explicit mapping first
97
+ mapped = STRATEGY_MAP[[intent, scope]]
98
+ return mapped if mapped
99
+
100
+ # Comprehensive and exploratory scopes default to hybrid
101
+ return :hybrid if %i[comprehensive exploratory].include?(scope)
102
+
103
+ # Scope-based defaults for remaining intents
104
+ case intent
105
+ when :locate, :reference
106
+ :keyword
107
+ else
108
+ :vector
109
+ end
110
+ end
111
+
112
+ # Execute the selected strategy.
113
+ #
114
+ # @param strategy [Symbol] Strategy to execute
115
+ # @param query [String] Original query text
116
+ # @param classification [QueryClassifier::Classification]
117
+ # @param limit [Integer] Max results
118
+ # @return [Array<Candidate>]
119
+ def run_strategy(strategy, query:, classification:, limit:)
120
+ case strategy
121
+ when :vector
122
+ execute_vector(query, classification: classification, limit: limit)
123
+ when :keyword
124
+ execute_keyword(classification: classification, limit: limit)
125
+ when :graph
126
+ execute_graph(classification: classification, limit: limit)
127
+ when :hybrid
128
+ execute_hybrid(query, classification: classification, limit: limit)
129
+ when :direct
130
+ execute_direct(classification: classification, limit: limit)
131
+ end
132
+ end
133
+
134
+ # Vector strategy: embed the query and search by similarity.
135
+ #
136
+ # @return [Array<Candidate>]
137
+ def execute_vector(query, classification:, limit:)
138
+ query_vector = @embedding_provider.embed(query)
139
+ filters = build_vector_filters(classification)
140
+
141
+ results = @vector_store.search(query_vector, limit: limit, filters: filters)
142
+ results.map do |r|
143
+ Candidate.new(identifier: r.id, score: r.score, source: :vector, metadata: r.metadata)
144
+ end
145
+ end
146
+
147
+ # Keyword strategy: search metadata store by extracted keywords.
148
+ #
149
+ # Searches each keyword individually and merges results, keeping the
150
+ # best score per identifier.
151
+ #
152
+ # @return [Array<Candidate>]
153
+ def execute_keyword(classification:, limit:)
154
+ keywords = classification.keywords
155
+ return [] if keywords.empty?
156
+
157
+ all_results = merge_keyword_results(keywords)
158
+ rank_keyword_results(all_results, limit)
159
+ end
160
+
161
+ # Search each keyword individually and merge, keeping best score per ID.
162
+ #
163
+ # @param keywords [Array<String>]
164
+ # @return [Hash<String, Hash>] id => { score:, metadata: }
165
+ def merge_keyword_results(keywords)
166
+ results_by_id = {}
167
+ keywords.each do |keyword|
168
+ results = @metadata_store.search(keyword)
169
+ results.each_with_index do |r, index|
170
+ id = r['id']
171
+ score = 1.0 - (index.to_f / [results.size, 10].max)
172
+ results_by_id[id] = { score: score, metadata: r } if !results_by_id[id] || score > results_by_id[id][:score]
173
+ end
174
+ end
175
+ results_by_id
176
+ end
177
+
178
+ # Rank merged keyword results into Candidate objects.
179
+ #
180
+ # @param results [Hash<String, Hash>]
181
+ # @param limit [Integer]
182
+ # @return [Array<Candidate>]
183
+ def rank_keyword_results(results, limit)
184
+ scored = results.map do |id, data|
185
+ Candidate.new(identifier: id, score: data[:score], source: :keyword, metadata: data[:metadata])
186
+ end
187
+ scored.sort_by { |c| -c.score }.first(limit)
188
+ end
189
+
190
+ # Graph strategy: find related units via dependency traversal.
191
+ #
192
+ # @return [Array<Candidate>]
193
+ def execute_graph(classification:, limit:)
194
+ candidates = []
195
+
196
+ # First, use keywords to find seed identifiers in the metadata store
197
+ seeds = find_seed_identifiers(classification)
198
+ return [] if seeds.empty?
199
+
200
+ seeds.each do |seed_id|
201
+ # Forward dependencies
202
+ deps = @graph_store.dependencies_of(seed_id)
203
+ deps.each do |dep|
204
+ candidates << Candidate.new(identifier: dep, score: 0.8, source: :graph, metadata: {})
205
+ end
206
+
207
+ # Reverse dependencies (dependents)
208
+ dependents = @graph_store.dependents_of(seed_id)
209
+ dependents.each do |dep|
210
+ candidates << Candidate.new(identifier: dep, score: 0.7, source: :graph, metadata: {})
211
+ end
212
+
213
+ # The seed itself
214
+ candidates << Candidate.new(identifier: seed_id, score: 1.0, source: :graph, metadata: {})
215
+ end
216
+
217
+ deduplicate(candidates).first(limit)
218
+ end
219
+
220
+ # Hybrid strategy: combine vector, keyword, and graph expansion.
221
+ #
222
+ # @return [Array<Candidate>]
223
+ def execute_hybrid(query, classification:, limit:)
224
+ # Gather from all three sources
225
+ vector_candidates = execute_vector(query, classification: classification, limit: limit)
226
+ keyword_candidates = execute_keyword(classification: classification, limit: limit)
227
+
228
+ # Graph expansion on top vector results
229
+ graph_candidates = []
230
+ vector_candidates.first(3).each do |candidate|
231
+ deps = @graph_store.dependencies_of(candidate.identifier)
232
+ deps.each do |dep|
233
+ graph_candidates << Candidate.new(
234
+ identifier: dep, score: 0.5, source: :graph_expansion, metadata: {}
235
+ )
236
+ end
237
+ end
238
+
239
+ all = vector_candidates + keyword_candidates + graph_candidates
240
+ deduplicate(all).first(limit)
241
+ end
242
+
243
+ # Direct strategy: look up specific identifiers from keywords.
244
+ #
245
+ # Tries each keyword as-is and capitalized (e.g. "user" → "User")
246
+ # since keywords are lowercased but identifiers are typically PascalCase.
247
+ #
248
+ # @return [Array<Candidate>]
249
+ def execute_direct(classification:, limit:)
250
+ keywords = classification.keywords
251
+ return [] if keywords.empty?
252
+
253
+ candidates = lookup_keyword_variants(keywords)
254
+
255
+ # Fall back to keyword search if direct lookups miss
256
+ return execute_keyword(classification: classification, limit: limit) if candidates.empty?
257
+
258
+ candidates.first(limit)
259
+ end
260
+
261
+ # Try each keyword as-is and in capitalized forms against the metadata store.
262
+ #
263
+ # @param keywords [Array<String>]
264
+ # @return [Array<Candidate>]
265
+ def lookup_keyword_variants(keywords)
266
+ candidates = []
267
+ keywords.each do |keyword|
268
+ variants = [keyword, keyword.capitalize, keyword.split('_').map(&:capitalize).join].uniq
269
+ variants.each do |variant|
270
+ result = @metadata_store.find(variant)
271
+ next unless result
272
+
273
+ candidates << Candidate.new(identifier: variant, score: 1.0, source: :direct, metadata: result)
274
+ break
275
+ end
276
+ end
277
+ candidates
278
+ end
279
+
280
+ # Build metadata filters for vector search based on classification.
281
+ #
282
+ # @param classification [QueryClassifier::Classification]
283
+ # @return [Hash]
284
+ def build_vector_filters(classification)
285
+ filters = {}
286
+ filters[:type] = classification.target_type.to_s if classification.target_type
287
+ filters
288
+ end
289
+
290
+ # Find seed identifiers from classification keywords via metadata search.
291
+ #
292
+ # @param classification [QueryClassifier::Classification]
293
+ # @return [Array<String>]
294
+ def find_seed_identifiers(classification)
295
+ seeds = []
296
+
297
+ # Try direct lookups for capitalized keywords (likely class names)
298
+ classification.keywords.each do |keyword|
299
+ capitalized = keyword.split('_').map(&:capitalize).join
300
+ result = @metadata_store.find(capitalized)
301
+ seeds << capitalized if result
302
+ end
303
+
304
+ # Fall back to search if no direct hits
305
+ if seeds.empty? && classification.keywords.any?
306
+ results = @metadata_store.search(classification.keywords.join(' '))
307
+ seeds = results.first(3).map { |r| r['id'] }
308
+ end
309
+
310
+ seeds
311
+ end
312
+
313
+ # Deduplicate candidates, keeping the highest-scored entry per identifier.
314
+ #
315
+ # @param candidates [Array<Candidate>]
316
+ # @return [Array<Candidate>]
317
+ def deduplicate(candidates)
318
+ best = {}
319
+ candidates.each do |c|
320
+ existing = best[c.identifier]
321
+ best[c.identifier] = c if existing.nil? || c.score > existing.score
322
+ end
323
+ best.values.sort_by { |c| -c.score }
324
+ end
325
+ end
326
+ end
327
+ end
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'retrieval/query_classifier'
4
+ require_relative 'retrieval/search_executor'
5
+ require_relative 'retrieval/ranker'
6
+ require_relative 'retrieval/context_assembler'
7
+
8
+ module CodebaseIndex
9
+ # Retriever orchestrates the full retrieval pipeline: classify, execute,
10
+ # rank, and assemble context from a natural language query.
11
+ #
12
+ # Coordinates four internal components:
13
+ # - {Retrieval::QueryClassifier} — determines intent, scope, target type
14
+ # - {Retrieval::SearchExecutor} — maps classification to search strategy
15
+ # - {Retrieval::Ranker} — re-ranks candidates with weighted signals
16
+ # - {Retrieval::ContextAssembler} — builds token-budgeted context string
17
+ #
18
+ # Optionally builds a structural context overview (codebase unit counts
19
+ # by type) that is prepended to the assembled context.
20
+ #
21
+ # @example
22
+ # retriever = CodebaseIndex::Retriever.new(
23
+ # vector_store: vector_store,
24
+ # metadata_store: metadata_store,
25
+ # graph_store: graph_store,
26
+ # embedding_provider: embedding_provider
27
+ # )
28
+ # result = retriever.retrieve("How does the User model work?")
29
+ # result.context # => "Codebase: 42 units (10 models, ...)\n\n---\n\n## User (model)..."
30
+ # result.strategy # => :vector
31
+ # result.tokens_used # => 4200
32
+ #
33
+ class Retriever
34
+ # Diagnostic trace for retrieval quality analysis.
35
+ RetrievalTrace = Struct.new(:classification, :strategy, :candidate_count,
36
+ :ranked_count, :tokens_used, :elapsed_ms,
37
+ keyword_init: true)
38
+
39
+ # The result of a retrieval operation.
40
+ RetrievalResult = Struct.new(:context, :sources, :classification, :strategy, :tokens_used, :budget, :trace,
41
+ keyword_init: true)
42
+
43
+ # Unit types queried for the structural context overview.
44
+ STRUCTURAL_TYPES = %w[model controller service job mailer component graphql].freeze
45
+
46
+ # @param vector_store [Storage::VectorStore::Interface] Vector store adapter
47
+ # @param metadata_store [Storage::MetadataStore::Interface] Metadata store adapter
48
+ # @param graph_store [Storage::GraphStore::Interface] Graph store adapter
49
+ # @param embedding_provider [Embedding::Provider::Interface] Embedding provider
50
+ # @param formatter [#call, nil] Optional callable to post-process the context string
51
+ def initialize(vector_store:, metadata_store:, graph_store:, embedding_provider:, formatter: nil)
52
+ @metadata_store = metadata_store
53
+ @formatter = formatter
54
+
55
+ @classifier = Retrieval::QueryClassifier.new
56
+ @executor = Retrieval::SearchExecutor.new(
57
+ vector_store: vector_store,
58
+ metadata_store: metadata_store,
59
+ graph_store: graph_store,
60
+ embedding_provider: embedding_provider
61
+ )
62
+ @ranker = Retrieval::Ranker.new(metadata_store: metadata_store)
63
+ @assembler = Retrieval::ContextAssembler.new(metadata_store: metadata_store)
64
+ end
65
+
66
+ # Execute the full retrieval pipeline for a natural language query.
67
+ #
68
+ # Pipeline: classify -> execute -> rank -> assemble -> format
69
+ #
70
+ # @param query [String] Natural language query
71
+ # @param budget [Integer] Token budget for context assembly
72
+ # @return [RetrievalResult] Complete retrieval result
73
+ def retrieve(query, budget: 8000)
74
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
75
+
76
+ classification = @classifier.classify(query)
77
+ execution_result = @executor.execute(query: query, classification: classification)
78
+ ranked = @ranker.rank(execution_result.candidates, classification: classification)
79
+ assembled = assemble_context(ranked, classification, budget)
80
+
81
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round(1)
82
+
83
+ trace = RetrievalTrace.new(
84
+ classification: classification,
85
+ strategy: execution_result.strategy,
86
+ candidate_count: execution_result.candidates.size,
87
+ ranked_count: ranked.size,
88
+ tokens_used: assembled.tokens_used,
89
+ elapsed_ms: elapsed_ms
90
+ )
91
+
92
+ build_result(assembled, classification, execution_result.strategy, budget, trace)
93
+ end
94
+
95
+ private
96
+
97
+ # Assemble token-budgeted context from ranked candidates.
98
+ #
99
+ # @param ranked [Array<Candidate>] Ranked search candidates
100
+ # @param classification [QueryClassifier::Classification] Query classification
101
+ # @return [AssembledContext]
102
+ def assemble_context(ranked, classification, budget)
103
+ @assembler.assemble(
104
+ candidates: ranked,
105
+ classification: classification,
106
+ structural_context: build_structural_context,
107
+ budget: budget
108
+ )
109
+ end
110
+
111
+ # Build a RetrievalResult from assembled context and pipeline metadata.
112
+ #
113
+ # @param assembled [AssembledContext] Assembled context
114
+ # @param classification [QueryClassifier::Classification] Query classification
115
+ # @param strategy [Symbol] Search strategy used
116
+ # @param budget [Integer] Token budget
117
+ # @return [RetrievalResult]
118
+ def build_result(assembled, classification, strategy, budget, trace = nil)
119
+ context = @formatter ? @formatter.call(assembled.context) : assembled.context
120
+
121
+ RetrievalResult.new(
122
+ context: context,
123
+ sources: assembled.sources,
124
+ classification: classification,
125
+ strategy: strategy,
126
+ tokens_used: assembled.tokens_used,
127
+ budget: budget,
128
+ trace: trace
129
+ )
130
+ end
131
+
132
+ # Build a structural context overview from the metadata store.
133
+ #
134
+ # Queries the metadata store for total unit count and counts per type,
135
+ # producing a summary like "Codebase: 42 units (10 models, 5 controllers, ...)".
136
+ #
137
+ # @return [String, nil] Overview string, or nil if the store is empty or on error
138
+ def build_structural_context
139
+ total = @metadata_store.count
140
+ return nil if total.zero?
141
+
142
+ type_counts = STRUCTURAL_TYPES.filter_map do |type|
143
+ count = count_by_type(type)
144
+ "#{count} #{type}s" if count.positive?
145
+ end
146
+
147
+ "Codebase: #{total} units (#{type_counts.join(', ')})"
148
+ rescue StandardError
149
+ nil
150
+ end
151
+
152
+ # Count units of a given type in the metadata store.
153
+ #
154
+ # @param type [String] The unit type to count
155
+ # @return [Integer] Number of units of this type
156
+ def count_by_type(type)
157
+ @metadata_store.find_by_type(type).size
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,190 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../ast/parser'
4
+ require_relative '../extracted_unit'
5
+ require_relative 'fqn_builder'
6
+
7
+ module CodebaseIndex
8
+ module RubyAnalyzer
9
+ # Extracts class and module definitions from Ruby source code using the AST layer.
10
+ #
11
+ # Produces ExtractedUnit objects with type :ruby_class or :ruby_module, including
12
+ # metadata about superclass, includes, extends, constants, and method count.
13
+ #
14
+ # @example
15
+ # analyzer = RubyAnalyzer::ClassAnalyzer.new
16
+ # units = analyzer.analyze(source: File.read(path), file_path: path)
17
+ # units.first.type #=> :ruby_class
18
+ #
19
+ class ClassAnalyzer
20
+ include FqnBuilder
21
+
22
+ # @param parser [Ast::Parser, nil] Parser instance (creates default if nil)
23
+ def initialize(parser: nil)
24
+ @parser = parser || Ast::Parser.new
25
+ end
26
+
27
+ # Analyze source code and extract class/module units.
28
+ #
29
+ # @param source [String] Ruby source code
30
+ # @param file_path [String] Absolute path to the source file
31
+ # @return [Array<ExtractedUnit>] Extracted class and module units
32
+ def analyze(source:, file_path:)
33
+ root = @parser.parse(source)
34
+ units = []
35
+ extract_definitions(root, source, file_path, [], units)
36
+ units
37
+ end
38
+
39
+ private
40
+
41
+ def extract_definitions(node, source, file_path, namespace_stack, units)
42
+ return unless node.is_a?(Ast::Node)
43
+
44
+ case node.type
45
+ when :class
46
+ process_class(node, source, file_path, namespace_stack, units)
47
+ when :module
48
+ process_module(node, source, file_path, namespace_stack, units)
49
+ else
50
+ (node.children || []).each do |child|
51
+ extract_definitions(child, source, file_path, namespace_stack, units)
52
+ end
53
+ end
54
+ end
55
+
56
+ def process_class(node, source, file_path, namespace_stack, units)
57
+ process_definition(node, :ruby_class, source, file_path, namespace_stack, units)
58
+ end
59
+
60
+ def process_module(node, source, file_path, namespace_stack, units)
61
+ process_definition(node, :ruby_module, source, file_path, namespace_stack, units)
62
+ end
63
+
64
+ def process_definition(node, type, source, file_path, namespace_stack, units)
65
+ name = node.method_name
66
+ fqn = build_fqn(name, namespace_stack)
67
+ namespace = build_namespace(name, namespace_stack)
68
+
69
+ superclass = type == :ruby_class ? extract_superclass(node) : nil
70
+ children = body_children(node, type)
71
+ includes = extract_mixins(children, 'include')
72
+ extends = extract_mixins(children, 'extend')
73
+ constants = extract_constants(children)
74
+ method_count = count_methods(children)
75
+
76
+ unit = ExtractedUnit.new(type: type, identifier: fqn, file_path: file_path)
77
+ unit.namespace = namespace
78
+ unit.source_code = extract_source(node, source)
79
+ unit.metadata = {
80
+ superclass: superclass,
81
+ includes: includes,
82
+ extends: extends,
83
+ constants: constants,
84
+ method_count: method_count
85
+ }
86
+ unit.dependencies = build_dependencies(superclass, includes, extends)
87
+ units << unit
88
+
89
+ # Recurse into body for nested definitions
90
+ inner_ns = namespace_stack + fqn_parts(name)
91
+ children.each do |child|
92
+ extract_definitions(child, source, file_path, inner_ns, units)
93
+ end
94
+ end
95
+
96
+ # Build namespace string (everything except the leaf name).
97
+ def build_namespace(name, namespace_stack)
98
+ parts = namespace_stack + fqn_parts(name)
99
+ parts.pop # Remove leaf
100
+ parts.empty? ? nil : parts.join('::')
101
+ end
102
+
103
+ # Split a name that may contain :: into parts.
104
+ def fqn_parts(name)
105
+ name.to_s.split('::')
106
+ end
107
+
108
+ # Extract superclass name from a class node.
109
+ # Children[0] is name, children[1] is superclass (or nil).
110
+ def extract_superclass(class_node)
111
+ superclass_node = class_node.children[1]
112
+ return nil unless superclass_node.is_a?(Ast::Node) && superclass_node.type == :const
113
+
114
+ build_const_name(superclass_node)
115
+ end
116
+
117
+ # Get body children of a class or module node.
118
+ # Class: children[0] = name, children[1] = superclass, rest = body
119
+ # Module: children[0] = name, rest = body
120
+ def body_children(node, type)
121
+ offset = type == :ruby_class ? 2 : 1
122
+ (node.children || [])[offset..] || []
123
+ end
124
+
125
+ # Extract include/extend module names from body send nodes.
126
+ def extract_mixins(body_children, method_name)
127
+ body_children.filter_map do |child|
128
+ next unless child.is_a?(Ast::Node) && child.type == :send
129
+ next unless child.method_name == method_name
130
+ next if child.arguments.nil? || child.arguments.empty?
131
+
132
+ child.arguments.first
133
+ end
134
+ end
135
+
136
+ # Extract constant assignment names from body.
137
+ def extract_constants(body_children)
138
+ body_children.filter_map do |child|
139
+ next unless child.is_a?(Ast::Node) && child.type == :casgn
140
+
141
+ child.method_name
142
+ end
143
+ end
144
+
145
+ # Count def and defs nodes in body children (non-recursive — only direct methods).
146
+ def count_methods(body_children)
147
+ count = 0
148
+ body_children.each do |child|
149
+ next unless child.is_a?(Ast::Node)
150
+
151
+ count += 1 if %i[def defs].include?(child.type)
152
+ end
153
+ count
154
+ end
155
+
156
+ # Build the constant name from a :const node (may have receiver for namespaced).
157
+ def build_const_name(const_node)
158
+ parts = []
159
+ parts << const_node.receiver if const_node.receiver
160
+ parts << const_node.method_name if const_node.method_name
161
+ parts.join('::')
162
+ end
163
+
164
+ # Extract source text for a node using line range.
165
+ def extract_source(node, source)
166
+ return nil unless node.line && node.end_line
167
+
168
+ lines = source.lines
169
+ start_idx = node.line - 1
170
+ end_idx = node.end_line - 1
171
+ return nil if start_idx.negative? || end_idx >= lines.length
172
+
173
+ lines[start_idx..end_idx].join
174
+ end
175
+
176
+ # Build dependency list from superclass, includes, and extends.
177
+ def build_dependencies(superclass, includes, extends)
178
+ deps = []
179
+ deps << { type: :ruby_class, target: superclass, via: :inheritance } if superclass
180
+ includes.each do |mod|
181
+ deps << { type: :ruby_class, target: mod, via: :include }
182
+ end
183
+ extends.each do |mod|
184
+ deps << { type: :ruby_class, target: mod, via: :extend }
185
+ end
186
+ deps
187
+ end
188
+ end
189
+ end
190
+ end