woods 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +89 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +406 -0
  7. data/exe/woods-console +59 -0
  8. data/exe/woods-console-mcp +22 -0
  9. data/exe/woods-mcp +34 -0
  10. data/exe/woods-mcp-http +37 -0
  11. data/exe/woods-mcp-start +58 -0
  12. data/lib/generators/woods/install_generator.rb +32 -0
  13. data/lib/generators/woods/pgvector_generator.rb +37 -0
  14. data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
  15. data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
  16. data/lib/tasks/woods.rake +621 -0
  17. data/lib/tasks/woods_evaluation.rake +115 -0
  18. data/lib/woods/ast/call_site_extractor.rb +106 -0
  19. data/lib/woods/ast/method_extractor.rb +71 -0
  20. data/lib/woods/ast/node.rb +116 -0
  21. data/lib/woods/ast/parser.rb +614 -0
  22. data/lib/woods/ast.rb +6 -0
  23. data/lib/woods/builder.rb +200 -0
  24. data/lib/woods/cache/cache_middleware.rb +199 -0
  25. data/lib/woods/cache/cache_store.rb +264 -0
  26. data/lib/woods/cache/redis_cache_store.rb +116 -0
  27. data/lib/woods/cache/solid_cache_store.rb +111 -0
  28. data/lib/woods/chunking/chunk.rb +84 -0
  29. data/lib/woods/chunking/semantic_chunker.rb +295 -0
  30. data/lib/woods/console/adapters/cache_adapter.rb +58 -0
  31. data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
  32. data/lib/woods/console/adapters/job_adapter.rb +68 -0
  33. data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
  34. data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
  35. data/lib/woods/console/audit_logger.rb +75 -0
  36. data/lib/woods/console/bridge.rb +177 -0
  37. data/lib/woods/console/confirmation.rb +90 -0
  38. data/lib/woods/console/connection_manager.rb +173 -0
  39. data/lib/woods/console/console_response_renderer.rb +74 -0
  40. data/lib/woods/console/embedded_executor.rb +373 -0
  41. data/lib/woods/console/model_validator.rb +81 -0
  42. data/lib/woods/console/rack_middleware.rb +87 -0
  43. data/lib/woods/console/safe_context.rb +82 -0
  44. data/lib/woods/console/server.rb +612 -0
  45. data/lib/woods/console/sql_validator.rb +172 -0
  46. data/lib/woods/console/tools/tier1.rb +118 -0
  47. data/lib/woods/console/tools/tier2.rb +117 -0
  48. data/lib/woods/console/tools/tier3.rb +110 -0
  49. data/lib/woods/console/tools/tier4.rb +79 -0
  50. data/lib/woods/coordination/pipeline_lock.rb +109 -0
  51. data/lib/woods/cost_model/embedding_cost.rb +88 -0
  52. data/lib/woods/cost_model/estimator.rb +128 -0
  53. data/lib/woods/cost_model/provider_pricing.rb +67 -0
  54. data/lib/woods/cost_model/storage_cost.rb +52 -0
  55. data/lib/woods/cost_model.rb +22 -0
  56. data/lib/woods/db/migrations/001_create_units.rb +38 -0
  57. data/lib/woods/db/migrations/002_create_edges.rb +35 -0
  58. data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
  59. data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
  60. data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
  61. data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
  62. data/lib/woods/db/migrator.rb +73 -0
  63. data/lib/woods/db/schema_version.rb +73 -0
  64. data/lib/woods/dependency_graph.rb +236 -0
  65. data/lib/woods/embedding/indexer.rb +140 -0
  66. data/lib/woods/embedding/openai.rb +126 -0
  67. data/lib/woods/embedding/provider.rb +162 -0
  68. data/lib/woods/embedding/text_preparer.rb +112 -0
  69. data/lib/woods/evaluation/baseline_runner.rb +115 -0
  70. data/lib/woods/evaluation/evaluator.rb +139 -0
  71. data/lib/woods/evaluation/metrics.rb +79 -0
  72. data/lib/woods/evaluation/query_set.rb +148 -0
  73. data/lib/woods/evaluation/report_generator.rb +90 -0
  74. data/lib/woods/extracted_unit.rb +145 -0
  75. data/lib/woods/extractor.rb +1028 -0
  76. data/lib/woods/extractors/action_cable_extractor.rb +201 -0
  77. data/lib/woods/extractors/ast_source_extraction.rb +46 -0
  78. data/lib/woods/extractors/behavioral_profile.rb +309 -0
  79. data/lib/woods/extractors/caching_extractor.rb +261 -0
  80. data/lib/woods/extractors/callback_analyzer.rb +246 -0
  81. data/lib/woods/extractors/concern_extractor.rb +292 -0
  82. data/lib/woods/extractors/configuration_extractor.rb +219 -0
  83. data/lib/woods/extractors/controller_extractor.rb +404 -0
  84. data/lib/woods/extractors/database_view_extractor.rb +278 -0
  85. data/lib/woods/extractors/decorator_extractor.rb +253 -0
  86. data/lib/woods/extractors/engine_extractor.rb +223 -0
  87. data/lib/woods/extractors/event_extractor.rb +211 -0
  88. data/lib/woods/extractors/factory_extractor.rb +289 -0
  89. data/lib/woods/extractors/graphql_extractor.rb +892 -0
  90. data/lib/woods/extractors/i18n_extractor.rb +117 -0
  91. data/lib/woods/extractors/job_extractor.rb +374 -0
  92. data/lib/woods/extractors/lib_extractor.rb +218 -0
  93. data/lib/woods/extractors/mailer_extractor.rb +269 -0
  94. data/lib/woods/extractors/manager_extractor.rb +188 -0
  95. data/lib/woods/extractors/middleware_extractor.rb +133 -0
  96. data/lib/woods/extractors/migration_extractor.rb +469 -0
  97. data/lib/woods/extractors/model_extractor.rb +988 -0
  98. data/lib/woods/extractors/phlex_extractor.rb +252 -0
  99. data/lib/woods/extractors/policy_extractor.rb +191 -0
  100. data/lib/woods/extractors/poro_extractor.rb +229 -0
  101. data/lib/woods/extractors/pundit_extractor.rb +223 -0
  102. data/lib/woods/extractors/rails_source_extractor.rb +473 -0
  103. data/lib/woods/extractors/rake_task_extractor.rb +343 -0
  104. data/lib/woods/extractors/route_extractor.rb +181 -0
  105. data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
  106. data/lib/woods/extractors/serializer_extractor.rb +339 -0
  107. data/lib/woods/extractors/service_extractor.rb +217 -0
  108. data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
  109. data/lib/woods/extractors/shared_utility_methods.rb +281 -0
  110. data/lib/woods/extractors/state_machine_extractor.rb +398 -0
  111. data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
  112. data/lib/woods/extractors/validator_extractor.rb +211 -0
  113. data/lib/woods/extractors/view_component_extractor.rb +311 -0
  114. data/lib/woods/extractors/view_template_extractor.rb +261 -0
  115. data/lib/woods/feedback/gap_detector.rb +89 -0
  116. data/lib/woods/feedback/store.rb +119 -0
  117. data/lib/woods/filename_utils.rb +32 -0
  118. data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
  119. data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
  120. data/lib/woods/flow_assembler.rb +290 -0
  121. data/lib/woods/flow_document.rb +191 -0
  122. data/lib/woods/flow_precomputer.rb +102 -0
  123. data/lib/woods/formatting/base.rb +30 -0
  124. data/lib/woods/formatting/claude_adapter.rb +98 -0
  125. data/lib/woods/formatting/generic_adapter.rb +56 -0
  126. data/lib/woods/formatting/gpt_adapter.rb +64 -0
  127. data/lib/woods/formatting/human_adapter.rb +78 -0
  128. data/lib/woods/graph_analyzer.rb +374 -0
  129. data/lib/woods/mcp/bootstrapper.rb +96 -0
  130. data/lib/woods/mcp/index_reader.rb +394 -0
  131. data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
  132. data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
  133. data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
  134. data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
  135. data/lib/woods/mcp/server.rb +962 -0
  136. data/lib/woods/mcp/tool_response_renderer.rb +85 -0
  137. data/lib/woods/model_name_cache.rb +51 -0
  138. data/lib/woods/notion/client.rb +217 -0
  139. data/lib/woods/notion/exporter.rb +219 -0
  140. data/lib/woods/notion/mapper.rb +40 -0
  141. data/lib/woods/notion/mappers/column_mapper.rb +57 -0
  142. data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
  143. data/lib/woods/notion/mappers/model_mapper.rb +161 -0
  144. data/lib/woods/notion/mappers/shared.rb +22 -0
  145. data/lib/woods/notion/rate_limiter.rb +68 -0
  146. data/lib/woods/observability/health_check.rb +79 -0
  147. data/lib/woods/observability/instrumentation.rb +34 -0
  148. data/lib/woods/observability/structured_logger.rb +57 -0
  149. data/lib/woods/operator/error_escalator.rb +81 -0
  150. data/lib/woods/operator/pipeline_guard.rb +92 -0
  151. data/lib/woods/operator/status_reporter.rb +80 -0
  152. data/lib/woods/railtie.rb +38 -0
  153. data/lib/woods/resilience/circuit_breaker.rb +99 -0
  154. data/lib/woods/resilience/index_validator.rb +167 -0
  155. data/lib/woods/resilience/retryable_provider.rb +108 -0
  156. data/lib/woods/retrieval/context_assembler.rb +261 -0
  157. data/lib/woods/retrieval/query_classifier.rb +133 -0
  158. data/lib/woods/retrieval/ranker.rb +277 -0
  159. data/lib/woods/retrieval/search_executor.rb +316 -0
  160. data/lib/woods/retriever.rb +152 -0
  161. data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
  162. data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
  163. data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
  164. data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
  165. data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
  166. data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
  167. data/lib/woods/ruby_analyzer.rb +87 -0
  168. data/lib/woods/session_tracer/file_store.rb +104 -0
  169. data/lib/woods/session_tracer/middleware.rb +143 -0
  170. data/lib/woods/session_tracer/redis_store.rb +106 -0
  171. data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
  172. data/lib/woods/session_tracer/session_flow_document.rb +223 -0
  173. data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
  174. data/lib/woods/session_tracer/store.rb +81 -0
  175. data/lib/woods/storage/graph_store.rb +120 -0
  176. data/lib/woods/storage/metadata_store.rb +196 -0
  177. data/lib/woods/storage/pgvector.rb +195 -0
  178. data/lib/woods/storage/qdrant.rb +205 -0
  179. data/lib/woods/storage/vector_store.rb +167 -0
  180. data/lib/woods/temporal/json_snapshot_store.rb +245 -0
  181. data/lib/woods/temporal/snapshot_store.rb +345 -0
  182. data/lib/woods/token_utils.rb +19 -0
  183. data/lib/woods/version.rb +5 -0
  184. data/lib/woods.rb +246 -0
  185. metadata +270 -0
@@ -0,0 +1,316 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Retrieval
5
+ # SearchExecutor maps a query classification to a retrieval strategy and
6
+ # executes it against the configured stores.
7
+ #
8
+ # Strategies:
9
+ # - :vector — semantic similarity search (understand, implement, debug)
10
+ # - :keyword — exact identifier/text matching (locate, reference)
11
+ # - :graph — dependency traversal (trace)
12
+ # - :hybrid — vector + keyword + graph expansion (exploratory/comprehensive)
13
+ # - :direct — direct metadata lookup (pinpoint + locate/reference)
14
+ #
15
+ # @example
16
+ # executor = SearchExecutor.new(
17
+ # vector_store: vector_store,
18
+ # metadata_store: metadata_store,
19
+ # graph_store: graph_store,
20
+ # embedding_provider: embedding_provider
21
+ # )
22
+ # classification = QueryClassifier.new.classify("How does User model work?")
23
+ # result = executor.execute(query: "How does User model work?", classification: classification)
24
+ # result.candidates # => [Candidate, ...]
25
+ # result.strategy # => :hybrid
26
+ #
27
+ class SearchExecutor
28
+ # A single search candidate with provenance tracking.
29
+ Candidate = Struct.new(:identifier, :score, :source, :metadata, keyword_init: true)
30
+
31
+ # The result of a search execution.
32
+ ExecutionResult = Struct.new(:candidates, :strategy, :query, keyword_init: true)
33
+
34
+ # Strategy mapping from (intent, scope) → strategy.
35
+ #
36
+ # Covers pinpoint overrides for locate/reference (:direct).
37
+ # Trace and framework intents are handled before this map is consulted.
38
+ STRATEGY_MAP = {
39
+ %i[locate pinpoint] => :direct,
40
+ %i[reference pinpoint] => :direct
41
+ }.freeze
42
+
43
+ # @param vector_store [Storage::VectorStore::Interface] Vector store adapter
44
+ # @param metadata_store [Storage::MetadataStore::Interface] Metadata store adapter
45
+ # @param graph_store [Storage::GraphStore::Interface] Graph store adapter
46
+ # @param embedding_provider [Embedding::Provider::Interface] Embedding provider
47
+ def initialize(vector_store:, metadata_store:, graph_store:, embedding_provider:)
48
+ @vector_store = vector_store
49
+ @metadata_store = metadata_store
50
+ @graph_store = graph_store
51
+ @embedding_provider = embedding_provider
52
+ end
53
+
54
+ # Execute a search based on query classification.
55
+ #
56
+ # @param query [String] The original query text
57
+ # @param classification [QueryClassifier::Classification] Classified query
58
+ # @param limit [Integer] Maximum candidates to return
59
+ # @return [ExecutionResult] Candidates with strategy metadata
60
+ def execute(query:, classification:, limit: 20)
61
+ strategy = select_strategy(classification)
62
+ candidates = run_strategy(strategy, query: query, classification: classification, limit: limit)
63
+
64
+ ExecutionResult.new(
65
+ candidates: candidates.first(limit),
66
+ strategy: strategy,
67
+ query: query
68
+ )
69
+ end
70
+
71
+ private
72
+
73
+ # Select the best retrieval strategy for a classification.
74
+ #
75
+ # @param classification [QueryClassifier::Classification]
76
+ # @return [Symbol] One of :vector, :keyword, :graph, :hybrid, :direct
77
+ def select_strategy(classification)
78
+ intent = classification.intent
79
+ scope = classification.scope
80
+
81
+ # Intent-level overrides (apply regardless of scope)
82
+ return :graph if intent == :trace
83
+ return :keyword if intent == :framework
84
+
85
+ # Pinpoint overrides for locate/reference
86
+ mapped = STRATEGY_MAP[[intent, scope]]
87
+ return mapped if mapped
88
+
89
+ # Comprehensive and exploratory scopes default to hybrid
90
+ return :hybrid if %i[comprehensive exploratory].include?(scope)
91
+
92
+ # Scope-based defaults for remaining intents
93
+ case intent
94
+ when :locate, :reference
95
+ :keyword
96
+ else
97
+ :vector
98
+ end
99
+ end
100
+
101
+ # Execute the selected strategy.
102
+ #
103
+ # @param strategy [Symbol] Strategy to execute
104
+ # @param query [String] Original query text
105
+ # @param classification [QueryClassifier::Classification]
106
+ # @param limit [Integer] Max results
107
+ # @return [Array<Candidate>]
108
+ def run_strategy(strategy, query:, classification:, limit:)
109
+ case strategy
110
+ when :vector
111
+ execute_vector(query, classification: classification, limit: limit)
112
+ when :keyword
113
+ execute_keyword(classification: classification, limit: limit)
114
+ when :graph
115
+ execute_graph(classification: classification, limit: limit)
116
+ when :hybrid
117
+ execute_hybrid(query, classification: classification, limit: limit)
118
+ when :direct
119
+ execute_direct(classification: classification, limit: limit)
120
+ end
121
+ end
122
+
123
+ # Vector strategy: embed the query and search by similarity.
124
+ #
125
+ # @return [Array<Candidate>]
126
+ def execute_vector(query, classification:, limit:)
127
+ query_vector = @embedding_provider.embed(query)
128
+ filters = build_vector_filters(classification)
129
+
130
+ results = @vector_store.search(query_vector, limit: limit, filters: filters)
131
+ results.map do |r|
132
+ Candidate.new(identifier: r.id, score: r.score, source: :vector, metadata: r.metadata)
133
+ end
134
+ end
135
+
136
+ # Keyword strategy: search metadata store by extracted keywords.
137
+ #
138
+ # Searches each keyword individually and merges results, keeping the
139
+ # best score per identifier.
140
+ #
141
+ # @return [Array<Candidate>]
142
+ def execute_keyword(classification:, limit:)
143
+ keywords = classification.keywords
144
+ return [] if keywords.empty?
145
+
146
+ all_results = merge_keyword_results(keywords)
147
+ rank_keyword_results(all_results, limit)
148
+ end
149
+
150
+ # Search each keyword individually and merge, keeping best score per ID.
151
+ #
152
+ # @param keywords [Array<String>]
153
+ # @return [Hash<String, Hash>] id => { score:, metadata: }
154
+ def merge_keyword_results(keywords)
155
+ results_by_id = {}
156
+ keywords.each do |keyword|
157
+ results = @metadata_store.search(keyword)
158
+ results.each_with_index do |r, index|
159
+ id = r['id']
160
+ score = 1.0 - (index.to_f / [results.size, 10].max)
161
+ results_by_id[id] = { score: score, metadata: r } if !results_by_id[id] || score > results_by_id[id][:score]
162
+ end
163
+ end
164
+ results_by_id
165
+ end
166
+
167
+ # Rank merged keyword results into Candidate objects.
168
+ #
169
+ # @param results [Hash<String, Hash>]
170
+ # @param limit [Integer]
171
+ # @return [Array<Candidate>]
172
+ def rank_keyword_results(results, limit)
173
+ scored = results.map do |id, data|
174
+ Candidate.new(identifier: id, score: data[:score], source: :keyword, metadata: data[:metadata])
175
+ end
176
+ scored.sort_by { |c| -c.score }.first(limit)
177
+ end
178
+
179
+ # Graph strategy: find related units via dependency traversal.
180
+ #
181
+ # @return [Array<Candidate>]
182
+ def execute_graph(classification:, limit:)
183
+ candidates = []
184
+
185
+ # First, use keywords to find seed identifiers in the metadata store
186
+ seeds = find_seed_identifiers(classification)
187
+ return [] if seeds.empty?
188
+
189
+ seeds.each do |seed_id|
190
+ # Forward dependencies
191
+ deps = @graph_store.dependencies_of(seed_id)
192
+ deps.each do |dep|
193
+ candidates << Candidate.new(identifier: dep, score: 0.8, source: :graph, metadata: {})
194
+ end
195
+
196
+ # Reverse dependencies (dependents)
197
+ dependents = @graph_store.dependents_of(seed_id)
198
+ dependents.each do |dep|
199
+ candidates << Candidate.new(identifier: dep, score: 0.7, source: :graph, metadata: {})
200
+ end
201
+
202
+ # The seed itself
203
+ candidates << Candidate.new(identifier: seed_id, score: 1.0, source: :graph, metadata: {})
204
+ end
205
+
206
+ deduplicate(candidates).first(limit)
207
+ end
208
+
209
+ # Hybrid strategy: combine vector, keyword, and graph expansion.
210
+ #
211
+ # @return [Array<Candidate>]
212
+ def execute_hybrid(query, classification:, limit:)
213
+ # Gather from all three sources
214
+ vector_candidates = execute_vector(query, classification: classification, limit: limit)
215
+ keyword_candidates = execute_keyword(classification: classification, limit: limit)
216
+
217
+ # Graph expansion on top vector results
218
+ graph_candidates = []
219
+ vector_candidates.first(3).each do |candidate|
220
+ deps = @graph_store.dependencies_of(candidate.identifier)
221
+ deps.each do |dep|
222
+ graph_candidates << Candidate.new(
223
+ identifier: dep, score: 0.5, source: :graph_expansion, metadata: {}
224
+ )
225
+ end
226
+ end
227
+
228
+ all = vector_candidates + keyword_candidates + graph_candidates
229
+ deduplicate(all).first(limit)
230
+ end
231
+
232
+ # Direct strategy: look up specific identifiers from keywords.
233
+ #
234
+ # Tries each keyword as-is and capitalized (e.g. "user" → "User")
235
+ # since keywords are lowercased but identifiers are typically PascalCase.
236
+ #
237
+ # @return [Array<Candidate>]
238
+ def execute_direct(classification:, limit:)
239
+ keywords = classification.keywords
240
+ return [] if keywords.empty?
241
+
242
+ candidates = lookup_keyword_variants(keywords)
243
+
244
+ # Fall back to keyword search if direct lookups miss
245
+ return execute_keyword(classification: classification, limit: limit) if candidates.empty?
246
+
247
+ candidates.first(limit)
248
+ end
249
+
250
+ # Try each keyword as-is and in capitalized forms against the metadata store.
251
+ #
252
+ # @param keywords [Array<String>]
253
+ # @return [Array<Candidate>]
254
+ def lookup_keyword_variants(keywords)
255
+ candidates = []
256
+ keywords.each do |keyword|
257
+ variants = [keyword, keyword.capitalize, keyword.split('_').map(&:capitalize).join].uniq
258
+ variants.each do |variant|
259
+ result = @metadata_store.find(variant)
260
+ next unless result
261
+
262
+ candidates << Candidate.new(identifier: variant, score: 1.0, source: :direct, metadata: result)
263
+ break
264
+ end
265
+ end
266
+ candidates
267
+ end
268
+
269
+ # Build metadata filters for vector search based on classification.
270
+ #
271
+ # @param classification [QueryClassifier::Classification]
272
+ # @return [Hash]
273
+ def build_vector_filters(classification)
274
+ filters = {}
275
+ filters[:type] = classification.target_type.to_s if classification.target_type
276
+ filters
277
+ end
278
+
279
+ # Find seed identifiers from classification keywords via metadata search.
280
+ #
281
+ # @param classification [QueryClassifier::Classification]
282
+ # @return [Array<String>]
283
+ def find_seed_identifiers(classification)
284
+ seeds = []
285
+
286
+ # Try direct lookups for capitalized keywords (likely class names)
287
+ classification.keywords.each do |keyword|
288
+ capitalized = keyword.split('_').map(&:capitalize).join
289
+ result = @metadata_store.find(capitalized)
290
+ seeds << capitalized if result
291
+ end
292
+
293
+ # Fall back to search if no direct hits
294
+ if seeds.empty? && classification.keywords.any?
295
+ results = @metadata_store.search(classification.keywords.join(' '))
296
+ seeds = results.first(3).map { |r| r['id'] }
297
+ end
298
+
299
+ seeds
300
+ end
301
+
302
+ # Deduplicate candidates, keeping the highest-scored entry per identifier.
303
+ #
304
+ # @param candidates [Array<Candidate>]
305
+ # @return [Array<Candidate>]
306
+ def deduplicate(candidates)
307
+ best = {}
308
+ candidates.each do |c|
309
+ existing = best[c.identifier]
310
+ best[c.identifier] = c if existing.nil? || c.score > existing.score
311
+ end
312
+ best.values.sort_by { |c| -c.score }
313
+ end
314
+ end
315
+ end
316
+ end
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'retrieval/query_classifier'
4
+ require_relative 'retrieval/search_executor'
5
+ require_relative 'retrieval/ranker'
6
+ require_relative 'retrieval/context_assembler'
7
+
8
+ module Woods
9
+ # Retriever orchestrates the full retrieval pipeline: classify, execute,
10
+ # rank, and assemble context from a natural language query.
11
+ #
12
+ # Coordinates four internal components:
13
+ # - {Retrieval::QueryClassifier} — determines intent, scope, target type
14
+ # - {Retrieval::SearchExecutor} — maps classification to search strategy
15
+ # - {Retrieval::Ranker} — re-ranks candidates with weighted signals
16
+ # - {Retrieval::ContextAssembler} — builds token-budgeted context string
17
+ #
18
+ # Optionally builds a structural context overview (codebase unit counts
19
+ # by type) that is prepended to the assembled context.
20
+ #
21
+ # @example
22
+ # retriever = Woods::Retriever.new(
23
+ # vector_store: vector_store,
24
+ # metadata_store: metadata_store,
25
+ # graph_store: graph_store,
26
+ # embedding_provider: embedding_provider
27
+ # )
28
+ # result = retriever.retrieve("How does the User model work?")
29
+ # result.context # => "Codebase: 42 units (10 models, ...)\n\n---\n\n## User (model)..."
30
+ # result.strategy # => :vector
31
+ # result.tokens_used # => 4200
32
+ #
33
+ class Retriever
34
+ # Diagnostic trace for retrieval quality analysis.
35
+ RetrievalTrace = Struct.new(:classification, :strategy, :candidate_count,
36
+ :ranked_count, :tokens_used, :elapsed_ms,
37
+ keyword_init: true)
38
+
39
+ # The result of a retrieval operation.
40
+ RetrievalResult = Struct.new(:context, :sources, :classification, :strategy, :tokens_used, :budget, :trace,
41
+ keyword_init: true)
42
+
43
+ # Unit types queried for the structural context overview.
44
+ STRUCTURAL_TYPES = %w[model controller service job mailer component graphql].freeze
45
+
46
+ # @param vector_store [Storage::VectorStore::Interface] Vector store adapter
47
+ # @param metadata_store [Storage::MetadataStore::Interface] Metadata store adapter
48
+ # @param graph_store [Storage::GraphStore::Interface] Graph store adapter
49
+ # @param embedding_provider [Embedding::Provider::Interface] Embedding provider
50
+ # @param formatter [#call, nil] Optional callable to post-process the context string
51
+ def initialize(vector_store:, metadata_store:, graph_store:, embedding_provider:, formatter: nil)
52
+ @metadata_store = metadata_store
53
+ @formatter = formatter
54
+
55
+ @classifier = Retrieval::QueryClassifier.new
56
+ @executor = Retrieval::SearchExecutor.new(
57
+ vector_store: vector_store,
58
+ metadata_store: metadata_store,
59
+ graph_store: graph_store,
60
+ embedding_provider: embedding_provider
61
+ )
62
+ @ranker = Retrieval::Ranker.new(metadata_store: metadata_store)
63
+ @assembler = Retrieval::ContextAssembler.new(metadata_store: metadata_store)
64
+ end
65
+
66
+ # Execute the full retrieval pipeline for a natural language query.
67
+ #
68
+ # Pipeline: classify -> execute -> rank -> assemble -> format
69
+ #
70
+ # @param query [String] Natural language query
71
+ # @param budget [Integer] Token budget for context assembly
72
+ # @return [RetrievalResult] Complete retrieval result
73
+ def retrieve(query, budget: 8000)
74
+ start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
75
+
76
+ classification = @classifier.classify(query)
77
+ execution_result = @executor.execute(query: query, classification: classification)
78
+ ranked = @ranker.rank(execution_result.candidates, classification: classification)
79
+ assembled = assemble_context(ranked, classification, budget)
80
+
81
+ elapsed_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time) * 1000).round(1)
82
+
83
+ trace = RetrievalTrace.new(
84
+ classification: classification,
85
+ strategy: execution_result.strategy,
86
+ candidate_count: execution_result.candidates.size,
87
+ ranked_count: ranked.size,
88
+ tokens_used: assembled.tokens_used,
89
+ elapsed_ms: elapsed_ms
90
+ )
91
+
92
+ build_result(assembled, classification, execution_result.strategy, budget, trace)
93
+ end
94
+
95
+ private
96
+
97
+ # Assemble token-budgeted context from ranked candidates.
98
+ #
99
+ # @param ranked [Array<Candidate>] Ranked search candidates
100
+ # @param classification [QueryClassifier::Classification] Query classification
101
+ # @return [AssembledContext]
102
+ def assemble_context(ranked, classification, budget)
103
+ @assembler.assemble(
104
+ candidates: ranked,
105
+ classification: classification,
106
+ structural_context: build_structural_context,
107
+ budget: budget
108
+ )
109
+ end
110
+
111
+ # Build a RetrievalResult from assembled context and pipeline metadata.
112
+ #
113
+ # @param assembled [AssembledContext] Assembled context
114
+ # @param classification [QueryClassifier::Classification] Query classification
115
+ # @param strategy [Symbol] Search strategy used
116
+ # @param budget [Integer] Token budget
117
+ # @return [RetrievalResult]
118
+ def build_result(assembled, classification, strategy, budget, trace = nil)
119
+ context = @formatter ? @formatter.call(assembled.context) : assembled.context
120
+
121
+ RetrievalResult.new(
122
+ context: context,
123
+ sources: assembled.sources,
124
+ classification: classification,
125
+ strategy: strategy,
126
+ tokens_used: assembled.tokens_used,
127
+ budget: budget,
128
+ trace: trace
129
+ )
130
+ end
131
+
132
+ # Build a structural context overview from the metadata store.
133
+ #
134
+ # Queries the metadata store for total unit count and counts per type,
135
+ # producing a summary like "Codebase: 42 units (10 models, 5 controllers, ...)".
136
+ #
137
+ # @return [String, nil] Overview string, or nil if the store is empty or on error
138
+ def build_structural_context
139
+ total = @metadata_store.count
140
+ return nil if total.zero?
141
+
142
+ type_counts = STRUCTURAL_TYPES.filter_map do |type|
143
+ count = @metadata_store.find_by_type(type).size
144
+ "#{count} #{type}s" if count.positive?
145
+ end
146
+
147
+ "Codebase: #{total} units (#{type_counts.join(', ')})"
148
+ rescue StandardError
149
+ nil
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../ast/parser'
4
+ require_relative '../extracted_unit'
5
+ require_relative 'fqn_builder'
6
+
7
+ module Woods
8
+ module RubyAnalyzer
9
+ # Extracts class and module definitions from Ruby source code using the AST layer.
10
+ #
11
+ # Produces ExtractedUnit objects with type :ruby_class or :ruby_module, including
12
+ # metadata about superclass, includes, extends, constants, and method count.
13
+ #
14
+ # @example
15
+ # analyzer = RubyAnalyzer::ClassAnalyzer.new
16
+ # units = analyzer.analyze(source: File.read(path), file_path: path)
17
+ # units.first.type #=> :ruby_class
18
+ #
19
+ class ClassAnalyzer
20
+ include FqnBuilder
21
+ include Ast::SourceSpan
22
+
23
+ # @param parser [Ast::Parser, nil] Parser instance (creates default if nil)
24
+ def initialize(parser: nil)
25
+ @parser = parser || Ast::Parser.new
26
+ end
27
+
28
+ # Analyze source code and extract class/module units.
29
+ #
30
+ # @param source [String] Ruby source code
31
+ # @param file_path [String] Absolute path to the source file
32
+ # @return [Array<ExtractedUnit>] Extracted class and module units
33
+ def analyze(source:, file_path:)
34
+ root = @parser.parse(source)
35
+ units = []
36
+ extract_definitions(root, source, file_path, [], units)
37
+ units
38
+ end
39
+
40
+ private
41
+
42
+ def extract_definitions(node, source, file_path, namespace_stack, units)
43
+ return unless node.is_a?(Ast::Node)
44
+
45
+ case node.type
46
+ when :class
47
+ process_definition(node, :ruby_class, source, file_path, namespace_stack, units)
48
+ when :module
49
+ process_definition(node, :ruby_module, source, file_path, namespace_stack, units)
50
+ else
51
+ (node.children || []).each do |child|
52
+ extract_definitions(child, source, file_path, namespace_stack, units)
53
+ end
54
+ end
55
+ end
56
+
57
+ def process_definition(node, type, source, file_path, namespace_stack, units)
58
+ name = node.method_name
59
+ fqn = build_fqn(name, namespace_stack)
60
+ namespace = build_namespace(name, namespace_stack)
61
+
62
+ superclass = type == :ruby_class ? extract_superclass(node) : nil
63
+ children = body_children(node, type)
64
+ includes = extract_mixins(children, 'include')
65
+ extends = extract_mixins(children, 'extend')
66
+ constants = extract_constants(children)
67
+ method_count = count_methods(children)
68
+
69
+ unit = ExtractedUnit.new(type: type, identifier: fqn, file_path: file_path)
70
+ unit.namespace = namespace
71
+ unit.source_code = extract_source(node, source)
72
+ unit.metadata = {
73
+ superclass: superclass,
74
+ includes: includes,
75
+ extends: extends,
76
+ constants: constants,
77
+ method_count: method_count
78
+ }
79
+ unit.dependencies = build_dependencies(superclass, includes, extends)
80
+ units << unit
81
+
82
+ # Recurse into body for nested definitions
83
+ inner_ns = namespace_stack + fqn_parts(name)
84
+ children.each do |child|
85
+ extract_definitions(child, source, file_path, inner_ns, units)
86
+ end
87
+ end
88
+
89
+ # Build namespace string (everything except the leaf name).
90
+ def build_namespace(name, namespace_stack)
91
+ parts = namespace_stack + fqn_parts(name)
92
+ parts.pop # Remove leaf
93
+ parts.empty? ? nil : parts.join('::')
94
+ end
95
+
96
+ # Split a name that may contain :: into parts.
97
+ def fqn_parts(name)
98
+ name.to_s.split('::')
99
+ end
100
+
101
+ # Extract superclass name from a class node.
102
+ # Children[0] is name, children[1] is superclass (or nil).
103
+ def extract_superclass(class_node)
104
+ superclass_node = class_node.children[1]
105
+ return nil unless superclass_node.is_a?(Ast::Node) && superclass_node.type == :const
106
+
107
+ build_const_name(superclass_node)
108
+ end
109
+
110
+ # Get body children of a class or module node.
111
+ # Class: children[0] = name, children[1] = superclass, rest = body
112
+ # Module: children[0] = name, rest = body
113
+ def body_children(node, type)
114
+ offset = type == :ruby_class ? 2 : 1
115
+ (node.children || [])[offset..] || []
116
+ end
117
+
118
+ # Extract include/extend module names from body send nodes.
119
+ def extract_mixins(body_children, method_name)
120
+ body_children.filter_map do |child|
121
+ next unless child.is_a?(Ast::Node) && child.type == :send
122
+ next unless child.method_name == method_name
123
+ next if child.arguments.nil? || child.arguments.empty?
124
+
125
+ child.arguments.first
126
+ end
127
+ end
128
+
129
+ # Extract constant assignment names from body.
130
+ def extract_constants(body_children)
131
+ body_children.filter_map do |child|
132
+ next unless child.is_a?(Ast::Node) && child.type == :casgn
133
+
134
+ child.method_name
135
+ end
136
+ end
137
+
138
+ # Count def and defs nodes in body children (non-recursive — only direct methods).
139
+ def count_methods(body_children)
140
+ body_children.count { |child| child.is_a?(Ast::Node) && %i[def defs].include?(child.type) }
141
+ end
142
+
143
+ # Build the constant name from a :const node (may have receiver for namespaced).
144
+ def build_const_name(const_node)
145
+ parts = []
146
+ parts << const_node.receiver if const_node.receiver
147
+ parts << const_node.method_name if const_node.method_name
148
+ parts.join('::')
149
+ end
150
+
151
+ # Extract source text for a node using line range.
152
+ def extract_source(node, source)
153
+ extract_source_span(source, node.line, node.end_line)
154
+ end
155
+
156
+ # Build dependency list from superclass, includes, and extends.
157
+ def build_dependencies(superclass, includes, extends)
158
+ deps = []
159
+ deps << { type: :ruby_class, target: superclass, via: :inheritance } if superclass
160
+ includes.each do |mod|
161
+ deps << { type: :ruby_class, target: mod, via: :include }
162
+ end
163
+ extends.each do |mod|
164
+ deps << { type: :ruby_class, target: mod, via: :extend }
165
+ end
166
+ deps
167
+ end
168
+ end
169
+ end
170
+ end