woods 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +89 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +406 -0
  7. data/exe/woods-console +59 -0
  8. data/exe/woods-console-mcp +22 -0
  9. data/exe/woods-mcp +34 -0
  10. data/exe/woods-mcp-http +37 -0
  11. data/exe/woods-mcp-start +58 -0
  12. data/lib/generators/woods/install_generator.rb +32 -0
  13. data/lib/generators/woods/pgvector_generator.rb +37 -0
  14. data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
  15. data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
  16. data/lib/tasks/woods.rake +621 -0
  17. data/lib/tasks/woods_evaluation.rake +115 -0
  18. data/lib/woods/ast/call_site_extractor.rb +106 -0
  19. data/lib/woods/ast/method_extractor.rb +71 -0
  20. data/lib/woods/ast/node.rb +116 -0
  21. data/lib/woods/ast/parser.rb +614 -0
  22. data/lib/woods/ast.rb +6 -0
  23. data/lib/woods/builder.rb +200 -0
  24. data/lib/woods/cache/cache_middleware.rb +199 -0
  25. data/lib/woods/cache/cache_store.rb +264 -0
  26. data/lib/woods/cache/redis_cache_store.rb +116 -0
  27. data/lib/woods/cache/solid_cache_store.rb +111 -0
  28. data/lib/woods/chunking/chunk.rb +84 -0
  29. data/lib/woods/chunking/semantic_chunker.rb +295 -0
  30. data/lib/woods/console/adapters/cache_adapter.rb +58 -0
  31. data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
  32. data/lib/woods/console/adapters/job_adapter.rb +68 -0
  33. data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
  34. data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
  35. data/lib/woods/console/audit_logger.rb +75 -0
  36. data/lib/woods/console/bridge.rb +177 -0
  37. data/lib/woods/console/confirmation.rb +90 -0
  38. data/lib/woods/console/connection_manager.rb +173 -0
  39. data/lib/woods/console/console_response_renderer.rb +74 -0
  40. data/lib/woods/console/embedded_executor.rb +373 -0
  41. data/lib/woods/console/model_validator.rb +81 -0
  42. data/lib/woods/console/rack_middleware.rb +87 -0
  43. data/lib/woods/console/safe_context.rb +82 -0
  44. data/lib/woods/console/server.rb +612 -0
  45. data/lib/woods/console/sql_validator.rb +172 -0
  46. data/lib/woods/console/tools/tier1.rb +118 -0
  47. data/lib/woods/console/tools/tier2.rb +117 -0
  48. data/lib/woods/console/tools/tier3.rb +110 -0
  49. data/lib/woods/console/tools/tier4.rb +79 -0
  50. data/lib/woods/coordination/pipeline_lock.rb +109 -0
  51. data/lib/woods/cost_model/embedding_cost.rb +88 -0
  52. data/lib/woods/cost_model/estimator.rb +128 -0
  53. data/lib/woods/cost_model/provider_pricing.rb +67 -0
  54. data/lib/woods/cost_model/storage_cost.rb +52 -0
  55. data/lib/woods/cost_model.rb +22 -0
  56. data/lib/woods/db/migrations/001_create_units.rb +38 -0
  57. data/lib/woods/db/migrations/002_create_edges.rb +35 -0
  58. data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
  59. data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
  60. data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
  61. data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
  62. data/lib/woods/db/migrator.rb +73 -0
  63. data/lib/woods/db/schema_version.rb +73 -0
  64. data/lib/woods/dependency_graph.rb +236 -0
  65. data/lib/woods/embedding/indexer.rb +140 -0
  66. data/lib/woods/embedding/openai.rb +126 -0
  67. data/lib/woods/embedding/provider.rb +162 -0
  68. data/lib/woods/embedding/text_preparer.rb +112 -0
  69. data/lib/woods/evaluation/baseline_runner.rb +115 -0
  70. data/lib/woods/evaluation/evaluator.rb +139 -0
  71. data/lib/woods/evaluation/metrics.rb +79 -0
  72. data/lib/woods/evaluation/query_set.rb +148 -0
  73. data/lib/woods/evaluation/report_generator.rb +90 -0
  74. data/lib/woods/extracted_unit.rb +145 -0
  75. data/lib/woods/extractor.rb +1028 -0
  76. data/lib/woods/extractors/action_cable_extractor.rb +201 -0
  77. data/lib/woods/extractors/ast_source_extraction.rb +46 -0
  78. data/lib/woods/extractors/behavioral_profile.rb +309 -0
  79. data/lib/woods/extractors/caching_extractor.rb +261 -0
  80. data/lib/woods/extractors/callback_analyzer.rb +246 -0
  81. data/lib/woods/extractors/concern_extractor.rb +292 -0
  82. data/lib/woods/extractors/configuration_extractor.rb +219 -0
  83. data/lib/woods/extractors/controller_extractor.rb +404 -0
  84. data/lib/woods/extractors/database_view_extractor.rb +278 -0
  85. data/lib/woods/extractors/decorator_extractor.rb +253 -0
  86. data/lib/woods/extractors/engine_extractor.rb +223 -0
  87. data/lib/woods/extractors/event_extractor.rb +211 -0
  88. data/lib/woods/extractors/factory_extractor.rb +289 -0
  89. data/lib/woods/extractors/graphql_extractor.rb +892 -0
  90. data/lib/woods/extractors/i18n_extractor.rb +117 -0
  91. data/lib/woods/extractors/job_extractor.rb +374 -0
  92. data/lib/woods/extractors/lib_extractor.rb +218 -0
  93. data/lib/woods/extractors/mailer_extractor.rb +269 -0
  94. data/lib/woods/extractors/manager_extractor.rb +188 -0
  95. data/lib/woods/extractors/middleware_extractor.rb +133 -0
  96. data/lib/woods/extractors/migration_extractor.rb +469 -0
  97. data/lib/woods/extractors/model_extractor.rb +988 -0
  98. data/lib/woods/extractors/phlex_extractor.rb +252 -0
  99. data/lib/woods/extractors/policy_extractor.rb +191 -0
  100. data/lib/woods/extractors/poro_extractor.rb +229 -0
  101. data/lib/woods/extractors/pundit_extractor.rb +223 -0
  102. data/lib/woods/extractors/rails_source_extractor.rb +473 -0
  103. data/lib/woods/extractors/rake_task_extractor.rb +343 -0
  104. data/lib/woods/extractors/route_extractor.rb +181 -0
  105. data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
  106. data/lib/woods/extractors/serializer_extractor.rb +339 -0
  107. data/lib/woods/extractors/service_extractor.rb +217 -0
  108. data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
  109. data/lib/woods/extractors/shared_utility_methods.rb +281 -0
  110. data/lib/woods/extractors/state_machine_extractor.rb +398 -0
  111. data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
  112. data/lib/woods/extractors/validator_extractor.rb +211 -0
  113. data/lib/woods/extractors/view_component_extractor.rb +311 -0
  114. data/lib/woods/extractors/view_template_extractor.rb +261 -0
  115. data/lib/woods/feedback/gap_detector.rb +89 -0
  116. data/lib/woods/feedback/store.rb +119 -0
  117. data/lib/woods/filename_utils.rb +32 -0
  118. data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
  119. data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
  120. data/lib/woods/flow_assembler.rb +290 -0
  121. data/lib/woods/flow_document.rb +191 -0
  122. data/lib/woods/flow_precomputer.rb +102 -0
  123. data/lib/woods/formatting/base.rb +30 -0
  124. data/lib/woods/formatting/claude_adapter.rb +98 -0
  125. data/lib/woods/formatting/generic_adapter.rb +56 -0
  126. data/lib/woods/formatting/gpt_adapter.rb +64 -0
  127. data/lib/woods/formatting/human_adapter.rb +78 -0
  128. data/lib/woods/graph_analyzer.rb +374 -0
  129. data/lib/woods/mcp/bootstrapper.rb +96 -0
  130. data/lib/woods/mcp/index_reader.rb +394 -0
  131. data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
  132. data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
  133. data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
  134. data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
  135. data/lib/woods/mcp/server.rb +962 -0
  136. data/lib/woods/mcp/tool_response_renderer.rb +85 -0
  137. data/lib/woods/model_name_cache.rb +51 -0
  138. data/lib/woods/notion/client.rb +217 -0
  139. data/lib/woods/notion/exporter.rb +219 -0
  140. data/lib/woods/notion/mapper.rb +40 -0
  141. data/lib/woods/notion/mappers/column_mapper.rb +57 -0
  142. data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
  143. data/lib/woods/notion/mappers/model_mapper.rb +161 -0
  144. data/lib/woods/notion/mappers/shared.rb +22 -0
  145. data/lib/woods/notion/rate_limiter.rb +68 -0
  146. data/lib/woods/observability/health_check.rb +79 -0
  147. data/lib/woods/observability/instrumentation.rb +34 -0
  148. data/lib/woods/observability/structured_logger.rb +57 -0
  149. data/lib/woods/operator/error_escalator.rb +81 -0
  150. data/lib/woods/operator/pipeline_guard.rb +92 -0
  151. data/lib/woods/operator/status_reporter.rb +80 -0
  152. data/lib/woods/railtie.rb +38 -0
  153. data/lib/woods/resilience/circuit_breaker.rb +99 -0
  154. data/lib/woods/resilience/index_validator.rb +167 -0
  155. data/lib/woods/resilience/retryable_provider.rb +108 -0
  156. data/lib/woods/retrieval/context_assembler.rb +261 -0
  157. data/lib/woods/retrieval/query_classifier.rb +133 -0
  158. data/lib/woods/retrieval/ranker.rb +277 -0
  159. data/lib/woods/retrieval/search_executor.rb +316 -0
  160. data/lib/woods/retriever.rb +152 -0
  161. data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
  162. data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
  163. data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
  164. data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
  165. data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
  166. data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
  167. data/lib/woods/ruby_analyzer.rb +87 -0
  168. data/lib/woods/session_tracer/file_store.rb +104 -0
  169. data/lib/woods/session_tracer/middleware.rb +143 -0
  170. data/lib/woods/session_tracer/redis_store.rb +106 -0
  171. data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
  172. data/lib/woods/session_tracer/session_flow_document.rb +223 -0
  173. data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
  174. data/lib/woods/session_tracer/store.rb +81 -0
  175. data/lib/woods/storage/graph_store.rb +120 -0
  176. data/lib/woods/storage/metadata_store.rb +196 -0
  177. data/lib/woods/storage/pgvector.rb +195 -0
  178. data/lib/woods/storage/qdrant.rb +205 -0
  179. data/lib/woods/storage/vector_store.rb +167 -0
  180. data/lib/woods/temporal/json_snapshot_store.rb +245 -0
  181. data/lib/woods/temporal/snapshot_store.rb +345 -0
  182. data/lib/woods/token_utils.rb +19 -0
  183. data/lib/woods/version.rb +5 -0
  184. data/lib/woods.rb +246 -0
  185. metadata +270 -0
@@ -0,0 +1,261 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Retrieval
5
+ # Transforms ranked search candidates into a token-budgeted context string
6
+ # for LLM consumption.
7
+ #
8
+ # Allocates a fixed token budget across four sections:
9
+ # - Structural (10%): Always-included codebase overview
10
+ # - Primary (50%): Direct query results
11
+ # - Supporting (25%): Dependencies and related context
12
+ # - Framework (15%): Rails/gem source when query has framework context
13
+ #
14
+ # When framework context is not needed, primary and supporting sections
15
+ # receive the framework allocation proportionally.
16
+ #
17
+ # @example
18
+ # assembler = ContextAssembler.new(metadata_store: store)
19
+ # result = assembler.assemble(candidates: ranked, classification: cls)
20
+ # result.context # => "## User (model)\n..."
21
+ # result.tokens_used # => 4200
22
+ # result.sections # => [:structural, :primary, :supporting]
23
+ #
24
+ class ContextAssembler
25
+ DEFAULT_BUDGET = 8000 # tokens
26
+
27
+ BUDGET_ALLOCATION = {
28
+ structural: 0.10,
29
+ primary: 0.50,
30
+ supporting: 0.25,
31
+ framework: 0.15
32
+ }.freeze
33
+
34
+ # Minimum token count for a section to be worth including.
35
+ MIN_USEFUL_TOKENS = 200
36
+
37
+ # @param metadata_store [#find] Store that resolves identifiers to unit data
38
+ # @param budget [Integer] Total token budget
39
+ def initialize(metadata_store:, budget: DEFAULT_BUDGET)
40
+ @metadata_store = metadata_store
41
+ @budget = budget
42
+ end
43
+
44
+ # Assemble context from ranked candidates within token budget.
45
+ #
46
+ # @param candidates [Array<Candidate>] Ranked search candidates
47
+ # @param classification [QueryClassifier::Classification] Query classification
48
+ # @param structural_context [String, nil] Optional codebase overview text
49
+ # @param budget [Integer, nil] Override token budget; falls back to @budget
50
+ # @return [AssembledContext] Token-budgeted context with source attribution
51
+ def assemble(candidates:, classification:, structural_context: nil, budget: nil)
52
+ effective_budget = budget || @budget
53
+ sections = []
54
+ sources = []
55
+ tokens_used = 0
56
+
57
+ # Pre-fetch all candidate metadata in one batch query
58
+ @unit_cache = @metadata_store.find_batch(candidates.map(&:identifier))
59
+
60
+ # 1. Structural context (always first if provided)
61
+ tokens_used = add_structural_section(sections, structural_context, tokens_used, effective_budget)
62
+
63
+ # 2. Compute per-section budgets from remaining tokens
64
+ budgets = compute_section_budgets(effective_budget - tokens_used, classification)
65
+
66
+ # 3. Primary, supporting, and framework sections
67
+ add_candidate_section(sections, sources, :primary,
68
+ candidates.reject { |c| c.source == :graph_expansion }, budgets[:primary])
69
+ add_candidate_section(sections, sources, :supporting,
70
+ candidates.select { |c| c.source == :graph_expansion }, budgets[:supporting])
71
+ if budgets[:framework].positive?
72
+ add_candidate_section(sections, sources, :framework,
73
+ candidates.select { |c| framework_candidate?(c) }, budgets[:framework])
74
+ end
75
+
76
+ build_result(sections, sources, effective_budget)
77
+ end
78
+
79
+ private
80
+
81
+ # Add structural context section if provided.
82
+ #
83
+ # @return [Integer] Updated tokens_used count
84
+ def add_structural_section(sections, structural_context, tokens_used, effective_budget)
85
+ return tokens_used unless structural_context
86
+
87
+ budget = (effective_budget * BUDGET_ALLOCATION[:structural]).to_i
88
+ text = truncate_to_budget(structural_context, budget)
89
+ sections << { section: :structural, content: text }
90
+ tokens_used + estimate_tokens(text)
91
+ end
92
+
93
+ # Add a candidate-based section if candidates produce content.
94
+ #
95
+ # @return [void]
96
+ def add_candidate_section(sections, sources, section_name, candidates, budget)
97
+ return if candidates.empty?
98
+
99
+ content, section_sources = assemble_section(candidates, budget)
100
+ return if content.empty?
101
+
102
+ sections << { section: section_name, content: content }
103
+ sources.concat(section_sources)
104
+ end
105
+
106
+ # Compute token budgets for primary/supporting/framework sections.
107
+ #
108
+ # @param remaining [Integer] Tokens available after structural
109
+ # @param classification [QueryClassifier::Classification]
110
+ # @return [Hash<Symbol, Integer>]
111
+ def compute_section_budgets(remaining, classification)
112
+ if classification.framework_context
113
+ {
114
+ primary: (remaining * 0.55).to_i,
115
+ supporting: (remaining * 0.25).to_i,
116
+ framework: (remaining * 0.20).to_i
117
+ }
118
+ else
119
+ {
120
+ primary: (remaining * 0.65).to_i,
121
+ supporting: (remaining * 0.35).to_i,
122
+ framework: 0
123
+ }
124
+ end
125
+ end
126
+
127
+ # Assemble content for a single section within a token budget.
128
+ #
129
+ # @param candidates [Array<Candidate>] Candidates for this section
130
+ # @param budget [Integer] Token budget for this section
131
+ # @return [Array(String, Array<Hash>)] Content string and source attributions
132
+ def assemble_section(candidates, budget)
133
+ content_parts = []
134
+ sources = []
135
+ tokens_used = 0
136
+
137
+ candidates.sort_by { |c| -c.score }.each do |candidate|
138
+ tokens_used = append_candidate(content_parts, sources, candidate, budget, tokens_used)
139
+ break if tokens_used.nil?
140
+ end
141
+
142
+ [content_parts.join("\n\n"), sources]
143
+ end
144
+
145
+ # Append a single candidate to the section. Returns updated tokens_used, or nil to stop.
146
+ def append_candidate(parts, sources, candidate, budget, tokens_used)
147
+ unit = @unit_cache[candidate.identifier]
148
+ return tokens_used unless unit
149
+
150
+ text = format_unit(unit, candidate)
151
+ tokens = estimate_tokens(text)
152
+ remaining = budget - tokens_used
153
+
154
+ if tokens <= remaining
155
+ parts << text
156
+ sources << build_source_attribution(candidate, unit)
157
+ tokens_used + tokens
158
+ elsif remaining > MIN_USEFUL_TOKENS
159
+ parts << truncate_to_budget(text, remaining)
160
+ sources << build_source_attribution(candidate, unit, truncated: true)
161
+ nil
162
+ end
163
+ end
164
+
165
+ # Format a unit for inclusion in context.
166
+ #
167
+ # @param unit [Hash] Unit data from metadata store
168
+ # @param candidate [Candidate] The search candidate
169
+ # @return [String]
170
+ def format_unit(unit, _candidate)
171
+ identifier = unit_field(unit, :identifier)
172
+ type = unit_field(unit, :type)
173
+ file_path = unit_field(unit, :file_path)
174
+ source = unit_field(unit, :source_code) || ''
175
+
176
+ <<~UNIT.strip
177
+ ## #{identifier} (#{type})
178
+ File: #{file_path}
179
+
180
+ #{source}
181
+ UNIT
182
+ end
183
+
184
+ # Build source attribution hash for a candidate.
185
+ #
186
+ # @return [Hash]
187
+ def build_source_attribution(candidate, unit, truncated: false)
188
+ attribution = {
189
+ identifier: candidate.identifier,
190
+ type: unit_field(unit, :type),
191
+ score: candidate.score,
192
+ file_path: unit_field(unit, :file_path)
193
+ }
194
+ attribution[:truncated] = true if truncated
195
+ attribution
196
+ end
197
+
198
+ # Read a field from a unit hash, accepting either symbol or string keys.
199
+ #
200
+ # @param unit [Hash]
201
+ # @param key [Symbol]
202
+ # @return [Object, nil]
203
+ def unit_field(unit, key)
204
+ unit[key] || unit[key.to_s]
205
+ end
206
+
207
+ # Check if a candidate is framework source.
208
+ #
209
+ # @param candidate [Candidate]
210
+ # @return [Boolean]
211
+ def framework_candidate?(candidate)
212
+ metadata = candidate.metadata
213
+ return false unless metadata
214
+
215
+ type = metadata[:type] || metadata['type']
216
+ %w[rails_source gem_source].include?(type.to_s)
217
+ end
218
+
219
+ # Truncate text to fit within a token budget.
220
+ #
221
+ # @param text [String]
222
+ # @param token_budget [Integer]
223
+ # @return [String]
224
+ def truncate_to_budget(text, token_budget)
225
+ return text if estimate_tokens(text) <= token_budget
226
+
227
+ # Estimate target character count with 10% safety margin
228
+ target_chars = (token_budget * 4.0 * 0.9).to_i
229
+ "#{text[0...target_chars]}\n... [truncated]"
230
+ end
231
+
232
+ # Estimate token count using the project convention.
233
+ #
234
+ # @param text [String]
235
+ # @return [Integer]
236
+ def estimate_tokens(text)
237
+ (text.length / 4.0).ceil
238
+ end
239
+
240
+ # Build the final AssembledContext result.
241
+ #
242
+ # @param sections [Array<Hash>] Assembled sections
243
+ # @param sources [Array<Hash>] Source attributions
244
+ # @param effective_budget [Integer] The budget actually used for assembly
245
+ # @return [AssembledContext]
246
+ def build_result(sections, sources, effective_budget)
247
+ context = sections.map { |s| s[:content] }.join("\n\n---\n\n")
248
+ AssembledContext.new(
249
+ context: context,
250
+ tokens_used: estimate_tokens(context),
251
+ budget: effective_budget,
252
+ sources: sources.uniq,
253
+ sections: sections.map { |s| s[:section] }
254
+ )
255
+ end
256
+ end
257
+
258
+ # Result of context assembly.
259
+ AssembledContext = Struct.new(:context, :tokens_used, :budget, :sources, :sections, keyword_init: true)
260
+ end
261
+ end
@@ -0,0 +1,133 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ module Woods
6
+ module Retrieval
7
+ # Classifies natural language queries to determine retrieval strategy.
8
+ #
9
+ # Uses heuristic pattern matching to determine:
10
+ # - Intent: what the user wants to do
11
+ # - Scope: how broad the search should be
12
+ # - Target type: what kind of code unit to look for
13
+ # - Framework context: whether this is about Rails/gems vs app code
14
+ #
15
+ class QueryClassifier
16
+ # Classification result
17
+ Classification = Struct.new(:intent, :scope, :target_type, :framework_context, :keywords, keyword_init: true)
18
+
19
+ INTENTS = %i[understand locate trace debug implement reference compare framework].freeze
20
+ SCOPES = %i[pinpoint focused exploratory comprehensive].freeze
21
+
22
+ STOP_WORDS = Set.new(%w[the a an is are was were be been being have has had do does did will would could
23
+ should may might can shall in on at to for of and or but not with by from as
24
+ this that these those it its how what when where why who which]).freeze
25
+
26
+ # Intent patterns — order matters (first match wins)
27
+ INTENT_PATTERNS = {
28
+ locate: /\b(where|find|which file|locate|look for|search for)\b/i,
29
+ trace: /\b(trace|follow|track|call(s|ed by)|depends on|used by|who calls|what calls)\b/i,
30
+ debug: /\b(bug|error|fix|broken|failing|wrong|issue|problem|crash|exception)\b/i,
31
+ implement: /\b(implement|add|create|build|write|make|generate)\b/i,
32
+ compare: /\b(compare|difference|vs|versus|between|contrast)\b/i,
33
+ # rubocop:disable Layout/LineLength
34
+ framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob)\b/i,
35
+ # rubocop:enable Layout/LineLength
36
+ reference: /\b(show me|what is|what are|list|options for|api|interface|signature)\b/i,
37
+ understand: /\b(how|why|explain|understand|what happens|describe|overview)\b/i
38
+ }.freeze
39
+
40
+ # Scope patterns
41
+ SCOPE_PATTERNS = {
42
+ pinpoint: /\b(exactly|specific|this one|just the|only the)\b/i,
43
+ comprehensive: /\b(all|every|entire|whole|complete|everything)\b/i,
44
+ exploratory: /\b(related|around|near|similar|like|associated)\b/i
45
+ }.freeze
46
+
47
+ # Target type patterns
48
+ TARGET_PATTERNS = {
49
+ model: /\b(model|activerecord|association|schema|table|column|scope|validation)\b/i,
50
+ controller: /\b(controller|action|route|endpoint|api|request|response|filter|callback)\b/i,
51
+ service: /\b(service|interactor|operation|command|use.?case|business.?logic)\b/i,
52
+ job: /\b(job|worker|background|async|sidekiq|queue|perform)\b/i,
53
+ mailer: /\b(mailer|email|notification|send.?mail)\b/i,
54
+ graphql: /\b(graphql|mutation|query|type|resolver|field|argument|schema)\b/i,
55
+ concern: /\b(concern|mixin|module|included|extend)\b/i,
56
+ route: /\b(route|path|url|endpoint|uri|http|get|post|put|patch|delete)\b/i,
57
+ middleware: /\b(middleware|rack|request.?pipeline|before.?action)\b/i,
58
+ i18n: /\b(i18n|translation|locale|internationalization|t\(|translate)\b/i,
59
+ pundit_policy: /\b(pundit|authorize|policy|allowed|permitted)\b/i,
60
+ configuration: /\b(config|initializer|environment|setting|configure)\b/i,
61
+ engine: /\b(engine|mountable|mount|railtie|plugin|isolated.?namespace)\b/i,
62
+ view_template: /\b(view|template|partial|render|erb|layout|html)\b/i,
63
+ # rubocop:disable Layout/LineLength
64
+ migration: /\b(migration|migrate|schema.?change|add.?column|remove.?column|create.?table|drop.?table|db.?migrate)\b/i,
65
+ action_cable_channel: /\b(action.?cable|websocket|broadcast|cable.?channel|subscription.?channel|realtime|real.?time)\b/i,
66
+ scheduled_job: /\b(schedule[dr]?|recurring|cron|periodic|every\s+\d|daily|hourly|weekly|solid.?queue.*recur|sidekiq.?cron|whenever)\b/i,
67
+ rake_task: /\b(rake|rake.?task|lib.?tasks?|maintenance.?script|batch.?script)\b/i
68
+ # rubocop:enable Layout/LineLength
69
+ }.freeze
70
+
71
+ # Classify a query string
72
+ #
73
+ # @param query [String] Natural language query
74
+ # @return [Classification] Classified query
75
+ def classify(query)
76
+ Classification.new(
77
+ intent: detect_intent(query),
78
+ scope: detect_scope(query),
79
+ target_type: detect_target_type(query),
80
+ framework_context: framework_query?(query),
81
+ keywords: extract_keywords(query)
82
+ )
83
+ end
84
+
85
+ private
86
+
87
+ # @param query [String]
88
+ # @return [Symbol]
89
+ def detect_intent(query)
90
+ match_first(INTENT_PATTERNS, query, default: :understand)
91
+ end
92
+
93
+ # @param query [String]
94
+ # @return [Symbol]
95
+ def detect_scope(query)
96
+ match_first(SCOPE_PATTERNS, query, default: :focused)
97
+ end
98
+
99
+ # @param query [String]
100
+ # @return [Symbol, nil]
101
+ def detect_target_type(query)
102
+ match_first(TARGET_PATTERNS, query, default: nil)
103
+ end
104
+
105
+ # Match query against a hash of {key => pattern}, returning the first matching key.
106
+ #
107
+ # @param patterns [Hash{Symbol => Regexp}]
108
+ # @param query [String]
109
+ # @param default [Object] value if no pattern matches
110
+ # @return [Object]
111
+ def match_first(patterns, query, default:)
112
+ patterns.each { |key, pattern| return key if query.match?(pattern) }
113
+ default
114
+ end
115
+
116
+ # @param query [String]
117
+ # @return [Boolean]
118
+ def framework_query?(query)
119
+ query.match?(/\b(rails|activerecord|actioncontroller|activejob|actionmailer|activesupport|rack|middleware)\b/i)
120
+ end
121
+
122
+ # @param query [String]
123
+ # @return [Array<String>]
124
+ def extract_keywords(query)
125
+ query.downcase
126
+ .gsub(/[^\w\s]/, ' ')
127
+ .split
128
+ .reject { |w| STOP_WORDS.include?(w) || w.length < 2 }
129
+ .uniq
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,277 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Woods
4
+ module Retrieval
5
+ # Ranks search candidates using weighted signal scoring and diversity adjustment.
6
+ #
7
+ # Combines multiple ranking signals into a final score:
8
+ # - Semantic similarity from vector search
9
+ # - Keyword match quality
10
+ # - Recency (git change frequency)
11
+ # - Importance (PageRank / structural importance)
12
+ # - Type match (bonus when result type matches query target_type)
13
+ # - Diversity (penalty for too many results of same type/namespace)
14
+ #
15
+ # After initial scoring, applies Reciprocal Rank Fusion (RRF) when
16
+ # candidates come from multiple retrieval sources.
17
+ #
18
+ # @example
19
+ # ranker = Ranker.new(metadata_store: store)
20
+ # ranked = ranker.rank(candidates, classification: classification)
21
+ #
22
+ class Ranker
23
+ # Signal weights for ranking — sum to 1.0.
24
+ WEIGHTS = {
25
+ semantic: 0.40,
26
+ keyword: 0.20,
27
+ recency: 0.15,
28
+ importance: 0.10,
29
+ type_match: 0.10,
30
+ diversity: 0.05
31
+ }.freeze
32
+
33
+ # RRF constant — balances rank position vs. absolute score.
34
+ # Standard value from the original RRF paper (Cormack et al., 2009).
35
+ RRF_K = 60
36
+
37
+ # @param metadata_store [#find] Store that resolves identifiers to unit metadata
38
+ def initialize(metadata_store:)
39
+ @metadata_store = metadata_store
40
+ end
41
+
42
+ # Rank candidates by weighted signal scoring with diversity adjustment.
43
+ #
44
+ # @param candidates [Array<Candidate>] Search candidates from executor
45
+ # @param classification [QueryClassifier::Classification] Query classification
46
+ # @return [Array<Candidate>] Re-ranked candidates (best first)
47
+ def rank(candidates, classification:)
48
+ return [] if candidates.empty?
49
+
50
+ # Apply RRF if candidates come from multiple sources
51
+ candidates = apply_rrf(candidates) if multi_source?(candidates)
52
+
53
+ scored = score_candidates(candidates, classification)
54
+ sorted = sorted_by_weighted_score(scored)
55
+ apply_diversity_penalty(sorted)
56
+
57
+ sorted.map { |item| item[:candidate] }
58
+ end
59
+
60
+ private
61
+
62
+ # Check if candidates come from multiple retrieval sources.
63
+ #
64
+ # @param candidates [Array<Candidate>]
65
+ # @return [Boolean]
66
+ def multi_source?(candidates)
67
+ candidates.map(&:source).uniq.size > 1
68
+ end
69
+
70
+ # Apply Reciprocal Rank Fusion across sources.
71
+ #
72
+ # RRF formula: score(d) = sum(1/(k + rank_i(d)))
73
+ # Each source's candidates are ranked independently, then RRF
74
+ # merges ranks into a single score.
75
+ #
76
+ # @param candidates [Array<Candidate>]
77
+ # @return [Array<Candidate>] Merged candidates with RRF scores
78
+ def apply_rrf(candidates)
79
+ rrf_scores, metadata_map = compute_rrf_scores(candidates)
80
+ rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
81
+ end
82
+
83
+ # Compute RRF scores across all sources.
84
+ #
85
+ # @return [Array(Hash, Hash)] [rrf_scores, metadata_map]
86
+ def compute_rrf_scores(candidates)
87
+ rrf_scores = Hash.new(0.0)
88
+ metadata_map = {}
89
+
90
+ candidates.group_by(&:source).each_value do |source_candidates|
91
+ ranked = source_candidates.sort_by { |c| -c.score }
92
+ ranked.each_with_index do |candidate, rank|
93
+ rrf_scores[candidate.identifier] += 1.0 / (RRF_K + rank)
94
+ metadata_map[candidate.identifier] ||= candidate.metadata
95
+ end
96
+ end
97
+
98
+ [rrf_scores, metadata_map]
99
+ end
100
+
101
+ # Rebuild candidates with merged RRF scores.
102
+ #
103
+ # @return [Array<Candidate>]
104
+ def rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
105
+ original_by_id = candidates.index_by(&:identifier)
106
+ rrf_scores.sort_by { |_id, score| -score }.map do |identifier, score|
107
+ original = original_by_id[identifier]
108
+ build_candidate(
109
+ identifier: identifier,
110
+ score: score,
111
+ source: original&.source || :rrf,
112
+ metadata: metadata_map[identifier]
113
+ )
114
+ end
115
+ end
116
+
117
+ # Score each candidate across all signals.
118
+ #
119
+ # @param candidates [Array<Candidate>]
120
+ # @param classification [QueryClassifier::Classification]
121
+ # @return [Array<Hash>]
122
+ def score_candidates(candidates, classification)
123
+ # Batch-fetch all metadata in one query instead of per-candidate lookups
124
+ unit_map = @metadata_store.find_batch(candidates.map(&:identifier))
125
+
126
+ candidates.map do |candidate|
127
+ unit = unit_map[candidate.identifier]
128
+
129
+ {
130
+ candidate: candidate,
131
+ unit: unit, # cached to avoid double lookup in apply_diversity_penalty
132
+ scores: {
133
+ semantic: candidate.score.to_f,
134
+ keyword: keyword_score(candidate),
135
+ recency: recency_score(unit),
136
+ importance: importance_score(unit),
137
+ type_match: type_match_score(unit, classification),
138
+ diversity: 1.0 # Adjusted after initial sort
139
+ }
140
+ }
141
+ end
142
+ end
143
+
144
+ # Calculate weighted score for each item.
145
+ #
146
+ # @param scored [Array<Hash>]
147
+ # @return [Array<Hash>] Sorted by weighted_score descending
148
+ def sorted_by_weighted_score(scored)
149
+ scored.each do |item|
150
+ item[:weighted_score] = WEIGHTS.sum do |signal, weight|
151
+ item[:scores][signal] * weight
152
+ end
153
+ end
154
+
155
+ scored.sort_by { |item| -item[:weighted_score] }
156
+ end
157
+
158
+ # Keyword match score based on matched field count.
159
+ #
160
+ # @param candidate [Candidate]
161
+ # @return [Float] 0.0 to 1.0
162
+ def keyword_score(candidate)
163
+ return 0.0 unless candidate.respond_to?(:matched_fields) && candidate.matched_fields
164
+
165
+ [candidate.matched_fields.size * 0.25, 1.0].min
166
+ end
167
+
168
+ # Recency score based on git change frequency metadata.
169
+ #
170
+ # @param unit [Hash, nil] Unit metadata from store
171
+ # @return [Float] 0.0 to 1.0
172
+ def recency_score(unit)
173
+ return 0.5 unless unit
174
+
175
+ frequency = dig_metadata(unit, :git, :change_frequency)
176
+ case frequency&.to_sym
177
+ when :hot then 1.0
178
+ when :active then 0.8
179
+ when :dormant then 0.3
180
+ when :new then 0.7
181
+ else 0.5 # stable or unknown
182
+ end
183
+ end
184
+
185
+ # Importance score based on PageRank / structural importance.
186
+ #
187
+ # @param unit [Hash, nil] Unit metadata from store
188
+ # @return [Float] 0.0 to 1.0
189
+ def importance_score(unit)
190
+ return 0.5 unless unit
191
+
192
+ importance = dig_metadata(unit, :importance)
193
+ case importance&.to_s
194
+ when 'high' then 1.0
195
+ when 'medium' then 0.6
196
+ when 'low' then 0.3
197
+ else 0.5
198
+ end
199
+ end
200
+
201
+ # Type match score — bonus when result type matches query target_type.
202
+ #
203
+ # @param unit [Hash, nil] Unit metadata from store
204
+ # @param classification [QueryClassifier::Classification]
205
+ # @return [Float] 0.0 to 1.0
206
+ def type_match_score(unit, classification)
207
+ return 0.5 unless unit
208
+ return 0.5 unless classification.target_type
209
+
210
+ unit_type = dig_metadata(unit, :type) || unit[:type]
211
+ unit_type&.to_sym == classification.target_type ? 1.0 : 0.3
212
+ end
213
+
214
+ # Apply diversity penalty to avoid clustering by type/namespace.
215
+ #
216
+ # @param sorted [Array<Hash>] Scored items sorted by weighted_score
217
+ # @return [void] Mutates items in place
218
+ def apply_diversity_penalty(sorted)
219
+ seen_namespaces = Hash.new(0)
220
+ seen_types = Hash.new(0)
221
+
222
+ sorted.each do |item|
223
+ penalty = diversity_penalty_for(item, seen_namespaces, seen_types)
224
+ next unless penalty
225
+
226
+ item[:scores][:diversity] = 1.0 - penalty
227
+ item[:weighted_score] -= penalty * WEIGHTS[:diversity]
228
+ end
229
+
230
+ sorted.sort_by! { |item| -item[:weighted_score] }
231
+ end
232
+
233
+ # Compute diversity penalty for a single item and update seen counts.
234
+ #
235
+ # Uses the unit cached in item[:unit] to avoid a redundant metadata store lookup.
236
+ #
237
+ # @return [Float, nil] Penalty amount, or nil if unit not found
238
+ def diversity_penalty_for(item, seen_namespaces, seen_types)
239
+ unit = item[:unit]
240
+ return nil unless unit
241
+
242
+ namespace = dig_metadata(unit, :namespace) || 'root'
243
+ type = (dig_metadata(unit, :type) || 'unknown').to_s
244
+
245
+ penalty = [(seen_namespaces[namespace] + seen_types[type]) * 0.1, 0.5].min
246
+ seen_namespaces[namespace] += 1
247
+ seen_types[type] += 1
248
+ penalty
249
+ end
250
+
251
+ # Dig into unit metadata, handling both hash and object access.
252
+ #
253
+ # @param unit [Hash, Object] Unit data
254
+ # @param keys [Array<Symbol>] Key path
255
+ # @return [Object, nil]
256
+ def dig_metadata(unit, *keys)
257
+ if keys.size == 1
258
+ unit.is_a?(Hash) ? (unit.dig(:metadata, keys[0]) || unit[keys[0]]) : nil
259
+ else
260
+ unit.is_a?(Hash) ? unit.dig(:metadata, *keys) : nil
261
+ end
262
+ end
263
+
264
+ # Build a Candidate struct compatible with SearchExecutor::Candidate.
265
+ #
266
+ # @return [Candidate-like Struct]
267
+ def build_candidate(identifier:, score:, source:, metadata:)
268
+ SearchExecutor::Candidate.new(
269
+ identifier: identifier,
270
+ score: score,
271
+ source: source,
272
+ metadata: metadata
273
+ )
274
+ end
275
+ end
276
+ end
277
+ end