woods 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +89 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +406 -0
- data/exe/woods-console +59 -0
- data/exe/woods-console-mcp +22 -0
- data/exe/woods-mcp +34 -0
- data/exe/woods-mcp-http +37 -0
- data/exe/woods-mcp-start +58 -0
- data/lib/generators/woods/install_generator.rb +32 -0
- data/lib/generators/woods/pgvector_generator.rb +37 -0
- data/lib/generators/woods/templates/add_pgvector_to_woods.rb.erb +15 -0
- data/lib/generators/woods/templates/create_woods_tables.rb.erb +43 -0
- data/lib/tasks/woods.rake +621 -0
- data/lib/tasks/woods_evaluation.rake +115 -0
- data/lib/woods/ast/call_site_extractor.rb +106 -0
- data/lib/woods/ast/method_extractor.rb +71 -0
- data/lib/woods/ast/node.rb +116 -0
- data/lib/woods/ast/parser.rb +614 -0
- data/lib/woods/ast.rb +6 -0
- data/lib/woods/builder.rb +200 -0
- data/lib/woods/cache/cache_middleware.rb +199 -0
- data/lib/woods/cache/cache_store.rb +264 -0
- data/lib/woods/cache/redis_cache_store.rb +116 -0
- data/lib/woods/cache/solid_cache_store.rb +111 -0
- data/lib/woods/chunking/chunk.rb +84 -0
- data/lib/woods/chunking/semantic_chunker.rb +295 -0
- data/lib/woods/console/adapters/cache_adapter.rb +58 -0
- data/lib/woods/console/adapters/good_job_adapter.rb +33 -0
- data/lib/woods/console/adapters/job_adapter.rb +68 -0
- data/lib/woods/console/adapters/sidekiq_adapter.rb +33 -0
- data/lib/woods/console/adapters/solid_queue_adapter.rb +33 -0
- data/lib/woods/console/audit_logger.rb +75 -0
- data/lib/woods/console/bridge.rb +177 -0
- data/lib/woods/console/confirmation.rb +90 -0
- data/lib/woods/console/connection_manager.rb +173 -0
- data/lib/woods/console/console_response_renderer.rb +74 -0
- data/lib/woods/console/embedded_executor.rb +373 -0
- data/lib/woods/console/model_validator.rb +81 -0
- data/lib/woods/console/rack_middleware.rb +87 -0
- data/lib/woods/console/safe_context.rb +82 -0
- data/lib/woods/console/server.rb +612 -0
- data/lib/woods/console/sql_validator.rb +172 -0
- data/lib/woods/console/tools/tier1.rb +118 -0
- data/lib/woods/console/tools/tier2.rb +117 -0
- data/lib/woods/console/tools/tier3.rb +110 -0
- data/lib/woods/console/tools/tier4.rb +79 -0
- data/lib/woods/coordination/pipeline_lock.rb +109 -0
- data/lib/woods/cost_model/embedding_cost.rb +88 -0
- data/lib/woods/cost_model/estimator.rb +128 -0
- data/lib/woods/cost_model/provider_pricing.rb +67 -0
- data/lib/woods/cost_model/storage_cost.rb +52 -0
- data/lib/woods/cost_model.rb +22 -0
- data/lib/woods/db/migrations/001_create_units.rb +38 -0
- data/lib/woods/db/migrations/002_create_edges.rb +35 -0
- data/lib/woods/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/woods/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/woods/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/woods/db/migrations/006_rename_tables.rb +34 -0
- data/lib/woods/db/migrator.rb +73 -0
- data/lib/woods/db/schema_version.rb +73 -0
- data/lib/woods/dependency_graph.rb +236 -0
- data/lib/woods/embedding/indexer.rb +140 -0
- data/lib/woods/embedding/openai.rb +126 -0
- data/lib/woods/embedding/provider.rb +162 -0
- data/lib/woods/embedding/text_preparer.rb +112 -0
- data/lib/woods/evaluation/baseline_runner.rb +115 -0
- data/lib/woods/evaluation/evaluator.rb +139 -0
- data/lib/woods/evaluation/metrics.rb +79 -0
- data/lib/woods/evaluation/query_set.rb +148 -0
- data/lib/woods/evaluation/report_generator.rb +90 -0
- data/lib/woods/extracted_unit.rb +145 -0
- data/lib/woods/extractor.rb +1028 -0
- data/lib/woods/extractors/action_cable_extractor.rb +201 -0
- data/lib/woods/extractors/ast_source_extraction.rb +46 -0
- data/lib/woods/extractors/behavioral_profile.rb +309 -0
- data/lib/woods/extractors/caching_extractor.rb +261 -0
- data/lib/woods/extractors/callback_analyzer.rb +246 -0
- data/lib/woods/extractors/concern_extractor.rb +292 -0
- data/lib/woods/extractors/configuration_extractor.rb +219 -0
- data/lib/woods/extractors/controller_extractor.rb +404 -0
- data/lib/woods/extractors/database_view_extractor.rb +278 -0
- data/lib/woods/extractors/decorator_extractor.rb +253 -0
- data/lib/woods/extractors/engine_extractor.rb +223 -0
- data/lib/woods/extractors/event_extractor.rb +211 -0
- data/lib/woods/extractors/factory_extractor.rb +289 -0
- data/lib/woods/extractors/graphql_extractor.rb +892 -0
- data/lib/woods/extractors/i18n_extractor.rb +117 -0
- data/lib/woods/extractors/job_extractor.rb +374 -0
- data/lib/woods/extractors/lib_extractor.rb +218 -0
- data/lib/woods/extractors/mailer_extractor.rb +269 -0
- data/lib/woods/extractors/manager_extractor.rb +188 -0
- data/lib/woods/extractors/middleware_extractor.rb +133 -0
- data/lib/woods/extractors/migration_extractor.rb +469 -0
- data/lib/woods/extractors/model_extractor.rb +988 -0
- data/lib/woods/extractors/phlex_extractor.rb +252 -0
- data/lib/woods/extractors/policy_extractor.rb +191 -0
- data/lib/woods/extractors/poro_extractor.rb +229 -0
- data/lib/woods/extractors/pundit_extractor.rb +223 -0
- data/lib/woods/extractors/rails_source_extractor.rb +473 -0
- data/lib/woods/extractors/rake_task_extractor.rb +343 -0
- data/lib/woods/extractors/route_extractor.rb +181 -0
- data/lib/woods/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/woods/extractors/serializer_extractor.rb +339 -0
- data/lib/woods/extractors/service_extractor.rb +217 -0
- data/lib/woods/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/woods/extractors/shared_utility_methods.rb +281 -0
- data/lib/woods/extractors/state_machine_extractor.rb +398 -0
- data/lib/woods/extractors/test_mapping_extractor.rb +225 -0
- data/lib/woods/extractors/validator_extractor.rb +211 -0
- data/lib/woods/extractors/view_component_extractor.rb +311 -0
- data/lib/woods/extractors/view_template_extractor.rb +261 -0
- data/lib/woods/feedback/gap_detector.rb +89 -0
- data/lib/woods/feedback/store.rb +119 -0
- data/lib/woods/filename_utils.rb +32 -0
- data/lib/woods/flow_analysis/operation_extractor.rb +206 -0
- data/lib/woods/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/woods/flow_assembler.rb +290 -0
- data/lib/woods/flow_document.rb +191 -0
- data/lib/woods/flow_precomputer.rb +102 -0
- data/lib/woods/formatting/base.rb +30 -0
- data/lib/woods/formatting/claude_adapter.rb +98 -0
- data/lib/woods/formatting/generic_adapter.rb +56 -0
- data/lib/woods/formatting/gpt_adapter.rb +64 -0
- data/lib/woods/formatting/human_adapter.rb +78 -0
- data/lib/woods/graph_analyzer.rb +374 -0
- data/lib/woods/mcp/bootstrapper.rb +96 -0
- data/lib/woods/mcp/index_reader.rb +394 -0
- data/lib/woods/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/woods/mcp/renderers/json_renderer.rb +17 -0
- data/lib/woods/mcp/renderers/markdown_renderer.rb +353 -0
- data/lib/woods/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/woods/mcp/server.rb +962 -0
- data/lib/woods/mcp/tool_response_renderer.rb +85 -0
- data/lib/woods/model_name_cache.rb +51 -0
- data/lib/woods/notion/client.rb +217 -0
- data/lib/woods/notion/exporter.rb +219 -0
- data/lib/woods/notion/mapper.rb +40 -0
- data/lib/woods/notion/mappers/column_mapper.rb +57 -0
- data/lib/woods/notion/mappers/migration_mapper.rb +39 -0
- data/lib/woods/notion/mappers/model_mapper.rb +161 -0
- data/lib/woods/notion/mappers/shared.rb +22 -0
- data/lib/woods/notion/rate_limiter.rb +68 -0
- data/lib/woods/observability/health_check.rb +79 -0
- data/lib/woods/observability/instrumentation.rb +34 -0
- data/lib/woods/observability/structured_logger.rb +57 -0
- data/lib/woods/operator/error_escalator.rb +81 -0
- data/lib/woods/operator/pipeline_guard.rb +92 -0
- data/lib/woods/operator/status_reporter.rb +80 -0
- data/lib/woods/railtie.rb +38 -0
- data/lib/woods/resilience/circuit_breaker.rb +99 -0
- data/lib/woods/resilience/index_validator.rb +167 -0
- data/lib/woods/resilience/retryable_provider.rb +108 -0
- data/lib/woods/retrieval/context_assembler.rb +261 -0
- data/lib/woods/retrieval/query_classifier.rb +133 -0
- data/lib/woods/retrieval/ranker.rb +277 -0
- data/lib/woods/retrieval/search_executor.rb +316 -0
- data/lib/woods/retriever.rb +152 -0
- data/lib/woods/ruby_analyzer/class_analyzer.rb +170 -0
- data/lib/woods/ruby_analyzer/dataflow_analyzer.rb +77 -0
- data/lib/woods/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/woods/ruby_analyzer/mermaid_renderer.rb +280 -0
- data/lib/woods/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/woods/ruby_analyzer/trace_enricher.rb +143 -0
- data/lib/woods/ruby_analyzer.rb +87 -0
- data/lib/woods/session_tracer/file_store.rb +104 -0
- data/lib/woods/session_tracer/middleware.rb +143 -0
- data/lib/woods/session_tracer/redis_store.rb +106 -0
- data/lib/woods/session_tracer/session_flow_assembler.rb +254 -0
- data/lib/woods/session_tracer/session_flow_document.rb +223 -0
- data/lib/woods/session_tracer/solid_cache_store.rb +139 -0
- data/lib/woods/session_tracer/store.rb +81 -0
- data/lib/woods/storage/graph_store.rb +120 -0
- data/lib/woods/storage/metadata_store.rb +196 -0
- data/lib/woods/storage/pgvector.rb +195 -0
- data/lib/woods/storage/qdrant.rb +205 -0
- data/lib/woods/storage/vector_store.rb +167 -0
- data/lib/woods/temporal/json_snapshot_store.rb +245 -0
- data/lib/woods/temporal/snapshot_store.rb +345 -0
- data/lib/woods/token_utils.rb +19 -0
- data/lib/woods/version.rb +5 -0
- data/lib/woods.rb +246 -0
- metadata +270 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
module Retrieval
|
|
5
|
+
# Transforms ranked search candidates into a token-budgeted context string
|
|
6
|
+
# for LLM consumption.
|
|
7
|
+
#
|
|
8
|
+
# Allocates a fixed token budget across four sections:
|
|
9
|
+
# - Structural (10%): Always-included codebase overview
|
|
10
|
+
# - Primary (50%): Direct query results
|
|
11
|
+
# - Supporting (25%): Dependencies and related context
|
|
12
|
+
# - Framework (15%): Rails/gem source when query has framework context
|
|
13
|
+
#
|
|
14
|
+
# When framework context is not needed, primary and supporting sections
|
|
15
|
+
# receive the framework allocation proportionally.
|
|
16
|
+
#
|
|
17
|
+
# @example
|
|
18
|
+
# assembler = ContextAssembler.new(metadata_store: store)
|
|
19
|
+
# result = assembler.assemble(candidates: ranked, classification: cls)
|
|
20
|
+
# result.context # => "## User (model)\n..."
|
|
21
|
+
# result.tokens_used # => 4200
|
|
22
|
+
# result.sections # => [:structural, :primary, :supporting]
|
|
23
|
+
#
|
|
24
|
+
class ContextAssembler
|
|
25
|
+
DEFAULT_BUDGET = 8000 # tokens
|
|
26
|
+
|
|
27
|
+
BUDGET_ALLOCATION = {
|
|
28
|
+
structural: 0.10,
|
|
29
|
+
primary: 0.50,
|
|
30
|
+
supporting: 0.25,
|
|
31
|
+
framework: 0.15
|
|
32
|
+
}.freeze
|
|
33
|
+
|
|
34
|
+
# Minimum token count for a section to be worth including.
|
|
35
|
+
MIN_USEFUL_TOKENS = 200
|
|
36
|
+
|
|
37
|
+
# @param metadata_store [#find] Store that resolves identifiers to unit data
|
|
38
|
+
# @param budget [Integer] Total token budget
|
|
39
|
+
def initialize(metadata_store:, budget: DEFAULT_BUDGET)
|
|
40
|
+
@metadata_store = metadata_store
|
|
41
|
+
@budget = budget
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Assemble context from ranked candidates within token budget.
|
|
45
|
+
#
|
|
46
|
+
# @param candidates [Array<Candidate>] Ranked search candidates
|
|
47
|
+
# @param classification [QueryClassifier::Classification] Query classification
|
|
48
|
+
# @param structural_context [String, nil] Optional codebase overview text
|
|
49
|
+
# @param budget [Integer, nil] Override token budget; falls back to @budget
|
|
50
|
+
# @return [AssembledContext] Token-budgeted context with source attribution
|
|
51
|
+
def assemble(candidates:, classification:, structural_context: nil, budget: nil)
|
|
52
|
+
effective_budget = budget || @budget
|
|
53
|
+
sections = []
|
|
54
|
+
sources = []
|
|
55
|
+
tokens_used = 0
|
|
56
|
+
|
|
57
|
+
# Pre-fetch all candidate metadata in one batch query
|
|
58
|
+
@unit_cache = @metadata_store.find_batch(candidates.map(&:identifier))
|
|
59
|
+
|
|
60
|
+
# 1. Structural context (always first if provided)
|
|
61
|
+
tokens_used = add_structural_section(sections, structural_context, tokens_used, effective_budget)
|
|
62
|
+
|
|
63
|
+
# 2. Compute per-section budgets from remaining tokens
|
|
64
|
+
budgets = compute_section_budgets(effective_budget - tokens_used, classification)
|
|
65
|
+
|
|
66
|
+
# 3. Primary, supporting, and framework sections
|
|
67
|
+
add_candidate_section(sections, sources, :primary,
|
|
68
|
+
candidates.reject { |c| c.source == :graph_expansion }, budgets[:primary])
|
|
69
|
+
add_candidate_section(sections, sources, :supporting,
|
|
70
|
+
candidates.select { |c| c.source == :graph_expansion }, budgets[:supporting])
|
|
71
|
+
if budgets[:framework].positive?
|
|
72
|
+
add_candidate_section(sections, sources, :framework,
|
|
73
|
+
candidates.select { |c| framework_candidate?(c) }, budgets[:framework])
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
build_result(sections, sources, effective_budget)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
# Add structural context section if provided.
|
|
82
|
+
#
|
|
83
|
+
# @return [Integer] Updated tokens_used count
|
|
84
|
+
def add_structural_section(sections, structural_context, tokens_used, effective_budget)
|
|
85
|
+
return tokens_used unless structural_context
|
|
86
|
+
|
|
87
|
+
budget = (effective_budget * BUDGET_ALLOCATION[:structural]).to_i
|
|
88
|
+
text = truncate_to_budget(structural_context, budget)
|
|
89
|
+
sections << { section: :structural, content: text }
|
|
90
|
+
tokens_used + estimate_tokens(text)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Add a candidate-based section if candidates produce content.
|
|
94
|
+
#
|
|
95
|
+
# @return [void]
|
|
96
|
+
def add_candidate_section(sections, sources, section_name, candidates, budget)
|
|
97
|
+
return if candidates.empty?
|
|
98
|
+
|
|
99
|
+
content, section_sources = assemble_section(candidates, budget)
|
|
100
|
+
return if content.empty?
|
|
101
|
+
|
|
102
|
+
sections << { section: section_name, content: content }
|
|
103
|
+
sources.concat(section_sources)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Compute token budgets for primary/supporting/framework sections.
|
|
107
|
+
#
|
|
108
|
+
# @param remaining [Integer] Tokens available after structural
|
|
109
|
+
# @param classification [QueryClassifier::Classification]
|
|
110
|
+
# @return [Hash<Symbol, Integer>]
|
|
111
|
+
def compute_section_budgets(remaining, classification)
|
|
112
|
+
if classification.framework_context
|
|
113
|
+
{
|
|
114
|
+
primary: (remaining * 0.55).to_i,
|
|
115
|
+
supporting: (remaining * 0.25).to_i,
|
|
116
|
+
framework: (remaining * 0.20).to_i
|
|
117
|
+
}
|
|
118
|
+
else
|
|
119
|
+
{
|
|
120
|
+
primary: (remaining * 0.65).to_i,
|
|
121
|
+
supporting: (remaining * 0.35).to_i,
|
|
122
|
+
framework: 0
|
|
123
|
+
}
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Assemble content for a single section within a token budget.
|
|
128
|
+
#
|
|
129
|
+
# @param candidates [Array<Candidate>] Candidates for this section
|
|
130
|
+
# @param budget [Integer] Token budget for this section
|
|
131
|
+
# @return [Array(String, Array<Hash>)] Content string and source attributions
|
|
132
|
+
def assemble_section(candidates, budget)
|
|
133
|
+
content_parts = []
|
|
134
|
+
sources = []
|
|
135
|
+
tokens_used = 0
|
|
136
|
+
|
|
137
|
+
candidates.sort_by { |c| -c.score }.each do |candidate|
|
|
138
|
+
tokens_used = append_candidate(content_parts, sources, candidate, budget, tokens_used)
|
|
139
|
+
break if tokens_used.nil?
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
[content_parts.join("\n\n"), sources]
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Append a single candidate to the section. Returns updated tokens_used, or nil to stop.
|
|
146
|
+
def append_candidate(parts, sources, candidate, budget, tokens_used)
|
|
147
|
+
unit = @unit_cache[candidate.identifier]
|
|
148
|
+
return tokens_used unless unit
|
|
149
|
+
|
|
150
|
+
text = format_unit(unit, candidate)
|
|
151
|
+
tokens = estimate_tokens(text)
|
|
152
|
+
remaining = budget - tokens_used
|
|
153
|
+
|
|
154
|
+
if tokens <= remaining
|
|
155
|
+
parts << text
|
|
156
|
+
sources << build_source_attribution(candidate, unit)
|
|
157
|
+
tokens_used + tokens
|
|
158
|
+
elsif remaining > MIN_USEFUL_TOKENS
|
|
159
|
+
parts << truncate_to_budget(text, remaining)
|
|
160
|
+
sources << build_source_attribution(candidate, unit, truncated: true)
|
|
161
|
+
nil
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Format a unit for inclusion in context.
|
|
166
|
+
#
|
|
167
|
+
# @param unit [Hash] Unit data from metadata store
|
|
168
|
+
# @param candidate [Candidate] The search candidate
|
|
169
|
+
# @return [String]
|
|
170
|
+
def format_unit(unit, _candidate)
|
|
171
|
+
identifier = unit_field(unit, :identifier)
|
|
172
|
+
type = unit_field(unit, :type)
|
|
173
|
+
file_path = unit_field(unit, :file_path)
|
|
174
|
+
source = unit_field(unit, :source_code) || ''
|
|
175
|
+
|
|
176
|
+
<<~UNIT.strip
|
|
177
|
+
## #{identifier} (#{type})
|
|
178
|
+
File: #{file_path}
|
|
179
|
+
|
|
180
|
+
#{source}
|
|
181
|
+
UNIT
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Build source attribution hash for a candidate.
|
|
185
|
+
#
|
|
186
|
+
# @return [Hash]
|
|
187
|
+
def build_source_attribution(candidate, unit, truncated: false)
|
|
188
|
+
attribution = {
|
|
189
|
+
identifier: candidate.identifier,
|
|
190
|
+
type: unit_field(unit, :type),
|
|
191
|
+
score: candidate.score,
|
|
192
|
+
file_path: unit_field(unit, :file_path)
|
|
193
|
+
}
|
|
194
|
+
attribution[:truncated] = true if truncated
|
|
195
|
+
attribution
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Read a field from a unit hash, accepting either symbol or string keys.
|
|
199
|
+
#
|
|
200
|
+
# @param unit [Hash]
|
|
201
|
+
# @param key [Symbol]
|
|
202
|
+
# @return [Object, nil]
|
|
203
|
+
def unit_field(unit, key)
|
|
204
|
+
unit[key] || unit[key.to_s]
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Check if a candidate is framework source.
|
|
208
|
+
#
|
|
209
|
+
# @param candidate [Candidate]
|
|
210
|
+
# @return [Boolean]
|
|
211
|
+
def framework_candidate?(candidate)
|
|
212
|
+
metadata = candidate.metadata
|
|
213
|
+
return false unless metadata
|
|
214
|
+
|
|
215
|
+
type = metadata[:type] || metadata['type']
|
|
216
|
+
%w[rails_source gem_source].include?(type.to_s)
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Truncate text to fit within a token budget.
|
|
220
|
+
#
|
|
221
|
+
# @param text [String]
|
|
222
|
+
# @param token_budget [Integer]
|
|
223
|
+
# @return [String]
|
|
224
|
+
def truncate_to_budget(text, token_budget)
|
|
225
|
+
return text if estimate_tokens(text) <= token_budget
|
|
226
|
+
|
|
227
|
+
# Estimate target character count with 10% safety margin
|
|
228
|
+
target_chars = (token_budget * 4.0 * 0.9).to_i
|
|
229
|
+
"#{text[0...target_chars]}\n... [truncated]"
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Estimate token count using the project convention.
|
|
233
|
+
#
|
|
234
|
+
# @param text [String]
|
|
235
|
+
# @return [Integer]
|
|
236
|
+
def estimate_tokens(text)
|
|
237
|
+
(text.length / 4.0).ceil
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Build the final AssembledContext result.
|
|
241
|
+
#
|
|
242
|
+
# @param sections [Array<Hash>] Assembled sections
|
|
243
|
+
# @param sources [Array<Hash>] Source attributions
|
|
244
|
+
# @param effective_budget [Integer] The budget actually used for assembly
|
|
245
|
+
# @return [AssembledContext]
|
|
246
|
+
def build_result(sections, sources, effective_budget)
|
|
247
|
+
context = sections.map { |s| s[:content] }.join("\n\n---\n\n")
|
|
248
|
+
AssembledContext.new(
|
|
249
|
+
context: context,
|
|
250
|
+
tokens_used: estimate_tokens(context),
|
|
251
|
+
budget: effective_budget,
|
|
252
|
+
sources: sources.uniq,
|
|
253
|
+
sections: sections.map { |s| s[:section] }
|
|
254
|
+
)
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
# Result of context assembly.
|
|
259
|
+
AssembledContext = Struct.new(:context, :tokens_used, :budget, :sources, :sections, keyword_init: true)
|
|
260
|
+
end
|
|
261
|
+
end
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
5
|
+
module Woods
|
|
6
|
+
module Retrieval
|
|
7
|
+
# Classifies natural language queries to determine retrieval strategy.
|
|
8
|
+
#
|
|
9
|
+
# Uses heuristic pattern matching to determine:
|
|
10
|
+
# - Intent: what the user wants to do
|
|
11
|
+
# - Scope: how broad the search should be
|
|
12
|
+
# - Target type: what kind of code unit to look for
|
|
13
|
+
# - Framework context: whether this is about Rails/gems vs app code
|
|
14
|
+
#
|
|
15
|
+
class QueryClassifier
|
|
16
|
+
# Classification result
|
|
17
|
+
Classification = Struct.new(:intent, :scope, :target_type, :framework_context, :keywords, keyword_init: true)
|
|
18
|
+
|
|
19
|
+
INTENTS = %i[understand locate trace debug implement reference compare framework].freeze
|
|
20
|
+
SCOPES = %i[pinpoint focused exploratory comprehensive].freeze
|
|
21
|
+
|
|
22
|
+
STOP_WORDS = Set.new(%w[the a an is are was were be been being have has had do does did will would could
|
|
23
|
+
should may might can shall in on at to for of and or but not with by from as
|
|
24
|
+
this that these those it its how what when where why who which]).freeze
|
|
25
|
+
|
|
26
|
+
# Intent patterns — order matters (first match wins)
|
|
27
|
+
INTENT_PATTERNS = {
|
|
28
|
+
locate: /\b(where|find|which file|locate|look for|search for)\b/i,
|
|
29
|
+
trace: /\b(trace|follow|track|call(s|ed by)|depends on|used by|who calls|what calls)\b/i,
|
|
30
|
+
debug: /\b(bug|error|fix|broken|failing|wrong|issue|problem|crash|exception)\b/i,
|
|
31
|
+
implement: /\b(implement|add|create|build|write|make|generate)\b/i,
|
|
32
|
+
compare: /\b(compare|difference|vs|versus|between|contrast)\b/i,
|
|
33
|
+
# rubocop:disable Layout/LineLength
|
|
34
|
+
framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob)\b/i,
|
|
35
|
+
# rubocop:enable Layout/LineLength
|
|
36
|
+
reference: /\b(show me|what is|what are|list|options for|api|interface|signature)\b/i,
|
|
37
|
+
understand: /\b(how|why|explain|understand|what happens|describe|overview)\b/i
|
|
38
|
+
}.freeze
|
|
39
|
+
|
|
40
|
+
# Scope patterns
|
|
41
|
+
SCOPE_PATTERNS = {
|
|
42
|
+
pinpoint: /\b(exactly|specific|this one|just the|only the)\b/i,
|
|
43
|
+
comprehensive: /\b(all|every|entire|whole|complete|everything)\b/i,
|
|
44
|
+
exploratory: /\b(related|around|near|similar|like|associated)\b/i
|
|
45
|
+
}.freeze
|
|
46
|
+
|
|
47
|
+
# Target type patterns
|
|
48
|
+
TARGET_PATTERNS = {
|
|
49
|
+
model: /\b(model|activerecord|association|schema|table|column|scope|validation)\b/i,
|
|
50
|
+
controller: /\b(controller|action|route|endpoint|api|request|response|filter|callback)\b/i,
|
|
51
|
+
service: /\b(service|interactor|operation|command|use.?case|business.?logic)\b/i,
|
|
52
|
+
job: /\b(job|worker|background|async|sidekiq|queue|perform)\b/i,
|
|
53
|
+
mailer: /\b(mailer|email|notification|send.?mail)\b/i,
|
|
54
|
+
graphql: /\b(graphql|mutation|query|type|resolver|field|argument|schema)\b/i,
|
|
55
|
+
concern: /\b(concern|mixin|module|included|extend)\b/i,
|
|
56
|
+
route: /\b(route|path|url|endpoint|uri|http|get|post|put|patch|delete)\b/i,
|
|
57
|
+
middleware: /\b(middleware|rack|request.?pipeline|before.?action)\b/i,
|
|
58
|
+
i18n: /\b(i18n|translation|locale|internationalization|t\(|translate)\b/i,
|
|
59
|
+
pundit_policy: /\b(pundit|authorize|policy|allowed|permitted)\b/i,
|
|
60
|
+
configuration: /\b(config|initializer|environment|setting|configure)\b/i,
|
|
61
|
+
engine: /\b(engine|mountable|mount|railtie|plugin|isolated.?namespace)\b/i,
|
|
62
|
+
view_template: /\b(view|template|partial|render|erb|layout|html)\b/i,
|
|
63
|
+
# rubocop:disable Layout/LineLength
|
|
64
|
+
migration: /\b(migration|migrate|schema.?change|add.?column|remove.?column|create.?table|drop.?table|db.?migrate)\b/i,
|
|
65
|
+
action_cable_channel: /\b(action.?cable|websocket|broadcast|cable.?channel|subscription.?channel|realtime|real.?time)\b/i,
|
|
66
|
+
scheduled_job: /\b(schedule[dr]?|recurring|cron|periodic|every\s+\d|daily|hourly|weekly|solid.?queue.*recur|sidekiq.?cron|whenever)\b/i,
|
|
67
|
+
rake_task: /\b(rake|rake.?task|lib.?tasks?|maintenance.?script|batch.?script)\b/i
|
|
68
|
+
# rubocop:enable Layout/LineLength
|
|
69
|
+
}.freeze
|
|
70
|
+
|
|
71
|
+
# Classify a query string
|
|
72
|
+
#
|
|
73
|
+
# @param query [String] Natural language query
|
|
74
|
+
# @return [Classification] Classified query
|
|
75
|
+
def classify(query)
|
|
76
|
+
Classification.new(
|
|
77
|
+
intent: detect_intent(query),
|
|
78
|
+
scope: detect_scope(query),
|
|
79
|
+
target_type: detect_target_type(query),
|
|
80
|
+
framework_context: framework_query?(query),
|
|
81
|
+
keywords: extract_keywords(query)
|
|
82
|
+
)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
private
|
|
86
|
+
|
|
87
|
+
# @param query [String]
|
|
88
|
+
# @return [Symbol]
|
|
89
|
+
def detect_intent(query)
|
|
90
|
+
match_first(INTENT_PATTERNS, query, default: :understand)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# @param query [String]
|
|
94
|
+
# @return [Symbol]
|
|
95
|
+
def detect_scope(query)
|
|
96
|
+
match_first(SCOPE_PATTERNS, query, default: :focused)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# @param query [String]
|
|
100
|
+
# @return [Symbol, nil]
|
|
101
|
+
def detect_target_type(query)
|
|
102
|
+
match_first(TARGET_PATTERNS, query, default: nil)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Match query against a hash of {key => pattern}, returning the first matching key.
|
|
106
|
+
#
|
|
107
|
+
# @param patterns [Hash{Symbol => Regexp}]
|
|
108
|
+
# @param query [String]
|
|
109
|
+
# @param default [Object] value if no pattern matches
|
|
110
|
+
# @return [Object]
|
|
111
|
+
def match_first(patterns, query, default:)
|
|
112
|
+
patterns.each { |key, pattern| return key if query.match?(pattern) }
|
|
113
|
+
default
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# @param query [String]
|
|
117
|
+
# @return [Boolean]
|
|
118
|
+
def framework_query?(query)
|
|
119
|
+
query.match?(/\b(rails|activerecord|actioncontroller|activejob|actionmailer|activesupport|rack|middleware)\b/i)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# @param query [String]
|
|
123
|
+
# @return [Array<String>]
|
|
124
|
+
def extract_keywords(query)
|
|
125
|
+
query.downcase
|
|
126
|
+
.gsub(/[^\w\s]/, ' ')
|
|
127
|
+
.split
|
|
128
|
+
.reject { |w| STOP_WORDS.include?(w) || w.length < 2 }
|
|
129
|
+
.uniq
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Woods
|
|
4
|
+
module Retrieval
|
|
5
|
+
# Ranks search candidates using weighted signal scoring and diversity adjustment.
|
|
6
|
+
#
|
|
7
|
+
# Combines multiple ranking signals into a final score:
|
|
8
|
+
# - Semantic similarity from vector search
|
|
9
|
+
# - Keyword match quality
|
|
10
|
+
# - Recency (git change frequency)
|
|
11
|
+
# - Importance (PageRank / structural importance)
|
|
12
|
+
# - Type match (bonus when result type matches query target_type)
|
|
13
|
+
# - Diversity (penalty for too many results of same type/namespace)
|
|
14
|
+
#
|
|
15
|
+
# After initial scoring, applies Reciprocal Rank Fusion (RRF) when
|
|
16
|
+
# candidates come from multiple retrieval sources.
|
|
17
|
+
#
|
|
18
|
+
# @example
|
|
19
|
+
# ranker = Ranker.new(metadata_store: store)
|
|
20
|
+
# ranked = ranker.rank(candidates, classification: classification)
|
|
21
|
+
#
|
|
22
|
+
class Ranker
|
|
23
|
+
# Signal weights for ranking — sum to 1.0.
|
|
24
|
+
WEIGHTS = {
|
|
25
|
+
semantic: 0.40,
|
|
26
|
+
keyword: 0.20,
|
|
27
|
+
recency: 0.15,
|
|
28
|
+
importance: 0.10,
|
|
29
|
+
type_match: 0.10,
|
|
30
|
+
diversity: 0.05
|
|
31
|
+
}.freeze
|
|
32
|
+
|
|
33
|
+
# RRF constant — balances rank position vs. absolute score.
|
|
34
|
+
# Standard value from the original RRF paper (Cormack et al., 2009).
|
|
35
|
+
RRF_K = 60
|
|
36
|
+
|
|
37
|
+
# @param metadata_store [#find] Store that resolves identifiers to unit metadata
|
|
38
|
+
def initialize(metadata_store:)
|
|
39
|
+
@metadata_store = metadata_store
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Rank candidates by weighted signal scoring with diversity adjustment.
|
|
43
|
+
#
|
|
44
|
+
# @param candidates [Array<Candidate>] Search candidates from executor
|
|
45
|
+
# @param classification [QueryClassifier::Classification] Query classification
|
|
46
|
+
# @return [Array<Candidate>] Re-ranked candidates (best first)
|
|
47
|
+
def rank(candidates, classification:)
|
|
48
|
+
return [] if candidates.empty?
|
|
49
|
+
|
|
50
|
+
# Apply RRF if candidates come from multiple sources
|
|
51
|
+
candidates = apply_rrf(candidates) if multi_source?(candidates)
|
|
52
|
+
|
|
53
|
+
scored = score_candidates(candidates, classification)
|
|
54
|
+
sorted = sorted_by_weighted_score(scored)
|
|
55
|
+
apply_diversity_penalty(sorted)
|
|
56
|
+
|
|
57
|
+
sorted.map { |item| item[:candidate] }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
# Check if candidates come from multiple retrieval sources.
|
|
63
|
+
#
|
|
64
|
+
# @param candidates [Array<Candidate>]
|
|
65
|
+
# @return [Boolean]
|
|
66
|
+
def multi_source?(candidates)
|
|
67
|
+
candidates.map(&:source).uniq.size > 1
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Apply Reciprocal Rank Fusion across sources.
|
|
71
|
+
#
|
|
72
|
+
# RRF formula: score(d) = sum(1/(k + rank_i(d)))
|
|
73
|
+
# Each source's candidates are ranked independently, then RRF
|
|
74
|
+
# merges ranks into a single score.
|
|
75
|
+
#
|
|
76
|
+
# @param candidates [Array<Candidate>]
|
|
77
|
+
# @return [Array<Candidate>] Merged candidates with RRF scores
|
|
78
|
+
def apply_rrf(candidates)
|
|
79
|
+
rrf_scores, metadata_map = compute_rrf_scores(candidates)
|
|
80
|
+
rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Compute RRF scores across all sources.
|
|
84
|
+
#
|
|
85
|
+
# @return [Array(Hash, Hash)] [rrf_scores, metadata_map]
|
|
86
|
+
def compute_rrf_scores(candidates)
|
|
87
|
+
rrf_scores = Hash.new(0.0)
|
|
88
|
+
metadata_map = {}
|
|
89
|
+
|
|
90
|
+
candidates.group_by(&:source).each_value do |source_candidates|
|
|
91
|
+
ranked = source_candidates.sort_by { |c| -c.score }
|
|
92
|
+
ranked.each_with_index do |candidate, rank|
|
|
93
|
+
rrf_scores[candidate.identifier] += 1.0 / (RRF_K + rank)
|
|
94
|
+
metadata_map[candidate.identifier] ||= candidate.metadata
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
[rrf_scores, metadata_map]
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Rebuild candidates with merged RRF scores.
|
|
102
|
+
#
|
|
103
|
+
# @return [Array<Candidate>]
|
|
104
|
+
def rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
|
|
105
|
+
original_by_id = candidates.index_by(&:identifier)
|
|
106
|
+
rrf_scores.sort_by { |_id, score| -score }.map do |identifier, score|
|
|
107
|
+
original = original_by_id[identifier]
|
|
108
|
+
build_candidate(
|
|
109
|
+
identifier: identifier,
|
|
110
|
+
score: score,
|
|
111
|
+
source: original&.source || :rrf,
|
|
112
|
+
metadata: metadata_map[identifier]
|
|
113
|
+
)
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Score each candidate across all signals.
|
|
118
|
+
#
|
|
119
|
+
# @param candidates [Array<Candidate>]
|
|
120
|
+
# @param classification [QueryClassifier::Classification]
|
|
121
|
+
# @return [Array<Hash>]
|
|
122
|
+
def score_candidates(candidates, classification)
|
|
123
|
+
# Batch-fetch all metadata in one query instead of per-candidate lookups
|
|
124
|
+
unit_map = @metadata_store.find_batch(candidates.map(&:identifier))
|
|
125
|
+
|
|
126
|
+
candidates.map do |candidate|
|
|
127
|
+
unit = unit_map[candidate.identifier]
|
|
128
|
+
|
|
129
|
+
{
|
|
130
|
+
candidate: candidate,
|
|
131
|
+
unit: unit, # cached to avoid double lookup in apply_diversity_penalty
|
|
132
|
+
scores: {
|
|
133
|
+
semantic: candidate.score.to_f,
|
|
134
|
+
keyword: keyword_score(candidate),
|
|
135
|
+
recency: recency_score(unit),
|
|
136
|
+
importance: importance_score(unit),
|
|
137
|
+
type_match: type_match_score(unit, classification),
|
|
138
|
+
diversity: 1.0 # Adjusted after initial sort
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Calculate weighted score for each item.
|
|
145
|
+
#
|
|
146
|
+
# @param scored [Array<Hash>]
|
|
147
|
+
# @return [Array<Hash>] Sorted by weighted_score descending
|
|
148
|
+
def sorted_by_weighted_score(scored)
|
|
149
|
+
scored.each do |item|
|
|
150
|
+
item[:weighted_score] = WEIGHTS.sum do |signal, weight|
|
|
151
|
+
item[:scores][signal] * weight
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
scored.sort_by { |item| -item[:weighted_score] }
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Keyword match score based on matched field count.
|
|
159
|
+
#
|
|
160
|
+
# @param candidate [Candidate]
|
|
161
|
+
# @return [Float] 0.0 to 1.0
|
|
162
|
+
def keyword_score(candidate)
|
|
163
|
+
return 0.0 unless candidate.respond_to?(:matched_fields) && candidate.matched_fields
|
|
164
|
+
|
|
165
|
+
[candidate.matched_fields.size * 0.25, 1.0].min
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Recency score based on git change frequency metadata.
|
|
169
|
+
#
|
|
170
|
+
# @param unit [Hash, nil] Unit metadata from store
|
|
171
|
+
# @return [Float] 0.0 to 1.0
|
|
172
|
+
def recency_score(unit)
|
|
173
|
+
return 0.5 unless unit
|
|
174
|
+
|
|
175
|
+
frequency = dig_metadata(unit, :git, :change_frequency)
|
|
176
|
+
case frequency&.to_sym
|
|
177
|
+
when :hot then 1.0
|
|
178
|
+
when :active then 0.8
|
|
179
|
+
when :dormant then 0.3
|
|
180
|
+
when :new then 0.7
|
|
181
|
+
else 0.5 # stable or unknown
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Importance score based on PageRank / structural importance.
|
|
186
|
+
#
|
|
187
|
+
# @param unit [Hash, nil] Unit metadata from store
|
|
188
|
+
# @return [Float] 0.0 to 1.0
|
|
189
|
+
def importance_score(unit)
|
|
190
|
+
return 0.5 unless unit
|
|
191
|
+
|
|
192
|
+
importance = dig_metadata(unit, :importance)
|
|
193
|
+
case importance&.to_s
|
|
194
|
+
when 'high' then 1.0
|
|
195
|
+
when 'medium' then 0.6
|
|
196
|
+
when 'low' then 0.3
|
|
197
|
+
else 0.5
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Type match score — bonus when result type matches query target_type.
|
|
202
|
+
#
|
|
203
|
+
# @param unit [Hash, nil] Unit metadata from store
|
|
204
|
+
# @param classification [QueryClassifier::Classification]
|
|
205
|
+
# @return [Float] 0.0 to 1.0
|
|
206
|
+
def type_match_score(unit, classification)
|
|
207
|
+
return 0.5 unless unit
|
|
208
|
+
return 0.5 unless classification.target_type
|
|
209
|
+
|
|
210
|
+
unit_type = dig_metadata(unit, :type) || unit[:type]
|
|
211
|
+
unit_type&.to_sym == classification.target_type ? 1.0 : 0.3
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Apply diversity penalty to avoid clustering by type/namespace.
|
|
215
|
+
#
|
|
216
|
+
# @param sorted [Array<Hash>] Scored items sorted by weighted_score
|
|
217
|
+
# @return [void] Mutates items in place
|
|
218
|
+
def apply_diversity_penalty(sorted)
|
|
219
|
+
seen_namespaces = Hash.new(0)
|
|
220
|
+
seen_types = Hash.new(0)
|
|
221
|
+
|
|
222
|
+
sorted.each do |item|
|
|
223
|
+
penalty = diversity_penalty_for(item, seen_namespaces, seen_types)
|
|
224
|
+
next unless penalty
|
|
225
|
+
|
|
226
|
+
item[:scores][:diversity] = 1.0 - penalty
|
|
227
|
+
item[:weighted_score] -= penalty * WEIGHTS[:diversity]
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
sorted.sort_by! { |item| -item[:weighted_score] }
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Compute diversity penalty for a single item and update seen counts.
|
|
234
|
+
#
|
|
235
|
+
# Uses the unit cached in item[:unit] to avoid a redundant metadata store lookup.
|
|
236
|
+
#
|
|
237
|
+
# @return [Float, nil] Penalty amount, or nil if unit not found
|
|
238
|
+
def diversity_penalty_for(item, seen_namespaces, seen_types)
|
|
239
|
+
unit = item[:unit]
|
|
240
|
+
return nil unless unit
|
|
241
|
+
|
|
242
|
+
namespace = dig_metadata(unit, :namespace) || 'root'
|
|
243
|
+
type = (dig_metadata(unit, :type) || 'unknown').to_s
|
|
244
|
+
|
|
245
|
+
penalty = [(seen_namespaces[namespace] + seen_types[type]) * 0.1, 0.5].min
|
|
246
|
+
seen_namespaces[namespace] += 1
|
|
247
|
+
seen_types[type] += 1
|
|
248
|
+
penalty
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Dig into unit metadata, handling both hash and object access.
|
|
252
|
+
#
|
|
253
|
+
# @param unit [Hash, Object] Unit data
|
|
254
|
+
# @param keys [Array<Symbol>] Key path
|
|
255
|
+
# @return [Object, nil]
|
|
256
|
+
def dig_metadata(unit, *keys)
|
|
257
|
+
if keys.size == 1
|
|
258
|
+
unit.is_a?(Hash) ? (unit.dig(:metadata, keys[0]) || unit[keys[0]]) : nil
|
|
259
|
+
else
|
|
260
|
+
unit.is_a?(Hash) ? unit.dig(:metadata, *keys) : nil
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Build a Candidate struct compatible with SearchExecutor::Candidate.
|
|
265
|
+
#
|
|
266
|
+
# @return [Candidate-like Struct]
|
|
267
|
+
def build_candidate(identifier:, score:, source:, metadata:)
|
|
268
|
+
SearchExecutor::Candidate.new(
|
|
269
|
+
identifier: identifier,
|
|
270
|
+
score: score,
|
|
271
|
+
source: source,
|
|
272
|
+
metadata: metadata
|
|
273
|
+
)
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
end
|