codebase_index 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +29 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +481 -0
- data/exe/codebase-console-mcp +22 -0
- data/exe/codebase-index-mcp +61 -0
- data/exe/codebase-index-mcp-http +64 -0
- data/exe/codebase-index-mcp-start +58 -0
- data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
- data/lib/codebase_index/ast/method_extractor.rb +76 -0
- data/lib/codebase_index/ast/node.rb +88 -0
- data/lib/codebase_index/ast/parser.rb +653 -0
- data/lib/codebase_index/ast.rb +6 -0
- data/lib/codebase_index/builder.rb +137 -0
- data/lib/codebase_index/chunking/chunk.rb +84 -0
- data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
- data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
- data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
- data/lib/codebase_index/console/audit_logger.rb +75 -0
- data/lib/codebase_index/console/bridge.rb +170 -0
- data/lib/codebase_index/console/confirmation.rb +90 -0
- data/lib/codebase_index/console/connection_manager.rb +173 -0
- data/lib/codebase_index/console/console_response_renderer.rb +78 -0
- data/lib/codebase_index/console/model_validator.rb +81 -0
- data/lib/codebase_index/console/safe_context.rb +82 -0
- data/lib/codebase_index/console/server.rb +557 -0
- data/lib/codebase_index/console/sql_validator.rb +172 -0
- data/lib/codebase_index/console/tools/tier1.rb +118 -0
- data/lib/codebase_index/console/tools/tier2.rb +117 -0
- data/lib/codebase_index/console/tools/tier3.rb +110 -0
- data/lib/codebase_index/console/tools/tier4.rb +79 -0
- data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
- data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
- data/lib/codebase_index/cost_model/estimator.rb +128 -0
- data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
- data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
- data/lib/codebase_index/cost_model.rb +22 -0
- data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
- data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
- data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/codebase_index/db/migrator.rb +71 -0
- data/lib/codebase_index/db/schema_version.rb +73 -0
- data/lib/codebase_index/dependency_graph.rb +227 -0
- data/lib/codebase_index/embedding/indexer.rb +130 -0
- data/lib/codebase_index/embedding/openai.rb +105 -0
- data/lib/codebase_index/embedding/provider.rb +135 -0
- data/lib/codebase_index/embedding/text_preparer.rb +112 -0
- data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
- data/lib/codebase_index/evaluation/evaluator.rb +146 -0
- data/lib/codebase_index/evaluation/metrics.rb +79 -0
- data/lib/codebase_index/evaluation/query_set.rb +148 -0
- data/lib/codebase_index/evaluation/report_generator.rb +90 -0
- data/lib/codebase_index/extracted_unit.rb +145 -0
- data/lib/codebase_index/extractor.rb +956 -0
- data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
- data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
- data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
- data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
- data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
- data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
- data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
- data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
- data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
- data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
- data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
- data/lib/codebase_index/extractors/event_extractor.rb +211 -0
- data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
- data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
- data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
- data/lib/codebase_index/extractors/job_extractor.rb +369 -0
- data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
- data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
- data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
- data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
- data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
- data/lib/codebase_index/extractors/model_extractor.rb +960 -0
- data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
- data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
- data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
- data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
- data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
- data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
- data/lib/codebase_index/extractors/route_extractor.rb +181 -0
- data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
- data/lib/codebase_index/extractors/service_extractor.rb +254 -0
- data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
- data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
- data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
- data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
- data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
- data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
- data/lib/codebase_index/feedback/gap_detector.rb +89 -0
- data/lib/codebase_index/feedback/store.rb +119 -0
- data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
- data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/codebase_index/flow_assembler.rb +290 -0
- data/lib/codebase_index/flow_document.rb +191 -0
- data/lib/codebase_index/flow_precomputer.rb +102 -0
- data/lib/codebase_index/formatting/base.rb +40 -0
- data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
- data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
- data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
- data/lib/codebase_index/formatting/human_adapter.rb +78 -0
- data/lib/codebase_index/graph_analyzer.rb +374 -0
- data/lib/codebase_index/mcp/index_reader.rb +394 -0
- data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
- data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
- data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/codebase_index/mcp/server.rb +935 -0
- data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
- data/lib/codebase_index/model_name_cache.rb +51 -0
- data/lib/codebase_index/notion/client.rb +217 -0
- data/lib/codebase_index/notion/exporter.rb +219 -0
- data/lib/codebase_index/notion/mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
- data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
- data/lib/codebase_index/notion/rate_limiter.rb +68 -0
- data/lib/codebase_index/observability/health_check.rb +81 -0
- data/lib/codebase_index/observability/instrumentation.rb +34 -0
- data/lib/codebase_index/observability/structured_logger.rb +75 -0
- data/lib/codebase_index/operator/error_escalator.rb +81 -0
- data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
- data/lib/codebase_index/operator/status_reporter.rb +80 -0
- data/lib/codebase_index/railtie.rb +26 -0
- data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
- data/lib/codebase_index/resilience/index_validator.rb +185 -0
- data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
- data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
- data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
- data/lib/codebase_index/retrieval/ranker.rb +273 -0
- data/lib/codebase_index/retrieval/search_executor.rb +327 -0
- data/lib/codebase_index/retriever.rb +160 -0
- data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
- data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
- data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
- data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
- data/lib/codebase_index/ruby_analyzer.rb +87 -0
- data/lib/codebase_index/session_tracer/file_store.rb +111 -0
- data/lib/codebase_index/session_tracer/middleware.rb +143 -0
- data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
- data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
- data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
- data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
- data/lib/codebase_index/session_tracer/store.rb +67 -0
- data/lib/codebase_index/storage/graph_store.rb +120 -0
- data/lib/codebase_index/storage/metadata_store.rb +169 -0
- data/lib/codebase_index/storage/pgvector.rb +163 -0
- data/lib/codebase_index/storage/qdrant.rb +172 -0
- data/lib/codebase_index/storage/vector_store.rb +156 -0
- data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
- data/lib/codebase_index/version.rb +5 -0
- data/lib/codebase_index.rb +223 -0
- data/lib/generators/codebase_index/install_generator.rb +32 -0
- data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
- data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
- data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
- data/lib/tasks/codebase_index.rake +583 -0
- data/lib/tasks/codebase_index_evaluation.rake +115 -0
- metadata +252 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
5
|
+
module CodebaseIndex
|
|
6
|
+
module Retrieval
|
|
7
|
+
# Classifies natural language queries to determine retrieval strategy.
|
|
8
|
+
#
|
|
9
|
+
# Uses heuristic pattern matching to determine:
|
|
10
|
+
# - Intent: what the user wants to do
|
|
11
|
+
# - Scope: how broad the search should be
|
|
12
|
+
# - Target type: what kind of code unit to look for
|
|
13
|
+
# - Framework context: whether this is about Rails/gems vs app code
|
|
14
|
+
#
|
|
15
|
+
class QueryClassifier
|
|
16
|
+
# Classification result
|
|
17
|
+
Classification = Struct.new(:intent, :scope, :target_type, :framework_context, :keywords, keyword_init: true)
|
|
18
|
+
|
|
19
|
+
INTENTS = %i[understand locate trace debug implement reference compare framework].freeze
|
|
20
|
+
SCOPES = %i[pinpoint focused exploratory comprehensive].freeze
|
|
21
|
+
|
|
22
|
+
STOP_WORDS = Set.new(%w[the a an is are was were be been being have has had do does did will would could
|
|
23
|
+
should may might can shall in on at to for of and or but not with by from as
|
|
24
|
+
this that these those it its how what when where why who which]).freeze
|
|
25
|
+
|
|
26
|
+
# Intent patterns — order matters (first match wins)
|
|
27
|
+
INTENT_PATTERNS = {
|
|
28
|
+
locate: /\b(where|find|which file|locate|look for|search for)\b/i,
|
|
29
|
+
trace: /\b(trace|follow|track|call(s|ed by)|depends on|used by|who calls|what calls)\b/i,
|
|
30
|
+
debug: /\b(bug|error|fix|broken|failing|wrong|issue|problem|crash|exception)\b/i,
|
|
31
|
+
implement: /\b(implement|add|create|build|write|make|generate)\b/i,
|
|
32
|
+
compare: /\b(compare|difference|vs|versus|between|contrast)\b/i,
|
|
33
|
+
# rubocop:disable Layout/LineLength
|
|
34
|
+
framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob)\b/i,
|
|
35
|
+
# rubocop:enable Layout/LineLength
|
|
36
|
+
reference: /\b(show me|what is|what are|list|options for|api|interface|signature)\b/i,
|
|
37
|
+
understand: /\b(how|why|explain|understand|what happens|describe|overview)\b/i
|
|
38
|
+
}.freeze
|
|
39
|
+
|
|
40
|
+
# Scope patterns
|
|
41
|
+
SCOPE_PATTERNS = {
|
|
42
|
+
pinpoint: /\b(exactly|specific|this one|just the|only the)\b/i,
|
|
43
|
+
comprehensive: /\b(all|every|entire|whole|complete|everything)\b/i,
|
|
44
|
+
exploratory: /\b(related|around|near|similar|like|associated)\b/i
|
|
45
|
+
}.freeze
|
|
46
|
+
|
|
47
|
+
# Target type patterns
|
|
48
|
+
TARGET_PATTERNS = {
|
|
49
|
+
model: /\b(model|activerecord|association|schema|table|column|scope|validation)\b/i,
|
|
50
|
+
controller: /\b(controller|action|route|endpoint|api|request|response|filter|callback)\b/i,
|
|
51
|
+
service: /\b(service|interactor|operation|command|use.?case|business.?logic)\b/i,
|
|
52
|
+
job: /\b(job|worker|background|async|sidekiq|queue|perform)\b/i,
|
|
53
|
+
mailer: /\b(mailer|email|notification|send.?mail)\b/i,
|
|
54
|
+
graphql: /\b(graphql|mutation|query|type|resolver|field|argument|schema)\b/i,
|
|
55
|
+
concern: /\b(concern|mixin|module|included|extend)\b/i,
|
|
56
|
+
route: /\b(route|path|url|endpoint|uri|http|get|post|put|patch|delete)\b/i,
|
|
57
|
+
middleware: /\b(middleware|rack|request.?pipeline|before.?action)\b/i,
|
|
58
|
+
i18n: /\b(i18n|translation|locale|internationalization|t\(|translate)\b/i,
|
|
59
|
+
pundit_policy: /\b(pundit|authorize|policy|allowed|permitted)\b/i,
|
|
60
|
+
configuration: /\b(config|initializer|environment|setting|configure)\b/i,
|
|
61
|
+
engine: /\b(engine|mountable|mount|railtie|plugin|isolated.?namespace)\b/i,
|
|
62
|
+
view_template: /\b(view|template|partial|render|erb|layout|html)\b/i,
|
|
63
|
+
# rubocop:disable Layout/LineLength
|
|
64
|
+
migration: /\b(migration|migrate|schema.?change|add.?column|remove.?column|create.?table|drop.?table|db.?migrate)\b/i,
|
|
65
|
+
action_cable_channel: /\b(action.?cable|websocket|broadcast|cable.?channel|subscription.?channel|realtime|real.?time)\b/i,
|
|
66
|
+
scheduled_job: /\b(schedule[dr]?|recurring|cron|periodic|every\s+\d|daily|hourly|weekly|solid.?queue.*recur|sidekiq.?cron|whenever)\b/i,
|
|
67
|
+
rake_task: /\b(rake|rake.?task|lib.?tasks?|maintenance.?script|batch.?script)\b/i
|
|
68
|
+
# rubocop:enable Layout/LineLength
|
|
69
|
+
}.freeze
|
|
70
|
+
|
|
71
|
+
# Classify a query string
|
|
72
|
+
#
|
|
73
|
+
# @param query [String] Natural language query
|
|
74
|
+
# @return [Classification] Classified query
|
|
75
|
+
def classify(query)
|
|
76
|
+
Classification.new(
|
|
77
|
+
intent: detect_intent(query),
|
|
78
|
+
scope: detect_scope(query),
|
|
79
|
+
target_type: detect_target_type(query),
|
|
80
|
+
framework_context: framework_query?(query),
|
|
81
|
+
keywords: extract_keywords(query)
|
|
82
|
+
)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
private
|
|
86
|
+
|
|
87
|
+
# @param query [String]
|
|
88
|
+
# @return [Symbol]
|
|
89
|
+
def detect_intent(query)
|
|
90
|
+
INTENT_PATTERNS.each do |intent, pattern|
|
|
91
|
+
return intent if query.match?(pattern)
|
|
92
|
+
end
|
|
93
|
+
:understand # default
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# @param query [String]
|
|
97
|
+
# @return [Symbol]
|
|
98
|
+
def detect_scope(query)
|
|
99
|
+
SCOPE_PATTERNS.each do |scope, pattern|
|
|
100
|
+
return scope if query.match?(pattern)
|
|
101
|
+
end
|
|
102
|
+
:focused # default
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# @param query [String]
|
|
106
|
+
# @return [Symbol, nil]
|
|
107
|
+
def detect_target_type(query)
|
|
108
|
+
TARGET_PATTERNS.each do |type, pattern|
|
|
109
|
+
return type if query.match?(pattern)
|
|
110
|
+
end
|
|
111
|
+
nil # no specific type detected
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# @param query [String]
|
|
115
|
+
# @return [Boolean]
|
|
116
|
+
def framework_query?(query)
|
|
117
|
+
query.match?(/\b(rails|activerecord|actioncontroller|activejob|actionmailer|activesupport|rack|middleware)\b/i)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# @param query [String]
|
|
121
|
+
# @return [Array<String>]
|
|
122
|
+
def extract_keywords(query)
|
|
123
|
+
query.downcase
|
|
124
|
+
.gsub(/[^\w\s]/, ' ')
|
|
125
|
+
.split
|
|
126
|
+
.reject { |w| STOP_WORDS.include?(w) || w.length < 2 }
|
|
127
|
+
.uniq
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Retrieval
|
|
5
|
+
# Ranks search candidates using weighted signal scoring and diversity adjustment.
|
|
6
|
+
#
|
|
7
|
+
# Combines multiple ranking signals into a final score:
|
|
8
|
+
# - Semantic similarity from vector search
|
|
9
|
+
# - Keyword match quality
|
|
10
|
+
# - Recency (git change frequency)
|
|
11
|
+
# - Importance (PageRank / structural importance)
|
|
12
|
+
# - Type match (bonus when result type matches query target_type)
|
|
13
|
+
# - Diversity (penalty for too many results of same type/namespace)
|
|
14
|
+
#
|
|
15
|
+
# After initial scoring, applies Reciprocal Rank Fusion (RRF) when
|
|
16
|
+
# candidates come from multiple retrieval sources.
|
|
17
|
+
#
|
|
18
|
+
# @example
|
|
19
|
+
# ranker = Ranker.new(metadata_store: store)
|
|
20
|
+
# ranked = ranker.rank(candidates, classification: classification)
|
|
21
|
+
#
|
|
22
|
+
class Ranker
|
|
23
|
+
# Signal weights for ranking — sum to 1.0.
|
|
24
|
+
WEIGHTS = {
|
|
25
|
+
semantic: 0.40,
|
|
26
|
+
keyword: 0.20,
|
|
27
|
+
recency: 0.15,
|
|
28
|
+
importance: 0.10,
|
|
29
|
+
type_match: 0.10,
|
|
30
|
+
diversity: 0.05
|
|
31
|
+
}.freeze
|
|
32
|
+
|
|
33
|
+
# RRF constant — balances rank position vs. absolute score.
|
|
34
|
+
# Standard value from the original RRF paper (Cormack et al., 2009).
|
|
35
|
+
RRF_K = 60
|
|
36
|
+
|
|
37
|
+
# @param metadata_store [#find] Store that resolves identifiers to unit metadata
|
|
38
|
+
def initialize(metadata_store:)
|
|
39
|
+
@metadata_store = metadata_store
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Rank candidates by weighted signal scoring with diversity adjustment.
|
|
43
|
+
#
|
|
44
|
+
# @param candidates [Array<Candidate>] Search candidates from executor
|
|
45
|
+
# @param classification [QueryClassifier::Classification] Query classification
|
|
46
|
+
# @return [Array<Candidate>] Re-ranked candidates (best first)
|
|
47
|
+
def rank(candidates, classification:)
|
|
48
|
+
return [] if candidates.empty?
|
|
49
|
+
|
|
50
|
+
# Apply RRF if candidates come from multiple sources
|
|
51
|
+
candidates = apply_rrf(candidates) if multi_source?(candidates)
|
|
52
|
+
|
|
53
|
+
scored = score_candidates(candidates, classification)
|
|
54
|
+
sorted = sorted_by_weighted_score(scored)
|
|
55
|
+
apply_diversity_penalty(sorted)
|
|
56
|
+
|
|
57
|
+
sorted.map { |item| item[:candidate] }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
# Check if candidates come from multiple retrieval sources.
|
|
63
|
+
#
|
|
64
|
+
# @param candidates [Array<Candidate>]
|
|
65
|
+
# @return [Boolean]
|
|
66
|
+
def multi_source?(candidates)
|
|
67
|
+
candidates.map(&:source).uniq.size > 1
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Apply Reciprocal Rank Fusion across sources.
|
|
71
|
+
#
|
|
72
|
+
# RRF formula: score(d) = sum(1/(k + rank_i(d)))
|
|
73
|
+
# Each source's candidates are ranked independently, then RRF
|
|
74
|
+
# merges ranks into a single score.
|
|
75
|
+
#
|
|
76
|
+
# @param candidates [Array<Candidate>]
|
|
77
|
+
# @return [Array<Candidate>] Merged candidates with RRF scores
|
|
78
|
+
def apply_rrf(candidates)
|
|
79
|
+
rrf_scores, metadata_map = compute_rrf_scores(candidates)
|
|
80
|
+
rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Compute RRF scores across all sources.
|
|
84
|
+
#
|
|
85
|
+
# @return [Array(Hash, Hash)] [rrf_scores, metadata_map]
|
|
86
|
+
def compute_rrf_scores(candidates)
|
|
87
|
+
rrf_scores = Hash.new(0.0)
|
|
88
|
+
metadata_map = {}
|
|
89
|
+
|
|
90
|
+
candidates.group_by(&:source).each_value do |source_candidates|
|
|
91
|
+
ranked = source_candidates.sort_by { |c| -c.score }
|
|
92
|
+
ranked.each_with_index do |candidate, rank|
|
|
93
|
+
rrf_scores[candidate.identifier] += 1.0 / (RRF_K + rank)
|
|
94
|
+
metadata_map[candidate.identifier] ||= candidate.metadata
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
[rrf_scores, metadata_map]
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Rebuild candidates with merged RRF scores.
|
|
102
|
+
#
|
|
103
|
+
# @return [Array<Candidate>]
|
|
104
|
+
def rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
|
|
105
|
+
rrf_scores.sort_by { |_id, score| -score }.map do |identifier, score|
|
|
106
|
+
original = candidates.find { |c| c.identifier == identifier }
|
|
107
|
+
build_candidate(
|
|
108
|
+
identifier: identifier,
|
|
109
|
+
score: score,
|
|
110
|
+
source: original&.source || :rrf,
|
|
111
|
+
metadata: metadata_map[identifier]
|
|
112
|
+
)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Score each candidate across all signals.
|
|
117
|
+
#
|
|
118
|
+
# @param candidates [Array<Candidate>]
|
|
119
|
+
# @param classification [QueryClassifier::Classification]
|
|
120
|
+
# @return [Array<Hash>]
|
|
121
|
+
def score_candidates(candidates, classification)
|
|
122
|
+
candidates.map do |candidate|
|
|
123
|
+
unit = @metadata_store.find(candidate.identifier)
|
|
124
|
+
|
|
125
|
+
{
|
|
126
|
+
candidate: candidate,
|
|
127
|
+
unit: unit, # cached to avoid double lookup in apply_diversity_penalty
|
|
128
|
+
scores: {
|
|
129
|
+
semantic: candidate.score.to_f,
|
|
130
|
+
keyword: keyword_score(candidate),
|
|
131
|
+
recency: recency_score(unit),
|
|
132
|
+
importance: importance_score(unit),
|
|
133
|
+
type_match: type_match_score(unit, classification),
|
|
134
|
+
diversity: 1.0 # Adjusted after initial sort
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Calculate weighted score for each item.
|
|
141
|
+
#
|
|
142
|
+
# @param scored [Array<Hash>]
|
|
143
|
+
# @return [Array<Hash>] Sorted by weighted_score descending
|
|
144
|
+
def sorted_by_weighted_score(scored)
|
|
145
|
+
scored.each do |item|
|
|
146
|
+
item[:weighted_score] = WEIGHTS.sum do |signal, weight|
|
|
147
|
+
item[:scores][signal] * weight
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
scored.sort_by { |item| -item[:weighted_score] }
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Keyword match score based on matched field count.
|
|
155
|
+
#
|
|
156
|
+
# @param candidate [Candidate]
|
|
157
|
+
# @return [Float] 0.0 to 1.0
|
|
158
|
+
def keyword_score(candidate)
|
|
159
|
+
return 0.0 unless candidate.respond_to?(:matched_fields) && candidate.matched_fields
|
|
160
|
+
|
|
161
|
+
[candidate.matched_fields.size * 0.25, 1.0].min
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Recency score based on git change frequency metadata.
|
|
165
|
+
#
|
|
166
|
+
# @param unit [Hash, nil] Unit metadata from store
|
|
167
|
+
# @return [Float] 0.0 to 1.0
|
|
168
|
+
def recency_score(unit)
|
|
169
|
+
return 0.5 unless unit
|
|
170
|
+
|
|
171
|
+
frequency = dig_metadata(unit, :git, :change_frequency)
|
|
172
|
+
case frequency&.to_sym
|
|
173
|
+
when :hot then 1.0
|
|
174
|
+
when :active then 0.8
|
|
175
|
+
when :dormant then 0.3
|
|
176
|
+
when :new then 0.7
|
|
177
|
+
else 0.5 # stable or unknown
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Importance score based on PageRank / structural importance.
|
|
182
|
+
#
|
|
183
|
+
# @param unit [Hash, nil] Unit metadata from store
|
|
184
|
+
# @return [Float] 0.0 to 1.0
|
|
185
|
+
def importance_score(unit)
|
|
186
|
+
return 0.5 unless unit
|
|
187
|
+
|
|
188
|
+
importance = dig_metadata(unit, :importance)
|
|
189
|
+
case importance&.to_s
|
|
190
|
+
when 'high' then 1.0
|
|
191
|
+
when 'medium' then 0.6
|
|
192
|
+
when 'low' then 0.3
|
|
193
|
+
else 0.5
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Type match score — bonus when result type matches query target_type.
|
|
198
|
+
#
|
|
199
|
+
# @param unit [Hash, nil] Unit metadata from store
|
|
200
|
+
# @param classification [QueryClassifier::Classification]
|
|
201
|
+
# @return [Float] 0.0 to 1.0
|
|
202
|
+
def type_match_score(unit, classification)
|
|
203
|
+
return 0.5 unless unit
|
|
204
|
+
return 0.5 unless classification.target_type
|
|
205
|
+
|
|
206
|
+
unit_type = dig_metadata(unit, :type) || unit[:type]
|
|
207
|
+
unit_type&.to_sym == classification.target_type ? 1.0 : 0.3
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Apply diversity penalty to avoid clustering by type/namespace.
|
|
211
|
+
#
|
|
212
|
+
# @param sorted [Array<Hash>] Scored items sorted by weighted_score
|
|
213
|
+
# @return [void] Mutates items in place
|
|
214
|
+
def apply_diversity_penalty(sorted)
|
|
215
|
+
seen_namespaces = Hash.new(0)
|
|
216
|
+
seen_types = Hash.new(0)
|
|
217
|
+
|
|
218
|
+
sorted.each do |item|
|
|
219
|
+
penalty = diversity_penalty_for(item, seen_namespaces, seen_types)
|
|
220
|
+
next unless penalty
|
|
221
|
+
|
|
222
|
+
item[:scores][:diversity] = 1.0 - penalty
|
|
223
|
+
item[:weighted_score] -= penalty * WEIGHTS[:diversity]
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
sorted.sort_by! { |item| -item[:weighted_score] }
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Compute diversity penalty for a single item and update seen counts.
|
|
230
|
+
#
|
|
231
|
+
# Uses the unit cached in item[:unit] to avoid a redundant metadata store lookup.
|
|
232
|
+
#
|
|
233
|
+
# @return [Float, nil] Penalty amount, or nil if unit not found
|
|
234
|
+
def diversity_penalty_for(item, seen_namespaces, seen_types)
|
|
235
|
+
unit = item[:unit]
|
|
236
|
+
return nil unless unit
|
|
237
|
+
|
|
238
|
+
namespace = dig_metadata(unit, :namespace) || 'root'
|
|
239
|
+
type = (dig_metadata(unit, :type) || 'unknown').to_s
|
|
240
|
+
|
|
241
|
+
penalty = [(seen_namespaces[namespace] + seen_types[type]) * 0.1, 0.5].min
|
|
242
|
+
seen_namespaces[namespace] += 1
|
|
243
|
+
seen_types[type] += 1
|
|
244
|
+
penalty
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Dig into unit metadata, handling both hash and object access.
|
|
248
|
+
#
|
|
249
|
+
# @param unit [Hash, Object] Unit data
|
|
250
|
+
# @param keys [Array<Symbol>] Key path
|
|
251
|
+
# @return [Object, nil]
|
|
252
|
+
def dig_metadata(unit, *keys)
|
|
253
|
+
if keys.size == 1
|
|
254
|
+
unit.is_a?(Hash) ? (unit.dig(:metadata, keys[0]) || unit[keys[0]]) : nil
|
|
255
|
+
else
|
|
256
|
+
unit.is_a?(Hash) ? unit.dig(:metadata, *keys) : nil
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Build a Candidate struct compatible with SearchExecutor::Candidate.
|
|
261
|
+
#
|
|
262
|
+
# @return [Candidate-like Struct]
|
|
263
|
+
def build_candidate(identifier:, score:, source:, metadata:)
|
|
264
|
+
SearchExecutor::Candidate.new(
|
|
265
|
+
identifier: identifier,
|
|
266
|
+
score: score,
|
|
267
|
+
source: source,
|
|
268
|
+
metadata: metadata
|
|
269
|
+
)
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
end
|