codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ module CodebaseIndex
6
+ module Retrieval
7
+ # Classifies natural language queries to determine retrieval strategy.
8
+ #
9
+ # Uses heuristic pattern matching to determine:
10
+ # - Intent: what the user wants to do
11
+ # - Scope: how broad the search should be
12
+ # - Target type: what kind of code unit to look for
13
+ # - Framework context: whether this is about Rails/gems vs app code
14
+ #
15
+ class QueryClassifier
16
+ # Classification result
17
+ Classification = Struct.new(:intent, :scope, :target_type, :framework_context, :keywords, keyword_init: true)
18
+
19
+ INTENTS = %i[understand locate trace debug implement reference compare framework].freeze
20
+ SCOPES = %i[pinpoint focused exploratory comprehensive].freeze
21
+
22
+ STOP_WORDS = Set.new(%w[the a an is are was were be been being have has had do does did will would could
23
+ should may might can shall in on at to for of and or but not with by from as
24
+ this that these those it its how what when where why who which]).freeze
25
+
26
+ # Intent patterns — order matters (first match wins)
27
+ INTENT_PATTERNS = {
28
+ locate: /\b(where|find|which file|locate|look for|search for)\b/i,
29
+ trace: /\b(trace|follow|track|call(s|ed by)|depends on|used by|who calls|what calls)\b/i,
30
+ debug: /\b(bug|error|fix|broken|failing|wrong|issue|problem|crash|exception)\b/i,
31
+ implement: /\b(implement|add|create|build|write|make|generate)\b/i,
32
+ compare: /\b(compare|difference|vs|versus|between|contrast)\b/i,
33
+ # rubocop:disable Layout/LineLength
34
+ framework: /\b(how does rails|what does rails|rails .+ work|work.+\brails\b|in rails\b|activerecord|actioncontroller|activejob)\b/i,
35
+ # rubocop:enable Layout/LineLength
36
+ reference: /\b(show me|what is|what are|list|options for|api|interface|signature)\b/i,
37
+ understand: /\b(how|why|explain|understand|what happens|describe|overview)\b/i
38
+ }.freeze
39
+
40
+ # Scope patterns
41
+ SCOPE_PATTERNS = {
42
+ pinpoint: /\b(exactly|specific|this one|just the|only the)\b/i,
43
+ comprehensive: /\b(all|every|entire|whole|complete|everything)\b/i,
44
+ exploratory: /\b(related|around|near|similar|like|associated)\b/i
45
+ }.freeze
46
+
47
+ # Target type patterns
48
+ TARGET_PATTERNS = {
49
+ model: /\b(model|activerecord|association|schema|table|column|scope|validation)\b/i,
50
+ controller: /\b(controller|action|route|endpoint|api|request|response|filter|callback)\b/i,
51
+ service: /\b(service|interactor|operation|command|use.?case|business.?logic)\b/i,
52
+ job: /\b(job|worker|background|async|sidekiq|queue|perform)\b/i,
53
+ mailer: /\b(mailer|email|notification|send.?mail)\b/i,
54
+ graphql: /\b(graphql|mutation|query|type|resolver|field|argument|schema)\b/i,
55
+ concern: /\b(concern|mixin|module|included|extend)\b/i,
56
+ route: /\b(route|path|url|endpoint|uri|http|get|post|put|patch|delete)\b/i,
57
+ middleware: /\b(middleware|rack|request.?pipeline|before.?action)\b/i,
58
+ i18n: /\b(i18n|translation|locale|internationalization|t\(|translate)\b/i,
59
+ pundit_policy: /\b(pundit|authorize|policy|allowed|permitted)\b/i,
60
+ configuration: /\b(config|initializer|environment|setting|configure)\b/i,
61
+ engine: /\b(engine|mountable|mount|railtie|plugin|isolated.?namespace)\b/i,
62
+ view_template: /\b(view|template|partial|render|erb|layout|html)\b/i,
63
+ # rubocop:disable Layout/LineLength
64
+ migration: /\b(migration|migrate|schema.?change|add.?column|remove.?column|create.?table|drop.?table|db.?migrate)\b/i,
65
+ action_cable_channel: /\b(action.?cable|websocket|broadcast|cable.?channel|subscription.?channel|realtime|real.?time)\b/i,
66
+ scheduled_job: /\b(schedule[dr]?|recurring|cron|periodic|every\s+\d|daily|hourly|weekly|solid.?queue.*recur|sidekiq.?cron|whenever)\b/i,
67
+ rake_task: /\b(rake|rake.?task|lib.?tasks?|maintenance.?script|batch.?script)\b/i
68
+ # rubocop:enable Layout/LineLength
69
+ }.freeze
70
+
71
+ # Classify a query string
72
+ #
73
+ # @param query [String] Natural language query
74
+ # @return [Classification] Classified query
75
+ def classify(query)
76
+ Classification.new(
77
+ intent: detect_intent(query),
78
+ scope: detect_scope(query),
79
+ target_type: detect_target_type(query),
80
+ framework_context: framework_query?(query),
81
+ keywords: extract_keywords(query)
82
+ )
83
+ end
84
+
85
+ private
86
+
87
+ # @param query [String]
88
+ # @return [Symbol]
89
+ def detect_intent(query)
90
+ INTENT_PATTERNS.each do |intent, pattern|
91
+ return intent if query.match?(pattern)
92
+ end
93
+ :understand # default
94
+ end
95
+
96
+ # @param query [String]
97
+ # @return [Symbol]
98
+ def detect_scope(query)
99
+ SCOPE_PATTERNS.each do |scope, pattern|
100
+ return scope if query.match?(pattern)
101
+ end
102
+ :focused # default
103
+ end
104
+
105
+ # @param query [String]
106
+ # @return [Symbol, nil]
107
+ def detect_target_type(query)
108
+ TARGET_PATTERNS.each do |type, pattern|
109
+ return type if query.match?(pattern)
110
+ end
111
+ nil # no specific type detected
112
+ end
113
+
114
+ # @param query [String]
115
+ # @return [Boolean]
116
+ def framework_query?(query)
117
+ query.match?(/\b(rails|activerecord|actioncontroller|activejob|actionmailer|activesupport|rack|middleware)\b/i)
118
+ end
119
+
120
+ # @param query [String]
121
+ # @return [Array<String>]
122
+ def extract_keywords(query)
123
+ query.downcase
124
+ .gsub(/[^\w\s]/, ' ')
125
+ .split
126
+ .reject { |w| STOP_WORDS.include?(w) || w.length < 2 }
127
+ .uniq
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,273 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Retrieval
5
+ # Ranks search candidates using weighted signal scoring and diversity adjustment.
6
+ #
7
+ # Combines multiple ranking signals into a final score:
8
+ # - Semantic similarity from vector search
9
+ # - Keyword match quality
10
+ # - Recency (git change frequency)
11
+ # - Importance (PageRank / structural importance)
12
+ # - Type match (bonus when result type matches query target_type)
13
+ # - Diversity (penalty for too many results of same type/namespace)
14
+ #
15
+ # After initial scoring, applies Reciprocal Rank Fusion (RRF) when
16
+ # candidates come from multiple retrieval sources.
17
+ #
18
+ # @example
19
+ # ranker = Ranker.new(metadata_store: store)
20
+ # ranked = ranker.rank(candidates, classification: classification)
21
+ #
22
+ class Ranker
23
+ # Signal weights for ranking — sum to 1.0.
24
+ WEIGHTS = {
25
+ semantic: 0.40,
26
+ keyword: 0.20,
27
+ recency: 0.15,
28
+ importance: 0.10,
29
+ type_match: 0.10,
30
+ diversity: 0.05
31
+ }.freeze
32
+
33
+ # RRF constant — balances rank position vs. absolute score.
34
+ # Standard value from the original RRF paper (Cormack et al., 2009).
35
+ RRF_K = 60
36
+
37
+ # @param metadata_store [#find] Store that resolves identifiers to unit metadata
38
+ def initialize(metadata_store:)
39
+ @metadata_store = metadata_store
40
+ end
41
+
42
+ # Rank candidates by weighted signal scoring with diversity adjustment.
43
+ #
44
+ # @param candidates [Array<Candidate>] Search candidates from executor
45
+ # @param classification [QueryClassifier::Classification] Query classification
46
+ # @return [Array<Candidate>] Re-ranked candidates (best first)
47
+ def rank(candidates, classification:)
48
+ return [] if candidates.empty?
49
+
50
+ # Apply RRF if candidates come from multiple sources
51
+ candidates = apply_rrf(candidates) if multi_source?(candidates)
52
+
53
+ scored = score_candidates(candidates, classification)
54
+ sorted = sorted_by_weighted_score(scored)
55
+ apply_diversity_penalty(sorted)
56
+
57
+ sorted.map { |item| item[:candidate] }
58
+ end
59
+
60
+ private
61
+
62
+ # Check if candidates come from multiple retrieval sources.
63
+ #
64
+ # @param candidates [Array<Candidate>]
65
+ # @return [Boolean]
66
+ def multi_source?(candidates)
67
+ candidates.map(&:source).uniq.size > 1
68
+ end
69
+
70
+ # Apply Reciprocal Rank Fusion across sources.
71
+ #
72
+ # RRF formula: score(d) = sum(1/(k + rank_i(d)))
73
+ # Each source's candidates are ranked independently, then RRF
74
+ # merges ranks into a single score.
75
+ #
76
+ # @param candidates [Array<Candidate>]
77
+ # @return [Array<Candidate>] Merged candidates with RRF scores
78
+ def apply_rrf(candidates)
79
+ rrf_scores, metadata_map = compute_rrf_scores(candidates)
80
+ rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
81
+ end
82
+
83
+ # Compute RRF scores across all sources.
84
+ #
85
+ # @return [Array(Hash, Hash)] [rrf_scores, metadata_map]
86
+ def compute_rrf_scores(candidates)
87
+ rrf_scores = Hash.new(0.0)
88
+ metadata_map = {}
89
+
90
+ candidates.group_by(&:source).each_value do |source_candidates|
91
+ ranked = source_candidates.sort_by { |c| -c.score }
92
+ ranked.each_with_index do |candidate, rank|
93
+ rrf_scores[candidate.identifier] += 1.0 / (RRF_K + rank)
94
+ metadata_map[candidate.identifier] ||= candidate.metadata
95
+ end
96
+ end
97
+
98
+ [rrf_scores, metadata_map]
99
+ end
100
+
101
+ # Rebuild candidates with merged RRF scores.
102
+ #
103
+ # @return [Array<Candidate>]
104
+ def rebuild_rrf_candidates(candidates, rrf_scores, metadata_map)
105
+ rrf_scores.sort_by { |_id, score| -score }.map do |identifier, score|
106
+ original = candidates.find { |c| c.identifier == identifier }
107
+ build_candidate(
108
+ identifier: identifier,
109
+ score: score,
110
+ source: original&.source || :rrf,
111
+ metadata: metadata_map[identifier]
112
+ )
113
+ end
114
+ end
115
+
116
+ # Score each candidate across all signals.
117
+ #
118
+ # @param candidates [Array<Candidate>]
119
+ # @param classification [QueryClassifier::Classification]
120
+ # @return [Array<Hash>]
121
+ def score_candidates(candidates, classification)
122
+ candidates.map do |candidate|
123
+ unit = @metadata_store.find(candidate.identifier)
124
+
125
+ {
126
+ candidate: candidate,
127
+ unit: unit, # cached to avoid double lookup in apply_diversity_penalty
128
+ scores: {
129
+ semantic: candidate.score.to_f,
130
+ keyword: keyword_score(candidate),
131
+ recency: recency_score(unit),
132
+ importance: importance_score(unit),
133
+ type_match: type_match_score(unit, classification),
134
+ diversity: 1.0 # Adjusted after initial sort
135
+ }
136
+ }
137
+ end
138
+ end
139
+
140
+ # Calculate weighted score for each item.
141
+ #
142
+ # @param scored [Array<Hash>]
143
+ # @return [Array<Hash>] Sorted by weighted_score descending
144
+ def sorted_by_weighted_score(scored)
145
+ scored.each do |item|
146
+ item[:weighted_score] = WEIGHTS.sum do |signal, weight|
147
+ item[:scores][signal] * weight
148
+ end
149
+ end
150
+
151
+ scored.sort_by { |item| -item[:weighted_score] }
152
+ end
153
+
154
+ # Keyword match score based on matched field count.
155
+ #
156
+ # @param candidate [Candidate]
157
+ # @return [Float] 0.0 to 1.0
158
+ def keyword_score(candidate)
159
+ return 0.0 unless candidate.respond_to?(:matched_fields) && candidate.matched_fields
160
+
161
+ [candidate.matched_fields.size * 0.25, 1.0].min
162
+ end
163
+
164
+ # Recency score based on git change frequency metadata.
165
+ #
166
+ # @param unit [Hash, nil] Unit metadata from store
167
+ # @return [Float] 0.0 to 1.0
168
+ def recency_score(unit)
169
+ return 0.5 unless unit
170
+
171
+ frequency = dig_metadata(unit, :git, :change_frequency)
172
+ case frequency&.to_sym
173
+ when :hot then 1.0
174
+ when :active then 0.8
175
+ when :dormant then 0.3
176
+ when :new then 0.7
177
+ else 0.5 # stable or unknown
178
+ end
179
+ end
180
+
181
+ # Importance score based on PageRank / structural importance.
182
+ #
183
+ # @param unit [Hash, nil] Unit metadata from store
184
+ # @return [Float] 0.0 to 1.0
185
+ def importance_score(unit)
186
+ return 0.5 unless unit
187
+
188
+ importance = dig_metadata(unit, :importance)
189
+ case importance&.to_s
190
+ when 'high' then 1.0
191
+ when 'medium' then 0.6
192
+ when 'low' then 0.3
193
+ else 0.5
194
+ end
195
+ end
196
+
197
+ # Type match score — bonus when result type matches query target_type.
198
+ #
199
+ # @param unit [Hash, nil] Unit metadata from store
200
+ # @param classification [QueryClassifier::Classification]
201
+ # @return [Float] 0.0 to 1.0
202
+ def type_match_score(unit, classification)
203
+ return 0.5 unless unit
204
+ return 0.5 unless classification.target_type
205
+
206
+ unit_type = dig_metadata(unit, :type) || unit[:type]
207
+ unit_type&.to_sym == classification.target_type ? 1.0 : 0.3
208
+ end
209
+
210
+ # Apply diversity penalty to avoid clustering by type/namespace.
211
+ #
212
+ # @param sorted [Array<Hash>] Scored items sorted by weighted_score
213
+ # @return [void] Mutates items in place
214
+ def apply_diversity_penalty(sorted)
215
+ seen_namespaces = Hash.new(0)
216
+ seen_types = Hash.new(0)
217
+
218
+ sorted.each do |item|
219
+ penalty = diversity_penalty_for(item, seen_namespaces, seen_types)
220
+ next unless penalty
221
+
222
+ item[:scores][:diversity] = 1.0 - penalty
223
+ item[:weighted_score] -= penalty * WEIGHTS[:diversity]
224
+ end
225
+
226
+ sorted.sort_by! { |item| -item[:weighted_score] }
227
+ end
228
+
229
+ # Compute diversity penalty for a single item and update seen counts.
230
+ #
231
+ # Uses the unit cached in item[:unit] to avoid a redundant metadata store lookup.
232
+ #
233
+ # @return [Float, nil] Penalty amount, or nil if unit not found
234
+ def diversity_penalty_for(item, seen_namespaces, seen_types)
235
+ unit = item[:unit]
236
+ return nil unless unit
237
+
238
+ namespace = dig_metadata(unit, :namespace) || 'root'
239
+ type = (dig_metadata(unit, :type) || 'unknown').to_s
240
+
241
+ penalty = [(seen_namespaces[namespace] + seen_types[type]) * 0.1, 0.5].min
242
+ seen_namespaces[namespace] += 1
243
+ seen_types[type] += 1
244
+ penalty
245
+ end
246
+
247
+ # Dig into unit metadata, handling both hash and object access.
248
+ #
249
+ # @param unit [Hash, Object] Unit data
250
+ # @param keys [Array<Symbol>] Key path
251
+ # @return [Object, nil]
252
+ def dig_metadata(unit, *keys)
253
+ if keys.size == 1
254
+ unit.is_a?(Hash) ? (unit.dig(:metadata, keys[0]) || unit[keys[0]]) : nil
255
+ else
256
+ unit.is_a?(Hash) ? unit.dig(:metadata, *keys) : nil
257
+ end
258
+ end
259
+
260
+ # Build a Candidate struct compatible with SearchExecutor::Candidate.
261
+ #
262
+ # @return [Candidate-like Struct]
263
+ def build_candidate(identifier:, score:, source:, metadata:)
264
+ SearchExecutor::Candidate.new(
265
+ identifier: identifier,
266
+ score: score,
267
+ source: source,
268
+ metadata: metadata
269
+ )
270
+ end
271
+ end
272
+ end
273
+ end