codebase_index 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +60 -0
- data/README.md +95 -300
- data/exe/codebase-index-mcp +3 -31
- data/exe/codebase-index-mcp-http +3 -31
- data/lib/codebase_index/ast/method_extractor.rb +3 -8
- data/lib/codebase_index/ast/node.rb +28 -0
- data/lib/codebase_index/ast/parser.rb +53 -92
- data/lib/codebase_index/builder.rb +67 -4
- data/lib/codebase_index/cache/cache_middleware.rb +199 -0
- data/lib/codebase_index/cache/cache_store.rb +264 -0
- data/lib/codebase_index/cache/redis_cache_store.rb +116 -0
- data/lib/codebase_index/cache/solid_cache_store.rb +111 -0
- data/lib/codebase_index/chunking/semantic_chunker.rb +29 -24
- data/lib/codebase_index/console/adapters/good_job_adapter.rb +7 -40
- data/lib/codebase_index/console/adapters/job_adapter.rb +68 -0
- data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +7 -40
- data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +7 -40
- data/lib/codebase_index/console/bridge.rb +7 -0
- data/lib/codebase_index/console/console_response_renderer.rb +3 -7
- data/lib/codebase_index/console/embedded_executor.rb +2 -1
- data/lib/codebase_index/console/server.rb +1 -4
- data/lib/codebase_index/dependency_graph.rb +28 -19
- data/lib/codebase_index/embedding/indexer.rb +18 -8
- data/lib/codebase_index/embedding/openai.rb +27 -6
- data/lib/codebase_index/embedding/provider.rb +29 -2
- data/lib/codebase_index/evaluation/evaluator.rb +5 -12
- data/lib/codebase_index/extractor.rb +40 -44
- data/lib/codebase_index/extractors/action_cable_extractor.rb +9 -36
- data/lib/codebase_index/extractors/callback_analyzer.rb +22 -8
- data/lib/codebase_index/extractors/controller_extractor.rb +3 -93
- data/lib/codebase_index/extractors/decorator_extractor.rb +7 -14
- data/lib/codebase_index/extractors/engine_extractor.rb +20 -1
- data/lib/codebase_index/extractors/graphql_extractor.rb +4 -29
- data/lib/codebase_index/extractors/job_extractor.rb +11 -6
- data/lib/codebase_index/extractors/lib_extractor.rb +0 -31
- data/lib/codebase_index/extractors/mailer_extractor.rb +15 -85
- data/lib/codebase_index/extractors/manager_extractor.rb +1 -15
- data/lib/codebase_index/extractors/model_extractor.rb +20 -53
- data/lib/codebase_index/extractors/phlex_extractor.rb +8 -8
- data/lib/codebase_index/extractors/policy_extractor.rb +1 -24
- data/lib/codebase_index/extractors/poro_extractor.rb +0 -17
- data/lib/codebase_index/extractors/serializer_extractor.rb +12 -7
- data/lib/codebase_index/extractors/service_extractor.rb +1 -38
- data/lib/codebase_index/extractors/shared_utility_methods.rb +183 -1
- data/lib/codebase_index/extractors/validator_extractor.rb +3 -17
- data/lib/codebase_index/extractors/view_component_extractor.rb +10 -9
- data/lib/codebase_index/filename_utils.rb +32 -0
- data/lib/codebase_index/flow_analysis/operation_extractor.rb +1 -4
- data/lib/codebase_index/formatting/base.rb +0 -10
- data/lib/codebase_index/graph_analyzer.rb +1 -1
- data/lib/codebase_index/mcp/bootstrapper.rb +58 -0
- data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +35 -34
- data/lib/codebase_index/mcp/renderers/plain_renderer.rb +29 -29
- data/lib/codebase_index/mcp/server.rb +59 -68
- data/lib/codebase_index/mcp/tool_response_renderer.rb +23 -0
- data/lib/codebase_index/notion/client.rb +2 -2
- data/lib/codebase_index/notion/mapper.rb +1 -0
- data/lib/codebase_index/notion/mappers/column_mapper.rb +3 -11
- data/lib/codebase_index/notion/mappers/model_mapper.rb +20 -23
- data/lib/codebase_index/notion/mappers/shared.rb +22 -0
- data/lib/codebase_index/observability/health_check.rb +0 -2
- data/lib/codebase_index/observability/structured_logger.rb +12 -30
- data/lib/codebase_index/operator/pipeline_guard.rb +0 -7
- data/lib/codebase_index/resilience/index_validator.rb +3 -21
- data/lib/codebase_index/retrieval/context_assembler.rb +19 -7
- data/lib/codebase_index/retrieval/query_classifier.rb +14 -12
- data/lib/codebase_index/retrieval/ranker.rb +6 -2
- data/lib/codebase_index/retrieval/search_executor.rb +8 -19
- data/lib/codebase_index/retriever.rb +1 -9
- data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +5 -25
- data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +6 -7
- data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +58 -53
- data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +11 -7
- data/lib/codebase_index/session_tracer/file_store.rb +1 -8
- data/lib/codebase_index/session_tracer/redis_store.rb +1 -7
- data/lib/codebase_index/session_tracer/session_flow_assembler.rb +4 -13
- data/lib/codebase_index/session_tracer/solid_cache_store.rb +1 -7
- data/lib/codebase_index/session_tracer/store.rb +14 -0
- data/lib/codebase_index/storage/metadata_store.rb +37 -10
- data/lib/codebase_index/storage/pgvector.rb +37 -5
- data/lib/codebase_index/storage/qdrant.rb +39 -6
- data/lib/codebase_index/storage/vector_store.rb +11 -0
- data/lib/codebase_index/temporal/snapshot_store.rb +14 -10
- data/lib/codebase_index/token_utils.rb +19 -0
- data/lib/codebase_index/version.rb +1 -1
- data/lib/codebase_index.rb +25 -6
- data/lib/tasks/codebase_index.rake +2 -2
- metadata +11 -2
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Console
|
|
5
|
+
module Adapters
|
|
6
|
+
# Base class for job backend adapters.
|
|
7
|
+
#
|
|
8
|
+
# Subclasses implement `self.available?` and a private `prefix` method.
|
|
9
|
+
# The prefix is used to build bridge tool names (e.g., "sidekiq_queue_stats").
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# class MyAdapter < JobAdapter
|
|
13
|
+
# def self.available? = !!defined?(::MyQueue)
|
|
14
|
+
# private
|
|
15
|
+
# def prefix = 'my_queue'
|
|
16
|
+
# end
|
|
17
|
+
#
|
|
18
|
+
class JobAdapter
|
|
19
|
+
# Get queue statistics (sizes, latencies).
|
|
20
|
+
#
|
|
21
|
+
# @return [Hash] Bridge request
|
|
22
|
+
def queue_stats
|
|
23
|
+
{ tool: "#{prefix}_queue_stats", params: {} }
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# List recent job failures.
|
|
27
|
+
#
|
|
28
|
+
# @param limit [Integer] Max failures (default: 10, max: 100)
|
|
29
|
+
# @return [Hash] Bridge request
|
|
30
|
+
def recent_failures(limit: 10)
|
|
31
|
+
limit = [limit, 100].min
|
|
32
|
+
{ tool: "#{prefix}_recent_failures", params: { limit: limit } }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Find a job by its ID.
|
|
36
|
+
#
|
|
37
|
+
# @param id [Object] Job ID
|
|
38
|
+
# @return [Hash] Bridge request
|
|
39
|
+
def find_job(id:)
|
|
40
|
+
{ tool: "#{prefix}_find_job", params: { id: id } }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# List scheduled jobs.
|
|
44
|
+
#
|
|
45
|
+
# @param limit [Integer] Max jobs (default: 20, max: 100)
|
|
46
|
+
# @return [Hash] Bridge request
|
|
47
|
+
def scheduled_jobs(limit: 20)
|
|
48
|
+
limit = [limit, 100].min
|
|
49
|
+
{ tool: "#{prefix}_scheduled_jobs", params: { limit: limit } }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Retry a failed job.
|
|
53
|
+
#
|
|
54
|
+
# @param id [Object] Job ID
|
|
55
|
+
# @return [Hash] Bridge request
|
|
56
|
+
def retry_job(id:)
|
|
57
|
+
{ tool: "#{prefix}_retry_job", params: { id: id } }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def prefix
|
|
63
|
+
raise NotImplementedError, "#{self.class}#prefix must be implemented"
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'job_adapter'
|
|
4
|
+
|
|
3
5
|
module CodebaseIndex
|
|
4
6
|
module Console
|
|
5
7
|
module Adapters
|
|
@@ -12,53 +14,18 @@ module CodebaseIndex
|
|
|
12
14
|
# adapter = SidekiqAdapter.new
|
|
13
15
|
# adapter.queue_stats # => { tool: 'sidekiq_queue_stats', params: {} }
|
|
14
16
|
#
|
|
15
|
-
class SidekiqAdapter
|
|
17
|
+
class SidekiqAdapter < JobAdapter
|
|
16
18
|
# Check if Sidekiq is available in the current environment.
|
|
17
19
|
#
|
|
18
20
|
# @return [Boolean]
|
|
19
21
|
def self.available?
|
|
20
|
-
defined?(::Sidekiq)
|
|
22
|
+
!!defined?(::Sidekiq)
|
|
21
23
|
end
|
|
22
24
|
|
|
23
|
-
|
|
24
|
-
#
|
|
25
|
-
# @return [Hash] Bridge request
|
|
26
|
-
def queue_stats
|
|
27
|
-
{ tool: 'sidekiq_queue_stats', params: {} }
|
|
28
|
-
end
|
|
25
|
+
private
|
|
29
26
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# @param limit [Integer] Max failures (default: 10, max: 100)
|
|
33
|
-
# @return [Hash] Bridge request
|
|
34
|
-
def recent_failures(limit: 10)
|
|
35
|
-
limit = [limit, 100].min
|
|
36
|
-
{ tool: 'sidekiq_recent_failures', params: { limit: limit } }
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
# Find a job by its ID.
|
|
40
|
-
#
|
|
41
|
-
# @param id [String] Sidekiq job ID
|
|
42
|
-
# @return [Hash] Bridge request
|
|
43
|
-
def find_job(id:)
|
|
44
|
-
{ tool: 'sidekiq_find_job', params: { id: id } }
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# List scheduled jobs.
|
|
48
|
-
#
|
|
49
|
-
# @param limit [Integer] Max jobs (default: 20, max: 100)
|
|
50
|
-
# @return [Hash] Bridge request
|
|
51
|
-
def scheduled_jobs(limit: 20)
|
|
52
|
-
limit = [limit, 100].min
|
|
53
|
-
{ tool: 'sidekiq_scheduled_jobs', params: { limit: limit } }
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
# Retry a failed job.
|
|
57
|
-
#
|
|
58
|
-
# @param id [String] Sidekiq job ID
|
|
59
|
-
# @return [Hash] Bridge request
|
|
60
|
-
def retry_job(id:)
|
|
61
|
-
{ tool: 'sidekiq_retry_job', params: { id: id } }
|
|
27
|
+
def prefix
|
|
28
|
+
'sidekiq'
|
|
62
29
|
end
|
|
63
30
|
end
|
|
64
31
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'job_adapter'
|
|
4
|
+
|
|
3
5
|
module CodebaseIndex
|
|
4
6
|
module Console
|
|
5
7
|
module Adapters
|
|
@@ -12,53 +14,18 @@ module CodebaseIndex
|
|
|
12
14
|
# adapter = SolidQueueAdapter.new
|
|
13
15
|
# adapter.queue_stats # => { tool: 'solid_queue_queue_stats', params: {} }
|
|
14
16
|
#
|
|
15
|
-
class SolidQueueAdapter
|
|
17
|
+
class SolidQueueAdapter < JobAdapter
|
|
16
18
|
# Check if Solid Queue is available in the current environment.
|
|
17
19
|
#
|
|
18
20
|
# @return [Boolean]
|
|
19
21
|
def self.available?
|
|
20
|
-
defined?(::SolidQueue)
|
|
22
|
+
!!defined?(::SolidQueue)
|
|
21
23
|
end
|
|
22
24
|
|
|
23
|
-
|
|
24
|
-
#
|
|
25
|
-
# @return [Hash] Bridge request
|
|
26
|
-
def queue_stats
|
|
27
|
-
{ tool: 'solid_queue_queue_stats', params: {} }
|
|
28
|
-
end
|
|
25
|
+
private
|
|
29
26
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# @param limit [Integer] Max failures (default: 10, max: 100)
|
|
33
|
-
# @return [Hash] Bridge request
|
|
34
|
-
def recent_failures(limit: 10)
|
|
35
|
-
limit = [limit, 100].min
|
|
36
|
-
{ tool: 'solid_queue_recent_failures', params: { limit: limit } }
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
# Find a job by its ID.
|
|
40
|
-
#
|
|
41
|
-
# @param id [Object] Solid Queue job ID
|
|
42
|
-
# @return [Hash] Bridge request
|
|
43
|
-
def find_job(id:)
|
|
44
|
-
{ tool: 'solid_queue_find_job', params: { id: id } }
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# List scheduled jobs.
|
|
48
|
-
#
|
|
49
|
-
# @param limit [Integer] Max jobs (default: 20, max: 100)
|
|
50
|
-
# @return [Hash] Bridge request
|
|
51
|
-
def scheduled_jobs(limit: 20)
|
|
52
|
-
limit = [limit, 100].min
|
|
53
|
-
{ tool: 'solid_queue_scheduled_jobs', params: { limit: limit } }
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
# Retry a failed job.
|
|
57
|
-
#
|
|
58
|
-
# @param id [Object] Solid Queue job ID
|
|
59
|
-
# @return [Hash] Bridge request
|
|
60
|
-
def retry_job(id:)
|
|
61
|
-
{ tool: 'solid_queue_retry_job', params: { id: id } }
|
|
27
|
+
def prefix
|
|
28
|
+
'solid_queue'
|
|
62
29
|
end
|
|
63
30
|
end
|
|
64
31
|
end
|
|
@@ -23,6 +23,8 @@ module CodebaseIndex
|
|
|
23
23
|
#
|
|
24
24
|
class Bridge
|
|
25
25
|
SUPPORTED_TOOLS = %w[count sample find pluck aggregate association_count schema recent status].freeze
|
|
26
|
+
# Alias used by EmbeddedExecutor to avoid duplicating the list.
|
|
27
|
+
TIER1_TOOLS = SUPPORTED_TOOLS
|
|
26
28
|
TOOL_HANDLERS = SUPPORTED_TOOLS.to_h { |t| [t, :"handle_#{t}"] }.freeze
|
|
27
29
|
|
|
28
30
|
# @param input [IO] Input stream (reads JSON-lines)
|
|
@@ -113,6 +115,11 @@ module CodebaseIndex
|
|
|
113
115
|
@model_validator.validate_model!(model)
|
|
114
116
|
end
|
|
115
117
|
|
|
118
|
+
# Stub handlers below return empty/zero data by design.
|
|
119
|
+
# This Bridge class is a protocol scaffold — real execution happens
|
|
120
|
+
# in EmbeddedExecutor (in-process) or a live Rails bridge process.
|
|
121
|
+
# The stubs satisfy the protocol contract for testing and offline use.
|
|
122
|
+
|
|
116
123
|
def handle_count(_params)
|
|
117
124
|
{ 'count' => 0 }
|
|
118
125
|
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative '../mcp/tool_response_renderer'
|
|
4
|
+
require_relative '../mcp/renderers/json_renderer'
|
|
4
5
|
|
|
5
6
|
module CodebaseIndex
|
|
6
7
|
module Console
|
|
@@ -66,13 +67,8 @@ module CodebaseIndex
|
|
|
66
67
|
end
|
|
67
68
|
|
|
68
69
|
# JSON passthrough renderer for backward compatibility.
|
|
69
|
-
#
|
|
70
|
-
class JsonConsoleRenderer < MCP::
|
|
71
|
-
# @param data [Object] Any JSON-serializable data
|
|
72
|
-
# @return [String] Pretty-printed JSON
|
|
73
|
-
def render_default(data)
|
|
74
|
-
JSON.pretty_generate(data)
|
|
75
|
-
end
|
|
70
|
+
# Delegates to MCP::Renderers::JsonRenderer for consistent JSON output.
|
|
71
|
+
class JsonConsoleRenderer < MCP::Renderers::JsonRenderer
|
|
76
72
|
end
|
|
77
73
|
end
|
|
78
74
|
end
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative 'bridge'
|
|
3
4
|
require_relative 'model_validator'
|
|
4
5
|
require_relative 'safe_context'
|
|
5
6
|
|
|
@@ -20,7 +21,7 @@ module CodebaseIndex
|
|
|
20
21
|
class EmbeddedExecutor # rubocop:disable Metrics/ClassLength
|
|
21
22
|
AGGREGATE_FUNCTIONS = %w[sum average minimum maximum].freeze
|
|
22
23
|
|
|
23
|
-
TIER1_TOOLS =
|
|
24
|
+
TIER1_TOOLS = Bridge::TIER1_TOOLS
|
|
24
25
|
|
|
25
26
|
# @param model_validator [ModelValidator] Validates model/column names
|
|
26
27
|
# @param safe_context [SafeContext] Wraps execution in rolled-back transaction
|
|
@@ -567,15 +567,12 @@ module CodebaseIndex
|
|
|
567
567
|
# rubocop:disable Metrics/ParameterLists
|
|
568
568
|
def define_console_tool(server, conn_mgr, name, description, properties:, required: nil,
|
|
569
569
|
safe_ctx: nil, renderer: nil, &tool_block)
|
|
570
|
-
mgr = conn_mgr
|
|
571
|
-
ctx = safe_ctx
|
|
572
|
-
rdr = renderer
|
|
573
570
|
bridge_method = method(:send_to_bridge)
|
|
574
571
|
schema = { properties: properties }
|
|
575
572
|
schema[:required] = required if required&.any?
|
|
576
573
|
server.define_tool(name: name, description: description, input_schema: schema) do |server_context:, **args|
|
|
577
574
|
request = tool_block.call(args)
|
|
578
|
-
bridge_method.call(
|
|
575
|
+
bridge_method.call(conn_mgr, request.transform_keys(&:to_s), safe_ctx, renderer: renderer)
|
|
579
576
|
end
|
|
580
577
|
end
|
|
581
578
|
# rubocop:enable Metrics/ParameterLists
|
|
@@ -24,15 +24,18 @@ module CodebaseIndex
|
|
|
24
24
|
def initialize
|
|
25
25
|
@nodes = {} # identifier => { type:, file_path: }
|
|
26
26
|
@edges = {} # identifier => [dependency identifiers]
|
|
27
|
-
@reverse = {} # identifier =>
|
|
27
|
+
@reverse = {} # identifier => Set of dependent identifiers
|
|
28
28
|
@file_map = {} # file_path => identifier
|
|
29
|
-
@type_index = {} # type =>
|
|
29
|
+
@type_index = {} # type => Set of identifiers
|
|
30
|
+
@to_h = nil
|
|
30
31
|
end
|
|
31
32
|
|
|
32
33
|
# Register a unit in the graph
|
|
33
34
|
#
|
|
34
35
|
# @param unit [ExtractedUnit] The unit to register
|
|
35
36
|
def register(unit)
|
|
37
|
+
@to_h = nil
|
|
38
|
+
|
|
36
39
|
@nodes[unit.identifier] = {
|
|
37
40
|
type: unit.type,
|
|
38
41
|
file_path: unit.file_path,
|
|
@@ -42,14 +45,12 @@ module CodebaseIndex
|
|
|
42
45
|
@edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
|
|
43
46
|
@file_map[unit.file_path] = unit.identifier if unit.file_path
|
|
44
47
|
|
|
45
|
-
# Type index for filtering
|
|
46
|
-
@type_index[unit.type] ||=
|
|
47
|
-
@type_index[unit.type] << unit.identifier unless @type_index[unit.type].include?(unit.identifier)
|
|
48
|
+
# Type index for filtering (Set-based for O(1) insert)
|
|
49
|
+
(@type_index[unit.type] ||= Set.new).add(unit.identifier)
|
|
48
50
|
|
|
49
|
-
# Build reverse edges
|
|
51
|
+
# Build reverse edges (Set-based for O(1) insert)
|
|
50
52
|
unit.dependencies.each do |dep|
|
|
51
|
-
@reverse[dep[:target]] ||=
|
|
52
|
-
@reverse[dep[:target]] << unit.identifier unless @reverse[dep[:target]].include?(unit.identifier)
|
|
53
|
+
(@reverse[dep[:target]] ||= Set.new).add(unit.identifier)
|
|
53
54
|
end
|
|
54
55
|
end
|
|
55
56
|
|
|
@@ -116,7 +117,7 @@ module CodebaseIndex
|
|
|
116
117
|
# @param identifier [String] Unit identifier
|
|
117
118
|
# @return [Array<String>] List of dependent identifiers
|
|
118
119
|
def dependents_of(identifier)
|
|
119
|
-
@reverse
|
|
120
|
+
@reverse.fetch(identifier, Set.new).to_a
|
|
120
121
|
end
|
|
121
122
|
|
|
122
123
|
# Get all units of a specific type
|
|
@@ -124,7 +125,7 @@ module CodebaseIndex
|
|
|
124
125
|
# @param type [Symbol] Unit type (:model, :controller, etc.)
|
|
125
126
|
# @return [Array<String>] List of unit identifiers
|
|
126
127
|
def units_of_type(type)
|
|
127
|
-
@type_index
|
|
128
|
+
@type_index.fetch(type, Set.new).to_a
|
|
128
129
|
end
|
|
129
130
|
|
|
130
131
|
# Compute PageRank scores for all nodes
|
|
@@ -140,18 +141,19 @@ module CodebaseIndex
|
|
|
140
141
|
n = @nodes.size
|
|
141
142
|
return {} if n.zero?
|
|
142
143
|
|
|
144
|
+
node_ids = @nodes.keys
|
|
143
145
|
base_score = 1.0 / n
|
|
144
|
-
scores =
|
|
146
|
+
scores = node_ids.to_h { |id| [id, base_score] }
|
|
145
147
|
|
|
146
148
|
iterations.times do
|
|
147
149
|
# Collect rank from dangling nodes (no outgoing edges) and redistribute
|
|
148
|
-
dangling_sum =
|
|
150
|
+
dangling_sum = node_ids.sum do |id|
|
|
149
151
|
@edges[id].nil? || @edges[id].empty? ? scores[id] : 0.0
|
|
150
152
|
end
|
|
151
153
|
|
|
152
154
|
new_scores = {}
|
|
153
155
|
|
|
154
|
-
|
|
156
|
+
node_ids.each do |id|
|
|
155
157
|
# Sum contributions from nodes that depend on this one
|
|
156
158
|
incoming = @reverse[id] || []
|
|
157
159
|
rank_sum = incoming.sum do |src|
|
|
@@ -168,22 +170,24 @@ module CodebaseIndex
|
|
|
168
170
|
scores
|
|
169
171
|
end
|
|
170
172
|
|
|
171
|
-
# Serialize graph for persistence
|
|
173
|
+
# Serialize graph for persistence. Memoized — cache is invalidated on register.
|
|
174
|
+
# Returns a dup so callers can't pollute the cached hash.
|
|
172
175
|
#
|
|
173
176
|
# @return [Hash] Complete graph data
|
|
174
177
|
def to_h
|
|
175
|
-
{
|
|
178
|
+
@to_h ||= {
|
|
176
179
|
nodes: @nodes,
|
|
177
180
|
edges: @edges,
|
|
178
|
-
reverse: @reverse,
|
|
181
|
+
reverse: @reverse.transform_values(&:to_a),
|
|
179
182
|
file_map: @file_map,
|
|
180
|
-
type_index: @type_index,
|
|
183
|
+
type_index: @type_index.transform_values(&:to_a),
|
|
181
184
|
stats: {
|
|
182
185
|
node_count: @nodes.size,
|
|
183
186
|
edge_count: @edges.values.sum(&:size),
|
|
184
187
|
types: @type_index.transform_values(&:size)
|
|
185
188
|
}
|
|
186
189
|
}
|
|
190
|
+
@to_h.dup
|
|
187
191
|
end
|
|
188
192
|
|
|
189
193
|
# Load graph from persisted data
|
|
@@ -201,11 +205,16 @@ module CodebaseIndex
|
|
|
201
205
|
graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
|
|
202
206
|
|
|
203
207
|
graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
|
|
204
|
-
|
|
208
|
+
|
|
209
|
+
raw_reverse = data[:reverse] || data['reverse'] || {}
|
|
210
|
+
graph.instance_variable_set(:@reverse, raw_reverse.transform_values { |v| v.is_a?(Set) ? v : Set.new(v) })
|
|
211
|
+
|
|
205
212
|
graph.instance_variable_set(:@file_map, data[:file_map] || data['file_map'] || {})
|
|
206
213
|
|
|
207
214
|
raw_type_index = data[:type_index] || data['type_index'] || {}
|
|
208
|
-
graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym)
|
|
215
|
+
graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym).transform_values do |v|
|
|
216
|
+
v.is_a?(Set) ? v : Set.new(v)
|
|
217
|
+
end)
|
|
209
218
|
|
|
210
219
|
graph
|
|
211
220
|
end
|
|
@@ -9,12 +9,14 @@ module CodebaseIndex
|
|
|
9
9
|
# generates embeddings, and stores vectors. Supports full and incremental
|
|
10
10
|
# modes with checkpoint-based resumability.
|
|
11
11
|
class Indexer
|
|
12
|
-
|
|
12
|
+
# @param checkpoint_interval [Integer] Save checkpoint every N batches (default: 10)
|
|
13
|
+
def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32, checkpoint_interval: 10) # rubocop:disable Metrics/ParameterLists
|
|
13
14
|
@provider = provider
|
|
14
15
|
@text_preparer = text_preparer
|
|
15
16
|
@vector_store = vector_store
|
|
16
17
|
@output_dir = output_dir
|
|
17
18
|
@batch_size = batch_size
|
|
19
|
+
@checkpoint_interval = checkpoint_interval
|
|
18
20
|
end
|
|
19
21
|
|
|
20
22
|
# Index all extracted units (full mode). Returns stats hash.
|
|
@@ -44,12 +46,17 @@ module CodebaseIndex
|
|
|
44
46
|
def process_units(units, incremental:)
|
|
45
47
|
checkpoint = incremental ? load_checkpoint : {}
|
|
46
48
|
stats = { processed: 0, skipped: 0, errors: 0 }
|
|
49
|
+
batch_count = 0
|
|
47
50
|
|
|
48
51
|
units.each_slice(@batch_size) do |batch|
|
|
49
52
|
process_batch(batch, checkpoint, stats, incremental: incremental)
|
|
50
|
-
|
|
53
|
+
batch_count += 1
|
|
54
|
+
save_checkpoint(checkpoint) if (batch_count % @checkpoint_interval).zero?
|
|
51
55
|
end
|
|
52
56
|
|
|
57
|
+
# Always save final checkpoint
|
|
58
|
+
save_checkpoint(checkpoint)
|
|
59
|
+
|
|
53
60
|
stats
|
|
54
61
|
end
|
|
55
62
|
|
|
@@ -98,16 +105,19 @@ module CodebaseIndex
|
|
|
98
105
|
store_vectors(items, vectors, checkpoint, stats)
|
|
99
106
|
rescue StandardError => e
|
|
100
107
|
stats[:errors] += items.size
|
|
101
|
-
stats[:error_messages] ||= []
|
|
102
|
-
stats[:error_messages] << e.message
|
|
103
108
|
raise CodebaseIndex::Error, "Embedding failed: #{e.message}"
|
|
104
109
|
end
|
|
105
110
|
|
|
106
111
|
def store_vectors(items, vectors, checkpoint, stats)
|
|
107
|
-
items.each_with_index do |item, idx|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
112
|
+
entries = items.each_with_index.map do |item, idx|
|
|
113
|
+
{ id: item[:id], vector: vectors[idx],
|
|
114
|
+
metadata: { type: item[:unit_data]['type'], identifier: item[:identifier],
|
|
115
|
+
file_path: item[:unit_data]['file_path'] } }
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
@vector_store.store_batch(entries)
|
|
119
|
+
|
|
120
|
+
items.each do |item|
|
|
111
121
|
checkpoint[item[:identifier]] = item[:source_hash]
|
|
112
122
|
stats[:processed] += 1
|
|
113
123
|
end
|
|
@@ -81,24 +81,45 @@ module CodebaseIndex
|
|
|
81
81
|
# @return [Hash] parsed JSON response
|
|
82
82
|
# @raise [CodebaseIndex::Error] if the API returns a non-success status
|
|
83
83
|
def post_request(body)
|
|
84
|
-
http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
|
|
85
|
-
http.use_ssl = true
|
|
86
|
-
http.open_timeout = 10
|
|
87
|
-
http.read_timeout = 30
|
|
88
|
-
|
|
89
84
|
request = Net::HTTP::Post.new(ENDPOINT.path)
|
|
90
85
|
request['Content-Type'] = 'application/json'
|
|
91
86
|
request['Authorization'] = "Bearer #{@api_key}"
|
|
92
87
|
request.body = body.to_json
|
|
93
88
|
|
|
94
|
-
response =
|
|
89
|
+
response = http_client.request(request)
|
|
90
|
+
|
|
91
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
92
|
+
raise CodebaseIndex::Error, "OpenAI API error: #{response.code} #{response.body}"
|
|
93
|
+
end
|
|
95
94
|
|
|
95
|
+
JSON.parse(response.body)
|
|
96
|
+
rescue Errno::ECONNRESET, Net::OpenTimeout, IOError
|
|
97
|
+
# Connection dropped — reset and retry once
|
|
98
|
+
@http_client = nil
|
|
99
|
+
response = http_client.request(request)
|
|
96
100
|
unless response.is_a?(Net::HTTPSuccess)
|
|
97
101
|
raise CodebaseIndex::Error, "OpenAI API error: #{response.code} #{response.body}"
|
|
98
102
|
end
|
|
99
103
|
|
|
100
104
|
JSON.parse(response.body)
|
|
101
105
|
end
|
|
106
|
+
|
|
107
|
+
# Return a reusable, started HTTP client for the OpenAI API.
|
|
108
|
+
# Calling http.start opens a persistent TCP connection so
|
|
109
|
+
# keep_alive_timeout actually takes effect across requests.
|
|
110
|
+
#
|
|
111
|
+
# @return [Net::HTTP]
|
|
112
|
+
def http_client
|
|
113
|
+
return @http_client if @http_client&.started?
|
|
114
|
+
|
|
115
|
+
http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
|
|
116
|
+
http.use_ssl = true
|
|
117
|
+
http.open_timeout = 10
|
|
118
|
+
http.read_timeout = 30
|
|
119
|
+
http.keep_alive_timeout = 30
|
|
120
|
+
http.start
|
|
121
|
+
@http_client = http
|
|
122
|
+
end
|
|
102
123
|
end
|
|
103
124
|
end
|
|
104
125
|
end
|
|
@@ -118,17 +118,44 @@ module CodebaseIndex
|
|
|
118
118
|
# @return [Hash] parsed JSON response
|
|
119
119
|
# @raise [CodebaseIndex::Error] if the API returns a non-success status
|
|
120
120
|
def post_request(body)
|
|
121
|
-
http = Net::HTTP.new(@uri.host, @uri.port)
|
|
122
121
|
request = Net::HTTP::Post.new(@uri.path, 'Content-Type' => 'application/json')
|
|
123
122
|
request.body = body.to_json
|
|
124
|
-
response =
|
|
123
|
+
response = http_client.request(request)
|
|
124
|
+
|
|
125
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
126
|
+
raise CodebaseIndex::Error, "Ollama API error: #{response.code} #{response.body}"
|
|
127
|
+
end
|
|
125
128
|
|
|
129
|
+
JSON.parse(response.body)
|
|
130
|
+
rescue Errno::ECONNRESET, Net::OpenTimeout, Net::ReadTimeout, IOError
|
|
131
|
+
# Connection dropped — reset and retry once
|
|
132
|
+
@http_client = nil
|
|
133
|
+
begin
|
|
134
|
+
response = http_client.request(request)
|
|
135
|
+
rescue StandardError => retry_error
|
|
136
|
+
raise CodebaseIndex::Error, "Ollama API error (retry failed): #{retry_error.message}"
|
|
137
|
+
end
|
|
126
138
|
unless response.is_a?(Net::HTTPSuccess)
|
|
127
139
|
raise CodebaseIndex::Error, "Ollama API error: #{response.code} #{response.body}"
|
|
128
140
|
end
|
|
129
141
|
|
|
130
142
|
JSON.parse(response.body)
|
|
131
143
|
end
|
|
144
|
+
|
|
145
|
+
# Return a reusable, started HTTP client for the Ollama API.
|
|
146
|
+
#
|
|
147
|
+
# @return [Net::HTTP]
|
|
148
|
+
def http_client
|
|
149
|
+
return @http_client if @http_client&.started?
|
|
150
|
+
|
|
151
|
+
http = Net::HTTP.new(@uri.host, @uri.port)
|
|
152
|
+
http.use_ssl = @uri.scheme == 'https'
|
|
153
|
+
http.open_timeout = 10
|
|
154
|
+
http.read_timeout = 30
|
|
155
|
+
http.keep_alive_timeout = 30
|
|
156
|
+
http.start
|
|
157
|
+
@http_client = http
|
|
158
|
+
end
|
|
132
159
|
end
|
|
133
160
|
end
|
|
134
161
|
end
|
|
@@ -23,6 +23,8 @@ module CodebaseIndex
|
|
|
23
23
|
# Aggregate report across all queries.
|
|
24
24
|
EvaluationReport = Struct.new(:results, :aggregates, keyword_init: true)
|
|
25
25
|
|
|
26
|
+
METRIC_KEYS = %i[precision_at5 precision_at10 recall mrr context_completeness token_efficiency].freeze
|
|
27
|
+
|
|
26
28
|
# @param retriever [CodebaseIndex::Retriever] Configured retriever instance
|
|
27
29
|
# @param query_set [QuerySet] Set of evaluation queries with ground truth
|
|
28
30
|
# @param budget [Integer] Token budget per query
|
|
@@ -113,10 +115,9 @@ module CodebaseIndex
|
|
|
113
115
|
def compute_aggregates(results)
|
|
114
116
|
return empty_aggregates if results.empty?
|
|
115
117
|
|
|
116
|
-
metric_keys = %i[precision_at5 precision_at10 recall mrr context_completeness token_efficiency]
|
|
117
118
|
aggregates = {}
|
|
118
119
|
|
|
119
|
-
|
|
120
|
+
METRIC_KEYS.each do |key|
|
|
120
121
|
values = results.map { |r| r.scores[key] }
|
|
121
122
|
aggregates[:"mean_#{key}"] = values.sum / values.size.to_f
|
|
122
123
|
end
|
|
@@ -130,16 +131,8 @@ module CodebaseIndex
|
|
|
130
131
|
#
|
|
131
132
|
# @return [Hash]
|
|
132
133
|
def empty_aggregates
|
|
133
|
-
{
|
|
134
|
-
|
|
135
|
-
mean_precision_at10: 0.0,
|
|
136
|
-
mean_recall: 0.0,
|
|
137
|
-
mean_mrr: 0.0,
|
|
138
|
-
mean_context_completeness: 0.0,
|
|
139
|
-
mean_token_efficiency: 0.0,
|
|
140
|
-
total_queries: 0,
|
|
141
|
-
mean_tokens_used: 0.0
|
|
142
|
-
}
|
|
134
|
+
METRIC_KEYS.to_h { |key| [:"mean_#{key}", 0.0] }
|
|
135
|
+
.merge(total_queries: 0, mean_tokens_used: 0.0)
|
|
143
136
|
end
|
|
144
137
|
end
|
|
145
138
|
end
|