codebase_index 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +60 -0
  3. data/README.md +95 -300
  4. data/exe/codebase-index-mcp +3 -31
  5. data/exe/codebase-index-mcp-http +3 -31
  6. data/lib/codebase_index/ast/method_extractor.rb +3 -8
  7. data/lib/codebase_index/ast/node.rb +28 -0
  8. data/lib/codebase_index/ast/parser.rb +53 -92
  9. data/lib/codebase_index/builder.rb +67 -4
  10. data/lib/codebase_index/cache/cache_middleware.rb +199 -0
  11. data/lib/codebase_index/cache/cache_store.rb +264 -0
  12. data/lib/codebase_index/cache/redis_cache_store.rb +116 -0
  13. data/lib/codebase_index/cache/solid_cache_store.rb +111 -0
  14. data/lib/codebase_index/chunking/semantic_chunker.rb +29 -24
  15. data/lib/codebase_index/console/adapters/good_job_adapter.rb +7 -40
  16. data/lib/codebase_index/console/adapters/job_adapter.rb +68 -0
  17. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +7 -40
  18. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +7 -40
  19. data/lib/codebase_index/console/bridge.rb +7 -0
  20. data/lib/codebase_index/console/console_response_renderer.rb +3 -7
  21. data/lib/codebase_index/console/embedded_executor.rb +2 -1
  22. data/lib/codebase_index/console/server.rb +1 -4
  23. data/lib/codebase_index/dependency_graph.rb +28 -19
  24. data/lib/codebase_index/embedding/indexer.rb +18 -8
  25. data/lib/codebase_index/embedding/openai.rb +27 -6
  26. data/lib/codebase_index/embedding/provider.rb +29 -2
  27. data/lib/codebase_index/evaluation/evaluator.rb +5 -12
  28. data/lib/codebase_index/extractor.rb +40 -44
  29. data/lib/codebase_index/extractors/action_cable_extractor.rb +9 -36
  30. data/lib/codebase_index/extractors/callback_analyzer.rb +22 -8
  31. data/lib/codebase_index/extractors/controller_extractor.rb +3 -93
  32. data/lib/codebase_index/extractors/decorator_extractor.rb +7 -14
  33. data/lib/codebase_index/extractors/engine_extractor.rb +20 -1
  34. data/lib/codebase_index/extractors/graphql_extractor.rb +4 -29
  35. data/lib/codebase_index/extractors/job_extractor.rb +11 -6
  36. data/lib/codebase_index/extractors/lib_extractor.rb +0 -31
  37. data/lib/codebase_index/extractors/mailer_extractor.rb +15 -85
  38. data/lib/codebase_index/extractors/manager_extractor.rb +1 -15
  39. data/lib/codebase_index/extractors/model_extractor.rb +20 -53
  40. data/lib/codebase_index/extractors/phlex_extractor.rb +8 -8
  41. data/lib/codebase_index/extractors/policy_extractor.rb +1 -24
  42. data/lib/codebase_index/extractors/poro_extractor.rb +0 -17
  43. data/lib/codebase_index/extractors/serializer_extractor.rb +12 -7
  44. data/lib/codebase_index/extractors/service_extractor.rb +1 -38
  45. data/lib/codebase_index/extractors/shared_utility_methods.rb +183 -1
  46. data/lib/codebase_index/extractors/validator_extractor.rb +3 -17
  47. data/lib/codebase_index/extractors/view_component_extractor.rb +10 -9
  48. data/lib/codebase_index/filename_utils.rb +32 -0
  49. data/lib/codebase_index/flow_analysis/operation_extractor.rb +1 -4
  50. data/lib/codebase_index/formatting/base.rb +0 -10
  51. data/lib/codebase_index/graph_analyzer.rb +1 -1
  52. data/lib/codebase_index/mcp/bootstrapper.rb +58 -0
  53. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +35 -34
  54. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +29 -29
  55. data/lib/codebase_index/mcp/server.rb +59 -68
  56. data/lib/codebase_index/mcp/tool_response_renderer.rb +23 -0
  57. data/lib/codebase_index/notion/client.rb +2 -2
  58. data/lib/codebase_index/notion/mapper.rb +1 -0
  59. data/lib/codebase_index/notion/mappers/column_mapper.rb +3 -11
  60. data/lib/codebase_index/notion/mappers/model_mapper.rb +20 -23
  61. data/lib/codebase_index/notion/mappers/shared.rb +22 -0
  62. data/lib/codebase_index/observability/health_check.rb +0 -2
  63. data/lib/codebase_index/observability/structured_logger.rb +12 -30
  64. data/lib/codebase_index/operator/pipeline_guard.rb +0 -7
  65. data/lib/codebase_index/resilience/index_validator.rb +3 -21
  66. data/lib/codebase_index/retrieval/context_assembler.rb +19 -7
  67. data/lib/codebase_index/retrieval/query_classifier.rb +14 -12
  68. data/lib/codebase_index/retrieval/ranker.rb +6 -2
  69. data/lib/codebase_index/retrieval/search_executor.rb +8 -19
  70. data/lib/codebase_index/retriever.rb +1 -9
  71. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +5 -25
  72. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +6 -7
  73. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +58 -53
  74. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +11 -7
  75. data/lib/codebase_index/session_tracer/file_store.rb +1 -8
  76. data/lib/codebase_index/session_tracer/redis_store.rb +1 -7
  77. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +4 -13
  78. data/lib/codebase_index/session_tracer/solid_cache_store.rb +1 -7
  79. data/lib/codebase_index/session_tracer/store.rb +14 -0
  80. data/lib/codebase_index/storage/metadata_store.rb +37 -10
  81. data/lib/codebase_index/storage/pgvector.rb +37 -5
  82. data/lib/codebase_index/storage/qdrant.rb +39 -6
  83. data/lib/codebase_index/storage/vector_store.rb +11 -0
  84. data/lib/codebase_index/temporal/snapshot_store.rb +14 -10
  85. data/lib/codebase_index/token_utils.rb +19 -0
  86. data/lib/codebase_index/version.rb +1 -1
  87. data/lib/codebase_index.rb +25 -6
  88. data/lib/tasks/codebase_index.rake +2 -2
  89. metadata +11 -2
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Console
5
+ module Adapters
6
+ # Base class for job backend adapters.
7
+ #
8
+ # Subclasses implement `self.available?` and a private `prefix` method.
9
+ # The prefix is used to build bridge tool names (e.g., "sidekiq_queue_stats").
10
+ #
11
+ # @example
12
+ # class MyAdapter < JobAdapter
13
+ # def self.available? = !!defined?(::MyQueue)
14
+ # private
15
+ # def prefix = 'my_queue'
16
+ # end
17
+ #
18
+ class JobAdapter
19
+ # Get queue statistics (sizes, latencies).
20
+ #
21
+ # @return [Hash] Bridge request
22
+ def queue_stats
23
+ { tool: "#{prefix}_queue_stats", params: {} }
24
+ end
25
+
26
+ # List recent job failures.
27
+ #
28
+ # @param limit [Integer] Max failures (default: 10, max: 100)
29
+ # @return [Hash] Bridge request
30
+ def recent_failures(limit: 10)
31
+ limit = [limit, 100].min
32
+ { tool: "#{prefix}_recent_failures", params: { limit: limit } }
33
+ end
34
+
35
+ # Find a job by its ID.
36
+ #
37
+ # @param id [Object] Job ID
38
+ # @return [Hash] Bridge request
39
+ def find_job(id:)
40
+ { tool: "#{prefix}_find_job", params: { id: id } }
41
+ end
42
+
43
+ # List scheduled jobs.
44
+ #
45
+ # @param limit [Integer] Max jobs (default: 20, max: 100)
46
+ # @return [Hash] Bridge request
47
+ def scheduled_jobs(limit: 20)
48
+ limit = [limit, 100].min
49
+ { tool: "#{prefix}_scheduled_jobs", params: { limit: limit } }
50
+ end
51
+
52
+ # Retry a failed job.
53
+ #
54
+ # @param id [Object] Job ID
55
+ # @return [Hash] Bridge request
56
+ def retry_job(id:)
57
+ { tool: "#{prefix}_retry_job", params: { id: id } }
58
+ end
59
+
60
+ private
61
+
62
+ def prefix
63
+ raise NotImplementedError, "#{self.class}#prefix must be implemented"
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'job_adapter'
4
+
3
5
  module CodebaseIndex
4
6
  module Console
5
7
  module Adapters
@@ -12,53 +14,18 @@ module CodebaseIndex
12
14
  # adapter = SidekiqAdapter.new
13
15
  # adapter.queue_stats # => { tool: 'sidekiq_queue_stats', params: {} }
14
16
  #
15
- class SidekiqAdapter
17
+ class SidekiqAdapter < JobAdapter
16
18
  # Check if Sidekiq is available in the current environment.
17
19
  #
18
20
  # @return [Boolean]
19
21
  def self.available?
20
- defined?(::Sidekiq) ? true : false
22
+ !!defined?(::Sidekiq)
21
23
  end
22
24
 
23
- # Get queue statistics (sizes, latencies).
24
- #
25
- # @return [Hash] Bridge request
26
- def queue_stats
27
- { tool: 'sidekiq_queue_stats', params: {} }
28
- end
25
+ private
29
26
 
30
- # List recent job failures.
31
- #
32
- # @param limit [Integer] Max failures (default: 10, max: 100)
33
- # @return [Hash] Bridge request
34
- def recent_failures(limit: 10)
35
- limit = [limit, 100].min
36
- { tool: 'sidekiq_recent_failures', params: { limit: limit } }
37
- end
38
-
39
- # Find a job by its ID.
40
- #
41
- # @param id [String] Sidekiq job ID
42
- # @return [Hash] Bridge request
43
- def find_job(id:)
44
- { tool: 'sidekiq_find_job', params: { id: id } }
45
- end
46
-
47
- # List scheduled jobs.
48
- #
49
- # @param limit [Integer] Max jobs (default: 20, max: 100)
50
- # @return [Hash] Bridge request
51
- def scheduled_jobs(limit: 20)
52
- limit = [limit, 100].min
53
- { tool: 'sidekiq_scheduled_jobs', params: { limit: limit } }
54
- end
55
-
56
- # Retry a failed job.
57
- #
58
- # @param id [String] Sidekiq job ID
59
- # @return [Hash] Bridge request
60
- def retry_job(id:)
61
- { tool: 'sidekiq_retry_job', params: { id: id } }
27
+ def prefix
28
+ 'sidekiq'
62
29
  end
63
30
  end
64
31
  end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'job_adapter'
4
+
3
5
  module CodebaseIndex
4
6
  module Console
5
7
  module Adapters
@@ -12,53 +14,18 @@ module CodebaseIndex
12
14
  # adapter = SolidQueueAdapter.new
13
15
  # adapter.queue_stats # => { tool: 'solid_queue_queue_stats', params: {} }
14
16
  #
15
- class SolidQueueAdapter
17
+ class SolidQueueAdapter < JobAdapter
16
18
  # Check if Solid Queue is available in the current environment.
17
19
  #
18
20
  # @return [Boolean]
19
21
  def self.available?
20
- defined?(::SolidQueue) ? true : false
22
+ !!defined?(::SolidQueue)
21
23
  end
22
24
 
23
- # Get queue statistics (sizes, latencies).
24
- #
25
- # @return [Hash] Bridge request
26
- def queue_stats
27
- { tool: 'solid_queue_queue_stats', params: {} }
28
- end
25
+ private
29
26
 
30
- # List recent job failures.
31
- #
32
- # @param limit [Integer] Max failures (default: 10, max: 100)
33
- # @return [Hash] Bridge request
34
- def recent_failures(limit: 10)
35
- limit = [limit, 100].min
36
- { tool: 'solid_queue_recent_failures', params: { limit: limit } }
37
- end
38
-
39
- # Find a job by its ID.
40
- #
41
- # @param id [Object] Solid Queue job ID
42
- # @return [Hash] Bridge request
43
- def find_job(id:)
44
- { tool: 'solid_queue_find_job', params: { id: id } }
45
- end
46
-
47
- # List scheduled jobs.
48
- #
49
- # @param limit [Integer] Max jobs (default: 20, max: 100)
50
- # @return [Hash] Bridge request
51
- def scheduled_jobs(limit: 20)
52
- limit = [limit, 100].min
53
- { tool: 'solid_queue_scheduled_jobs', params: { limit: limit } }
54
- end
55
-
56
- # Retry a failed job.
57
- #
58
- # @param id [Object] Solid Queue job ID
59
- # @return [Hash] Bridge request
60
- def retry_job(id:)
61
- { tool: 'solid_queue_retry_job', params: { id: id } }
27
+ def prefix
28
+ 'solid_queue'
62
29
  end
63
30
  end
64
31
  end
@@ -23,6 +23,8 @@ module CodebaseIndex
23
23
  #
24
24
  class Bridge
25
25
  SUPPORTED_TOOLS = %w[count sample find pluck aggregate association_count schema recent status].freeze
26
+ # Alias used by EmbeddedExecutor to avoid duplicating the list.
27
+ TIER1_TOOLS = SUPPORTED_TOOLS
26
28
  TOOL_HANDLERS = SUPPORTED_TOOLS.to_h { |t| [t, :"handle_#{t}"] }.freeze
27
29
 
28
30
  # @param input [IO] Input stream (reads JSON-lines)
@@ -113,6 +115,11 @@ module CodebaseIndex
113
115
  @model_validator.validate_model!(model)
114
116
  end
115
117
 
118
+ # Stub handlers below return empty/zero data by design.
119
+ # This Bridge class is a protocol scaffold — real execution happens
120
+ # in EmbeddedExecutor (in-process) or a live Rails bridge process.
121
+ # The stubs satisfy the protocol contract for testing and offline use.
122
+
116
123
  def handle_count(_params)
117
124
  { 'count' => 0 }
118
125
  end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative '../mcp/tool_response_renderer'
4
+ require_relative '../mcp/renderers/json_renderer'
4
5
 
5
6
  module CodebaseIndex
6
7
  module Console
@@ -66,13 +67,8 @@ module CodebaseIndex
66
67
  end
67
68
 
68
69
  # JSON passthrough renderer for backward compatibility.
69
- # Returns JSON.pretty_generate output for any data.
70
- class JsonConsoleRenderer < MCP::ToolResponseRenderer
71
- # @param data [Object] Any JSON-serializable data
72
- # @return [String] Pretty-printed JSON
73
- def render_default(data)
74
- JSON.pretty_generate(data)
75
- end
70
+ # Delegates to MCP::Renderers::JsonRenderer for consistent JSON output.
71
+ class JsonConsoleRenderer < MCP::Renderers::JsonRenderer
76
72
  end
77
73
  end
78
74
  end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative 'bridge'
3
4
  require_relative 'model_validator'
4
5
  require_relative 'safe_context'
5
6
 
@@ -20,7 +21,7 @@ module CodebaseIndex
20
21
  class EmbeddedExecutor # rubocop:disable Metrics/ClassLength
21
22
  AGGREGATE_FUNCTIONS = %w[sum average minimum maximum].freeze
22
23
 
23
- TIER1_TOOLS = %w[count sample find pluck aggregate association_count schema recent status].freeze
24
+ TIER1_TOOLS = Bridge::TIER1_TOOLS
24
25
 
25
26
  # @param model_validator [ModelValidator] Validates model/column names
26
27
  # @param safe_context [SafeContext] Wraps execution in rolled-back transaction
@@ -567,15 +567,12 @@ module CodebaseIndex
567
567
  # rubocop:disable Metrics/ParameterLists
568
568
  def define_console_tool(server, conn_mgr, name, description, properties:, required: nil,
569
569
  safe_ctx: nil, renderer: nil, &tool_block)
570
- mgr = conn_mgr
571
- ctx = safe_ctx
572
- rdr = renderer
573
570
  bridge_method = method(:send_to_bridge)
574
571
  schema = { properties: properties }
575
572
  schema[:required] = required if required&.any?
576
573
  server.define_tool(name: name, description: description, input_schema: schema) do |server_context:, **args|
577
574
  request = tool_block.call(args)
578
- bridge_method.call(mgr, request.transform_keys(&:to_s), ctx, renderer: rdr)
575
+ bridge_method.call(conn_mgr, request.transform_keys(&:to_s), safe_ctx, renderer: renderer)
579
576
  end
580
577
  end
581
578
  # rubocop:enable Metrics/ParameterLists
@@ -24,15 +24,18 @@ module CodebaseIndex
24
24
  def initialize
25
25
  @nodes = {} # identifier => { type:, file_path: }
26
26
  @edges = {} # identifier => [dependency identifiers]
27
- @reverse = {} # identifier => [dependent identifiers]
27
+ @reverse = {} # identifier => Set of dependent identifiers
28
28
  @file_map = {} # file_path => identifier
29
- @type_index = {} # type => [identifiers]
29
+ @type_index = {} # type => Set of identifiers
30
+ @to_h = nil
30
31
  end
31
32
 
32
33
  # Register a unit in the graph
33
34
  #
34
35
  # @param unit [ExtractedUnit] The unit to register
35
36
  def register(unit)
37
+ @to_h = nil
38
+
36
39
  @nodes[unit.identifier] = {
37
40
  type: unit.type,
38
41
  file_path: unit.file_path,
@@ -42,14 +45,12 @@ module CodebaseIndex
42
45
  @edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
43
46
  @file_map[unit.file_path] = unit.identifier if unit.file_path
44
47
 
45
- # Type index for filtering
46
- @type_index[unit.type] ||= []
47
- @type_index[unit.type] << unit.identifier unless @type_index[unit.type].include?(unit.identifier)
48
+ # Type index for filtering (Set-based for O(1) insert)
49
+ (@type_index[unit.type] ||= Set.new).add(unit.identifier)
48
50
 
49
- # Build reverse edges
51
+ # Build reverse edges (Set-based for O(1) insert)
50
52
  unit.dependencies.each do |dep|
51
- @reverse[dep[:target]] ||= []
52
- @reverse[dep[:target]] << unit.identifier unless @reverse[dep[:target]].include?(unit.identifier)
53
+ (@reverse[dep[:target]] ||= Set.new).add(unit.identifier)
53
54
  end
54
55
  end
55
56
 
@@ -116,7 +117,7 @@ module CodebaseIndex
116
117
  # @param identifier [String] Unit identifier
117
118
  # @return [Array<String>] List of dependent identifiers
118
119
  def dependents_of(identifier)
119
- @reverse[identifier] || []
120
+ @reverse.fetch(identifier, Set.new).to_a
120
121
  end
121
122
 
122
123
  # Get all units of a specific type
@@ -124,7 +125,7 @@ module CodebaseIndex
124
125
  # @param type [Symbol] Unit type (:model, :controller, etc.)
125
126
  # @return [Array<String>] List of unit identifiers
126
127
  def units_of_type(type)
127
- @type_index[type] || []
128
+ @type_index.fetch(type, Set.new).to_a
128
129
  end
129
130
 
130
131
  # Compute PageRank scores for all nodes
@@ -140,18 +141,19 @@ module CodebaseIndex
140
141
  n = @nodes.size
141
142
  return {} if n.zero?
142
143
 
144
+ node_ids = @nodes.keys
143
145
  base_score = 1.0 / n
144
- scores = @nodes.keys.to_h { |id| [id, base_score] }
146
+ scores = node_ids.to_h { |id| [id, base_score] }
145
147
 
146
148
  iterations.times do
147
149
  # Collect rank from dangling nodes (no outgoing edges) and redistribute
148
- dangling_sum = @nodes.keys.sum do |id|
150
+ dangling_sum = node_ids.sum do |id|
149
151
  @edges[id].nil? || @edges[id].empty? ? scores[id] : 0.0
150
152
  end
151
153
 
152
154
  new_scores = {}
153
155
 
154
- @nodes.each_key do |id|
156
+ node_ids.each do |id|
155
157
  # Sum contributions from nodes that depend on this one
156
158
  incoming = @reverse[id] || []
157
159
  rank_sum = incoming.sum do |src|
@@ -168,22 +170,24 @@ module CodebaseIndex
168
170
  scores
169
171
  end
170
172
 
171
- # Serialize graph for persistence
173
+ # Serialize graph for persistence. Memoized — cache is invalidated on register.
174
+ # Returns a dup so callers can't pollute the cached hash.
172
175
  #
173
176
  # @return [Hash] Complete graph data
174
177
  def to_h
175
- {
178
+ @to_h ||= {
176
179
  nodes: @nodes,
177
180
  edges: @edges,
178
- reverse: @reverse,
181
+ reverse: @reverse.transform_values(&:to_a),
179
182
  file_map: @file_map,
180
- type_index: @type_index,
183
+ type_index: @type_index.transform_values(&:to_a),
181
184
  stats: {
182
185
  node_count: @nodes.size,
183
186
  edge_count: @edges.values.sum(&:size),
184
187
  types: @type_index.transform_values(&:size)
185
188
  }
186
189
  }
190
+ @to_h.dup
187
191
  end
188
192
 
189
193
  # Load graph from persisted data
@@ -201,11 +205,16 @@ module CodebaseIndex
201
205
  graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
202
206
 
203
207
  graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
204
- graph.instance_variable_set(:@reverse, data[:reverse] || data['reverse'] || {})
208
+
209
+ raw_reverse = data[:reverse] || data['reverse'] || {}
210
+ graph.instance_variable_set(:@reverse, raw_reverse.transform_values { |v| v.is_a?(Set) ? v : Set.new(v) })
211
+
205
212
  graph.instance_variable_set(:@file_map, data[:file_map] || data['file_map'] || {})
206
213
 
207
214
  raw_type_index = data[:type_index] || data['type_index'] || {}
208
- graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym))
215
+ graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym).transform_values do |v|
216
+ v.is_a?(Set) ? v : Set.new(v)
217
+ end)
209
218
 
210
219
  graph
211
220
  end
@@ -9,12 +9,14 @@ module CodebaseIndex
9
9
  # generates embeddings, and stores vectors. Supports full and incremental
10
10
  # modes with checkpoint-based resumability.
11
11
  class Indexer
12
- def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32)
12
+ # @param checkpoint_interval [Integer] Save checkpoint every N batches (default: 10)
13
+ def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32, checkpoint_interval: 10) # rubocop:disable Metrics/ParameterLists
13
14
  @provider = provider
14
15
  @text_preparer = text_preparer
15
16
  @vector_store = vector_store
16
17
  @output_dir = output_dir
17
18
  @batch_size = batch_size
19
+ @checkpoint_interval = checkpoint_interval
18
20
  end
19
21
 
20
22
  # Index all extracted units (full mode). Returns stats hash.
@@ -44,12 +46,17 @@ module CodebaseIndex
44
46
  def process_units(units, incremental:)
45
47
  checkpoint = incremental ? load_checkpoint : {}
46
48
  stats = { processed: 0, skipped: 0, errors: 0 }
49
+ batch_count = 0
47
50
 
48
51
  units.each_slice(@batch_size) do |batch|
49
52
  process_batch(batch, checkpoint, stats, incremental: incremental)
50
- save_checkpoint(checkpoint)
53
+ batch_count += 1
54
+ save_checkpoint(checkpoint) if (batch_count % @checkpoint_interval).zero?
51
55
  end
52
56
 
57
+ # Always save final checkpoint
58
+ save_checkpoint(checkpoint)
59
+
53
60
  stats
54
61
  end
55
62
 
@@ -98,16 +105,19 @@ module CodebaseIndex
98
105
  store_vectors(items, vectors, checkpoint, stats)
99
106
  rescue StandardError => e
100
107
  stats[:errors] += items.size
101
- stats[:error_messages] ||= []
102
- stats[:error_messages] << e.message
103
108
  raise CodebaseIndex::Error, "Embedding failed: #{e.message}"
104
109
  end
105
110
 
106
111
  def store_vectors(items, vectors, checkpoint, stats)
107
- items.each_with_index do |item, idx|
108
- metadata = { type: item[:unit_data]['type'], identifier: item[:identifier],
109
- file_path: item[:unit_data]['file_path'] }
110
- @vector_store.store(item[:id], vectors[idx], metadata)
112
+ entries = items.each_with_index.map do |item, idx|
113
+ { id: item[:id], vector: vectors[idx],
114
+ metadata: { type: item[:unit_data]['type'], identifier: item[:identifier],
115
+ file_path: item[:unit_data]['file_path'] } }
116
+ end
117
+
118
+ @vector_store.store_batch(entries)
119
+
120
+ items.each do |item|
111
121
  checkpoint[item[:identifier]] = item[:source_hash]
112
122
  stats[:processed] += 1
113
123
  end
@@ -81,24 +81,45 @@ module CodebaseIndex
81
81
  # @return [Hash] parsed JSON response
82
82
  # @raise [CodebaseIndex::Error] if the API returns a non-success status
83
83
  def post_request(body)
84
- http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
85
- http.use_ssl = true
86
- http.open_timeout = 10
87
- http.read_timeout = 30
88
-
89
84
  request = Net::HTTP::Post.new(ENDPOINT.path)
90
85
  request['Content-Type'] = 'application/json'
91
86
  request['Authorization'] = "Bearer #{@api_key}"
92
87
  request.body = body.to_json
93
88
 
94
- response = http.request(request)
89
+ response = http_client.request(request)
90
+
91
+ unless response.is_a?(Net::HTTPSuccess)
92
+ raise CodebaseIndex::Error, "OpenAI API error: #{response.code} #{response.body}"
93
+ end
95
94
 
95
+ JSON.parse(response.body)
96
+ rescue Errno::ECONNRESET, Net::OpenTimeout, IOError
97
+ # Connection dropped — reset and retry once
98
+ @http_client = nil
99
+ response = http_client.request(request)
96
100
  unless response.is_a?(Net::HTTPSuccess)
97
101
  raise CodebaseIndex::Error, "OpenAI API error: #{response.code} #{response.body}"
98
102
  end
99
103
 
100
104
  JSON.parse(response.body)
101
105
  end
106
+
107
+ # Return a reusable, started HTTP client for the OpenAI API.
108
+ # Calling http.start opens a persistent TCP connection so
109
+ # keep_alive_timeout actually takes effect across requests.
110
+ #
111
+ # @return [Net::HTTP]
112
+ def http_client
113
+ return @http_client if @http_client&.started?
114
+
115
+ http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
116
+ http.use_ssl = true
117
+ http.open_timeout = 10
118
+ http.read_timeout = 30
119
+ http.keep_alive_timeout = 30
120
+ http.start
121
+ @http_client = http
122
+ end
102
123
  end
103
124
  end
104
125
  end
@@ -118,17 +118,44 @@ module CodebaseIndex
118
118
  # @return [Hash] parsed JSON response
119
119
  # @raise [CodebaseIndex::Error] if the API returns a non-success status
120
120
  def post_request(body)
121
- http = Net::HTTP.new(@uri.host, @uri.port)
122
121
  request = Net::HTTP::Post.new(@uri.path, 'Content-Type' => 'application/json')
123
122
  request.body = body.to_json
124
- response = http.request(request)
123
+ response = http_client.request(request)
124
+
125
+ unless response.is_a?(Net::HTTPSuccess)
126
+ raise CodebaseIndex::Error, "Ollama API error: #{response.code} #{response.body}"
127
+ end
125
128
 
129
+ JSON.parse(response.body)
130
+ rescue Errno::ECONNRESET, Net::OpenTimeout, Net::ReadTimeout, IOError
131
+ # Connection dropped — reset and retry once
132
+ @http_client = nil
133
+ begin
134
+ response = http_client.request(request)
135
+ rescue StandardError => retry_error
136
+ raise CodebaseIndex::Error, "Ollama API error (retry failed): #{retry_error.message}"
137
+ end
126
138
  unless response.is_a?(Net::HTTPSuccess)
127
139
  raise CodebaseIndex::Error, "Ollama API error: #{response.code} #{response.body}"
128
140
  end
129
141
 
130
142
  JSON.parse(response.body)
131
143
  end
144
+
145
+ # Return a reusable, started HTTP client for the Ollama API.
146
+ #
147
+ # @return [Net::HTTP]
148
+ def http_client
149
+ return @http_client if @http_client&.started?
150
+
151
+ http = Net::HTTP.new(@uri.host, @uri.port)
152
+ http.use_ssl = @uri.scheme == 'https'
153
+ http.open_timeout = 10
154
+ http.read_timeout = 30
155
+ http.keep_alive_timeout = 30
156
+ http.start
157
+ @http_client = http
158
+ end
132
159
  end
133
160
  end
134
161
  end
@@ -23,6 +23,8 @@ module CodebaseIndex
23
23
  # Aggregate report across all queries.
24
24
  EvaluationReport = Struct.new(:results, :aggregates, keyword_init: true)
25
25
 
26
+ METRIC_KEYS = %i[precision_at5 precision_at10 recall mrr context_completeness token_efficiency].freeze
27
+
26
28
  # @param retriever [CodebaseIndex::Retriever] Configured retriever instance
27
29
  # @param query_set [QuerySet] Set of evaluation queries with ground truth
28
30
  # @param budget [Integer] Token budget per query
@@ -113,10 +115,9 @@ module CodebaseIndex
113
115
  def compute_aggregates(results)
114
116
  return empty_aggregates if results.empty?
115
117
 
116
- metric_keys = %i[precision_at5 precision_at10 recall mrr context_completeness token_efficiency]
117
118
  aggregates = {}
118
119
 
119
- metric_keys.each do |key|
120
+ METRIC_KEYS.each do |key|
120
121
  values = results.map { |r| r.scores[key] }
121
122
  aggregates[:"mean_#{key}"] = values.sum / values.size.to_f
122
123
  end
@@ -130,16 +131,8 @@ module CodebaseIndex
130
131
  #
131
132
  # @return [Hash]
132
133
  def empty_aggregates
133
- {
134
- mean_precision_at5: 0.0,
135
- mean_precision_at10: 0.0,
136
- mean_recall: 0.0,
137
- mean_mrr: 0.0,
138
- mean_context_completeness: 0.0,
139
- mean_token_efficiency: 0.0,
140
- total_queries: 0,
141
- mean_tokens_used: 0.0
142
- }
134
+ METRIC_KEYS.to_h { |key| [:"mean_#{key}", 0.0] }
135
+ .merge(total_queries: 0, mean_tokens_used: 0.0)
143
136
  end
144
137
  end
145
138
  end