codebase_index 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +60 -0
  3. data/README.md +95 -300
  4. data/exe/codebase-index-mcp +3 -31
  5. data/exe/codebase-index-mcp-http +3 -31
  6. data/lib/codebase_index/ast/method_extractor.rb +3 -8
  7. data/lib/codebase_index/ast/node.rb +28 -0
  8. data/lib/codebase_index/ast/parser.rb +53 -92
  9. data/lib/codebase_index/builder.rb +67 -4
  10. data/lib/codebase_index/cache/cache_middleware.rb +199 -0
  11. data/lib/codebase_index/cache/cache_store.rb +264 -0
  12. data/lib/codebase_index/cache/redis_cache_store.rb +116 -0
  13. data/lib/codebase_index/cache/solid_cache_store.rb +111 -0
  14. data/lib/codebase_index/chunking/semantic_chunker.rb +29 -24
  15. data/lib/codebase_index/console/adapters/good_job_adapter.rb +7 -40
  16. data/lib/codebase_index/console/adapters/job_adapter.rb +68 -0
  17. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +7 -40
  18. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +7 -40
  19. data/lib/codebase_index/console/bridge.rb +7 -0
  20. data/lib/codebase_index/console/console_response_renderer.rb +3 -7
  21. data/lib/codebase_index/console/embedded_executor.rb +2 -1
  22. data/lib/codebase_index/console/server.rb +1 -4
  23. data/lib/codebase_index/dependency_graph.rb +28 -19
  24. data/lib/codebase_index/embedding/indexer.rb +18 -8
  25. data/lib/codebase_index/embedding/openai.rb +27 -6
  26. data/lib/codebase_index/embedding/provider.rb +29 -2
  27. data/lib/codebase_index/evaluation/evaluator.rb +5 -12
  28. data/lib/codebase_index/extractor.rb +40 -44
  29. data/lib/codebase_index/extractors/action_cable_extractor.rb +9 -36
  30. data/lib/codebase_index/extractors/callback_analyzer.rb +22 -8
  31. data/lib/codebase_index/extractors/controller_extractor.rb +3 -93
  32. data/lib/codebase_index/extractors/decorator_extractor.rb +7 -14
  33. data/lib/codebase_index/extractors/engine_extractor.rb +20 -1
  34. data/lib/codebase_index/extractors/graphql_extractor.rb +4 -29
  35. data/lib/codebase_index/extractors/job_extractor.rb +11 -6
  36. data/lib/codebase_index/extractors/lib_extractor.rb +0 -31
  37. data/lib/codebase_index/extractors/mailer_extractor.rb +15 -85
  38. data/lib/codebase_index/extractors/manager_extractor.rb +1 -15
  39. data/lib/codebase_index/extractors/model_extractor.rb +20 -53
  40. data/lib/codebase_index/extractors/phlex_extractor.rb +8 -8
  41. data/lib/codebase_index/extractors/policy_extractor.rb +1 -24
  42. data/lib/codebase_index/extractors/poro_extractor.rb +0 -17
  43. data/lib/codebase_index/extractors/serializer_extractor.rb +12 -7
  44. data/lib/codebase_index/extractors/service_extractor.rb +1 -38
  45. data/lib/codebase_index/extractors/shared_utility_methods.rb +183 -1
  46. data/lib/codebase_index/extractors/validator_extractor.rb +3 -17
  47. data/lib/codebase_index/extractors/view_component_extractor.rb +10 -9
  48. data/lib/codebase_index/filename_utils.rb +32 -0
  49. data/lib/codebase_index/flow_analysis/operation_extractor.rb +1 -4
  50. data/lib/codebase_index/formatting/base.rb +0 -10
  51. data/lib/codebase_index/graph_analyzer.rb +1 -1
  52. data/lib/codebase_index/mcp/bootstrapper.rb +58 -0
  53. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +35 -34
  54. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +29 -29
  55. data/lib/codebase_index/mcp/server.rb +59 -68
  56. data/lib/codebase_index/mcp/tool_response_renderer.rb +23 -0
  57. data/lib/codebase_index/notion/client.rb +2 -2
  58. data/lib/codebase_index/notion/mapper.rb +1 -0
  59. data/lib/codebase_index/notion/mappers/column_mapper.rb +3 -11
  60. data/lib/codebase_index/notion/mappers/model_mapper.rb +20 -23
  61. data/lib/codebase_index/notion/mappers/shared.rb +22 -0
  62. data/lib/codebase_index/observability/health_check.rb +0 -2
  63. data/lib/codebase_index/observability/structured_logger.rb +12 -30
  64. data/lib/codebase_index/operator/pipeline_guard.rb +0 -7
  65. data/lib/codebase_index/resilience/index_validator.rb +3 -21
  66. data/lib/codebase_index/retrieval/context_assembler.rb +19 -7
  67. data/lib/codebase_index/retrieval/query_classifier.rb +14 -12
  68. data/lib/codebase_index/retrieval/ranker.rb +6 -2
  69. data/lib/codebase_index/retrieval/search_executor.rb +8 -19
  70. data/lib/codebase_index/retriever.rb +1 -9
  71. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +5 -25
  72. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +6 -7
  73. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +58 -53
  74. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +11 -7
  75. data/lib/codebase_index/session_tracer/file_store.rb +1 -8
  76. data/lib/codebase_index/session_tracer/redis_store.rb +1 -7
  77. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +4 -13
  78. data/lib/codebase_index/session_tracer/solid_cache_store.rb +1 -7
  79. data/lib/codebase_index/session_tracer/store.rb +14 -0
  80. data/lib/codebase_index/storage/metadata_store.rb +37 -10
  81. data/lib/codebase_index/storage/pgvector.rb +37 -5
  82. data/lib/codebase_index/storage/qdrant.rb +39 -6
  83. data/lib/codebase_index/storage/vector_store.rb +11 -0
  84. data/lib/codebase_index/temporal/snapshot_store.rb +14 -10
  85. data/lib/codebase_index/token_utils.rb +19 -0
  86. data/lib/codebase_index/version.rb +1 -1
  87. data/lib/codebase_index.rb +25 -6
  88. data/lib/tasks/codebase_index.rake +2 -2
  89. metadata +11 -2
@@ -16,40 +16,12 @@ require_relative '../lib/codebase_index'
16
16
  require_relative '../lib/codebase_index/dependency_graph'
17
17
  require_relative '../lib/codebase_index/graph_analyzer'
18
18
  require_relative '../lib/codebase_index/mcp/server'
19
+ require_relative '../lib/codebase_index/mcp/bootstrapper'
19
20
  require_relative '../lib/codebase_index/embedding/text_preparer'
20
21
  require_relative '../lib/codebase_index/embedding/indexer'
21
22
 
22
- index_dir = ARGV[0] || ENV['CODEBASE_INDEX_DIR'] || Dir.pwd
23
-
24
- unless Dir.exist?(index_dir)
25
- warn "Error: Index directory does not exist: #{index_dir}"
26
- exit 1
27
- end
28
-
29
- unless File.exist?(File.join(index_dir, 'manifest.json'))
30
- warn "Error: No manifest.json found in: #{index_dir}"
31
- warn 'Run `bundle exec rake codebase_index:extract` in your Rails app first.'
32
- exit 1
33
- end
34
-
35
- # Attempt to build a retriever for semantic search.
36
- # Auto-configures from environment variables when no explicit configuration exists.
37
- retriever = begin
38
- config = CodebaseIndex.configuration
39
-
40
- if !config.embedding_provider && ENV.fetch('OPENAI_API_KEY', nil)
41
- config.vector_store = :in_memory
42
- config.metadata_store = :in_memory
43
- config.graph_store = :in_memory
44
- config.embedding_provider = :openai
45
- config.embedding_options = { api_key: ENV.fetch('OPENAI_API_KEY', nil) }
46
- end
47
-
48
- CodebaseIndex::Builder.new(config).build_retriever if config.embedding_provider
49
- rescue StandardError => e
50
- warn "Note: Semantic search unavailable (#{e.message}). Using pattern-based search only."
51
- nil
52
- end
23
+ index_dir = CodebaseIndex::MCP::Bootstrapper.resolve_index_dir(ARGV)
24
+ retriever = CodebaseIndex::MCP::Bootstrapper.build_retriever
53
25
 
54
26
  port = (ENV['PORT'] || 9292).to_i
55
27
  host = ENV['HOST'] || 'localhost'
@@ -16,6 +16,8 @@ module CodebaseIndex
16
16
  # # => "def create\n @user = User.find(params[:id])\nend\n"
17
17
  #
18
18
  class MethodExtractor
19
+ include SourceSpan
20
+
19
21
  # @param parser [Ast::Parser, nil] Parser instance (creates default if nil)
20
22
  def initialize(parser: nil)
21
23
  @parser = parser || Parser.new
@@ -62,14 +64,7 @@ module CodebaseIndex
62
64
  return node.source if node.source
63
65
 
64
66
  # Fallback: extract by line range
65
- return nil unless node.line && node.end_line
66
-
67
- lines = source.lines
68
- start_idx = node.line - 1
69
- end_idx = node.end_line - 1
70
- return nil if start_idx.negative? || end_idx >= lines.length
71
-
72
- lines[start_idx..end_idx].join
67
+ extract_source_span(source, node.line, node.end_line)
73
68
  end
74
69
  end
75
70
  end
@@ -84,5 +84,33 @@ module CodebaseIndex
84
84
  end
85
85
  end
86
86
  end
87
+
88
+ # Mixin for line-range source extraction, shared across Parser, MethodExtractor,
89
+ # and ClassAnalyzer.
90
+ #
91
+ # @example
92
+ # include Ast::SourceSpan
93
+ # extract_source_span(source, node.line, node.end_line)
94
+ #
95
+ module SourceSpan
96
+ private
97
+
98
+ # Extract source lines for a 1-based start/end line range.
99
+ #
100
+ # @param source [String] Full source text
101
+ # @param start_line [Integer, nil] 1-based start line
102
+ # @param end_line [Integer, nil] 1-based end line
103
+ # @return [String, nil] Extracted lines joined, or nil if out of range
104
+ def extract_source_span(source, start_line, end_line)
105
+ return nil unless start_line && end_line
106
+
107
+ lines = source.lines
108
+ start_idx = start_line - 1
109
+ end_idx = end_line - 1
110
+ return nil if start_idx.negative? || end_idx >= lines.length
111
+
112
+ lines[start_idx..end_idx].join
113
+ end
114
+ end
87
115
  end
88
116
  end
@@ -15,6 +15,8 @@ module CodebaseIndex
15
15
  # root.find_all(:def).first.method_name #=> "bar"
16
16
  #
17
17
  class Parser
18
+ include SourceSpan
19
+
18
20
  # Parse Ruby source into a normalized AST.
19
21
  #
20
22
  # @param source [String] Ruby source code
@@ -26,8 +28,6 @@ module CodebaseIndex
26
28
  else
27
29
  parse_with_parser_gem(source)
28
30
  end
29
- rescue CodebaseIndex::ExtractionError
30
- raise
31
31
  rescue StandardError => e
32
32
  raise CodebaseIndex::ExtractionError, "Failed to parse source: #{e.message}"
33
33
  end
@@ -36,15 +36,12 @@ module CodebaseIndex
36
36
  #
37
37
  # @return [Boolean]
38
38
  def prism_available?
39
- if @prism_available.nil?
40
- begin
41
- require 'prism'
42
- @prism_available = defined?(Prism) ? true : false
43
- rescue LoadError
44
- @prism_available = false
45
- end
46
- end
47
- @prism_available
39
+ return @prism_available unless @prism_available.nil?
40
+
41
+ require 'prism'
42
+ @prism_available = true
43
+ rescue LoadError
44
+ @prism_available = false
48
45
  end
49
46
 
50
47
  private
@@ -105,10 +102,8 @@ module CodebaseIndex
105
102
  )
106
103
  when Prism::ConstantPathNode
107
104
  convert_prism_constant_path(prism_node, source)
108
- when Prism::IfNode
105
+ when Prism::IfNode, Prism::UnlessNode
109
106
  convert_prism_if(prism_node, source)
110
- when Prism::UnlessNode
111
- convert_prism_unless(prism_node, source)
112
107
  when Prism::CaseNode
113
108
  convert_prism_case(prism_node, source)
114
109
  when Prism::BeginNode
@@ -223,17 +218,7 @@ module CodebaseIndex
223
218
  def convert_prism_class(prism_node, source)
224
219
  name_node = convert_prism_node(prism_node.constant_path, source)
225
220
  superclass = prism_node.superclass ? convert_prism_node(prism_node.superclass, source) : nil
226
- body_children = if prism_node.body
227
- if prism_node.body.is_a?(Prism::StatementsNode)
228
- prism_node.body.body.map do |c|
229
- convert_prism_node(c, source)
230
- end
231
- else
232
- [convert_prism_node(prism_node.body, source)]
233
- end
234
- else
235
- []
236
- end
221
+ body_children = extract_prism_body_children(prism_node, source)
237
222
 
238
223
  children = [name_node, superclass] + body_children
239
224
 
@@ -248,17 +233,7 @@ module CodebaseIndex
248
233
 
249
234
  def convert_prism_module(prism_node, source)
250
235
  name_node = convert_prism_node(prism_node.constant_path, source)
251
- body_children = if prism_node.body
252
- if prism_node.body.is_a?(Prism::StatementsNode)
253
- prism_node.body.body.map do |c|
254
- convert_prism_node(c, source)
255
- end
256
- else
257
- [convert_prism_node(prism_node.body, source)]
258
- end
259
- else
260
- []
261
- end
236
+ body_children = extract_prism_body_children(prism_node, source)
262
237
 
263
238
  children = [name_node] + body_children
264
239
 
@@ -272,15 +247,7 @@ module CodebaseIndex
272
247
  end
273
248
 
274
249
  def convert_prism_def(prism_node, source)
275
- body_children = if prism_node.body
276
- if prism_node.body.is_a?(Prism::StatementsNode)
277
- prism_node.body.body.map { |c| convert_prism_node(c, source) }
278
- else
279
- [convert_prism_node(prism_node.body, source)]
280
- end
281
- else
282
- []
283
- end
250
+ body_children = extract_prism_body_children(prism_node, source)
284
251
 
285
252
  is_class_method = prism_node.respond_to?(:receiver) && prism_node.receiver
286
253
  receiver_text = if is_class_method
@@ -346,13 +313,14 @@ module CodebaseIndex
346
313
 
347
314
  def convert_prism_constant_path(prism_node, _source)
348
315
  parent_text = (extract_const_path_text(prism_node.parent) if prism_node.parent)
316
+ const_name = prism_node.respond_to?(:name) ? prism_node.name.to_s : prism_node.child.name.to_s
349
317
 
350
318
  Node.new(
351
319
  type: :const,
352
320
  children: [],
353
321
  line: line_for_prism(prism_node),
354
322
  receiver: parent_text,
355
- method_name: prism_node.name.to_s
323
+ method_name: const_name
356
324
  )
357
325
  end
358
326
 
@@ -364,23 +332,8 @@ module CodebaseIndex
364
332
  end
365
333
 
366
334
  then_body = prism_node.statements ? convert_prism_node(prism_node.statements, source) : nil
367
- else_body = prism_node.subsequent ? convert_prism_node(prism_node.subsequent, source) : nil
368
-
369
- Node.new(
370
- type: :if,
371
- children: [condition, then_body, else_body].compact,
372
- line: line_for_prism(prism_node),
373
- end_line: end_line_for_prism(prism_node),
374
- source: condition_source
375
- )
376
- end
377
-
378
- def convert_prism_unless(prism_node, source)
379
- condition = convert_prism_node(prism_node.predicate, source)
380
- condition_source = extract_prism_source_text(prism_node.predicate, source)
381
-
382
- then_body = prism_node.statements ? convert_prism_node(prism_node.statements, source) : nil
383
- else_body = prism_node.else_clause ? convert_prism_node(prism_node.else_clause, source) : nil
335
+ else_clause = prism_else_clause(prism_node)
336
+ else_body = else_clause ? convert_prism_node(else_clause, source) : nil
384
337
 
385
338
  Node.new(
386
339
  type: :if,
@@ -395,10 +348,21 @@ module CodebaseIndex
395
348
  children = []
396
349
  children << convert_prism_node(prism_node.predicate, source) if prism_node.predicate
397
350
  prism_node.conditions.each { |c| children << convert_prism_node(c, source) }
398
- children << convert_prism_node(prism_node.else_clause, source) if prism_node.else_clause
351
+ else_clause = prism_else_clause(prism_node)
352
+ children << convert_prism_node(else_clause, source) if else_clause
399
353
  Node.new(type: :case, children: children, line: line_for_prism(prism_node))
400
354
  end
401
355
 
356
+ def extract_prism_body_children(prism_node, source)
357
+ return [] unless prism_node.body
358
+
359
+ if prism_node.body.is_a?(Prism::StatementsNode)
360
+ prism_node.body.body.map { |c| convert_prism_node(c, source) }
361
+ else
362
+ [convert_prism_node(prism_node.body, source)]
363
+ end
364
+ end
365
+
402
366
  def convert_prism_children(statements_node, source)
403
367
  return [] unless statements_node
404
368
 
@@ -410,12 +374,19 @@ module CodebaseIndex
410
374
  end
411
375
 
412
376
  def extract_prism_generic_children(prism_node, source)
413
- children = []
414
- prism_node.child_nodes.compact.each do |child|
415
- converted = convert_prism_node(child, source)
416
- children << converted if converted
377
+ prism_node.child_nodes.compact.filter_map { |child| convert_prism_node(child, source) }
378
+ end
379
+
380
+ # Portable accessor for the else/consequent clause of if/unless/case nodes.
381
+ # Prism < 1.0 uses :consequent, Prism >= 1.0 uses :else_clause/:subsequent.
382
+ def prism_else_clause(node)
383
+ if node.respond_to?(:consequent)
384
+ node.consequent
385
+ elsif node.respond_to?(:else_clause)
386
+ node.else_clause
387
+ elsif node.respond_to?(:subsequent)
388
+ node.subsequent
417
389
  end
418
- children
419
390
  end
420
391
 
421
392
  def line_for_prism(node)
@@ -427,12 +398,7 @@ module CodebaseIndex
427
398
  end
428
399
 
429
400
  def extract_prism_source_span(node, source)
430
- lines = source.lines
431
- start_idx = node.location.start_line - 1
432
- end_idx = node.location.end_line - 1
433
- return nil if start_idx.negative? || end_idx >= lines.length
434
-
435
- lines[start_idx..end_idx].join
401
+ extract_source_span(source, node.location.start_line, node.location.end_line)
436
402
  end
437
403
 
438
404
  def extract_prism_source_text(node, source)
@@ -463,17 +429,13 @@ module CodebaseIndex
463
429
  node.name.to_s
464
430
  when Prism::ConstantPathNode
465
431
  parent = node.parent ? extract_const_path_text(node.parent) : nil
466
- [parent, node.name.to_s].compact.join('::')
432
+ const_name = node.respond_to?(:name) ? node.name.to_s : node.child.name.to_s
433
+ [parent, const_name].compact.join('::')
467
434
  end
468
435
  end
469
436
 
470
437
  def extract_const_name(node)
471
- case node
472
- when Prism::ConstantReadNode
473
- node.name.to_s
474
- when Prism::ConstantPathNode
475
- extract_const_path_text(node)
476
- end
438
+ extract_const_path_text(node)
477
439
  end
478
440
 
479
441
  # ── Parser gem fallback ──────────────────────────────────────────────
@@ -490,7 +452,7 @@ module CodebaseIndex
490
452
  name_node = convert_parser_node(parser_node.children[0], source)
491
453
  superclass = parser_node.children[1] ? convert_parser_node(parser_node.children[1], source) : nil
492
454
  body = parser_node.children[2] ? convert_parser_node(parser_node.children[2], source) : nil
493
- body_children = body&.type == :begin ? body.children : [body].compact
455
+ body_children = parser_body_children(body)
494
456
  children = [name_node, superclass] + body_children
495
457
  Node.new(
496
458
  type: :class,
@@ -502,7 +464,7 @@ module CodebaseIndex
502
464
  when :module
503
465
  name_node = convert_parser_node(parser_node.children[0], source)
504
466
  body = parser_node.children[1] ? convert_parser_node(parser_node.children[1], source) : nil
505
- body_children = body&.type == :begin ? body.children : [body].compact
467
+ body_children = parser_body_children(body)
506
468
  children = [name_node] + body_children
507
469
  Node.new(
508
470
  type: :module,
@@ -513,7 +475,7 @@ module CodebaseIndex
513
475
  )
514
476
  when :def
515
477
  body = parser_node.children[2] ? convert_parser_node(parser_node.children[2], source) : nil
516
- body_children = body&.type == :begin ? body.children : [body].compact
478
+ body_children = parser_body_children(body)
517
479
  Node.new(
518
480
  type: :def,
519
481
  children: body_children,
@@ -524,7 +486,7 @@ module CodebaseIndex
524
486
  )
525
487
  when :defs
526
488
  body = parser_node.children[3] ? convert_parser_node(parser_node.children[3], source) : nil
527
- body_children = body&.type == :begin ? body.children : [body].compact
489
+ body_children = parser_body_children(body)
528
490
  receiver = parser_node.children[0].type == :self ? 'self' : parser_node.children[0].to_s
529
491
  Node.new(
530
492
  type: :defs,
@@ -605,13 +567,12 @@ module CodebaseIndex
605
567
  end
606
568
  end
607
569
 
608
- def extract_parser_source_span(node, source)
609
- lines = source.lines
610
- start_idx = node.loc.line - 1
611
- end_idx = node.loc.expression.last_line - 1
612
- return nil if start_idx.negative? || end_idx >= lines.length
570
+ def parser_body_children(body_node)
571
+ body_node&.type == :begin ? body_node.children : [body_node].compact
572
+ end
613
573
 
614
- lines[start_idx..end_idx].join
574
+ def extract_parser_source_span(node, source)
575
+ extract_source_span(source, node.loc.line, node.loc.expression.last_line)
615
576
  end
616
577
 
617
578
  def extract_parser_source_text(node, source)
@@ -26,7 +26,7 @@ module CodebaseIndex
26
26
  # config.vector_store_options = { url: ENV['QDRANT_URL'], collection: 'myapp' }
27
27
  # end
28
28
  #
29
- class Builder
29
+ class Builder # rubocop:disable Metrics/ClassLength
30
30
  # Named presets mapping to default adapter types.
31
31
  #
32
32
  # :local — fully local, no external services required
@@ -74,14 +74,25 @@ module CodebaseIndex
74
74
 
75
75
  # Build a {Retriever} wired with adapters from the configuration.
76
76
  #
77
- # @return [Retriever] A fully instantiated, wired retriever
77
+ # When `cache_enabled` is true, the embedding provider is wrapped with
78
+ # {Cache::CachedEmbeddingProvider} and the retriever is wrapped with
79
+ # {Cache::CachedRetriever} for transparent caching of expensive operations.
80
+ #
81
+ # @return [Retriever, Cache::CachedRetriever] A fully wired retriever
78
82
  def build_retriever
79
- Retriever.new(
83
+ provider = build_embedding_provider
84
+ cache = build_cache_store
85
+
86
+ provider = wrap_with_embedding_cache(provider, cache) if cache
87
+
88
+ retriever = Retriever.new(
80
89
  vector_store: build_vector_store,
81
90
  metadata_store: build_metadata_store,
82
91
  graph_store: build_graph_store,
83
- embedding_provider: build_embedding_provider
92
+ embedding_provider: provider
84
93
  )
94
+
95
+ cache ? wrap_with_retriever_cache(retriever, cache) : retriever
85
96
  end
86
97
 
87
98
  # Instantiate the vector store adapter specified by the configuration.
@@ -133,5 +144,57 @@ module CodebaseIndex
133
144
  else raise ArgumentError, "Unknown graph_store: #{@config.graph_store}"
134
145
  end
135
146
  end
147
+
148
+ # Build a cache store from configuration, or nil if caching is disabled.
149
+ #
150
+ # @return [Cache::CacheStore, nil]
151
+ def build_cache_store
152
+ return nil unless @config.cache_enabled
153
+
154
+ opts = @config.cache_options || {}
155
+
156
+ case @config.cache_store
157
+ when :memory
158
+ Cache::InMemory.new(max_entries: opts.fetch(:max_entries, 500))
159
+ when :redis
160
+ require_relative 'cache/redis_cache_store'
161
+ Cache::RedisCacheStore.new(redis: opts.fetch(:redis), default_ttl: opts[:default_ttl])
162
+ when :solid_cache
163
+ require_relative 'cache/solid_cache_store'
164
+ Cache::SolidCacheStore.new(cache: opts.fetch(:cache), default_ttl: opts[:default_ttl])
165
+ when Cache::CacheStore
166
+ @config.cache_store
167
+ else
168
+ raise ArgumentError, "Unknown cache_store: #{@config.cache_store}"
169
+ end
170
+ end
171
+
172
+ # Wrap an embedding provider with caching.
173
+ #
174
+ # @param provider [Embedding::Provider::Interface]
175
+ # @param cache [Cache::CacheStore]
176
+ # @return [Cache::CachedEmbeddingProvider]
177
+ def wrap_with_embedding_cache(provider, cache)
178
+ ttls = (@config.cache_options || {}).fetch(:ttl, {})
179
+ Cache::CachedEmbeddingProvider.new(
180
+ provider: provider,
181
+ cache_store: cache,
182
+ ttl: ttls.fetch(:embeddings, Cache::DEFAULT_TTLS[:embeddings])
183
+ )
184
+ end
185
+
186
+ # Wrap a retriever with caching.
187
+ #
188
+ # @param retriever [Retriever]
189
+ # @param cache [Cache::CacheStore]
190
+ # @return [Cache::CachedRetriever]
191
+ def wrap_with_retriever_cache(retriever, cache)
192
+ ttls = (@config.cache_options || {}).fetch(:ttl, {})
193
+ Cache::CachedRetriever.new(
194
+ retriever: retriever,
195
+ cache_store: cache,
196
+ context_ttl: ttls.fetch(:context, Cache::DEFAULT_TTLS[:context])
197
+ )
198
+ end
136
199
  end
137
200
  end
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require_relative 'cache_store'
5
+
6
+ module CodebaseIndex
7
+ module Cache
8
+ # Decorator that wraps an embedding provider with cache-through logic.
9
+ #
10
+ # Implements the same {Embedding::Provider::Interface} so it can be
11
+ # injected transparently in place of the real provider. On cache hit,
12
+ # the expensive API call (OpenAI, Ollama) is skipped entirely.
13
+ #
14
+ # @example
15
+ # real_provider = Embedding::Provider::OpenAI.new(api_key: key)
16
+ # cached = CachedEmbeddingProvider.new(provider: real_provider, cache_store: store)
17
+ # cached.embed("How does User work?") # API call + cache write
18
+ # cached.embed("How does User work?") # cache hit, no API call
19
+ #
20
+ class CachedEmbeddingProvider
21
+ include Embedding::Provider::Interface
22
+
23
+ # @param provider [Embedding::Provider::Interface] The real embedding provider
24
+ # @param cache_store [CacheStore] Cache backend instance
25
+ # @param ttl [Integer] TTL for cached embeddings in seconds
26
+ def initialize(provider:, cache_store:, ttl: DEFAULT_TTLS[:embeddings])
27
+ @provider = provider
28
+ @cache_store = cache_store
29
+ @ttl = ttl
30
+ end
31
+
32
+ # Embed a single text, returning a cached vector when available.
33
+ #
34
+ # @param text [String] Text to embed
35
+ # @return [Array<Float>] Embedding vector
36
+ def embed(text)
37
+ key = embedding_key(text)
38
+ @cache_store.fetch(key, ttl: @ttl) { @provider.embed(text) }
39
+ end
40
+
41
+ # Embed a batch of texts, using cached vectors for any previously seen texts.
42
+ #
43
+ # Only texts that are not already cached are sent to the real provider.
44
+ # Results are merged back in original order.
45
+ #
46
+ # @param texts [Array<String>] Texts to embed
47
+ # @return [Array<Array<Float>>] Embedding vectors (same order as input)
48
+ def embed_batch(texts)
49
+ results, misses, miss_indices = partition_cached(texts)
50
+
51
+ if misses.any?
52
+ fresh_vectors = @provider.embed_batch(misses)
53
+ misses.each_with_index do |text, i|
54
+ results[miss_indices[i]] = fresh_vectors[i]
55
+ begin
56
+ @cache_store.write(embedding_key(text), fresh_vectors[i], ttl: @ttl)
57
+ rescue StandardError => e
58
+ warn("[CodebaseIndex] CachedEmbeddingProvider cache write failed: #{e.message}")
59
+ end
60
+ end
61
+ end
62
+
63
+ results
64
+ end
65
+
66
+ # Delegate dimensions to the underlying provider.
67
+ #
68
+ # @return [Integer]
69
+ def dimensions
70
+ @provider.dimensions
71
+ end
72
+
73
+ # Delegate model_name to the underlying provider.
74
+ #
75
+ # @return [String]
76
+ def model_name
77
+ @provider.model_name
78
+ end
79
+
80
+ private
81
+
82
+ # Split texts into cached hits and uncached misses.
83
+ #
84
+ # @param texts [Array<String>]
85
+ # @return [Array(Array, Array<String>, Array<Integer>)]
86
+ def partition_cached(texts)
87
+ results = Array.new(texts.size)
88
+ misses = []
89
+ miss_indices = []
90
+
91
+ texts.each_with_index do |text, idx|
92
+ cached = @cache_store.read(embedding_key(text))
93
+ if cached
94
+ results[idx] = cached
95
+ else
96
+ misses << text
97
+ miss_indices << idx
98
+ end
99
+ end
100
+
101
+ [results, misses, miss_indices]
102
+ end
103
+
104
+ # Build a cache key for an embedding text.
105
+ #
106
+ # @param text [String]
107
+ # @return [String]
108
+ def embedding_key(text)
109
+ Cache.cache_key(:embeddings, Digest::SHA256.hexdigest(text))
110
+ end
111
+ end
112
+
113
+ # Decorator that wraps a {Retriever} with result caching.
114
+ #
115
+ # Caches the full formatted context output (the most token-expensive artifact)
116
+ # keyed by query + budget. Also caches the structural context overview
117
+ # separately with a longer TTL.
118
+ #
119
+ # @example
120
+ # retriever = CodebaseIndex::Retriever.new(...)
121
+ # cached = CachedRetriever.new(retriever: retriever, cache_store: store)
122
+ # cached.retrieve("How does User work?") # full pipeline + cache
123
+ # cached.retrieve("How does User work?") # instant cache hit
124
+ #
125
+ class CachedRetriever
126
+ # @param retriever [Retriever] The real retriever instance
127
+ # @param cache_store [CacheStore] Cache backend instance
128
+ # @param context_ttl [Integer] TTL for formatted context results
129
+ def initialize(retriever:, cache_store:, context_ttl: DEFAULT_TTLS[:context])
130
+ @retriever = retriever
131
+ @cache_store = cache_store
132
+ @context_ttl = context_ttl
133
+ end
134
+
135
+ # Execute the retrieval pipeline with context-level caching.
136
+ #
137
+ # On cache hit, returns a RetrievalResult reconstructed from cached data
138
+ # without running any pipeline stages. On miss, delegates to the real
139
+ # retriever and caches the serializable parts of the result.
140
+ #
141
+ # @param query [String] Natural language query
142
+ # @param budget [Integer] Token budget
143
+ # @return [Retriever::RetrievalResult]
144
+ def retrieve(query, budget: 8000)
145
+ key = context_key(query, budget)
146
+ cached = @cache_store.read(key)
147
+
148
+ if cached
149
+ return Retriever::RetrievalResult.new(
150
+ context: cached['context'],
151
+ sources: cached['sources'],
152
+ classification: nil,
153
+ strategy: cached['strategy']&.to_sym,
154
+ tokens_used: cached['tokens_used'],
155
+ budget: budget,
156
+ trace: nil
157
+ )
158
+ end
159
+
160
+ result = @retriever.retrieve(query, budget: budget)
161
+
162
+ begin
163
+ @cache_store.write(key, serialize_result(result), ttl: @context_ttl)
164
+ rescue StandardError => e
165
+ warn("[CodebaseIndex] CachedRetriever cache write failed: #{e.message}")
166
+ end
167
+
168
+ result
169
+ end
170
+
171
+ private
172
+
173
+ # Build a cache key for a context result.
174
+ #
175
+ # @param query [String]
176
+ # @param budget [Integer]
177
+ # @return [String]
178
+ def context_key(query, budget)
179
+ Cache.cache_key(:context, query, budget.to_s)
180
+ end
181
+
182
+ # Serialize a RetrievalResult to a JSON-safe hash.
183
+ #
184
+ # Only caches the fields needed to reconstruct a useful result:
185
+ # context string, sources list, strategy, and token count.
186
+ #
187
+ # @param result [Retriever::RetrievalResult]
188
+ # @return [Hash]
189
+ def serialize_result(result)
190
+ {
191
+ 'context' => result.context,
192
+ 'sources' => result.sources,
193
+ 'strategy' => result.strategy&.to_s,
194
+ 'tokens_used' => result.tokens_used
195
+ }
196
+ end
197
+ end
198
+ end
199
+ end