codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ # Railtie integrates CodebaseIndex into Rails applications.
5
+ # Loads rake tasks automatically when the gem is bundled.
6
+ # Conditionally inserts session tracer middleware when enabled.
7
+ class Railtie < Rails::Railtie
8
+ rake_tasks do
9
+ load File.expand_path('../tasks/codebase_index.rake', __dir__)
10
+ end
11
+
12
+ initializer 'codebase_index.session_tracer' do |app|
13
+ config = CodebaseIndex.configuration
14
+ if config.session_tracer_enabled
15
+ require 'codebase_index/session_tracer/middleware'
16
+
17
+ app.middleware.use(
18
+ CodebaseIndex::SessionTracer::Middleware,
19
+ store: config.session_store,
20
+ session_id_proc: config.session_id_proc,
21
+ exclude_paths: config.session_exclude_paths
22
+ )
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Resilience
5
+ # Raised when the circuit breaker is open and calls are being rejected.
6
+ #
7
+ # @example Handling a circuit open condition
8
+ # begin
9
+ # breaker.call { provider.embed(text) }
10
+ # rescue CircuitOpenError => e
11
+ # use_cached_result(text)
12
+ # end
13
+ class CircuitOpenError < CodebaseIndex::Error; end
14
+
15
+ # Circuit breaker pattern for protecting external service calls.
16
+ #
17
+ # Tracks failures and transitions between three states:
18
+ # - **:closed** — normal operation, calls pass through
19
+ # - **:open** — too many failures, calls are rejected immediately
20
+ # - **:half_open** — testing recovery, one call is allowed through
21
+ #
22
+ # @example Basic usage
23
+ # breaker = CircuitBreaker.new(threshold: 5, reset_timeout: 60)
24
+ # result = breaker.call { external_service.request }
25
+ #
26
+ # @example With retry logic
27
+ # breaker = CircuitBreaker.new(threshold: 3, reset_timeout: 30)
28
+ # begin
29
+ # breaker.call { api.embed(text) }
30
+ # rescue CircuitOpenError
31
+ # # Service is down, use fallback
32
+ # end
33
+ class CircuitBreaker
34
+ # @return [Symbol] Current state — :closed, :open, or :half_open
35
+ attr_reader :state
36
+
37
+ # @param threshold [Integer] Number of consecutive failures before opening the circuit
38
+ # @param reset_timeout [Numeric] Seconds to wait before transitioning from open to half_open
39
+ def initialize(threshold: 5, reset_timeout: 60)
40
+ @threshold = threshold
41
+ @reset_timeout = reset_timeout
42
+ @state = :closed
43
+ @failure_count = 0
44
+ @last_failure_time = nil
45
+ @mutex = Mutex.new
46
+ end
47
+
48
+ # Execute a block through the circuit breaker.
49
+ #
50
+ # @yield The block to execute
51
+ # @return [Object] The return value of the block
52
+ # @raise [CircuitOpenError] if the circuit is open and the timeout has not elapsed
53
+ # @raise [StandardError] re-raises any error from the block
54
+ def call(&block)
55
+ # Phase 1: Check state under mutex
56
+ @mutex.synchronize do
57
+ case @state
58
+ when :open
59
+ unless Time.now - @last_failure_time >= @reset_timeout
60
+ raise CircuitOpenError, "Circuit breaker is open (#{@failure_count} failures)"
61
+ end
62
+
63
+ @state = :half_open
64
+ end
65
+ end
66
+
67
+ # Phase 2: Execute outside mutex
68
+ result = block.call
69
+
70
+ # Phase 3: Record success under mutex
71
+ @mutex.synchronize { reset! }
72
+
73
+ result
74
+ rescue CircuitOpenError
75
+ raise
76
+ rescue StandardError => e
77
+ # Phase 4: Record failure under mutex
78
+ @mutex.synchronize { record_failure }
79
+ raise e
80
+ end
81
+
82
+ private
83
+
84
+ # Record a failure and potentially open the circuit.
85
+ def record_failure
86
+ @failure_count += 1
87
+ @last_failure_time = Time.now
88
+ @state = :open if @failure_count >= @threshold
89
+ end
90
+
91
+ # Reset the circuit breaker to closed state with zero failures.
92
+ def reset!
93
+ @state = :closed
94
+ @failure_count = 0
95
+ @last_failure_time = nil
96
+ end
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'digest'
5
+
6
+ module CodebaseIndex
7
+ module Resilience
8
+ # Validates the integrity of a codebase index output directory.
9
+ #
10
+ # Checks that:
11
+ # - Each type directory has a valid `_index.json`
12
+ # - All files referenced in the index exist on disk
13
+ # - Content hashes (source_hash) match the actual source_code
14
+ # - No stale unit files exist that aren't listed in the index
15
+ #
16
+ # @example
17
+ # validator = IndexValidator.new(index_dir: "tmp/codebase_index")
18
+ # report = validator.validate
19
+ # puts report.errors if !report.valid?
20
+ class IndexValidator
21
+ # Report produced by {#validate}.
22
+ #
23
+ # @!attribute [r] valid?
24
+ # @return [Boolean] true if no errors were found
25
+ # @!attribute [r] warnings
26
+ # @return [Array<String>] non-fatal issues (e.g., stale files)
27
+ # @!attribute [r] errors
28
+ # @return [Array<String>] fatal integrity issues
29
+ ValidationReport = Struct.new(:valid?, :warnings, :errors, keyword_init: true)
30
+
31
+ # @param index_dir [String] Path to the codebase index output directory
32
+ def initialize(index_dir:)
33
+ @index_dir = index_dir
34
+ end
35
+
36
+ # Validate the index directory and return a report.
37
+ #
38
+ # @return [ValidationReport] the validation results
39
+ def validate
40
+ warnings = []
41
+ errors = []
42
+
43
+ unless Dir.exist?(@index_dir)
44
+ errors << "Index directory does not exist: #{@index_dir}"
45
+ return ValidationReport.new(valid?: false, warnings: warnings, errors: errors)
46
+ end
47
+
48
+ type_dirs = Dir.children(@index_dir).filter_map do |name|
49
+ full_path = File.join(@index_dir, name)
50
+ full_path if File.directory?(full_path)
51
+ end
52
+
53
+ type_dirs.each do |type_dir|
54
+ validate_type_directory(type_dir, warnings, errors)
55
+ end
56
+
57
+ ValidationReport.new(valid?: errors.empty?, warnings: warnings, errors: errors)
58
+ end
59
+
60
+ private
61
+
62
+ # Validate a single type directory (e.g., models/, controllers/).
63
+ #
64
+ # @param type_dir [String] Absolute path to the type directory
65
+ # @param warnings [Array<String>] Accumulated warnings
66
+ # @param errors [Array<String>] Accumulated errors
67
+ def validate_type_directory(type_dir, warnings, errors)
68
+ type_name = File.basename(type_dir)
69
+ index_path = File.join(type_dir, '_index.json')
70
+
71
+ unless File.exist?(index_path)
72
+ errors << "Missing _index.json in #{type_name}/"
73
+ return
74
+ end
75
+
76
+ index_entries = JSON.parse(File.read(index_path))
77
+ indexed_identifiers = Set.new
78
+
79
+ index_entries.each do |entry|
80
+ identifier = entry['identifier']
81
+ indexed_identifiers << identifier
82
+ validate_index_entry(type_dir, type_name, identifier, errors)
83
+ end
84
+
85
+ check_stale_files(type_dir, type_name, indexed_identifiers, warnings)
86
+ end
87
+
88
+ # Validate that a single index entry has a corresponding unit file with correct hash.
89
+ #
90
+ # @param type_dir [String] Path to the type directory
91
+ # @param type_name [String] Name of the type (for error messages)
92
+ # @param identifier [String] The unit identifier from the index
93
+ # @param errors [Array<String>] Accumulated errors
94
+ def validate_index_entry(type_dir, type_name, identifier, errors)
95
+ unit_file = find_unit_file(type_dir, identifier)
96
+
97
+ unless unit_file
98
+ errors << "Missing unit file for #{identifier} in #{type_name}/"
99
+ return
100
+ end
101
+
102
+ validate_content_hash(unit_file, identifier, errors)
103
+ end
104
+
105
+ # Find the JSON file for a given identifier in a type directory.
106
+ #
107
+ # @param type_dir [String] Path to the type directory
108
+ # @param identifier [String] The unit identifier
109
+ # @return [String, nil] Path to the unit file, or nil if not found
110
+ def find_unit_file(type_dir, identifier)
111
+ # Try collision-safe first (current format), then legacy safe_filename, then exact match
112
+ candidates = [
113
+ File.join(type_dir, collision_safe_filename(identifier)),
114
+ File.join(type_dir, safe_filename(identifier)),
115
+ File.join(type_dir, "#{identifier}.json")
116
+ ]
117
+
118
+ candidates.find { |path| File.exist?(path) }
119
+ end
120
+
121
+ # Validate that the source_hash in a unit file matches the actual source_code.
122
+ #
123
+ # @param unit_file [String] Path to the unit JSON file
124
+ # @param identifier [String] The unit identifier (for error messages)
125
+ # @param errors [Array<String>] Accumulated errors
126
+ def validate_content_hash(unit_file, identifier, errors)
127
+ data = JSON.parse(File.read(unit_file))
128
+ source_code = data['source_code']
129
+ stored_hash = data['source_hash']
130
+
131
+ return unless source_code && stored_hash
132
+
133
+ expected_hash = Digest::SHA256.hexdigest(source_code)
134
+ return if stored_hash == expected_hash
135
+
136
+ errors << "Content hash mismatch for #{identifier}: expected #{expected_hash[0..7]}..., " \
137
+ "got #{stored_hash[0..7]}..."
138
+ end
139
+
140
+ # Check for unit files that exist on disk but aren't referenced in the index.
141
+ #
142
+ # @param type_dir [String] Path to the type directory
143
+ # @param type_name [String] Name of the type (for warning messages)
144
+ # @param indexed_identifiers [Set<String>] Identifiers listed in the index
145
+ # @param warnings [Array<String>] Accumulated warnings
146
+ def check_stale_files(type_dir, type_name, indexed_identifiers, warnings)
147
+ # Build a set of expected filenames from indexed identifiers (both current and legacy formats)
148
+ expected_filenames = Set.new
149
+ indexed_identifiers.each do |id|
150
+ expected_filenames << collision_safe_filename(id)
151
+ expected_filenames << safe_filename(id)
152
+ expected_filenames << "#{id}.json"
153
+ end
154
+
155
+ Dir[File.join(type_dir, '*.json')].each do |file|
156
+ basename = File.basename(file)
157
+ next if basename == '_index.json'
158
+ next if expected_filenames.include?(basename)
159
+
160
+ warnings << "Stale file not in index: #{type_name}/#{basename}"
161
+ end
162
+ end
163
+
164
+ # Convert an identifier to a safe filename (legacy format, mirrors Extractor#safe_filename).
165
+ #
166
+ # @param identifier [String] The unit identifier (e.g., "Admin::UsersController")
167
+ # @return [String] A filesystem-safe filename (e.g., "Admin__UsersController.json")
168
+ def safe_filename(identifier)
169
+ "#{identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')}.json"
170
+ end
171
+
172
+ # Convert an identifier to a collision-safe filename (current format).
173
+ # Mirrors {Extractor#collision_safe_filename} — appends a short SHA256 digest
174
+ # to disambiguate identifiers that normalize to the same safe_filename.
175
+ #
176
+ # @param identifier [String] The unit identifier
177
+ # @return [String] Collision-safe filename (e.g., "Admin__UsersController_a1b2c3d4.json")
178
+ def collision_safe_filename(identifier)
179
+ base = identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')
180
+ digest = Digest::SHA256.hexdigest(identifier)[0, 8]
181
+ "#{base}_#{digest}.json"
182
+ end
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,108 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative '../embedding/provider'
4
+ require_relative 'circuit_breaker'
5
+
6
+ module CodebaseIndex
7
+ module Resilience
8
+ # Wraps an embedding provider with retry logic and optional circuit breaker.
9
+ #
10
+ # Transparently retries transient failures with exponential backoff.
11
+ # When a circuit breaker is provided, all calls are routed through it,
12
+ # and {CircuitOpenError} is never retried.
13
+ #
14
+ # @example Without circuit breaker
15
+ # retryable = RetryableProvider.new(provider: ollama_provider, max_retries: 3)
16
+ # vector = retryable.embed("some text")
17
+ #
18
+ # @example With circuit breaker
19
+ # breaker = CircuitBreaker.new(threshold: 5, reset_timeout: 60)
20
+ # retryable = RetryableProvider.new(
21
+ # provider: ollama_provider,
22
+ # max_retries: 3,
23
+ # circuit_breaker: breaker
24
+ # )
25
+ # vector = retryable.embed("some text")
26
+ class RetryableProvider
27
+ include CodebaseIndex::Embedding::Provider::Interface
28
+
29
+ # @param provider [#embed, #embed_batch, #dimensions, #model_name] The underlying embedding provider
30
+ # @param max_retries [Integer] Maximum number of retry attempts
31
+ # @param circuit_breaker [CircuitBreaker, nil] Optional circuit breaker instance
32
+ def initialize(provider:, max_retries: 3, circuit_breaker: nil)
33
+ @provider = provider
34
+ @max_retries = max_retries
35
+ @circuit_breaker = circuit_breaker
36
+ end
37
+
38
+ # Embed a single text string with retry logic.
39
+ #
40
+ # @param text [String] the text to embed
41
+ # @return [Array<Float>] the embedding vector
42
+ # @raise [CircuitOpenError] if the circuit breaker is open
43
+ # @raise [StandardError] if all retries are exhausted
44
+ def embed(text)
45
+ with_retries { call_provider { @provider.embed(text) } }
46
+ end
47
+
48
+ # Embed multiple texts with retry logic.
49
+ #
50
+ # @param texts [Array<String>] the texts to embed
51
+ # @return [Array<Array<Float>>] array of embedding vectors
52
+ # @raise [CircuitOpenError] if the circuit breaker is open
53
+ # @raise [StandardError] if all retries are exhausted
54
+ def embed_batch(texts)
55
+ with_retries { call_provider { @provider.embed_batch(texts) } }
56
+ end
57
+
58
+ # Return the dimensionality of the embedding vectors.
59
+ #
60
+ # @return [Integer] number of dimensions
61
+ def dimensions
62
+ @provider.dimensions
63
+ end
64
+
65
+ # Return the name of the embedding model.
66
+ #
67
+ # @return [String] model name
68
+ def model_name
69
+ @provider.model_name
70
+ end
71
+
72
+ private
73
+
74
+ # Execute a block with retry logic and exponential backoff.
75
+ #
76
+ # @yield The block to execute
77
+ # @return [Object] The return value of the block
78
+ # @raise [CircuitOpenError] immediately without retrying
79
+ # @raise [StandardError] the last error if all retries are exhausted
80
+ def with_retries
81
+ attempt = 0
82
+ begin
83
+ attempt += 1
84
+ yield
85
+ rescue CircuitOpenError
86
+ raise
87
+ rescue StandardError => e
88
+ raise e if attempt > @max_retries
89
+
90
+ sleep((2**attempt) * 0.1)
91
+ retry
92
+ end
93
+ end
94
+
95
+ # Route a call through the circuit breaker if one is configured.
96
+ #
97
+ # @yield The block to execute
98
+ # @return [Object] The return value of the block
99
+ def call_provider(&block)
100
+ if @circuit_breaker
101
+ @circuit_breaker.call(&block)
102
+ else
103
+ block.call
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,249 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Retrieval
5
+ # Transforms ranked search candidates into a token-budgeted context string
6
+ # for LLM consumption.
7
+ #
8
+ # Allocates a fixed token budget across four sections:
9
+ # - Structural (10%): Always-included codebase overview
10
+ # - Primary (50%): Direct query results
11
+ # - Supporting (25%): Dependencies and related context
12
+ # - Framework (15%): Rails/gem source when query has framework context
13
+ #
14
+ # When framework context is not needed, primary and supporting sections
15
+ # receive the framework allocation proportionally.
16
+ #
17
+ # @example
18
+ # assembler = ContextAssembler.new(metadata_store: store)
19
+ # result = assembler.assemble(candidates: ranked, classification: cls)
20
+ # result.context # => "## User (model)\n..."
21
+ # result.tokens_used # => 4200
22
+ # result.sections # => [:structural, :primary, :supporting]
23
+ #
24
+ class ContextAssembler
25
+ DEFAULT_BUDGET = 8000 # tokens
26
+
27
+ BUDGET_ALLOCATION = {
28
+ structural: 0.10,
29
+ primary: 0.50,
30
+ supporting: 0.25,
31
+ framework: 0.15
32
+ }.freeze
33
+
34
+ # Minimum token count for a section to be worth including.
35
+ MIN_USEFUL_TOKENS = 200
36
+
37
+ # @param metadata_store [#find] Store that resolves identifiers to unit data
38
+ # @param budget [Integer] Total token budget
39
+ def initialize(metadata_store:, budget: DEFAULT_BUDGET)
40
+ @metadata_store = metadata_store
41
+ @budget = budget
42
+ end
43
+
44
+ # Assemble context from ranked candidates within token budget.
45
+ #
46
+ # @param candidates [Array<Candidate>] Ranked search candidates
47
+ # @param classification [QueryClassifier::Classification] Query classification
48
+ # @param structural_context [String, nil] Optional codebase overview text
49
+ # @param budget [Integer, nil] Override token budget; falls back to @budget
50
+ # @return [AssembledContext] Token-budgeted context with source attribution
51
+ def assemble(candidates:, classification:, structural_context: nil, budget: nil)
52
+ effective_budget = budget || @budget
53
+ sections = []
54
+ sources = []
55
+ tokens_used = 0
56
+
57
+ # 1. Structural context (always first if provided)
58
+ tokens_used = add_structural_section(sections, structural_context, tokens_used, effective_budget)
59
+
60
+ # 2. Compute per-section budgets from remaining tokens
61
+ budgets = compute_section_budgets(effective_budget - tokens_used, classification)
62
+
63
+ # 3. Primary, supporting, and framework sections
64
+ add_candidate_section(sections, sources, :primary,
65
+ candidates.reject { |c| c.source == :graph_expansion }, budgets[:primary])
66
+ add_candidate_section(sections, sources, :supporting,
67
+ candidates.select { |c| c.source == :graph_expansion }, budgets[:supporting])
68
+ if budgets[:framework].positive?
69
+ add_candidate_section(sections, sources, :framework,
70
+ candidates.select { |c| framework_candidate?(c) }, budgets[:framework])
71
+ end
72
+
73
+ build_result(sections, sources, effective_budget)
74
+ end
75
+
76
+ private
77
+
78
+ # Add structural context section if provided.
79
+ #
80
+ # @return [Integer] Updated tokens_used count
81
+ def add_structural_section(sections, structural_context, tokens_used, effective_budget)
82
+ return tokens_used unless structural_context
83
+
84
+ budget = (effective_budget * BUDGET_ALLOCATION[:structural]).to_i
85
+ text = truncate_to_budget(structural_context, budget)
86
+ sections << { section: :structural, content: text }
87
+ tokens_used + estimate_tokens(text)
88
+ end
89
+
90
+ # Add a candidate-based section if candidates produce content.
91
+ #
92
+ # @return [void]
93
+ def add_candidate_section(sections, sources, section_name, candidates, budget)
94
+ return if candidates.empty?
95
+
96
+ content, section_sources = assemble_section(candidates, budget)
97
+ return if content.empty?
98
+
99
+ sections << { section: section_name, content: content }
100
+ sources.concat(section_sources)
101
+ end
102
+
103
+ # Compute token budgets for primary/supporting/framework sections.
104
+ #
105
+ # @param remaining [Integer] Tokens available after structural
106
+ # @param classification [QueryClassifier::Classification]
107
+ # @return [Hash<Symbol, Integer>]
108
+ def compute_section_budgets(remaining, classification)
109
+ if classification.framework_context
110
+ {
111
+ primary: (remaining * 0.55).to_i,
112
+ supporting: (remaining * 0.25).to_i,
113
+ framework: (remaining * 0.20).to_i
114
+ }
115
+ else
116
+ {
117
+ primary: (remaining * 0.65).to_i,
118
+ supporting: (remaining * 0.35).to_i,
119
+ framework: 0
120
+ }
121
+ end
122
+ end
123
+
124
+ # Assemble content for a single section within a token budget.
125
+ #
126
+ # @param candidates [Array<Candidate>] Candidates for this section
127
+ # @param budget [Integer] Token budget for this section
128
+ # @return [Array(String, Array<Hash>)] Content string and source attributions
129
+ def assemble_section(candidates, budget)
130
+ content_parts = []
131
+ sources = []
132
+ tokens_used = 0
133
+
134
+ candidates.sort_by { |c| -c.score }.each do |candidate|
135
+ tokens_used = append_candidate(content_parts, sources, candidate, budget, tokens_used)
136
+ break if tokens_used.nil?
137
+ end
138
+
139
+ [content_parts.join("\n\n"), sources]
140
+ end
141
+
142
+ # Append a single candidate to the section. Returns updated tokens_used, or nil to stop.
143
+ def append_candidate(parts, sources, candidate, budget, tokens_used)
144
+ unit = @metadata_store.find(candidate.identifier)
145
+ return tokens_used unless unit
146
+
147
+ text = format_unit(unit, candidate)
148
+ tokens = estimate_tokens(text)
149
+ remaining = budget - tokens_used
150
+
151
+ if tokens <= remaining
152
+ parts << text
153
+ sources << build_source_attribution(candidate, unit)
154
+ tokens_used + tokens
155
+ elsif remaining > MIN_USEFUL_TOKENS
156
+ parts << truncate_to_budget(text, remaining)
157
+ sources << build_source_attribution(candidate, unit, truncated: true)
158
+ nil
159
+ end
160
+ end
161
+
162
+ # Format a unit for inclusion in context.
163
+ #
164
+ # @param unit [Hash] Unit data from metadata store
165
+ # @param candidate [Candidate] The search candidate
166
+ # @return [String]
167
+ def format_unit(unit, _candidate)
168
+ identifier = unit[:identifier] || unit['identifier']
169
+ type = unit[:type] || unit['type']
170
+ file_path = unit[:file_path] || unit['file_path']
171
+ source = unit[:source_code] || unit['source_code'] || ''
172
+
173
+ <<~UNIT.strip
174
+ ## #{identifier} (#{type})
175
+ File: #{file_path}
176
+
177
+ #{source}
178
+ UNIT
179
+ end
180
+
181
+ # Build source attribution hash for a candidate.
182
+ #
183
+ # @return [Hash]
184
+ def build_source_attribution(candidate, unit, truncated: false)
185
+ attribution = {
186
+ identifier: candidate.identifier,
187
+ type: unit[:type] || unit['type'],
188
+ score: candidate.score,
189
+ file_path: unit[:file_path] || unit['file_path']
190
+ }
191
+ attribution[:truncated] = true if truncated
192
+ attribution
193
+ end
194
+
195
+ # Check if a candidate is framework source.
196
+ #
197
+ # @param candidate [Candidate]
198
+ # @return [Boolean]
199
+ def framework_candidate?(candidate)
200
+ metadata = candidate.metadata
201
+ return false unless metadata
202
+
203
+ type = metadata[:type] || metadata['type']
204
+ %w[rails_source gem_source].include?(type.to_s)
205
+ end
206
+
207
+ # Truncate text to fit within a token budget.
208
+ #
209
+ # @param text [String]
210
+ # @param token_budget [Integer]
211
+ # @return [String]
212
+ def truncate_to_budget(text, token_budget)
213
+ return text if estimate_tokens(text) <= token_budget
214
+
215
+ # Estimate target character count with 10% safety margin
216
+ target_chars = (token_budget * 4.0 * 0.9).to_i
217
+ "#{text[0...target_chars]}\n... [truncated]"
218
+ end
219
+
220
+ # Estimate token count using the project convention.
221
+ #
222
+ # @param text [String]
223
+ # @return [Integer]
224
+ def estimate_tokens(text)
225
+ (text.length / 4.0).ceil
226
+ end
227
+
228
+ # Build the final AssembledContext result.
229
+ #
230
+ # @param sections [Array<Hash>] Assembled sections
231
+ # @param sources [Array<Hash>] Source attributions
232
+ # @param effective_budget [Integer] The budget actually used for assembly
233
+ # @return [AssembledContext]
234
+ def build_result(sections, sources, effective_budget)
235
+ context = sections.map { |s| s[:content] }.join("\n\n---\n\n")
236
+ AssembledContext.new(
237
+ context: context,
238
+ tokens_used: estimate_tokens(context),
239
+ budget: effective_budget,
240
+ sources: sources.uniq,
241
+ sections: sections.map { |s| s[:section] }
242
+ )
243
+ end
244
+ end
245
+
246
+ # Result of context assembly.
247
+ AssembledContext = Struct.new(:context, :tokens_used, :budget, :sources, :sections, keyword_init: true)
248
+ end
249
+ end