codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'json'
5
+
6
+ module CodebaseIndex
7
+ module Coordination
8
+ class LockError < CodebaseIndex::Error; end
9
+
10
+ # File-based lock for preventing concurrent pipeline operations.
11
+ #
12
+ # Creates a lock file with PID and timestamp. Supports stale lock
13
+ # detection for crashed processes.
14
+ #
15
+ # @example
16
+ # lock = PipelineLock.new(lock_dir: '/tmp', name: 'extraction')
17
+ # lock.with_lock do
18
+ # # extraction runs here
19
+ # end
20
+ #
21
+ class PipelineLock
22
+ DEFAULT_STALE_TIMEOUT = 3600 # 1 hour
23
+
24
+ # @param lock_dir [String] Directory for lock files
25
+ # @param name [String] Lock name (used as filename prefix)
26
+ # @param stale_timeout [Integer] Seconds after which a lock is considered stale
27
+ def initialize(lock_dir:, name:, stale_timeout: DEFAULT_STALE_TIMEOUT)
28
+ @lock_dir = lock_dir
29
+ @name = name
30
+ @stale_timeout = stale_timeout
31
+ @lock_path = File.join(lock_dir, "#{name}.lock")
32
+ @held = false
33
+ end
34
+
35
+ # Attempt to acquire the lock.
36
+ #
37
+ # @return [Boolean] true if lock acquired, false if already held
38
+ def acquire
39
+ FileUtils.mkdir_p(@lock_dir)
40
+
41
+ # Check for stale lock first (separate from atomic creation)
42
+ if File.exist?(@lock_path)
43
+ return false unless stale?
44
+
45
+ # Remove stale lock
46
+ FileUtils.rm_f(@lock_path)
47
+ end
48
+
49
+ # Atomic lock creation: File::EXCL ensures this fails if file already exists
50
+ File.open(@lock_path, File::WRONLY | File::CREAT | File::EXCL) do |f|
51
+ f.write(lock_content)
52
+ end
53
+ @held = true
54
+ true
55
+ rescue Errno::EEXIST
56
+ false
57
+ end
58
+
59
+ # Release the lock.
60
+ #
61
+ # @return [void]
62
+ def release
63
+ FileUtils.rm_f(@lock_path) if @held
64
+ @held = false
65
+ end
66
+
67
+ # Execute a block while holding the lock.
68
+ #
69
+ # @yield Block to execute
70
+ # @return [Object] Return value of the block
71
+ # @raise [LockError] if lock cannot be acquired
72
+ def with_lock(&block)
73
+ raise LockError, "Cannot acquire lock '#{@name}' — another process is running" unless acquire
74
+
75
+ begin
76
+ block.call
77
+ ensure
78
+ release
79
+ end
80
+ end
81
+
82
+ # Whether the lock is currently held by this instance.
83
+ #
84
+ # @return [Boolean]
85
+ def locked?
86
+ @held && File.exist?(@lock_path)
87
+ end
88
+
89
+ private
90
+
91
+ # Check if the existing lock file is stale.
92
+ #
93
+ # @return [Boolean]
94
+ def stale?
95
+ return false unless File.exist?(@lock_path)
96
+
97
+ age = Time.now - File.mtime(@lock_path)
98
+ age > @stale_timeout
99
+ rescue Errno::ENOENT
100
+ true
101
+ end
102
+
103
+ # @return [String] Lock file content (JSON with PID and timestamp)
104
+ def lock_content
105
+ JSON.generate(pid: Process.pid, locked_at: Time.now.iso8601, name: @name)
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module CostModel
5
+ # Calculates embedding costs for full-index, incremental, and query-time
6
+ # scenarios using the token-based pricing from {ProviderPricing}.
7
+ #
8
+ # The cost model uses a constant of 450 tokens per chunk, derived from the
9
+ # BACKEND_MATRIX.md tables (e.g. 500 units × 2.5 chunks = 1250 chunks × 450 = 562K tokens).
10
+ #
11
+ # @example
12
+ # calc = EmbeddingCost.new(provider: :openai_small)
13
+ # calc.full_index_cost(units: 500, chunk_multiplier: 2.5) # => 0.01125
14
+ #
15
+ class EmbeddingCost
16
+ # Average tokens per chunk after hierarchical chunking with context prefix.
17
+ TOKENS_PER_CHUNK = 450
18
+
19
+ # Average tokens per retrieval query.
20
+ TOKENS_PER_QUERY = 100
21
+
22
+ # @param provider [Symbol] Embedding provider key from {ProviderPricing}
23
+ def initialize(provider:)
24
+ @cost_per_million = ProviderPricing.cost_per_million(provider)
25
+ end
26
+
27
+ # Cost to embed the full codebase index.
28
+ #
29
+ # @param units [Integer] Number of extracted units
30
+ # @param chunk_multiplier [Float] Average chunks per unit (default 2.5)
31
+ # @return [Float] Cost in USD
32
+ def full_index_cost(units:, chunk_multiplier: 2.5)
33
+ tokens = total_tokens(units, chunk_multiplier)
34
+ token_cost(tokens)
35
+ end
36
+
37
+ # Cost to re-embed changed units from a single merge.
38
+ #
39
+ # @param changed_units [Integer] Number of units changed (default 5)
40
+ # @param chunk_multiplier [Float] Average chunks per unit (default 2.5)
41
+ # @return [Float] Cost in USD
42
+ def incremental_cost(changed_units: 5, chunk_multiplier: 2.5)
43
+ tokens = total_tokens(changed_units, chunk_multiplier)
44
+ token_cost(tokens)
45
+ end
46
+
47
+ # Monthly cost for query-time embedding.
48
+ #
49
+ # @param daily_queries [Integer] Number of queries per day
50
+ # @return [Float] Cost in USD per month
51
+ def monthly_query_cost(daily_queries:)
52
+ monthly_tokens = daily_queries * 30 * TOKENS_PER_QUERY
53
+ token_cost(monthly_tokens)
54
+ end
55
+
56
+ # Yearly embedding cost from incremental re-indexing.
57
+ #
58
+ # @param merges_per_year [Integer] Number of merges per year (default 2400)
59
+ # @param changed_units_per_merge [Integer] Units changed per merge (default 5)
60
+ # @param chunk_multiplier [Float] Average chunks per unit (default 2.5)
61
+ # @return [Float] Cost in USD per year
62
+ def yearly_incremental_cost(merges_per_year: 2400, changed_units_per_merge: 5, chunk_multiplier: 2.5)
63
+ tokens_per_merge = total_tokens(changed_units_per_merge, chunk_multiplier)
64
+ token_cost(tokens_per_merge * merges_per_year)
65
+ end
66
+
67
+ # Total tokens for a given number of units and chunk multiplier.
68
+ #
69
+ # @param units [Integer] Number of units
70
+ # @param chunk_multiplier [Float] Chunks per unit
71
+ # @return [Integer] Total embedding tokens
72
+ def total_tokens(units, chunk_multiplier)
73
+ chunks = (units * chunk_multiplier).ceil
74
+ chunks * TOKENS_PER_CHUNK
75
+ end
76
+
77
+ private
78
+
79
+ # Convert token count to cost in USD.
80
+ #
81
+ # @param tokens [Numeric] Number of tokens
82
+ # @return [Float] Cost in USD
83
+ def token_cost(tokens)
84
+ (tokens.to_f / 1_000_000) * @cost_per_million
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module CostModel
5
+ # Unified cost estimator that combines embedding, storage, and query costs
6
+ # into a single breakdown for a given configuration.
7
+ #
8
+ # @example
9
+ # estimate = Estimator.new(
10
+ # units: 500,
11
+ # chunk_multiplier: 2.5,
12
+ # embedding_provider: :openai_small,
13
+ # dimensions: 1536,
14
+ # daily_queries: 100
15
+ # )
16
+ # estimate.full_index_cost # => 0.01125
17
+ # estimate.monthly_query_cost # => 0.006
18
+ # estimate.storage_bytes # => 9_984_000
19
+ # estimate.to_h # => { full_index_cost: ..., ... }
20
+ #
21
+ class Estimator
22
+ # @return [Integer] Number of extracted units
23
+ attr_reader :units
24
+
25
+ # @return [Float] Average chunks per unit
26
+ attr_reader :chunk_multiplier
27
+
28
+ # @return [Symbol] Embedding provider key
29
+ attr_reader :embedding_provider
30
+
31
+ # @return [Integer] Embedding vector dimensions
32
+ attr_reader :dimensions
33
+
34
+ # @return [Integer] Number of retrieval queries per day
35
+ attr_reader :daily_queries
36
+
37
+ # @param units [Integer] Number of extracted units
38
+ # @param chunk_multiplier [Float] Average chunks per unit (default 2.5)
39
+ # @param embedding_provider [Symbol] Provider key from {ProviderPricing}
40
+ # @param dimensions [Integer, nil] Vector dimensions (defaults to provider default)
41
+ # @param daily_queries [Integer] Retrieval queries per day (default 100)
42
+ def initialize(units:, embedding_provider:, chunk_multiplier: 2.5, dimensions: nil, daily_queries: 100)
43
+ @units = units
44
+ @chunk_multiplier = chunk_multiplier
45
+ @embedding_provider = embedding_provider
46
+ @dimensions = dimensions || ProviderPricing.default_dimensions(embedding_provider)
47
+ @daily_queries = daily_queries
48
+
49
+ @embedding_cost = EmbeddingCost.new(provider: embedding_provider)
50
+ @storage_cost = StorageCost.new(dimensions: @dimensions)
51
+ end
52
+
53
+ # Cost to embed the full codebase index.
54
+ #
55
+ # @return [Float] Cost in USD
56
+ def full_index_cost
57
+ @embedding_cost.full_index_cost(units: units, chunk_multiplier: chunk_multiplier)
58
+ end
59
+
60
+ # Cost to re-embed a single merge (default 5 changed units).
61
+ #
62
+ # @param changed_units [Integer] Units changed per merge (default 5)
63
+ # @return [Float] Cost in USD
64
+ def incremental_per_merge_cost(changed_units: 5)
65
+ @embedding_cost.incremental_cost(changed_units: changed_units, chunk_multiplier: chunk_multiplier)
66
+ end
67
+
68
+ # Monthly cost for query-time embedding.
69
+ #
70
+ # @return [Float] Cost in USD per month
71
+ def monthly_query_cost
72
+ @embedding_cost.monthly_query_cost(daily_queries: daily_queries)
73
+ end
74
+
75
+ # Yearly embedding cost from incremental re-indexing.
76
+ #
77
+ # @param merges_per_year [Integer] Merges per year (default 2400)
78
+ # @return [Float] Cost in USD per year
79
+ def yearly_incremental_cost(merges_per_year: 2400)
80
+ @embedding_cost.yearly_incremental_cost(
81
+ merges_per_year: merges_per_year,
82
+ chunk_multiplier: chunk_multiplier
83
+ )
84
+ end
85
+
86
+ # Total number of chunks for the codebase.
87
+ #
88
+ # @return [Integer]
89
+ def total_chunks
90
+ @total_chunks ||= (units * chunk_multiplier).ceil
91
+ end
92
+
93
+ # Total storage in bytes for vector data.
94
+ #
95
+ # @return [Integer]
96
+ def storage_bytes
97
+ @storage_cost.storage_bytes(chunks: total_chunks)
98
+ end
99
+
100
+ # Total storage in megabytes for vector data.
101
+ #
102
+ # @return [Float]
103
+ def storage_mb
104
+ @storage_cost.storage_mb(chunks: total_chunks)
105
+ end
106
+
107
+ # Full cost breakdown as a Hash.
108
+ #
109
+ # @return [Hash{Symbol => Numeric}]
110
+ def to_h
111
+ {
112
+ full_index_cost: full_index_cost,
113
+ incremental_per_merge_cost: incremental_per_merge_cost,
114
+ monthly_query_cost: monthly_query_cost,
115
+ yearly_incremental_cost: yearly_incremental_cost,
116
+ storage_bytes: storage_bytes,
117
+ storage_mb: storage_mb,
118
+ total_chunks: total_chunks,
119
+ units: units,
120
+ chunk_multiplier: chunk_multiplier,
121
+ embedding_provider: embedding_provider,
122
+ dimensions: dimensions,
123
+ daily_queries: daily_queries
124
+ }
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module CostModel
5
+ # Frozen pricing data for embedding providers.
6
+ #
7
+ # Costs are expressed as dollars per 1 million tokens, sourced from
8
+ # BACKEND_MATRIX.md. Each provider is identified by a Symbol key.
9
+ #
10
+ # @example
11
+ # ProviderPricing.cost_per_million(:openai_small) # => 0.02
12
+ # ProviderPricing.providers # => [:openai_small, ...]
13
+ #
14
+ module ProviderPricing
15
+ # Cost per 1 million tokens, in USD.
16
+ #
17
+ # @return [Hash{Symbol => Float}]
18
+ COSTS_PER_MILLION_TOKENS = {
19
+ openai_small: 0.02,
20
+ openai_large: 0.13,
21
+ voyage_code3: 0.06,
22
+ ollama: 0.00
23
+ }.freeze
24
+
25
+ # Default embedding dimensions per provider.
26
+ #
27
+ # @return [Hash{Symbol => Integer}]
28
+ DEFAULT_DIMENSIONS = {
29
+ openai_small: 1536,
30
+ openai_large: 3072,
31
+ voyage_code3: 1024,
32
+ ollama: 768
33
+ }.freeze
34
+
35
+ # Look up the cost per 1M tokens for a provider.
36
+ #
37
+ # @param provider [Symbol] Provider key (e.g. :openai_small)
38
+ # @return [Float] Cost in USD per 1M tokens
39
+ # @raise [ArgumentError] if provider is unknown
40
+ def self.cost_per_million(provider)
41
+ COSTS_PER_MILLION_TOKENS.fetch(provider) do
42
+ raise ArgumentError, "Unknown embedding provider: #{provider.inspect}. " \
43
+ "Valid providers: #{providers.join(', ')}"
44
+ end
45
+ end
46
+
47
+ # Look up the default dimensions for a provider.
48
+ #
49
+ # @param provider [Symbol] Provider key
50
+ # @return [Integer] Default embedding dimensions
51
+ # @raise [ArgumentError] if provider is unknown
52
+ def self.default_dimensions(provider)
53
+ DEFAULT_DIMENSIONS.fetch(provider) do
54
+ raise ArgumentError, "Unknown embedding provider: #{provider.inspect}. " \
55
+ "Valid providers: #{providers.join(', ')}"
56
+ end
57
+ end
58
+
59
+ # List all known provider keys.
60
+ #
61
+ # @return [Array<Symbol>]
62
+ def self.providers
63
+ COSTS_PER_MILLION_TOKENS.keys
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module CostModel
5
+ # Calculates vector storage requirements based on embedding dimensions
6
+ # and chunk count.
7
+ #
8
+ # Bytes per vector = dimensions × 4 (float32), with a 1.3× metadata
9
+ # overhead factor applied per BACKEND_MATRIX.md.
10
+ #
11
+ # @example
12
+ # calc = StorageCost.new(dimensions: 1536)
13
+ # calc.storage_bytes(chunks: 1250) # => 9_984_000
14
+ # calc.storage_mb(chunks: 1250) # => 9.52
15
+ #
16
+ class StorageCost
17
+ # Bytes per float32 value.
18
+ BYTES_PER_FLOAT = 4
19
+
20
+ # Metadata overhead multiplier (JSONB payload, indexes, etc.).
21
+ METADATA_OVERHEAD = 1.3
22
+
23
+ # @param dimensions [Integer] Embedding vector dimensions
24
+ def initialize(dimensions:)
25
+ @dimensions = dimensions
26
+ end
27
+
28
+ # Bytes per vector including metadata overhead.
29
+ #
30
+ # @return [Integer]
31
+ def bytes_per_vector
32
+ @bytes_per_vector ||= (@dimensions * BYTES_PER_FLOAT * METADATA_OVERHEAD).ceil
33
+ end
34
+
35
+ # Total storage in bytes for a given number of chunks.
36
+ #
37
+ # @param chunks [Integer] Total number of chunks (units × chunk_multiplier)
38
+ # @return [Integer]
39
+ def storage_bytes(chunks:)
40
+ chunks * bytes_per_vector
41
+ end
42
+
43
+ # Total storage in megabytes for a given number of chunks.
44
+ #
45
+ # @param chunks [Integer] Total number of chunks
46
+ # @return [Float] Storage in MB, rounded to 2 decimal places
47
+ def storage_mb(chunks:)
48
+ (storage_bytes(chunks: chunks).to_f / (1024 * 1024)).round(2)
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'cost_model/provider_pricing'
4
+ require_relative 'cost_model/embedding_cost'
5
+ require_relative 'cost_model/storage_cost'
6
+ require_relative 'cost_model/estimator'
7
+
8
+ module CodebaseIndex
9
+ # Cost modeling for embedding, storage, and query costs across different
10
+ # backend configurations. Based on the cost analysis in BACKEND_MATRIX.md.
11
+ #
12
+ # @example
13
+ # estimate = CodebaseIndex::CostModel::Estimator.new(
14
+ # units: 500,
15
+ # embedding_provider: :openai_small
16
+ # )
17
+ # estimate.full_index_cost # => 0.011
18
+ # estimate.monthly_query_cost # => 0.006
19
+ #
20
+ module CostModel
21
+ end
22
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Db
5
+ module Migrations
6
+ # Creates the codebase_units table for storing extracted unit metadata.
7
+ module CreateUnits
8
+ VERSION = 1
9
+
10
+ # @param connection [Object] Database connection
11
+ # @return [void]
12
+ def self.up(connection) # rubocop:disable Metrics/MethodLength
13
+ connection.execute(<<~SQL)
14
+ CREATE TABLE IF NOT EXISTS codebase_units (
15
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
16
+ unit_type TEXT NOT NULL,
17
+ identifier TEXT NOT NULL,
18
+ namespace TEXT,
19
+ file_path TEXT NOT NULL,
20
+ source_code TEXT,
21
+ source_hash TEXT,
22
+ metadata TEXT,
23
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
24
+ updated_at TEXT NOT NULL DEFAULT (datetime('now')),
25
+ UNIQUE(identifier)
26
+ )
27
+ SQL
28
+ connection.execute(<<~SQL)
29
+ CREATE INDEX IF NOT EXISTS idx_codebase_units_type ON codebase_units(unit_type)
30
+ SQL
31
+ connection.execute(<<~SQL)
32
+ CREATE INDEX IF NOT EXISTS idx_codebase_units_file_path ON codebase_units(file_path)
33
+ SQL
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Db
5
+ module Migrations
6
+ # Creates the codebase_edges table for storing unit relationships.
7
+ module CreateEdges
8
+ VERSION = 2
9
+
10
+ # @param connection [Object] Database connection
11
+ # @return [void]
12
+ def self.up(connection)
13
+ connection.execute(<<~SQL)
14
+ CREATE TABLE IF NOT EXISTS codebase_edges (
15
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
16
+ source_id INTEGER NOT NULL,
17
+ target_id INTEGER NOT NULL,
18
+ relationship TEXT NOT NULL,
19
+ via TEXT,
20
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
21
+ FOREIGN KEY (source_id) REFERENCES codebase_units(id),
22
+ FOREIGN KEY (target_id) REFERENCES codebase_units(id)
23
+ )
24
+ SQL
25
+ connection.execute(<<~SQL)
26
+ CREATE INDEX IF NOT EXISTS idx_codebase_edges_source ON codebase_edges(source_id)
27
+ SQL
28
+ connection.execute(<<~SQL)
29
+ CREATE INDEX IF NOT EXISTS idx_codebase_edges_target ON codebase_edges(target_id)
30
+ SQL
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Db
5
+ module Migrations
6
+ # Creates the codebase_embeddings table for storing vector embeddings.
7
+ # Uses TEXT for embedding storage (JSON array) for database portability.
8
+ # Pgvector users should use the pgvector generator for native vector columns.
9
+ module CreateEmbeddings
10
+ VERSION = 3
11
+
12
+ # @param connection [Object] Database connection
13
+ # @return [void]
14
+ def self.up(connection)
15
+ connection.execute(<<~SQL)
16
+ CREATE TABLE IF NOT EXISTS codebase_embeddings (
17
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
18
+ unit_id INTEGER NOT NULL,
19
+ chunk_type TEXT,
20
+ embedding TEXT NOT NULL,
21
+ content_hash TEXT NOT NULL,
22
+ dimensions INTEGER NOT NULL,
23
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
24
+ FOREIGN KEY (unit_id) REFERENCES codebase_units(id)
25
+ )
26
+ SQL
27
+ connection.execute(<<~SQL)
28
+ CREATE INDEX IF NOT EXISTS idx_codebase_embeddings_unit ON codebase_embeddings(unit_id)
29
+ SQL
30
+ connection.execute(<<~SQL)
31
+ CREATE INDEX IF NOT EXISTS idx_codebase_embeddings_hash ON codebase_embeddings(content_hash)
32
+ SQL
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Db
5
+ module Migrations
6
+ # Creates the codebase_snapshots table for temporal index tracking.
7
+ #
8
+ # Each row represents one extraction run anchored to a git commit SHA.
9
+ # Stores aggregate stats and diff counts vs. the previous snapshot.
10
+ module CreateSnapshots
11
+ VERSION = 4
12
+
13
+ # @param connection [Object] Database connection
14
+ # @return [void]
15
+ def self.up(connection) # rubocop:disable Metrics/MethodLength
16
+ connection.execute(<<~SQL)
17
+ CREATE TABLE IF NOT EXISTS codebase_snapshots (
18
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
19
+ git_sha TEXT NOT NULL,
20
+ git_branch TEXT,
21
+ extracted_at TEXT NOT NULL,
22
+ rails_version TEXT,
23
+ ruby_version TEXT,
24
+ total_units INTEGER NOT NULL DEFAULT 0,
25
+ unit_counts TEXT,
26
+ gemfile_lock_sha TEXT,
27
+ schema_sha TEXT,
28
+ units_added INTEGER DEFAULT 0,
29
+ units_modified INTEGER DEFAULT 0,
30
+ units_deleted INTEGER DEFAULT 0,
31
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
32
+ UNIQUE(git_sha)
33
+ )
34
+ SQL
35
+ connection.execute(<<~SQL)
36
+ CREATE INDEX IF NOT EXISTS idx_snapshots_extracted_at ON codebase_snapshots(extracted_at)
37
+ SQL
38
+ connection.execute(<<~SQL)
39
+ CREATE INDEX IF NOT EXISTS idx_snapshots_branch ON codebase_snapshots(git_branch)
40
+ SQL
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Db
5
+ module Migrations
6
+ # Creates the codebase_snapshot_units table for per-unit temporal tracking.
7
+ #
8
+ # Each row links a unit (by identifier) to a snapshot, storing content hashes
9
+ # for efficient diff computation without duplicating full source code.
10
+ module CreateSnapshotUnits
11
+ VERSION = 5
12
+
13
+ # @param connection [Object] Database connection
14
+ # @return [void]
15
+ def self.up(connection)
16
+ connection.execute(<<~SQL)
17
+ CREATE TABLE IF NOT EXISTS codebase_snapshot_units (
18
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
19
+ snapshot_id INTEGER NOT NULL,
20
+ identifier TEXT NOT NULL,
21
+ unit_type TEXT NOT NULL,
22
+ source_hash TEXT,
23
+ metadata_hash TEXT,
24
+ dependencies_hash TEXT,
25
+ created_at TEXT NOT NULL DEFAULT (datetime('now')),
26
+ FOREIGN KEY (snapshot_id) REFERENCES codebase_snapshots(id),
27
+ UNIQUE(snapshot_id, identifier)
28
+ )
29
+ SQL
30
+ connection.execute(<<~SQL)
31
+ CREATE INDEX IF NOT EXISTS idx_snapshot_units_identifier ON codebase_snapshot_units(identifier)
32
+ SQL
33
+ connection.execute(<<~SQL)
34
+ CREATE INDEX IF NOT EXISTS idx_snapshot_units_snapshot ON codebase_snapshot_units(snapshot_id)
35
+ SQL
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end