codebase_index 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +29 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +481 -0
- data/exe/codebase-console-mcp +22 -0
- data/exe/codebase-index-mcp +61 -0
- data/exe/codebase-index-mcp-http +64 -0
- data/exe/codebase-index-mcp-start +58 -0
- data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
- data/lib/codebase_index/ast/method_extractor.rb +76 -0
- data/lib/codebase_index/ast/node.rb +88 -0
- data/lib/codebase_index/ast/parser.rb +653 -0
- data/lib/codebase_index/ast.rb +6 -0
- data/lib/codebase_index/builder.rb +137 -0
- data/lib/codebase_index/chunking/chunk.rb +84 -0
- data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
- data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
- data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
- data/lib/codebase_index/console/audit_logger.rb +75 -0
- data/lib/codebase_index/console/bridge.rb +170 -0
- data/lib/codebase_index/console/confirmation.rb +90 -0
- data/lib/codebase_index/console/connection_manager.rb +173 -0
- data/lib/codebase_index/console/console_response_renderer.rb +78 -0
- data/lib/codebase_index/console/model_validator.rb +81 -0
- data/lib/codebase_index/console/safe_context.rb +82 -0
- data/lib/codebase_index/console/server.rb +557 -0
- data/lib/codebase_index/console/sql_validator.rb +172 -0
- data/lib/codebase_index/console/tools/tier1.rb +118 -0
- data/lib/codebase_index/console/tools/tier2.rb +117 -0
- data/lib/codebase_index/console/tools/tier3.rb +110 -0
- data/lib/codebase_index/console/tools/tier4.rb +79 -0
- data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
- data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
- data/lib/codebase_index/cost_model/estimator.rb +128 -0
- data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
- data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
- data/lib/codebase_index/cost_model.rb +22 -0
- data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
- data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
- data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/codebase_index/db/migrator.rb +71 -0
- data/lib/codebase_index/db/schema_version.rb +73 -0
- data/lib/codebase_index/dependency_graph.rb +227 -0
- data/lib/codebase_index/embedding/indexer.rb +130 -0
- data/lib/codebase_index/embedding/openai.rb +105 -0
- data/lib/codebase_index/embedding/provider.rb +135 -0
- data/lib/codebase_index/embedding/text_preparer.rb +112 -0
- data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
- data/lib/codebase_index/evaluation/evaluator.rb +146 -0
- data/lib/codebase_index/evaluation/metrics.rb +79 -0
- data/lib/codebase_index/evaluation/query_set.rb +148 -0
- data/lib/codebase_index/evaluation/report_generator.rb +90 -0
- data/lib/codebase_index/extracted_unit.rb +145 -0
- data/lib/codebase_index/extractor.rb +956 -0
- data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
- data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
- data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
- data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
- data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
- data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
- data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
- data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
- data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
- data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
- data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
- data/lib/codebase_index/extractors/event_extractor.rb +211 -0
- data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
- data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
- data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
- data/lib/codebase_index/extractors/job_extractor.rb +369 -0
- data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
- data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
- data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
- data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
- data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
- data/lib/codebase_index/extractors/model_extractor.rb +960 -0
- data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
- data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
- data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
- data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
- data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
- data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
- data/lib/codebase_index/extractors/route_extractor.rb +181 -0
- data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
- data/lib/codebase_index/extractors/service_extractor.rb +254 -0
- data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
- data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
- data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
- data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
- data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
- data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
- data/lib/codebase_index/feedback/gap_detector.rb +89 -0
- data/lib/codebase_index/feedback/store.rb +119 -0
- data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
- data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/codebase_index/flow_assembler.rb +290 -0
- data/lib/codebase_index/flow_document.rb +191 -0
- data/lib/codebase_index/flow_precomputer.rb +102 -0
- data/lib/codebase_index/formatting/base.rb +40 -0
- data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
- data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
- data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
- data/lib/codebase_index/formatting/human_adapter.rb +78 -0
- data/lib/codebase_index/graph_analyzer.rb +374 -0
- data/lib/codebase_index/mcp/index_reader.rb +394 -0
- data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
- data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
- data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/codebase_index/mcp/server.rb +935 -0
- data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
- data/lib/codebase_index/model_name_cache.rb +51 -0
- data/lib/codebase_index/notion/client.rb +217 -0
- data/lib/codebase_index/notion/exporter.rb +219 -0
- data/lib/codebase_index/notion/mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
- data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
- data/lib/codebase_index/notion/rate_limiter.rb +68 -0
- data/lib/codebase_index/observability/health_check.rb +81 -0
- data/lib/codebase_index/observability/instrumentation.rb +34 -0
- data/lib/codebase_index/observability/structured_logger.rb +75 -0
- data/lib/codebase_index/operator/error_escalator.rb +81 -0
- data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
- data/lib/codebase_index/operator/status_reporter.rb +80 -0
- data/lib/codebase_index/railtie.rb +26 -0
- data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
- data/lib/codebase_index/resilience/index_validator.rb +185 -0
- data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
- data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
- data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
- data/lib/codebase_index/retrieval/ranker.rb +273 -0
- data/lib/codebase_index/retrieval/search_executor.rb +327 -0
- data/lib/codebase_index/retriever.rb +160 -0
- data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
- data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
- data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
- data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
- data/lib/codebase_index/ruby_analyzer.rb +87 -0
- data/lib/codebase_index/session_tracer/file_store.rb +111 -0
- data/lib/codebase_index/session_tracer/middleware.rb +143 -0
- data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
- data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
- data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
- data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
- data/lib/codebase_index/session_tracer/store.rb +67 -0
- data/lib/codebase_index/storage/graph_store.rb +120 -0
- data/lib/codebase_index/storage/metadata_store.rb +169 -0
- data/lib/codebase_index/storage/pgvector.rb +163 -0
- data/lib/codebase_index/storage/qdrant.rb +172 -0
- data/lib/codebase_index/storage/vector_store.rb +156 -0
- data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
- data/lib/codebase_index/version.rb +5 -0
- data/lib/codebase_index.rb +223 -0
- data/lib/generators/codebase_index/install_generator.rb +32 -0
- data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
- data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
- data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
- data/lib/tasks/codebase_index.rake +583 -0
- data/lib/tasks/codebase_index_evaluation.rake +115 -0
- metadata +252 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module CodebaseIndex
|
|
7
|
+
module Coordination
|
|
8
|
+
class LockError < CodebaseIndex::Error; end
|
|
9
|
+
|
|
10
|
+
# File-based lock for preventing concurrent pipeline operations.
|
|
11
|
+
#
|
|
12
|
+
# Creates a lock file with PID and timestamp. Supports stale lock
|
|
13
|
+
# detection for crashed processes.
|
|
14
|
+
#
|
|
15
|
+
# @example
|
|
16
|
+
# lock = PipelineLock.new(lock_dir: '/tmp', name: 'extraction')
|
|
17
|
+
# lock.with_lock do
|
|
18
|
+
# # extraction runs here
|
|
19
|
+
# end
|
|
20
|
+
#
|
|
21
|
+
class PipelineLock
|
|
22
|
+
DEFAULT_STALE_TIMEOUT = 3600 # 1 hour
|
|
23
|
+
|
|
24
|
+
# @param lock_dir [String] Directory for lock files
|
|
25
|
+
# @param name [String] Lock name (used as filename prefix)
|
|
26
|
+
# @param stale_timeout [Integer] Seconds after which a lock is considered stale
|
|
27
|
+
def initialize(lock_dir:, name:, stale_timeout: DEFAULT_STALE_TIMEOUT)
|
|
28
|
+
@lock_dir = lock_dir
|
|
29
|
+
@name = name
|
|
30
|
+
@stale_timeout = stale_timeout
|
|
31
|
+
@lock_path = File.join(lock_dir, "#{name}.lock")
|
|
32
|
+
@held = false
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Attempt to acquire the lock.
|
|
36
|
+
#
|
|
37
|
+
# @return [Boolean] true if lock acquired, false if already held
|
|
38
|
+
def acquire
|
|
39
|
+
FileUtils.mkdir_p(@lock_dir)
|
|
40
|
+
|
|
41
|
+
# Check for stale lock first (separate from atomic creation)
|
|
42
|
+
if File.exist?(@lock_path)
|
|
43
|
+
return false unless stale?
|
|
44
|
+
|
|
45
|
+
# Remove stale lock
|
|
46
|
+
FileUtils.rm_f(@lock_path)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Atomic lock creation: File::EXCL ensures this fails if file already exists
|
|
50
|
+
File.open(@lock_path, File::WRONLY | File::CREAT | File::EXCL) do |f|
|
|
51
|
+
f.write(lock_content)
|
|
52
|
+
end
|
|
53
|
+
@held = true
|
|
54
|
+
true
|
|
55
|
+
rescue Errno::EEXIST
|
|
56
|
+
false
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Release the lock.
|
|
60
|
+
#
|
|
61
|
+
# @return [void]
|
|
62
|
+
def release
|
|
63
|
+
FileUtils.rm_f(@lock_path) if @held
|
|
64
|
+
@held = false
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Execute a block while holding the lock.
|
|
68
|
+
#
|
|
69
|
+
# @yield Block to execute
|
|
70
|
+
# @return [Object] Return value of the block
|
|
71
|
+
# @raise [LockError] if lock cannot be acquired
|
|
72
|
+
def with_lock(&block)
|
|
73
|
+
raise LockError, "Cannot acquire lock '#{@name}' — another process is running" unless acquire
|
|
74
|
+
|
|
75
|
+
begin
|
|
76
|
+
block.call
|
|
77
|
+
ensure
|
|
78
|
+
release
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Whether the lock is currently held by this instance.
|
|
83
|
+
#
|
|
84
|
+
# @return [Boolean]
|
|
85
|
+
def locked?
|
|
86
|
+
@held && File.exist?(@lock_path)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
# Check if the existing lock file is stale.
|
|
92
|
+
#
|
|
93
|
+
# @return [Boolean]
|
|
94
|
+
def stale?
|
|
95
|
+
return false unless File.exist?(@lock_path)
|
|
96
|
+
|
|
97
|
+
age = Time.now - File.mtime(@lock_path)
|
|
98
|
+
age > @stale_timeout
|
|
99
|
+
rescue Errno::ENOENT
|
|
100
|
+
true
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# @return [String] Lock file content (JSON with PID and timestamp)
|
|
104
|
+
def lock_content
|
|
105
|
+
JSON.generate(pid: Process.pid, locked_at: Time.now.iso8601, name: @name)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module CostModel
|
|
5
|
+
# Calculates embedding costs for full-index, incremental, and query-time
|
|
6
|
+
# scenarios using the token-based pricing from {ProviderPricing}.
|
|
7
|
+
#
|
|
8
|
+
# The cost model uses a constant of 450 tokens per chunk, derived from the
|
|
9
|
+
# BACKEND_MATRIX.md tables (e.g. 500 units × 2.5 chunks = 1250 chunks × 450 = 562K tokens).
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# calc = EmbeddingCost.new(provider: :openai_small)
|
|
13
|
+
# calc.full_index_cost(units: 500, chunk_multiplier: 2.5) # => 0.01125
|
|
14
|
+
#
|
|
15
|
+
class EmbeddingCost
|
|
16
|
+
# Average tokens per chunk after hierarchical chunking with context prefix.
|
|
17
|
+
TOKENS_PER_CHUNK = 450
|
|
18
|
+
|
|
19
|
+
# Average tokens per retrieval query.
|
|
20
|
+
TOKENS_PER_QUERY = 100
|
|
21
|
+
|
|
22
|
+
# @param provider [Symbol] Embedding provider key from {ProviderPricing}
|
|
23
|
+
def initialize(provider:)
|
|
24
|
+
@cost_per_million = ProviderPricing.cost_per_million(provider)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Cost to embed the full codebase index.
|
|
28
|
+
#
|
|
29
|
+
# @param units [Integer] Number of extracted units
|
|
30
|
+
# @param chunk_multiplier [Float] Average chunks per unit (default 2.5)
|
|
31
|
+
# @return [Float] Cost in USD
|
|
32
|
+
def full_index_cost(units:, chunk_multiplier: 2.5)
|
|
33
|
+
tokens = total_tokens(units, chunk_multiplier)
|
|
34
|
+
token_cost(tokens)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Cost to re-embed changed units from a single merge.
|
|
38
|
+
#
|
|
39
|
+
# @param changed_units [Integer] Number of units changed (default 5)
|
|
40
|
+
# @param chunk_multiplier [Float] Average chunks per unit (default 2.5)
|
|
41
|
+
# @return [Float] Cost in USD
|
|
42
|
+
def incremental_cost(changed_units: 5, chunk_multiplier: 2.5)
|
|
43
|
+
tokens = total_tokens(changed_units, chunk_multiplier)
|
|
44
|
+
token_cost(tokens)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Monthly cost for query-time embedding.
|
|
48
|
+
#
|
|
49
|
+
# @param daily_queries [Integer] Number of queries per day
|
|
50
|
+
# @return [Float] Cost in USD per month
|
|
51
|
+
def monthly_query_cost(daily_queries:)
|
|
52
|
+
monthly_tokens = daily_queries * 30 * TOKENS_PER_QUERY
|
|
53
|
+
token_cost(monthly_tokens)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Yearly embedding cost from incremental re-indexing.
|
|
57
|
+
#
|
|
58
|
+
# @param merges_per_year [Integer] Number of merges per year (default 2400)
|
|
59
|
+
# @param changed_units_per_merge [Integer] Units changed per merge (default 5)
|
|
60
|
+
# @param chunk_multiplier [Float] Average chunks per unit (default 2.5)
|
|
61
|
+
# @return [Float] Cost in USD per year
|
|
62
|
+
def yearly_incremental_cost(merges_per_year: 2400, changed_units_per_merge: 5, chunk_multiplier: 2.5)
|
|
63
|
+
tokens_per_merge = total_tokens(changed_units_per_merge, chunk_multiplier)
|
|
64
|
+
token_cost(tokens_per_merge * merges_per_year)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Total tokens for a given number of units and chunk multiplier.
|
|
68
|
+
#
|
|
69
|
+
# @param units [Integer] Number of units
|
|
70
|
+
# @param chunk_multiplier [Float] Chunks per unit
|
|
71
|
+
# @return [Integer] Total embedding tokens
|
|
72
|
+
def total_tokens(units, chunk_multiplier)
|
|
73
|
+
chunks = (units * chunk_multiplier).ceil
|
|
74
|
+
chunks * TOKENS_PER_CHUNK
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
# Convert token count to cost in USD.
|
|
80
|
+
#
|
|
81
|
+
# @param tokens [Numeric] Number of tokens
|
|
82
|
+
# @return [Float] Cost in USD
|
|
83
|
+
def token_cost(tokens)
|
|
84
|
+
(tokens.to_f / 1_000_000) * @cost_per_million
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module CostModel
|
|
5
|
+
# Unified cost estimator that combines embedding, storage, and query costs
|
|
6
|
+
# into a single breakdown for a given configuration.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# estimate = Estimator.new(
|
|
10
|
+
# units: 500,
|
|
11
|
+
# chunk_multiplier: 2.5,
|
|
12
|
+
# embedding_provider: :openai_small,
|
|
13
|
+
# dimensions: 1536,
|
|
14
|
+
# daily_queries: 100
|
|
15
|
+
# )
|
|
16
|
+
# estimate.full_index_cost # => 0.01125
|
|
17
|
+
# estimate.monthly_query_cost # => 0.006
|
|
18
|
+
# estimate.storage_bytes # => 9_984_000
|
|
19
|
+
# estimate.to_h # => { full_index_cost: ..., ... }
|
|
20
|
+
#
|
|
21
|
+
class Estimator
|
|
22
|
+
# @return [Integer] Number of extracted units
|
|
23
|
+
attr_reader :units
|
|
24
|
+
|
|
25
|
+
# @return [Float] Average chunks per unit
|
|
26
|
+
attr_reader :chunk_multiplier
|
|
27
|
+
|
|
28
|
+
# @return [Symbol] Embedding provider key
|
|
29
|
+
attr_reader :embedding_provider
|
|
30
|
+
|
|
31
|
+
# @return [Integer] Embedding vector dimensions
|
|
32
|
+
attr_reader :dimensions
|
|
33
|
+
|
|
34
|
+
# @return [Integer] Number of retrieval queries per day
|
|
35
|
+
attr_reader :daily_queries
|
|
36
|
+
|
|
37
|
+
# @param units [Integer] Number of extracted units
|
|
38
|
+
# @param chunk_multiplier [Float] Average chunks per unit (default 2.5)
|
|
39
|
+
# @param embedding_provider [Symbol] Provider key from {ProviderPricing}
|
|
40
|
+
# @param dimensions [Integer, nil] Vector dimensions (defaults to provider default)
|
|
41
|
+
# @param daily_queries [Integer] Retrieval queries per day (default 100)
|
|
42
|
+
def initialize(units:, embedding_provider:, chunk_multiplier: 2.5, dimensions: nil, daily_queries: 100)
|
|
43
|
+
@units = units
|
|
44
|
+
@chunk_multiplier = chunk_multiplier
|
|
45
|
+
@embedding_provider = embedding_provider
|
|
46
|
+
@dimensions = dimensions || ProviderPricing.default_dimensions(embedding_provider)
|
|
47
|
+
@daily_queries = daily_queries
|
|
48
|
+
|
|
49
|
+
@embedding_cost = EmbeddingCost.new(provider: embedding_provider)
|
|
50
|
+
@storage_cost = StorageCost.new(dimensions: @dimensions)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Cost to embed the full codebase index.
|
|
54
|
+
#
|
|
55
|
+
# @return [Float] Cost in USD
|
|
56
|
+
def full_index_cost
|
|
57
|
+
@embedding_cost.full_index_cost(units: units, chunk_multiplier: chunk_multiplier)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Cost to re-embed a single merge (default 5 changed units).
|
|
61
|
+
#
|
|
62
|
+
# @param changed_units [Integer] Units changed per merge (default 5)
|
|
63
|
+
# @return [Float] Cost in USD
|
|
64
|
+
def incremental_per_merge_cost(changed_units: 5)
|
|
65
|
+
@embedding_cost.incremental_cost(changed_units: changed_units, chunk_multiplier: chunk_multiplier)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Monthly cost for query-time embedding.
|
|
69
|
+
#
|
|
70
|
+
# @return [Float] Cost in USD per month
|
|
71
|
+
def monthly_query_cost
|
|
72
|
+
@embedding_cost.monthly_query_cost(daily_queries: daily_queries)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Yearly embedding cost from incremental re-indexing.
|
|
76
|
+
#
|
|
77
|
+
# @param merges_per_year [Integer] Merges per year (default 2400)
|
|
78
|
+
# @return [Float] Cost in USD per year
|
|
79
|
+
def yearly_incremental_cost(merges_per_year: 2400)
|
|
80
|
+
@embedding_cost.yearly_incremental_cost(
|
|
81
|
+
merges_per_year: merges_per_year,
|
|
82
|
+
chunk_multiplier: chunk_multiplier
|
|
83
|
+
)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Total number of chunks for the codebase.
|
|
87
|
+
#
|
|
88
|
+
# @return [Integer]
|
|
89
|
+
def total_chunks
|
|
90
|
+
@total_chunks ||= (units * chunk_multiplier).ceil
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Total storage in bytes for vector data.
|
|
94
|
+
#
|
|
95
|
+
# @return [Integer]
|
|
96
|
+
def storage_bytes
|
|
97
|
+
@storage_cost.storage_bytes(chunks: total_chunks)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Total storage in megabytes for vector data.
|
|
101
|
+
#
|
|
102
|
+
# @return [Float]
|
|
103
|
+
def storage_mb
|
|
104
|
+
@storage_cost.storage_mb(chunks: total_chunks)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Full cost breakdown as a Hash.
|
|
108
|
+
#
|
|
109
|
+
# @return [Hash{Symbol => Numeric}]
|
|
110
|
+
def to_h
|
|
111
|
+
{
|
|
112
|
+
full_index_cost: full_index_cost,
|
|
113
|
+
incremental_per_merge_cost: incremental_per_merge_cost,
|
|
114
|
+
monthly_query_cost: monthly_query_cost,
|
|
115
|
+
yearly_incremental_cost: yearly_incremental_cost,
|
|
116
|
+
storage_bytes: storage_bytes,
|
|
117
|
+
storage_mb: storage_mb,
|
|
118
|
+
total_chunks: total_chunks,
|
|
119
|
+
units: units,
|
|
120
|
+
chunk_multiplier: chunk_multiplier,
|
|
121
|
+
embedding_provider: embedding_provider,
|
|
122
|
+
dimensions: dimensions,
|
|
123
|
+
daily_queries: daily_queries
|
|
124
|
+
}
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module CostModel
|
|
5
|
+
# Frozen pricing data for embedding providers.
|
|
6
|
+
#
|
|
7
|
+
# Costs are expressed as dollars per 1 million tokens, sourced from
|
|
8
|
+
# BACKEND_MATRIX.md. Each provider is identified by a Symbol key.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# ProviderPricing.cost_per_million(:openai_small) # => 0.02
|
|
12
|
+
# ProviderPricing.providers # => [:openai_small, ...]
|
|
13
|
+
#
|
|
14
|
+
module ProviderPricing
|
|
15
|
+
# Cost per 1 million tokens, in USD.
|
|
16
|
+
#
|
|
17
|
+
# @return [Hash{Symbol => Float}]
|
|
18
|
+
COSTS_PER_MILLION_TOKENS = {
|
|
19
|
+
openai_small: 0.02,
|
|
20
|
+
openai_large: 0.13,
|
|
21
|
+
voyage_code3: 0.06,
|
|
22
|
+
ollama: 0.00
|
|
23
|
+
}.freeze
|
|
24
|
+
|
|
25
|
+
# Default embedding dimensions per provider.
|
|
26
|
+
#
|
|
27
|
+
# @return [Hash{Symbol => Integer}]
|
|
28
|
+
DEFAULT_DIMENSIONS = {
|
|
29
|
+
openai_small: 1536,
|
|
30
|
+
openai_large: 3072,
|
|
31
|
+
voyage_code3: 1024,
|
|
32
|
+
ollama: 768
|
|
33
|
+
}.freeze
|
|
34
|
+
|
|
35
|
+
# Look up the cost per 1M tokens for a provider.
|
|
36
|
+
#
|
|
37
|
+
# @param provider [Symbol] Provider key (e.g. :openai_small)
|
|
38
|
+
# @return [Float] Cost in USD per 1M tokens
|
|
39
|
+
# @raise [ArgumentError] if provider is unknown
|
|
40
|
+
def self.cost_per_million(provider)
|
|
41
|
+
COSTS_PER_MILLION_TOKENS.fetch(provider) do
|
|
42
|
+
raise ArgumentError, "Unknown embedding provider: #{provider.inspect}. " \
|
|
43
|
+
"Valid providers: #{providers.join(', ')}"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Look up the default dimensions for a provider.
|
|
48
|
+
#
|
|
49
|
+
# @param provider [Symbol] Provider key
|
|
50
|
+
# @return [Integer] Default embedding dimensions
|
|
51
|
+
# @raise [ArgumentError] if provider is unknown
|
|
52
|
+
def self.default_dimensions(provider)
|
|
53
|
+
DEFAULT_DIMENSIONS.fetch(provider) do
|
|
54
|
+
raise ArgumentError, "Unknown embedding provider: #{provider.inspect}. " \
|
|
55
|
+
"Valid providers: #{providers.join(', ')}"
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# List all known provider keys.
|
|
60
|
+
#
|
|
61
|
+
# @return [Array<Symbol>]
|
|
62
|
+
def self.providers
|
|
63
|
+
COSTS_PER_MILLION_TOKENS.keys
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module CostModel
|
|
5
|
+
# Calculates vector storage requirements based on embedding dimensions
|
|
6
|
+
# and chunk count.
|
|
7
|
+
#
|
|
8
|
+
# Bytes per vector = dimensions × 4 (float32), with a 1.3× metadata
|
|
9
|
+
# overhead factor applied per BACKEND_MATRIX.md.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# calc = StorageCost.new(dimensions: 1536)
|
|
13
|
+
# calc.storage_bytes(chunks: 1250) # => 9_984_000
|
|
14
|
+
# calc.storage_mb(chunks: 1250) # => 9.52
|
|
15
|
+
#
|
|
16
|
+
class StorageCost
|
|
17
|
+
# Bytes per float32 value.
|
|
18
|
+
BYTES_PER_FLOAT = 4
|
|
19
|
+
|
|
20
|
+
# Metadata overhead multiplier (JSONB payload, indexes, etc.).
|
|
21
|
+
METADATA_OVERHEAD = 1.3
|
|
22
|
+
|
|
23
|
+
# @param dimensions [Integer] Embedding vector dimensions
|
|
24
|
+
def initialize(dimensions:)
|
|
25
|
+
@dimensions = dimensions
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Bytes per vector including metadata overhead.
|
|
29
|
+
#
|
|
30
|
+
# @return [Integer]
|
|
31
|
+
def bytes_per_vector
|
|
32
|
+
@bytes_per_vector ||= (@dimensions * BYTES_PER_FLOAT * METADATA_OVERHEAD).ceil
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Total storage in bytes for a given number of chunks.
|
|
36
|
+
#
|
|
37
|
+
# @param chunks [Integer] Total number of chunks (units × chunk_multiplier)
|
|
38
|
+
# @return [Integer]
|
|
39
|
+
def storage_bytes(chunks:)
|
|
40
|
+
chunks * bytes_per_vector
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Total storage in megabytes for a given number of chunks.
|
|
44
|
+
#
|
|
45
|
+
# @param chunks [Integer] Total number of chunks
|
|
46
|
+
# @return [Float] Storage in MB, rounded to 2 decimal places
|
|
47
|
+
def storage_mb(chunks:)
|
|
48
|
+
(storage_bytes(chunks: chunks).to_f / (1024 * 1024)).round(2)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'cost_model/provider_pricing'
|
|
4
|
+
require_relative 'cost_model/embedding_cost'
|
|
5
|
+
require_relative 'cost_model/storage_cost'
|
|
6
|
+
require_relative 'cost_model/estimator'
|
|
7
|
+
|
|
8
|
+
module CodebaseIndex
|
|
9
|
+
# Cost modeling for embedding, storage, and query costs across different
|
|
10
|
+
# backend configurations. Based on the cost analysis in BACKEND_MATRIX.md.
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# estimate = CodebaseIndex::CostModel::Estimator.new(
|
|
14
|
+
# units: 500,
|
|
15
|
+
# embedding_provider: :openai_small
|
|
16
|
+
# )
|
|
17
|
+
# estimate.full_index_cost # => 0.011
|
|
18
|
+
# estimate.monthly_query_cost # => 0.006
|
|
19
|
+
#
|
|
20
|
+
module CostModel
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Db
|
|
5
|
+
module Migrations
|
|
6
|
+
# Creates the codebase_units table for storing extracted unit metadata.
|
|
7
|
+
module CreateUnits
|
|
8
|
+
VERSION = 1
|
|
9
|
+
|
|
10
|
+
# @param connection [Object] Database connection
|
|
11
|
+
# @return [void]
|
|
12
|
+
def self.up(connection) # rubocop:disable Metrics/MethodLength
|
|
13
|
+
connection.execute(<<~SQL)
|
|
14
|
+
CREATE TABLE IF NOT EXISTS codebase_units (
|
|
15
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
16
|
+
unit_type TEXT NOT NULL,
|
|
17
|
+
identifier TEXT NOT NULL,
|
|
18
|
+
namespace TEXT,
|
|
19
|
+
file_path TEXT NOT NULL,
|
|
20
|
+
source_code TEXT,
|
|
21
|
+
source_hash TEXT,
|
|
22
|
+
metadata TEXT,
|
|
23
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
24
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
25
|
+
UNIQUE(identifier)
|
|
26
|
+
)
|
|
27
|
+
SQL
|
|
28
|
+
connection.execute(<<~SQL)
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_codebase_units_type ON codebase_units(unit_type)
|
|
30
|
+
SQL
|
|
31
|
+
connection.execute(<<~SQL)
|
|
32
|
+
CREATE INDEX IF NOT EXISTS idx_codebase_units_file_path ON codebase_units(file_path)
|
|
33
|
+
SQL
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Db
|
|
5
|
+
module Migrations
|
|
6
|
+
# Creates the codebase_edges table for storing unit relationships.
|
|
7
|
+
module CreateEdges
|
|
8
|
+
VERSION = 2
|
|
9
|
+
|
|
10
|
+
# @param connection [Object] Database connection
|
|
11
|
+
# @return [void]
|
|
12
|
+
def self.up(connection)
|
|
13
|
+
connection.execute(<<~SQL)
|
|
14
|
+
CREATE TABLE IF NOT EXISTS codebase_edges (
|
|
15
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
16
|
+
source_id INTEGER NOT NULL,
|
|
17
|
+
target_id INTEGER NOT NULL,
|
|
18
|
+
relationship TEXT NOT NULL,
|
|
19
|
+
via TEXT,
|
|
20
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
21
|
+
FOREIGN KEY (source_id) REFERENCES codebase_units(id),
|
|
22
|
+
FOREIGN KEY (target_id) REFERENCES codebase_units(id)
|
|
23
|
+
)
|
|
24
|
+
SQL
|
|
25
|
+
connection.execute(<<~SQL)
|
|
26
|
+
CREATE INDEX IF NOT EXISTS idx_codebase_edges_source ON codebase_edges(source_id)
|
|
27
|
+
SQL
|
|
28
|
+
connection.execute(<<~SQL)
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_codebase_edges_target ON codebase_edges(target_id)
|
|
30
|
+
SQL
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Db
|
|
5
|
+
module Migrations
|
|
6
|
+
# Creates the codebase_embeddings table for storing vector embeddings.
|
|
7
|
+
# Uses TEXT for embedding storage (JSON array) for database portability.
|
|
8
|
+
# Pgvector users should use the pgvector generator for native vector columns.
|
|
9
|
+
module CreateEmbeddings
|
|
10
|
+
VERSION = 3
|
|
11
|
+
|
|
12
|
+
# @param connection [Object] Database connection
|
|
13
|
+
# @return [void]
|
|
14
|
+
def self.up(connection)
|
|
15
|
+
connection.execute(<<~SQL)
|
|
16
|
+
CREATE TABLE IF NOT EXISTS codebase_embeddings (
|
|
17
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
18
|
+
unit_id INTEGER NOT NULL,
|
|
19
|
+
chunk_type TEXT,
|
|
20
|
+
embedding TEXT NOT NULL,
|
|
21
|
+
content_hash TEXT NOT NULL,
|
|
22
|
+
dimensions INTEGER NOT NULL,
|
|
23
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
24
|
+
FOREIGN KEY (unit_id) REFERENCES codebase_units(id)
|
|
25
|
+
)
|
|
26
|
+
SQL
|
|
27
|
+
connection.execute(<<~SQL)
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_codebase_embeddings_unit ON codebase_embeddings(unit_id)
|
|
29
|
+
SQL
|
|
30
|
+
connection.execute(<<~SQL)
|
|
31
|
+
CREATE INDEX IF NOT EXISTS idx_codebase_embeddings_hash ON codebase_embeddings(content_hash)
|
|
32
|
+
SQL
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Db
|
|
5
|
+
module Migrations
|
|
6
|
+
# Creates the codebase_snapshots table for temporal index tracking.
|
|
7
|
+
#
|
|
8
|
+
# Each row represents one extraction run anchored to a git commit SHA.
|
|
9
|
+
# Stores aggregate stats and diff counts vs. the previous snapshot.
|
|
10
|
+
module CreateSnapshots
|
|
11
|
+
VERSION = 4
|
|
12
|
+
|
|
13
|
+
# @param connection [Object] Database connection
|
|
14
|
+
# @return [void]
|
|
15
|
+
def self.up(connection) # rubocop:disable Metrics/MethodLength
|
|
16
|
+
connection.execute(<<~SQL)
|
|
17
|
+
CREATE TABLE IF NOT EXISTS codebase_snapshots (
|
|
18
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
19
|
+
git_sha TEXT NOT NULL,
|
|
20
|
+
git_branch TEXT,
|
|
21
|
+
extracted_at TEXT NOT NULL,
|
|
22
|
+
rails_version TEXT,
|
|
23
|
+
ruby_version TEXT,
|
|
24
|
+
total_units INTEGER NOT NULL DEFAULT 0,
|
|
25
|
+
unit_counts TEXT,
|
|
26
|
+
gemfile_lock_sha TEXT,
|
|
27
|
+
schema_sha TEXT,
|
|
28
|
+
units_added INTEGER DEFAULT 0,
|
|
29
|
+
units_modified INTEGER DEFAULT 0,
|
|
30
|
+
units_deleted INTEGER DEFAULT 0,
|
|
31
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
32
|
+
UNIQUE(git_sha)
|
|
33
|
+
)
|
|
34
|
+
SQL
|
|
35
|
+
connection.execute(<<~SQL)
|
|
36
|
+
CREATE INDEX IF NOT EXISTS idx_snapshots_extracted_at ON codebase_snapshots(extracted_at)
|
|
37
|
+
SQL
|
|
38
|
+
connection.execute(<<~SQL)
|
|
39
|
+
CREATE INDEX IF NOT EXISTS idx_snapshots_branch ON codebase_snapshots(git_branch)
|
|
40
|
+
SQL
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module CodebaseIndex
|
|
4
|
+
module Db
|
|
5
|
+
module Migrations
|
|
6
|
+
# Creates the codebase_snapshot_units table for per-unit temporal tracking.
|
|
7
|
+
#
|
|
8
|
+
# Each row links a unit (by identifier) to a snapshot, storing content hashes
|
|
9
|
+
# for efficient diff computation without duplicating full source code.
|
|
10
|
+
module CreateSnapshotUnits
|
|
11
|
+
VERSION = 5
|
|
12
|
+
|
|
13
|
+
# @param connection [Object] Database connection
|
|
14
|
+
# @return [void]
|
|
15
|
+
def self.up(connection)
|
|
16
|
+
connection.execute(<<~SQL)
|
|
17
|
+
CREATE TABLE IF NOT EXISTS codebase_snapshot_units (
|
|
18
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
19
|
+
snapshot_id INTEGER NOT NULL,
|
|
20
|
+
identifier TEXT NOT NULL,
|
|
21
|
+
unit_type TEXT NOT NULL,
|
|
22
|
+
source_hash TEXT,
|
|
23
|
+
metadata_hash TEXT,
|
|
24
|
+
dependencies_hash TEXT,
|
|
25
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
26
|
+
FOREIGN KEY (snapshot_id) REFERENCES codebase_snapshots(id),
|
|
27
|
+
UNIQUE(snapshot_id, identifier)
|
|
28
|
+
)
|
|
29
|
+
SQL
|
|
30
|
+
connection.execute(<<~SQL)
|
|
31
|
+
CREATE INDEX IF NOT EXISTS idx_snapshot_units_identifier ON codebase_snapshot_units(identifier)
|
|
32
|
+
SQL
|
|
33
|
+
connection.execute(<<~SQL)
|
|
34
|
+
CREATE INDEX IF NOT EXISTS idx_snapshot_units_snapshot ON codebase_snapshot_units(snapshot_id)
|
|
35
|
+
SQL
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|