codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'schema_version'
4
+ require_relative 'migrations/001_create_units'
5
+ require_relative 'migrations/002_create_edges'
6
+ require_relative 'migrations/003_create_embeddings'
7
+ require_relative 'migrations/004_create_snapshots'
8
+ require_relative 'migrations/005_create_snapshot_units'
9
+
10
+ module CodebaseIndex
11
+ module Db
12
+ # Runs schema migrations against a database connection.
13
+ #
14
+ # Tracks applied migrations via {SchemaVersion} and only runs pending ones.
15
+ # Migrations are defined as modules in `db/migrations/` with a VERSION
16
+ # constant and a `.up(connection)` class method.
17
+ #
18
+ # @example
19
+ # db = SQLite3::Database.new('codebase_index.db')
20
+ # migrator = Migrator.new(connection: db)
21
+ # migrator.migrate! # => [1, 2, 3]
22
+ #
23
+ class Migrator
24
+ MIGRATIONS = [
25
+ Migrations::CreateUnits,
26
+ Migrations::CreateEdges,
27
+ Migrations::CreateEmbeddings,
28
+ Migrations::CreateSnapshots,
29
+ Migrations::CreateSnapshotUnits
30
+ ].freeze
31
+
32
+ attr_reader :schema_version
33
+
34
+ # @param connection [Object] Database connection supporting #execute
35
+ def initialize(connection:)
36
+ @connection = connection
37
+ @schema_version = SchemaVersion.new(connection: connection)
38
+ @schema_version.ensure_table!
39
+ end
40
+
41
+ # Run all pending migrations.
42
+ #
43
+ # @return [Array<Integer>] Version numbers of newly applied migrations
44
+ def migrate!
45
+ applied = []
46
+ pending_migrations.each do |migration|
47
+ migration.up(@connection)
48
+ @schema_version.record_version(migration::VERSION)
49
+ applied << migration::VERSION
50
+ end
51
+ applied
52
+ end
53
+
54
+ # List version numbers of pending (unapplied) migrations.
55
+ #
56
+ # @return [Array<Integer>]
57
+ def pending_versions
58
+ applied = @schema_version.applied_versions
59
+ MIGRATIONS.map { |m| m::VERSION }.reject { |v| applied.include?(v) }
60
+ end
61
+
62
+ private
63
+
64
+ # @return [Array<Module>] Pending migration modules
65
+ def pending_migrations
66
+ applied = @schema_version.applied_versions
67
+ MIGRATIONS.reject { |m| applied.include?(m::VERSION) }
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Db
5
+ # Tracks which schema migrations have been applied.
6
+ #
7
+ # Uses a simple `codebase_index_schema_migrations` table with a single
8
+ # `version` column. Works with any database connection that supports
9
+ # `execute` and returns arrays (SQLite3, pg, mysql2).
10
+ #
11
+ # @example
12
+ # db = SQLite3::Database.new('codebase_index.db')
13
+ # sv = SchemaVersion.new(connection: db)
14
+ # sv.ensure_table!
15
+ # sv.current_version # => 0
16
+ # sv.record_version(1)
17
+ # sv.current_version # => 1
18
+ #
19
+ class SchemaVersion
20
+ TABLE_NAME = 'codebase_index_schema_migrations'
21
+
22
+ # @param connection [Object] Database connection supporting #execute
23
+ def initialize(connection:)
24
+ @connection = connection
25
+ end
26
+
27
+ # Create the schema migrations table if it does not exist.
28
+ #
29
+ # @return [void]
30
+ def ensure_table!
31
+ @connection.execute(<<~SQL)
32
+ CREATE TABLE IF NOT EXISTS #{TABLE_NAME} (
33
+ version INTEGER PRIMARY KEY NOT NULL,
34
+ applied_at TEXT NOT NULL DEFAULT (datetime('now'))
35
+ )
36
+ SQL
37
+ end
38
+
39
+ # List all applied migration version numbers, sorted ascending.
40
+ #
41
+ # @return [Array<Integer>]
42
+ def applied_versions
43
+ rows = @connection.execute("SELECT version FROM #{TABLE_NAME} ORDER BY version ASC")
44
+ rows.map { |row| row.is_a?(Array) ? row[0] : row['version'] }
45
+ end
46
+
47
+ # Record a migration version as applied.
48
+ #
49
+ # @param version [Integer] The migration version number
50
+ # @return [void]
51
+ def record_version(version)
52
+ @connection.execute(
53
+ "INSERT OR IGNORE INTO #{TABLE_NAME} (version) VALUES (?)", [version]
54
+ )
55
+ end
56
+
57
+ # Check whether a version has been applied.
58
+ #
59
+ # @param version [Integer]
60
+ # @return [Boolean]
61
+ def applied?(version)
62
+ applied_versions.include?(version)
63
+ end
64
+
65
+ # The highest applied version, or 0 if none.
66
+ #
67
+ # @return [Integer]
68
+ def current_version
69
+ applied_versions.last || 0
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,227 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+ require 'json'
5
+
6
+ module CodebaseIndex
7
+ # DependencyGraph tracks relationships between code units for:
8
+ # 1. Understanding what depends on what
9
+ # 2. Computing "blast radius" for incremental re-indexing
10
+ # 3. Enabling graph-based retrieval queries
11
+ #
12
+ # The graph is bidirectional - we track both what a unit depends on
13
+ # and what depends on that unit (reverse edges).
14
+ #
15
+ # @example Building and querying the graph
16
+ # graph = DependencyGraph.new
17
+ # graph.register(user_model_unit)
18
+ # graph.register(user_service_unit)
19
+ #
20
+ # # Find everything affected by a change to user.rb
21
+ # affected = graph.affected_by(["app/models/user.rb"])
22
+ #
23
+ class DependencyGraph
24
+ def initialize
25
+ @nodes = {} # identifier => { type:, file_path: }
26
+ @edges = {} # identifier => [dependency identifiers]
27
+ @reverse = {} # identifier => [dependent identifiers]
28
+ @file_map = {} # file_path => identifier
29
+ @type_index = {} # type => [identifiers]
30
+ end
31
+
32
+ # Register a unit in the graph
33
+ #
34
+ # @param unit [ExtractedUnit] The unit to register
35
+ def register(unit)
36
+ @nodes[unit.identifier] = {
37
+ type: unit.type,
38
+ file_path: unit.file_path,
39
+ namespace: unit.namespace
40
+ }
41
+
42
+ @edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
43
+ @file_map[unit.file_path] = unit.identifier if unit.file_path
44
+
45
+ # Type index for filtering
46
+ @type_index[unit.type] ||= []
47
+ @type_index[unit.type] << unit.identifier unless @type_index[unit.type].include?(unit.identifier)
48
+
49
+ # Build reverse edges
50
+ unit.dependencies.each do |dep|
51
+ @reverse[dep[:target]] ||= []
52
+ @reverse[dep[:target]] << unit.identifier unless @reverse[dep[:target]].include?(unit.identifier)
53
+ end
54
+ end
55
+
56
+ # Find all units affected by changes to given files
57
+ # Uses BFS to find transitive dependents
58
+ #
59
+ # @param changed_files [Array<String>] List of changed file paths
60
+ # @param max_depth [Integer] Maximum traversal depth (nil for unlimited)
61
+ # @return [Array<String>] List of affected unit identifiers
62
+ def affected_by(changed_files, max_depth: nil)
63
+ directly_changed = changed_files.filter_map { |f| @file_map[f] }
64
+
65
+ affected = Set.new(directly_changed)
66
+ queue = directly_changed.map { |id| [id, 0] } # [identifier, depth]
67
+
68
+ while queue.any?
69
+ current, depth = queue.shift
70
+ next if max_depth && depth >= max_depth
71
+
72
+ dependents = @reverse[current] || []
73
+
74
+ dependents.each do |dep|
75
+ unless affected.include?(dep)
76
+ affected.add(dep)
77
+ queue.push([dep, depth + 1])
78
+ end
79
+ end
80
+ end
81
+
82
+ affected.to_a
83
+ end
84
+
85
+ # Check if a node exists in the graph by exact identifier.
86
+ #
87
+ # @param identifier [String] Unit identifier to check
88
+ # @return [Boolean] true if the node exists
89
+ def node_exists?(identifier)
90
+ @nodes.key?(identifier)
91
+ end
92
+
93
+ # Find a node by suffix matching (e.g., "Update" matches "Order::Update").
94
+ #
95
+ # When multiple nodes share the same suffix, the first match wins.
96
+ # Suffix matching requires a "::" separator — bare identifiers (no namespace)
97
+ # are not matched by this method; use {#node_exists?} for exact lookups.
98
+ #
99
+ # @param suffix [String] The suffix to match against
100
+ # @return [String, nil] The first matching identifier, or nil
101
+ def find_node_by_suffix(suffix)
102
+ target_suffix = "::#{suffix}"
103
+ @nodes.keys.find { |id| id.end_with?(target_suffix) }
104
+ end
105
+
106
+ # Get direct dependencies of a unit
107
+ #
108
+ # @param identifier [String] Unit identifier
109
+ # @return [Array<String>] List of dependency identifiers
110
+ def dependencies_of(identifier)
111
+ @edges[identifier] || []
112
+ end
113
+
114
+ # Get direct dependents of a unit (what depends on it)
115
+ #
116
+ # @param identifier [String] Unit identifier
117
+ # @return [Array<String>] List of dependent identifiers
118
+ def dependents_of(identifier)
119
+ @reverse[identifier] || []
120
+ end
121
+
122
+ # Get all units of a specific type
123
+ #
124
+ # @param type [Symbol] Unit type (:model, :controller, etc.)
125
+ # @return [Array<String>] List of unit identifiers
126
+ def units_of_type(type)
127
+ @type_index[type] || []
128
+ end
129
+
130
+ # Compute PageRank scores for all nodes
131
+ #
132
+ # Uses the reverse edges (dependents) as the link structure: a node
133
+ # with many dependents gets a higher score. This matches Aider's insight
134
+ # that structural importance correlates with retrieval relevance.
135
+ #
136
+ # @param damping [Float] Damping factor (default: 0.85)
137
+ # @param iterations [Integer] Number of iterations (default: 20)
138
+ # @return [Hash<String, Float>] Identifier => PageRank score
139
+ def pagerank(damping: 0.85, iterations: 20)
140
+ n = @nodes.size
141
+ return {} if n.zero?
142
+
143
+ base_score = 1.0 / n
144
+ scores = @nodes.keys.to_h { |id| [id, base_score] }
145
+
146
+ iterations.times do
147
+ # Collect rank from dangling nodes (no outgoing edges) and redistribute
148
+ dangling_sum = @nodes.keys.sum do |id|
149
+ @edges[id].nil? || @edges[id].empty? ? scores[id] : 0.0
150
+ end
151
+
152
+ new_scores = {}
153
+
154
+ @nodes.each_key do |id|
155
+ # Sum contributions from nodes that depend on this one
156
+ incoming = @reverse[id] || []
157
+ rank_sum = incoming.sum do |src|
158
+ out_degree = (@edges[src] || []).size
159
+ out_degree.positive? ? scores[src] / out_degree : 0.0
160
+ end
161
+
162
+ new_scores[id] = ((1.0 - damping) / n) + (damping * (rank_sum + (dangling_sum / n)))
163
+ end
164
+
165
+ scores = new_scores
166
+ end
167
+
168
+ scores
169
+ end
170
+
171
+ # Serialize graph for persistence
172
+ #
173
+ # @return [Hash] Complete graph data
174
+ def to_h
175
+ {
176
+ nodes: @nodes,
177
+ edges: @edges,
178
+ reverse: @reverse,
179
+ file_map: @file_map,
180
+ type_index: @type_index,
181
+ stats: {
182
+ node_count: @nodes.size,
183
+ edge_count: @edges.values.sum(&:size),
184
+ types: @type_index.transform_values(&:size)
185
+ }
186
+ }
187
+ end
188
+
189
+ # Load graph from persisted data
190
+ #
191
+ # After JSON round-trip all keys become strings. This method normalizes
192
+ # them back to the expected types: node values use symbol keys (:type,
193
+ # :file_path, :namespace), and type_index uses symbol keys for types.
194
+ #
195
+ # @param data [Hash] Previously serialized graph data
196
+ # @return [DependencyGraph] Restored graph
197
+ def self.from_h(data)
198
+ graph = new
199
+
200
+ raw_nodes = data[:nodes] || data['nodes'] || {}
201
+ graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
202
+
203
+ graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
204
+ graph.instance_variable_set(:@reverse, data[:reverse] || data['reverse'] || {})
205
+ graph.instance_variable_set(:@file_map, data[:file_map] || data['file_map'] || {})
206
+
207
+ raw_type_index = data[:type_index] || data['type_index'] || {}
208
+ graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym))
209
+
210
+ graph
211
+ end
212
+
213
+ # Normalize a node hash to use symbol keys
214
+ #
215
+ # @param node [Hash] Node data with string or symbol keys
216
+ # @return [Hash] Node data with symbol keys
217
+ def self.symbolize_node(node)
218
+ return node unless node.is_a?(Hash)
219
+
220
+ {
221
+ type: (node[:type] || node['type'])&.to_sym,
222
+ file_path: node[:file_path] || node['file_path'],
223
+ namespace: node[:namespace] || node['namespace']
224
+ }
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'digest'
5
+
6
+ module CodebaseIndex
7
+ module Embedding
8
+ # Orchestrates the indexing pipeline: reads extracted units, prepares text,
9
+ # generates embeddings, and stores vectors. Supports full and incremental
10
+ # modes with checkpoint-based resumability.
11
+ class Indexer
12
+ def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32)
13
+ @provider = provider
14
+ @text_preparer = text_preparer
15
+ @vector_store = vector_store
16
+ @output_dir = output_dir
17
+ @batch_size = batch_size
18
+ end
19
+
20
+ # Index all extracted units (full mode). Returns stats hash.
21
+ # @return [Hash] Stats with :processed, :skipped, :errors counts
22
+ def index_all
23
+ process_units(load_units, incremental: false)
24
+ end
25
+
26
+ # Index only changed units (incremental mode). Returns stats hash.
27
+ # @return [Hash] Stats with :processed, :skipped, :errors counts
28
+ def index_incremental
29
+ process_units(load_units, incremental: true)
30
+ end
31
+
32
+ private
33
+
34
+ def load_units
35
+ Dir.glob(File.join(@output_dir, '**', '*.json')).filter_map do |path|
36
+ next if File.basename(path) == 'checkpoint.json'
37
+
38
+ JSON.parse(File.read(path))
39
+ rescue JSON::ParserError
40
+ nil
41
+ end
42
+ end
43
+
44
+ def process_units(units, incremental:)
45
+ checkpoint = incremental ? load_checkpoint : {}
46
+ stats = { processed: 0, skipped: 0, errors: 0 }
47
+
48
+ units.each_slice(@batch_size) do |batch|
49
+ process_batch(batch, checkpoint, stats, incremental: incremental)
50
+ save_checkpoint(checkpoint)
51
+ end
52
+
53
+ stats
54
+ end
55
+
56
+ def process_batch(batch, checkpoint, stats, incremental:)
57
+ to_embed = batch.each_with_object([]) do |unit_data, items|
58
+ if incremental && checkpoint[unit_data['identifier']] == unit_data['source_hash']
59
+ stats[:skipped] += 1
60
+ next
61
+ end
62
+ collect_embed_items(unit_data, items)
63
+ end
64
+
65
+ embed_and_store(to_embed, checkpoint, stats)
66
+ end
67
+
68
+ def collect_embed_items(unit_data, items)
69
+ texts = prepare_texts(unit_data)
70
+ identifier = unit_data['identifier']
71
+
72
+ texts.each_with_index do |text, idx|
73
+ embed_id = texts.length > 1 ? "#{identifier}#chunk_#{idx}" : identifier
74
+ items << { id: embed_id, text: text, unit_data: unit_data,
75
+ source_hash: unit_data['source_hash'], identifier: identifier }
76
+ end
77
+ end
78
+
79
+ def prepare_texts(unit_data)
80
+ unit = build_unit(unit_data)
81
+ unit.chunks&.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
82
+ end
83
+
84
+ def build_unit(data)
85
+ unit = ExtractedUnit.new(type: data['type']&.to_sym, identifier: data['identifier'],
86
+ file_path: data['file_path'])
87
+ unit.namespace = data['namespace']
88
+ unit.source_code = data['source_code']
89
+ unit.dependencies = data['dependencies'] || []
90
+ unit.chunks = (data['chunks'] || []).map { |c| c.transform_keys(&:to_sym) }
91
+ unit
92
+ end
93
+
94
+ def embed_and_store(items, checkpoint, stats)
95
+ return if items.empty?
96
+
97
+ vectors = @provider.embed_batch(items.map { |i| i[:text] })
98
+ store_vectors(items, vectors, checkpoint, stats)
99
+ rescue StandardError => e
100
+ stats[:errors] += items.size
101
+ stats[:error_messages] ||= []
102
+ stats[:error_messages] << e.message
103
+ raise CodebaseIndex::Error, "Embedding failed: #{e.message}"
104
+ end
105
+
106
+ def store_vectors(items, vectors, checkpoint, stats)
107
+ items.each_with_index do |item, idx|
108
+ metadata = { type: item[:unit_data]['type'], identifier: item[:identifier],
109
+ file_path: item[:unit_data]['file_path'] }
110
+ @vector_store.store(item[:id], vectors[idx], metadata)
111
+ checkpoint[item[:identifier]] = item[:source_hash]
112
+ stats[:processed] += 1
113
+ end
114
+ end
115
+
116
+ def load_checkpoint
117
+ path = File.join(@output_dir, 'checkpoint.json')
118
+ return {} unless File.exist?(path)
119
+
120
+ JSON.parse(File.read(path))
121
+ rescue JSON::ParserError
122
+ {}
123
+ end
124
+
125
+ def save_checkpoint(checkpoint)
126
+ File.write(File.join(@output_dir, 'checkpoint.json'), JSON.generate(checkpoint))
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+
6
+ module CodebaseIndex
7
+ module Embedding
8
+ module Provider
9
+ # OpenAI adapter for cloud embeddings via the OpenAI HTTP API.
10
+ #
11
+ # Uses the `/v1/embeddings` endpoint to generate embeddings. Requires a valid
12
+ # OpenAI API key.
13
+ #
14
+ # @example
15
+ # provider = CodebaseIndex::Embedding::Provider::OpenAI.new(api_key: ENV['OPENAI_API_KEY'])
16
+ # vector = provider.embed("class User < ApplicationRecord; end")
17
+ # vectors = provider.embed_batch(["text1", "text2"])
18
+ class OpenAI
19
+ include Interface
20
+
21
+ ENDPOINT = URI('https://api.openai.com/v1/embeddings')
22
+ DEFAULT_MODEL = 'text-embedding-3-small'
23
+ DIMENSIONS = {
24
+ 'text-embedding-3-small' => 1536,
25
+ 'text-embedding-3-large' => 3072
26
+ }.freeze
27
+
28
+ # @param api_key [String] OpenAI API key
29
+ # @param model [String] OpenAI embedding model name (default: text-embedding-3-small)
30
+ def initialize(api_key:, model: DEFAULT_MODEL)
31
+ @api_key = api_key
32
+ @model = model
33
+ end
34
+
35
+ # Embed a single text string.
36
+ #
37
+ # @param text [String] the text to embed
38
+ # @return [Array<Float>] the embedding vector
39
+ # @raise [CodebaseIndex::Error] if the API returns an error
40
+ def embed(text)
41
+ response = post_request({ model: @model, input: text })
42
+ response['data'].first['embedding']
43
+ end
44
+
45
+ # Embed multiple texts in a single request.
46
+ #
47
+ # Sorts results by the index field to guarantee ordering matches input.
48
+ #
49
+ # @param texts [Array<String>] the texts to embed
50
+ # @return [Array<Array<Float>>] array of embedding vectors
51
+ # @raise [CodebaseIndex::Error] if the API returns an error
52
+ def embed_batch(texts)
53
+ response = post_request({ model: @model, input: texts })
54
+ response['data']
55
+ .sort_by { |item| item['index'] }
56
+ .map { |item| item['embedding'] }
57
+ end
58
+
59
+ # Return the dimensionality of vectors produced by this model.
60
+ #
61
+ # Uses the known dimensions for standard models, falling back to a
62
+ # test embedding for unknown models.
63
+ #
64
+ # @return [Integer] number of dimensions
65
+ def dimensions
66
+ DIMENSIONS[@model] || embed('test').length
67
+ end
68
+
69
+ # Return the model name.
70
+ #
71
+ # @return [String] the OpenAI model name
72
+ def model_name
73
+ @model
74
+ end
75
+
76
+ private
77
+
78
+ # Send a POST request to the OpenAI embeddings API.
79
+ #
80
+ # @param body [Hash] request body
81
+ # @return [Hash] parsed JSON response
82
+ # @raise [CodebaseIndex::Error] if the API returns a non-success status
83
+ def post_request(body)
84
+ http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
85
+ http.use_ssl = true
86
+ http.open_timeout = 10
87
+ http.read_timeout = 30
88
+
89
+ request = Net::HTTP::Post.new(ENDPOINT.path)
90
+ request['Content-Type'] = 'application/json'
91
+ request['Authorization'] = "Bearer #{@api_key}"
92
+ request.body = body.to_json
93
+
94
+ response = http.request(request)
95
+
96
+ unless response.is_a?(Net::HTTPSuccess)
97
+ raise CodebaseIndex::Error, "OpenAI API error: #{response.code} #{response.body}"
98
+ end
99
+
100
+ JSON.parse(response.body)
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end