codebase_index 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. checksums.yaml +4 -4
  2. data/lib/codebase_index.rb +3 -243
  3. metadata +28 -223
  4. data/CHANGELOG.md +0 -89
  5. data/CODE_OF_CONDUCT.md +0 -83
  6. data/CONTRIBUTING.md +0 -65
  7. data/LICENSE.txt +0 -21
  8. data/README.md +0 -325
  9. data/exe/codebase-console +0 -59
  10. data/exe/codebase-console-mcp +0 -22
  11. data/exe/codebase-index-mcp +0 -34
  12. data/exe/codebase-index-mcp-http +0 -37
  13. data/exe/codebase-index-mcp-start +0 -58
  14. data/lib/codebase_index/ast/call_site_extractor.rb +0 -106
  15. data/lib/codebase_index/ast/method_extractor.rb +0 -71
  16. data/lib/codebase_index/ast/node.rb +0 -116
  17. data/lib/codebase_index/ast/parser.rb +0 -614
  18. data/lib/codebase_index/ast.rb +0 -6
  19. data/lib/codebase_index/builder.rb +0 -200
  20. data/lib/codebase_index/cache/cache_middleware.rb +0 -199
  21. data/lib/codebase_index/cache/cache_store.rb +0 -264
  22. data/lib/codebase_index/cache/redis_cache_store.rb +0 -116
  23. data/lib/codebase_index/cache/solid_cache_store.rb +0 -111
  24. data/lib/codebase_index/chunking/chunk.rb +0 -84
  25. data/lib/codebase_index/chunking/semantic_chunker.rb +0 -295
  26. data/lib/codebase_index/console/adapters/cache_adapter.rb +0 -58
  27. data/lib/codebase_index/console/adapters/good_job_adapter.rb +0 -33
  28. data/lib/codebase_index/console/adapters/job_adapter.rb +0 -68
  29. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +0 -33
  30. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +0 -33
  31. data/lib/codebase_index/console/audit_logger.rb +0 -75
  32. data/lib/codebase_index/console/bridge.rb +0 -177
  33. data/lib/codebase_index/console/confirmation.rb +0 -90
  34. data/lib/codebase_index/console/connection_manager.rb +0 -173
  35. data/lib/codebase_index/console/console_response_renderer.rb +0 -74
  36. data/lib/codebase_index/console/embedded_executor.rb +0 -373
  37. data/lib/codebase_index/console/model_validator.rb +0 -81
  38. data/lib/codebase_index/console/rack_middleware.rb +0 -87
  39. data/lib/codebase_index/console/safe_context.rb +0 -82
  40. data/lib/codebase_index/console/server.rb +0 -612
  41. data/lib/codebase_index/console/sql_validator.rb +0 -172
  42. data/lib/codebase_index/console/tools/tier1.rb +0 -118
  43. data/lib/codebase_index/console/tools/tier2.rb +0 -117
  44. data/lib/codebase_index/console/tools/tier3.rb +0 -110
  45. data/lib/codebase_index/console/tools/tier4.rb +0 -79
  46. data/lib/codebase_index/coordination/pipeline_lock.rb +0 -109
  47. data/lib/codebase_index/cost_model/embedding_cost.rb +0 -88
  48. data/lib/codebase_index/cost_model/estimator.rb +0 -128
  49. data/lib/codebase_index/cost_model/provider_pricing.rb +0 -67
  50. data/lib/codebase_index/cost_model/storage_cost.rb +0 -52
  51. data/lib/codebase_index/cost_model.rb +0 -22
  52. data/lib/codebase_index/db/migrations/001_create_units.rb +0 -38
  53. data/lib/codebase_index/db/migrations/002_create_edges.rb +0 -35
  54. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +0 -37
  55. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +0 -45
  56. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +0 -40
  57. data/lib/codebase_index/db/migrator.rb +0 -71
  58. data/lib/codebase_index/db/schema_version.rb +0 -73
  59. data/lib/codebase_index/dependency_graph.rb +0 -236
  60. data/lib/codebase_index/embedding/indexer.rb +0 -140
  61. data/lib/codebase_index/embedding/openai.rb +0 -126
  62. data/lib/codebase_index/embedding/provider.rb +0 -162
  63. data/lib/codebase_index/embedding/text_preparer.rb +0 -112
  64. data/lib/codebase_index/evaluation/baseline_runner.rb +0 -115
  65. data/lib/codebase_index/evaluation/evaluator.rb +0 -139
  66. data/lib/codebase_index/evaluation/metrics.rb +0 -79
  67. data/lib/codebase_index/evaluation/query_set.rb +0 -148
  68. data/lib/codebase_index/evaluation/report_generator.rb +0 -90
  69. data/lib/codebase_index/extracted_unit.rb +0 -145
  70. data/lib/codebase_index/extractor.rb +0 -1028
  71. data/lib/codebase_index/extractors/action_cable_extractor.rb +0 -201
  72. data/lib/codebase_index/extractors/ast_source_extraction.rb +0 -46
  73. data/lib/codebase_index/extractors/behavioral_profile.rb +0 -309
  74. data/lib/codebase_index/extractors/caching_extractor.rb +0 -261
  75. data/lib/codebase_index/extractors/callback_analyzer.rb +0 -246
  76. data/lib/codebase_index/extractors/concern_extractor.rb +0 -292
  77. data/lib/codebase_index/extractors/configuration_extractor.rb +0 -219
  78. data/lib/codebase_index/extractors/controller_extractor.rb +0 -404
  79. data/lib/codebase_index/extractors/database_view_extractor.rb +0 -278
  80. data/lib/codebase_index/extractors/decorator_extractor.rb +0 -253
  81. data/lib/codebase_index/extractors/engine_extractor.rb +0 -223
  82. data/lib/codebase_index/extractors/event_extractor.rb +0 -211
  83. data/lib/codebase_index/extractors/factory_extractor.rb +0 -289
  84. data/lib/codebase_index/extractors/graphql_extractor.rb +0 -892
  85. data/lib/codebase_index/extractors/i18n_extractor.rb +0 -117
  86. data/lib/codebase_index/extractors/job_extractor.rb +0 -374
  87. data/lib/codebase_index/extractors/lib_extractor.rb +0 -218
  88. data/lib/codebase_index/extractors/mailer_extractor.rb +0 -269
  89. data/lib/codebase_index/extractors/manager_extractor.rb +0 -188
  90. data/lib/codebase_index/extractors/middleware_extractor.rb +0 -133
  91. data/lib/codebase_index/extractors/migration_extractor.rb +0 -469
  92. data/lib/codebase_index/extractors/model_extractor.rb +0 -988
  93. data/lib/codebase_index/extractors/phlex_extractor.rb +0 -252
  94. data/lib/codebase_index/extractors/policy_extractor.rb +0 -191
  95. data/lib/codebase_index/extractors/poro_extractor.rb +0 -229
  96. data/lib/codebase_index/extractors/pundit_extractor.rb +0 -223
  97. data/lib/codebase_index/extractors/rails_source_extractor.rb +0 -473
  98. data/lib/codebase_index/extractors/rake_task_extractor.rb +0 -343
  99. data/lib/codebase_index/extractors/route_extractor.rb +0 -181
  100. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +0 -331
  101. data/lib/codebase_index/extractors/serializer_extractor.rb +0 -339
  102. data/lib/codebase_index/extractors/service_extractor.rb +0 -217
  103. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +0 -91
  104. data/lib/codebase_index/extractors/shared_utility_methods.rb +0 -281
  105. data/lib/codebase_index/extractors/state_machine_extractor.rb +0 -398
  106. data/lib/codebase_index/extractors/test_mapping_extractor.rb +0 -225
  107. data/lib/codebase_index/extractors/validator_extractor.rb +0 -211
  108. data/lib/codebase_index/extractors/view_component_extractor.rb +0 -311
  109. data/lib/codebase_index/extractors/view_template_extractor.rb +0 -261
  110. data/lib/codebase_index/feedback/gap_detector.rb +0 -89
  111. data/lib/codebase_index/feedback/store.rb +0 -119
  112. data/lib/codebase_index/filename_utils.rb +0 -32
  113. data/lib/codebase_index/flow_analysis/operation_extractor.rb +0 -206
  114. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +0 -154
  115. data/lib/codebase_index/flow_assembler.rb +0 -290
  116. data/lib/codebase_index/flow_document.rb +0 -191
  117. data/lib/codebase_index/flow_precomputer.rb +0 -102
  118. data/lib/codebase_index/formatting/base.rb +0 -30
  119. data/lib/codebase_index/formatting/claude_adapter.rb +0 -98
  120. data/lib/codebase_index/formatting/generic_adapter.rb +0 -56
  121. data/lib/codebase_index/formatting/gpt_adapter.rb +0 -64
  122. data/lib/codebase_index/formatting/human_adapter.rb +0 -78
  123. data/lib/codebase_index/graph_analyzer.rb +0 -374
  124. data/lib/codebase_index/mcp/bootstrapper.rb +0 -96
  125. data/lib/codebase_index/mcp/index_reader.rb +0 -394
  126. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +0 -81
  127. data/lib/codebase_index/mcp/renderers/json_renderer.rb +0 -17
  128. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +0 -353
  129. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +0 -240
  130. data/lib/codebase_index/mcp/server.rb +0 -961
  131. data/lib/codebase_index/mcp/tool_response_renderer.rb +0 -85
  132. data/lib/codebase_index/model_name_cache.rb +0 -51
  133. data/lib/codebase_index/notion/client.rb +0 -217
  134. data/lib/codebase_index/notion/exporter.rb +0 -219
  135. data/lib/codebase_index/notion/mapper.rb +0 -40
  136. data/lib/codebase_index/notion/mappers/column_mapper.rb +0 -57
  137. data/lib/codebase_index/notion/mappers/migration_mapper.rb +0 -39
  138. data/lib/codebase_index/notion/mappers/model_mapper.rb +0 -161
  139. data/lib/codebase_index/notion/mappers/shared.rb +0 -22
  140. data/lib/codebase_index/notion/rate_limiter.rb +0 -68
  141. data/lib/codebase_index/observability/health_check.rb +0 -79
  142. data/lib/codebase_index/observability/instrumentation.rb +0 -34
  143. data/lib/codebase_index/observability/structured_logger.rb +0 -57
  144. data/lib/codebase_index/operator/error_escalator.rb +0 -81
  145. data/lib/codebase_index/operator/pipeline_guard.rb +0 -92
  146. data/lib/codebase_index/operator/status_reporter.rb +0 -80
  147. data/lib/codebase_index/railtie.rb +0 -38
  148. data/lib/codebase_index/resilience/circuit_breaker.rb +0 -99
  149. data/lib/codebase_index/resilience/index_validator.rb +0 -167
  150. data/lib/codebase_index/resilience/retryable_provider.rb +0 -108
  151. data/lib/codebase_index/retrieval/context_assembler.rb +0 -261
  152. data/lib/codebase_index/retrieval/query_classifier.rb +0 -133
  153. data/lib/codebase_index/retrieval/ranker.rb +0 -277
  154. data/lib/codebase_index/retrieval/search_executor.rb +0 -316
  155. data/lib/codebase_index/retriever.rb +0 -152
  156. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +0 -170
  157. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +0 -77
  158. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +0 -18
  159. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +0 -280
  160. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +0 -143
  161. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +0 -143
  162. data/lib/codebase_index/ruby_analyzer.rb +0 -87
  163. data/lib/codebase_index/session_tracer/file_store.rb +0 -104
  164. data/lib/codebase_index/session_tracer/middleware.rb +0 -143
  165. data/lib/codebase_index/session_tracer/redis_store.rb +0 -106
  166. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +0 -254
  167. data/lib/codebase_index/session_tracer/session_flow_document.rb +0 -223
  168. data/lib/codebase_index/session_tracer/solid_cache_store.rb +0 -139
  169. data/lib/codebase_index/session_tracer/store.rb +0 -81
  170. data/lib/codebase_index/storage/graph_store.rb +0 -120
  171. data/lib/codebase_index/storage/metadata_store.rb +0 -196
  172. data/lib/codebase_index/storage/pgvector.rb +0 -195
  173. data/lib/codebase_index/storage/qdrant.rb +0 -205
  174. data/lib/codebase_index/storage/vector_store.rb +0 -167
  175. data/lib/codebase_index/temporal/json_snapshot_store.rb +0 -245
  176. data/lib/codebase_index/temporal/snapshot_store.rb +0 -345
  177. data/lib/codebase_index/token_utils.rb +0 -19
  178. data/lib/codebase_index/version.rb +0 -5
  179. data/lib/generators/codebase_index/install_generator.rb +0 -32
  180. data/lib/generators/codebase_index/pgvector_generator.rb +0 -37
  181. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +0 -15
  182. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +0 -43
  183. data/lib/tasks/codebase_index.rake +0 -597
  184. data/lib/tasks/codebase_index_evaluation.rake +0 -115
@@ -1,71 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require_relative 'schema_version'
4
- require_relative 'migrations/001_create_units'
5
- require_relative 'migrations/002_create_edges'
6
- require_relative 'migrations/003_create_embeddings'
7
- require_relative 'migrations/004_create_snapshots'
8
- require_relative 'migrations/005_create_snapshot_units'
9
-
10
- module CodebaseIndex
11
- module Db
12
- # Runs schema migrations against a database connection.
13
- #
14
- # Tracks applied migrations via {SchemaVersion} and only runs pending ones.
15
- # Migrations are defined as modules in `db/migrations/` with a VERSION
16
- # constant and a `.up(connection)` class method.
17
- #
18
- # @example
19
- # db = SQLite3::Database.new('codebase_index.db')
20
- # migrator = Migrator.new(connection: db)
21
- # migrator.migrate! # => [1, 2, 3]
22
- #
23
- class Migrator
24
- MIGRATIONS = [
25
- Migrations::CreateUnits,
26
- Migrations::CreateEdges,
27
- Migrations::CreateEmbeddings,
28
- Migrations::CreateSnapshots,
29
- Migrations::CreateSnapshotUnits
30
- ].freeze
31
-
32
- attr_reader :schema_version
33
-
34
- # @param connection [Object] Database connection supporting #execute
35
- def initialize(connection:)
36
- @connection = connection
37
- @schema_version = SchemaVersion.new(connection: connection)
38
- @schema_version.ensure_table!
39
- end
40
-
41
- # Run all pending migrations.
42
- #
43
- # @return [Array<Integer>] Version numbers of newly applied migrations
44
- def migrate!
45
- applied = []
46
- pending_migrations.each do |migration|
47
- migration.up(@connection)
48
- @schema_version.record_version(migration::VERSION)
49
- applied << migration::VERSION
50
- end
51
- applied
52
- end
53
-
54
- # List version numbers of pending (unapplied) migrations.
55
- #
56
- # @return [Array<Integer>]
57
- def pending_versions
58
- applied = @schema_version.applied_versions
59
- MIGRATIONS.map { |m| m::VERSION }.reject { |v| applied.include?(v) }
60
- end
61
-
62
- private
63
-
64
- # @return [Array<Module>] Pending migration modules
65
- def pending_migrations
66
- applied = @schema_version.applied_versions
67
- MIGRATIONS.reject { |m| applied.include?(m::VERSION) }
68
- end
69
- end
70
- end
71
- end
@@ -1,73 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module CodebaseIndex
4
- module Db
5
- # Tracks which schema migrations have been applied.
6
- #
7
- # Uses a simple `codebase_index_schema_migrations` table with a single
8
- # `version` column. Works with any database connection that supports
9
- # `execute` and returns arrays (SQLite3, pg, mysql2).
10
- #
11
- # @example
12
- # db = SQLite3::Database.new('codebase_index.db')
13
- # sv = SchemaVersion.new(connection: db)
14
- # sv.ensure_table!
15
- # sv.current_version # => 0
16
- # sv.record_version(1)
17
- # sv.current_version # => 1
18
- #
19
- class SchemaVersion
20
- TABLE_NAME = 'codebase_index_schema_migrations'
21
-
22
- # @param connection [Object] Database connection supporting #execute
23
- def initialize(connection:)
24
- @connection = connection
25
- end
26
-
27
- # Create the schema migrations table if it does not exist.
28
- #
29
- # @return [void]
30
- def ensure_table!
31
- @connection.execute(<<~SQL)
32
- CREATE TABLE IF NOT EXISTS #{TABLE_NAME} (
33
- version INTEGER PRIMARY KEY NOT NULL,
34
- applied_at TEXT NOT NULL DEFAULT (datetime('now'))
35
- )
36
- SQL
37
- end
38
-
39
- # List all applied migration version numbers, sorted ascending.
40
- #
41
- # @return [Array<Integer>]
42
- def applied_versions
43
- rows = @connection.execute("SELECT version FROM #{TABLE_NAME} ORDER BY version ASC")
44
- rows.map { |row| row.is_a?(Array) ? row[0] : row['version'] }
45
- end
46
-
47
- # Record a migration version as applied.
48
- #
49
- # @param version [Integer] The migration version number
50
- # @return [void]
51
- def record_version(version)
52
- @connection.execute(
53
- "INSERT OR IGNORE INTO #{TABLE_NAME} (version) VALUES (?)", [version]
54
- )
55
- end
56
-
57
- # Check whether a version has been applied.
58
- #
59
- # @param version [Integer]
60
- # @return [Boolean]
61
- def applied?(version)
62
- applied_versions.include?(version)
63
- end
64
-
65
- # The highest applied version, or 0 if none.
66
- #
67
- # @return [Integer]
68
- def current_version
69
- applied_versions.last || 0
70
- end
71
- end
72
- end
73
- end
@@ -1,236 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'set'
4
- require 'json'
5
-
6
- module CodebaseIndex
7
- # DependencyGraph tracks relationships between code units for:
8
- # 1. Understanding what depends on what
9
- # 2. Computing "blast radius" for incremental re-indexing
10
- # 3. Enabling graph-based retrieval queries
11
- #
12
- # The graph is bidirectional - we track both what a unit depends on
13
- # and what depends on that unit (reverse edges).
14
- #
15
- # @example Building and querying the graph
16
- # graph = DependencyGraph.new
17
- # graph.register(user_model_unit)
18
- # graph.register(user_service_unit)
19
- #
20
- # # Find everything affected by a change to user.rb
21
- # affected = graph.affected_by(["app/models/user.rb"])
22
- #
23
- class DependencyGraph
24
- def initialize
25
- @nodes = {} # identifier => { type:, file_path: }
26
- @edges = {} # identifier => [dependency identifiers]
27
- @reverse = {} # identifier => Set of dependent identifiers
28
- @file_map = {} # file_path => identifier
29
- @type_index = {} # type => Set of identifiers
30
- @to_h = nil
31
- end
32
-
33
- # Register a unit in the graph
34
- #
35
- # @param unit [ExtractedUnit] The unit to register
36
- def register(unit)
37
- @to_h = nil
38
-
39
- @nodes[unit.identifier] = {
40
- type: unit.type,
41
- file_path: unit.file_path,
42
- namespace: unit.namespace
43
- }
44
-
45
- @edges[unit.identifier] = unit.dependencies.map { |d| d[:target] }
46
- @file_map[unit.file_path] = unit.identifier if unit.file_path
47
-
48
- # Type index for filtering (Set-based for O(1) insert)
49
- (@type_index[unit.type] ||= Set.new).add(unit.identifier)
50
-
51
- # Build reverse edges (Set-based for O(1) insert)
52
- unit.dependencies.each do |dep|
53
- (@reverse[dep[:target]] ||= Set.new).add(unit.identifier)
54
- end
55
- end
56
-
57
- # Find all units affected by changes to given files
58
- # Uses BFS to find transitive dependents
59
- #
60
- # @param changed_files [Array<String>] List of changed file paths
61
- # @param max_depth [Integer] Maximum traversal depth (nil for unlimited)
62
- # @return [Array<String>] List of affected unit identifiers
63
- def affected_by(changed_files, max_depth: nil)
64
- directly_changed = changed_files.filter_map { |f| @file_map[f] }
65
-
66
- affected = Set.new(directly_changed)
67
- queue = directly_changed.map { |id| [id, 0] } # [identifier, depth]
68
-
69
- while queue.any?
70
- current, depth = queue.shift
71
- next if max_depth && depth >= max_depth
72
-
73
- dependents = @reverse[current] || []
74
-
75
- dependents.each do |dep|
76
- unless affected.include?(dep)
77
- affected.add(dep)
78
- queue.push([dep, depth + 1])
79
- end
80
- end
81
- end
82
-
83
- affected.to_a
84
- end
85
-
86
- # Check if a node exists in the graph by exact identifier.
87
- #
88
- # @param identifier [String] Unit identifier to check
89
- # @return [Boolean] true if the node exists
90
- def node_exists?(identifier)
91
- @nodes.key?(identifier)
92
- end
93
-
94
- # Find a node by suffix matching (e.g., "Update" matches "Order::Update").
95
- #
96
- # When multiple nodes share the same suffix, the first match wins.
97
- # Suffix matching requires a "::" separator — bare identifiers (no namespace)
98
- # are not matched by this method; use {#node_exists?} for exact lookups.
99
- #
100
- # @param suffix [String] The suffix to match against
101
- # @return [String, nil] The first matching identifier, or nil
102
- def find_node_by_suffix(suffix)
103
- target_suffix = "::#{suffix}"
104
- @nodes.keys.find { |id| id.end_with?(target_suffix) }
105
- end
106
-
107
- # Get direct dependencies of a unit
108
- #
109
- # @param identifier [String] Unit identifier
110
- # @return [Array<String>] List of dependency identifiers
111
- def dependencies_of(identifier)
112
- @edges[identifier] || []
113
- end
114
-
115
- # Get direct dependents of a unit (what depends on it)
116
- #
117
- # @param identifier [String] Unit identifier
118
- # @return [Array<String>] List of dependent identifiers
119
- def dependents_of(identifier)
120
- @reverse.fetch(identifier, Set.new).to_a
121
- end
122
-
123
- # Get all units of a specific type
124
- #
125
- # @param type [Symbol] Unit type (:model, :controller, etc.)
126
- # @return [Array<String>] List of unit identifiers
127
- def units_of_type(type)
128
- @type_index.fetch(type, Set.new).to_a
129
- end
130
-
131
- # Compute PageRank scores for all nodes
132
- #
133
- # Uses the reverse edges (dependents) as the link structure: a node
134
- # with many dependents gets a higher score. This matches Aider's insight
135
- # that structural importance correlates with retrieval relevance.
136
- #
137
- # @param damping [Float] Damping factor (default: 0.85)
138
- # @param iterations [Integer] Number of iterations (default: 20)
139
- # @return [Hash<String, Float>] Identifier => PageRank score
140
- def pagerank(damping: 0.85, iterations: 20)
141
- n = @nodes.size
142
- return {} if n.zero?
143
-
144
- node_ids = @nodes.keys
145
- base_score = 1.0 / n
146
- scores = node_ids.to_h { |id| [id, base_score] }
147
-
148
- iterations.times do
149
- # Collect rank from dangling nodes (no outgoing edges) and redistribute
150
- dangling_sum = node_ids.sum do |id|
151
- @edges[id].nil? || @edges[id].empty? ? scores[id] : 0.0
152
- end
153
-
154
- new_scores = {}
155
-
156
- node_ids.each do |id|
157
- # Sum contributions from nodes that depend on this one
158
- incoming = @reverse[id] || []
159
- rank_sum = incoming.sum do |src|
160
- out_degree = (@edges[src] || []).size
161
- out_degree.positive? ? scores[src] / out_degree : 0.0
162
- end
163
-
164
- new_scores[id] = ((1.0 - damping) / n) + (damping * (rank_sum + (dangling_sum / n)))
165
- end
166
-
167
- scores = new_scores
168
- end
169
-
170
- scores
171
- end
172
-
173
- # Serialize graph for persistence. Memoized — cache is invalidated on register.
174
- # Returns a dup so callers can't pollute the cached hash.
175
- #
176
- # @return [Hash] Complete graph data
177
- def to_h
178
- @to_h ||= {
179
- nodes: @nodes,
180
- edges: @edges,
181
- reverse: @reverse.transform_values(&:to_a),
182
- file_map: @file_map,
183
- type_index: @type_index.transform_values(&:to_a),
184
- stats: {
185
- node_count: @nodes.size,
186
- edge_count: @edges.values.sum(&:size),
187
- types: @type_index.transform_values(&:size)
188
- }
189
- }
190
- @to_h.dup
191
- end
192
-
193
- # Load graph from persisted data
194
- #
195
- # After JSON round-trip all keys become strings. This method normalizes
196
- # them back to the expected types: node values use symbol keys (:type,
197
- # :file_path, :namespace), and type_index uses symbol keys for types.
198
- #
199
- # @param data [Hash] Previously serialized graph data
200
- # @return [DependencyGraph] Restored graph
201
- def self.from_h(data)
202
- graph = new
203
-
204
- raw_nodes = data[:nodes] || data['nodes'] || {}
205
- graph.instance_variable_set(:@nodes, raw_nodes.transform_values { |v| symbolize_node(v) })
206
-
207
- graph.instance_variable_set(:@edges, data[:edges] || data['edges'] || {})
208
-
209
- raw_reverse = data[:reverse] || data['reverse'] || {}
210
- graph.instance_variable_set(:@reverse, raw_reverse.transform_values { |v| v.is_a?(Set) ? v : Set.new(v) })
211
-
212
- graph.instance_variable_set(:@file_map, data[:file_map] || data['file_map'] || {})
213
-
214
- raw_type_index = data[:type_index] || data['type_index'] || {}
215
- graph.instance_variable_set(:@type_index, raw_type_index.transform_keys(&:to_sym).transform_values do |v|
216
- v.is_a?(Set) ? v : Set.new(v)
217
- end)
218
-
219
- graph
220
- end
221
-
222
- # Normalize a node hash to use symbol keys
223
- #
224
- # @param node [Hash] Node data with string or symbol keys
225
- # @return [Hash] Node data with symbol keys
226
- def self.symbolize_node(node)
227
- return node unless node.is_a?(Hash)
228
-
229
- {
230
- type: (node[:type] || node['type'])&.to_sym,
231
- file_path: node[:file_path] || node['file_path'],
232
- namespace: node[:namespace] || node['namespace']
233
- }
234
- end
235
- end
236
- end
@@ -1,140 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'json'
4
- require 'digest'
5
-
6
- module CodebaseIndex
7
- module Embedding
8
- # Orchestrates the indexing pipeline: reads extracted units, prepares text,
9
- # generates embeddings, and stores vectors. Supports full and incremental
10
- # modes with checkpoint-based resumability.
11
- class Indexer
12
- # @param checkpoint_interval [Integer] Save checkpoint every N batches (default: 10)
13
- def initialize(provider:, text_preparer:, vector_store:, output_dir:, batch_size: 32, checkpoint_interval: 10) # rubocop:disable Metrics/ParameterLists
14
- @provider = provider
15
- @text_preparer = text_preparer
16
- @vector_store = vector_store
17
- @output_dir = output_dir
18
- @batch_size = batch_size
19
- @checkpoint_interval = checkpoint_interval
20
- end
21
-
22
- # Index all extracted units (full mode). Returns stats hash.
23
- # @return [Hash] Stats with :processed, :skipped, :errors counts
24
- def index_all
25
- process_units(load_units, incremental: false)
26
- end
27
-
28
- # Index only changed units (incremental mode). Returns stats hash.
29
- # @return [Hash] Stats with :processed, :skipped, :errors counts
30
- def index_incremental
31
- process_units(load_units, incremental: true)
32
- end
33
-
34
- private
35
-
36
- def load_units
37
- Dir.glob(File.join(@output_dir, '**', '*.json')).filter_map do |path|
38
- next if File.basename(path) == 'checkpoint.json'
39
-
40
- JSON.parse(File.read(path))
41
- rescue JSON::ParserError
42
- nil
43
- end
44
- end
45
-
46
- def process_units(units, incremental:)
47
- checkpoint = incremental ? load_checkpoint : {}
48
- stats = { processed: 0, skipped: 0, errors: 0 }
49
- batch_count = 0
50
-
51
- units.each_slice(@batch_size) do |batch|
52
- process_batch(batch, checkpoint, stats, incremental: incremental)
53
- batch_count += 1
54
- save_checkpoint(checkpoint) if (batch_count % @checkpoint_interval).zero?
55
- end
56
-
57
- # Always save final checkpoint
58
- save_checkpoint(checkpoint)
59
-
60
- stats
61
- end
62
-
63
- def process_batch(batch, checkpoint, stats, incremental:)
64
- to_embed = batch.each_with_object([]) do |unit_data, items|
65
- if incremental && checkpoint[unit_data['identifier']] == unit_data['source_hash']
66
- stats[:skipped] += 1
67
- next
68
- end
69
- collect_embed_items(unit_data, items)
70
- end
71
-
72
- embed_and_store(to_embed, checkpoint, stats)
73
- end
74
-
75
- def collect_embed_items(unit_data, items)
76
- texts = prepare_texts(unit_data)
77
- identifier = unit_data['identifier']
78
-
79
- texts.each_with_index do |text, idx|
80
- embed_id = texts.length > 1 ? "#{identifier}#chunk_#{idx}" : identifier
81
- items << { id: embed_id, text: text, unit_data: unit_data,
82
- source_hash: unit_data['source_hash'], identifier: identifier }
83
- end
84
- end
85
-
86
- def prepare_texts(unit_data)
87
- unit = build_unit(unit_data)
88
- unit.chunks&.any? ? @text_preparer.prepare_chunks(unit) : [@text_preparer.prepare(unit)]
89
- end
90
-
91
- def build_unit(data)
92
- unit = ExtractedUnit.new(type: data['type']&.to_sym, identifier: data['identifier'],
93
- file_path: data['file_path'])
94
- unit.namespace = data['namespace']
95
- unit.source_code = data['source_code']
96
- unit.dependencies = data['dependencies'] || []
97
- unit.chunks = (data['chunks'] || []).map { |c| c.transform_keys(&:to_sym) }
98
- unit
99
- end
100
-
101
- def embed_and_store(items, checkpoint, stats)
102
- return if items.empty?
103
-
104
- vectors = @provider.embed_batch(items.map { |i| i[:text] })
105
- store_vectors(items, vectors, checkpoint, stats)
106
- rescue StandardError => e
107
- stats[:errors] += items.size
108
- raise CodebaseIndex::Error, "Embedding failed: #{e.message}"
109
- end
110
-
111
- def store_vectors(items, vectors, checkpoint, stats)
112
- entries = items.each_with_index.map do |item, idx|
113
- { id: item[:id], vector: vectors[idx],
114
- metadata: { type: item[:unit_data]['type'], identifier: item[:identifier],
115
- file_path: item[:unit_data]['file_path'] } }
116
- end
117
-
118
- @vector_store.store_batch(entries)
119
-
120
- items.each do |item|
121
- checkpoint[item[:identifier]] = item[:source_hash]
122
- stats[:processed] += 1
123
- end
124
- end
125
-
126
- def load_checkpoint
127
- path = File.join(@output_dir, 'checkpoint.json')
128
- return {} unless File.exist?(path)
129
-
130
- JSON.parse(File.read(path))
131
- rescue JSON::ParserError
132
- {}
133
- end
134
-
135
- def save_checkpoint(checkpoint)
136
- File.write(File.join(@output_dir, 'checkpoint.json'), JSON.generate(checkpoint))
137
- end
138
- end
139
- end
140
- end
@@ -1,126 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'net/http'
4
- require 'json'
5
-
6
- module CodebaseIndex
7
- module Embedding
8
- module Provider
9
- # OpenAI adapter for cloud embeddings via the OpenAI HTTP API.
10
- #
11
- # Uses the `/v1/embeddings` endpoint to generate embeddings. Requires a valid
12
- # OpenAI API key.
13
- #
14
- # @example
15
- # provider = CodebaseIndex::Embedding::Provider::OpenAI.new(api_key: ENV['OPENAI_API_KEY'])
16
- # vector = provider.embed("class User < ApplicationRecord; end")
17
- # vectors = provider.embed_batch(["text1", "text2"])
18
- class OpenAI
19
- include Interface
20
-
21
- ENDPOINT = URI('https://api.openai.com/v1/embeddings')
22
- DEFAULT_MODEL = 'text-embedding-3-small'
23
- DIMENSIONS = {
24
- 'text-embedding-3-small' => 1536,
25
- 'text-embedding-3-large' => 3072
26
- }.freeze
27
-
28
- # @param api_key [String] OpenAI API key
29
- # @param model [String] OpenAI embedding model name (default: text-embedding-3-small)
30
- def initialize(api_key:, model: DEFAULT_MODEL)
31
- @api_key = api_key
32
- @model = model
33
- end
34
-
35
- # Embed a single text string.
36
- #
37
- # @param text [String] the text to embed
38
- # @return [Array<Float>] the embedding vector
39
- # @raise [CodebaseIndex::Error] if the API returns an error
40
- def embed(text)
41
- response = post_request({ model: @model, input: text })
42
- response['data'].first['embedding']
43
- end
44
-
45
- # Embed multiple texts in a single request.
46
- #
47
- # Sorts results by the index field to guarantee ordering matches input.
48
- #
49
- # @param texts [Array<String>] the texts to embed
50
- # @return [Array<Array<Float>>] array of embedding vectors
51
- # @raise [CodebaseIndex::Error] if the API returns an error
52
- def embed_batch(texts)
53
- response = post_request({ model: @model, input: texts })
54
- response['data']
55
- .sort_by { |item| item['index'] }
56
- .map { |item| item['embedding'] }
57
- end
58
-
59
- # Return the dimensionality of vectors produced by this model.
60
- #
61
- # Uses the known dimensions for standard models, falling back to a
62
- # test embedding for unknown models.
63
- #
64
- # @return [Integer] number of dimensions
65
- def dimensions
66
- DIMENSIONS[@model] || embed('test').length
67
- end
68
-
69
- # Return the model name.
70
- #
71
- # @return [String] the OpenAI model name
72
- def model_name
73
- @model
74
- end
75
-
76
- private
77
-
78
- # Send a POST request to the OpenAI embeddings API.
79
- #
80
- # @param body [Hash] request body
81
- # @return [Hash] parsed JSON response
82
- # @raise [CodebaseIndex::Error] if the API returns a non-success status
83
- def post_request(body)
84
- request = Net::HTTP::Post.new(ENDPOINT.path)
85
- request['Content-Type'] = 'application/json'
86
- request['Authorization'] = "Bearer #{@api_key}"
87
- request.body = body.to_json
88
-
89
- response = http_client.request(request)
90
-
91
- unless response.is_a?(Net::HTTPSuccess)
92
- raise CodebaseIndex::Error, "OpenAI API error: #{response.code} #{response.body}"
93
- end
94
-
95
- JSON.parse(response.body)
96
- rescue Errno::ECONNRESET, Net::OpenTimeout, IOError
97
- # Connection dropped — reset and retry once
98
- @http_client = nil
99
- response = http_client.request(request)
100
- unless response.is_a?(Net::HTTPSuccess)
101
- raise CodebaseIndex::Error, "OpenAI API error: #{response.code} #{response.body}"
102
- end
103
-
104
- JSON.parse(response.body)
105
- end
106
-
107
- # Return a reusable, started HTTP client for the OpenAI API.
108
- # Calling http.start opens a persistent TCP connection so
109
- # keep_alive_timeout actually takes effect across requests.
110
- #
111
- # @return [Net::HTTP]
112
- def http_client
113
- return @http_client if @http_client&.started?
114
-
115
- http = Net::HTTP.new(ENDPOINT.host, ENDPOINT.port)
116
- http.use_ssl = true
117
- http.open_timeout = 10
118
- http.read_timeout = 30
119
- http.keep_alive_timeout = 30
120
- http.start
121
- @http_client = http
122
- end
123
- end
124
- end
125
- end
126
- end