codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,156 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ module Storage
5
+ # VectorStore provides an interface for storing and searching embedding vectors.
6
+ #
7
+ # All vector store adapters must include the {Interface} module and implement
8
+ # its methods. The {InMemory} adapter is provided for development and testing.
9
+ #
10
+ # @example Using the in-memory adapter
11
+ # store = CodebaseIndex::Storage::VectorStore::InMemory.new
12
+ # store.store("User", [0.1, 0.2, 0.3], { type: "model" })
13
+ # results = store.search([0.1, 0.2, 0.3], limit: 5)
14
+ #
15
+ module VectorStore
16
+ # Interface that all vector store adapters must implement.
17
+ module Interface
18
+ # Store a vector with associated metadata.
19
+ #
20
+ # @param id [String] Unique identifier for the vector
21
+ # @param vector [Array<Float>] The embedding vector
22
+ # @param metadata [Hash] Optional metadata to store alongside the vector
23
+ # @raise [NotImplementedError] if not implemented by adapter
24
+ def store(id, vector, metadata = {})
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # Search for similar vectors using cosine similarity.
29
+ #
30
+ # @param query_vector [Array<Float>] The query embedding vector
31
+ # @param limit [Integer] Maximum number of results to return
32
+ # @param filters [Hash] Optional metadata filters to apply
33
+ # @return [Array<SearchResult>] Results sorted by descending similarity
34
+ # @raise [NotImplementedError] if not implemented by adapter
35
+ def search(query_vector, limit: 10, filters: {})
36
+ raise NotImplementedError
37
+ end
38
+
39
+ # Delete a vector by ID.
40
+ #
41
+ # @param id [String] The identifier to delete
42
+ # @raise [NotImplementedError] if not implemented by adapter
43
+ def delete(id)
44
+ raise NotImplementedError
45
+ end
46
+
47
+ # Delete vectors matching metadata filters.
48
+ #
49
+ # @param filters [Hash] Metadata key-value pairs to match
50
+ # @raise [NotImplementedError] if not implemented by adapter
51
+ def delete_by_filter(filters)
52
+ raise NotImplementedError
53
+ end
54
+
55
+ # Return the number of stored vectors.
56
+ #
57
+ # @return [Integer] Total count
58
+ # @raise [NotImplementedError] if not implemented by adapter
59
+ def count
60
+ raise NotImplementedError
61
+ end
62
+ end
63
+
64
+ # Value object representing a single search result.
65
+ SearchResult = Struct.new(:id, :score, :metadata, keyword_init: true)
66
+
67
+ # In-memory vector store using hash storage and cosine similarity.
68
+ #
69
+ # Suitable for development and testing. Not intended for production use
70
+ # with large datasets.
71
+ #
72
+ # @example
73
+ # store = InMemory.new
74
+ # store.store("doc1", [1.0, 0.0], { type: "model" })
75
+ # store.store("doc2", [0.0, 1.0], { type: "service" })
76
+ # store.search([1.0, 0.0], limit: 1)
77
+ # # => [#<SearchResult id="doc1", score=1.0, metadata={type: "model"}>]
78
+ #
79
+ class InMemory
80
+ include Interface
81
+
82
+ def initialize
83
+ @entries = {} # id => { vector:, metadata: }
84
+ end
85
+
86
+ # @see Interface#store
87
+ def store(id, vector, metadata = {})
88
+ @entries[id] = { vector: vector, metadata: metadata }
89
+ end
90
+
91
+ # @see Interface#search
92
+ def search(query_vector, limit: 10, filters: {})
93
+ candidates = filter_entries(filters)
94
+
95
+ scored = candidates.map do |id, entry|
96
+ score = cosine_similarity(query_vector, entry[:vector])
97
+ SearchResult.new(id: id, score: score, metadata: entry[:metadata])
98
+ end
99
+ scored.sort_by { |r| -r.score }.first(limit)
100
+ end
101
+
102
+ # @see Interface#delete
103
+ def delete(id)
104
+ @entries.delete(id)
105
+ end
106
+
107
+ # @see Interface#delete_by_filter
108
+ def delete_by_filter(filters)
109
+ @entries.reject! do |_id, entry|
110
+ filters.all? { |key, value| entry[:metadata][key] == value }
111
+ end
112
+ end
113
+
114
+ # @see Interface#count
115
+ def count
116
+ @entries.size
117
+ end
118
+
119
+ private
120
+
121
+ # Filter entries by metadata key-value pairs.
122
+ #
123
+ # @param filters [Hash] Metadata filters
124
+ # @return [Hash] Filtered entries
125
+ def filter_entries(filters)
126
+ return @entries if filters.empty?
127
+
128
+ @entries.select do |_id, entry|
129
+ filters.all? { |key, value| entry[:metadata][key] == value }
130
+ end
131
+ end
132
+
133
+ # Compute cosine similarity between two vectors.
134
+ #
135
+ # @param vec_a [Array<Float>] First vector
136
+ # @param vec_b [Array<Float>] Second vector
137
+ # @return [Float] Cosine similarity between -1.0 and 1.0
138
+ # @raise [ArgumentError] if vectors have different dimensions
139
+ def cosine_similarity(vec_a, vec_b)
140
+ unless vec_a.length == vec_b.length
141
+ raise ArgumentError,
142
+ "Vector dimension mismatch (#{vec_a.length} vs #{vec_b.length})"
143
+ end
144
+
145
+ dot = vec_a.zip(vec_b).sum { |x, y| x * y }
146
+ mag_a = Math.sqrt(vec_a.sum { |x| x**2 })
147
+ mag_b = Math.sqrt(vec_b.sum { |x| x**2 })
148
+
149
+ return 0.0 if mag_a.zero? || mag_b.zero?
150
+
151
+ dot / (mag_a * mag_b)
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,341 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'time'
5
+
6
+ module CodebaseIndex
7
+ module Temporal
8
+ # SnapshotStore captures and queries temporal snapshots of extraction runs.
9
+ #
10
+ # Each snapshot is anchored to a git commit SHA and stores per-unit content
11
+ # hashes for efficient diff computation. Full source is not duplicated —
12
+ # only hashes of source, metadata, and dependencies are stored per snapshot.
13
+ #
14
+ # @example Capturing a snapshot
15
+ # store = SnapshotStore.new(connection: db)
16
+ # store.capture(manifest, unit_hashes)
17
+ #
18
+ # @example Comparing snapshots
19
+ # diff = store.diff("abc123", "def456")
20
+ # diff[:added] # => [{ identifier: "NewModel", ... }]
21
+ # diff[:modified] # => [{ identifier: "User", ... }]
22
+ # diff[:deleted] # => [{ identifier: "OldService", ... }]
23
+ #
24
+ class SnapshotStore # rubocop:disable Metrics/ClassLength
25
+ # @param connection [Object] Database connection supporting #execute and #get_first_row
26
+ def initialize(connection:)
27
+ @db = connection
28
+ end
29
+
30
+ # Capture a snapshot after extraction completes.
31
+ #
32
+ # Stores the manifest metadata and per-unit content hashes.
33
+ # Computes diff stats vs. the most recent previous snapshot.
34
+ #
35
+ # @param manifest [Hash] The manifest data (string or symbol keys)
36
+ # @param unit_hashes [Array<Hash>] Per-unit content hashes
37
+ # @return [Hash] Snapshot record with diff stats
38
+ def capture(manifest, unit_hashes)
39
+ git_sha = mget(manifest, 'git_sha')
40
+ return nil unless git_sha
41
+
42
+ previous = find_latest
43
+ upsert_snapshot(manifest, git_sha, unit_hashes.size)
44
+
45
+ snapshot_id = fetch_snapshot_id(git_sha)
46
+ @db.execute('DELETE FROM codebase_snapshot_units WHERE snapshot_id = ?', [snapshot_id])
47
+ insert_unit_hashes(snapshot_id, unit_hashes)
48
+
49
+ update_diff_stats(snapshot_id, previous)
50
+ find(git_sha)
51
+ end
52
+
53
+ # List snapshots, optionally filtered by branch.
54
+ #
55
+ # @param limit [Integer] Max results (default 20)
56
+ # @param branch [String, nil] Filter by branch name
57
+ # @return [Array<Hash>] Snapshot summaries sorted by extracted_at descending
58
+ def list(limit: 20, branch: nil)
59
+ rows = if branch
60
+ @db.execute(
61
+ 'SELECT * FROM codebase_snapshots WHERE git_branch = ? ORDER BY extracted_at DESC LIMIT ?',
62
+ [branch, limit]
63
+ )
64
+ else
65
+ @db.execute(
66
+ 'SELECT * FROM codebase_snapshots ORDER BY extracted_at DESC LIMIT ?',
67
+ [limit]
68
+ )
69
+ end
70
+
71
+ rows.map { |row| row_to_hash(row) }
72
+ end
73
+
74
+ # Find a specific snapshot by git SHA.
75
+ #
76
+ # @param git_sha [String]
77
+ # @return [Hash, nil] Snapshot metadata or nil if not found
78
+ def find(git_sha)
79
+ row = @db.get_first_row('SELECT * FROM codebase_snapshots WHERE git_sha = ?', [git_sha])
80
+ return nil unless row
81
+
82
+ row_to_hash(row)
83
+ end
84
+
85
+ # Compute diff between two snapshots.
86
+ #
87
+ # @param sha_a [String] Before snapshot git SHA
88
+ # @param sha_b [String] After snapshot git SHA
89
+ # @return [Hash] {added: [...], modified: [...], deleted: [...]}
90
+ def diff(sha_a, sha_b)
91
+ id_a = fetch_snapshot_id(sha_a)
92
+ id_b = fetch_snapshot_id(sha_b)
93
+
94
+ return { added: [], modified: [], deleted: [] } unless id_a && id_b
95
+
96
+ units_a = load_snapshot_units(id_a)
97
+ units_b = load_snapshot_units(id_b)
98
+
99
+ compute_diff(units_a, units_b)
100
+ end
101
+
102
+ # History of a single unit across snapshots.
103
+ #
104
+ # @param identifier [String] Unit identifier
105
+ # @param limit [Integer] Max snapshots to return (default 20)
106
+ # @return [Array<Hash>] Entries with git_sha, extracted_at, source_hash, changed flag
107
+ def unit_history(identifier, limit: 20)
108
+ rows = @db.execute(<<~SQL, [identifier, limit])
109
+ SELECT su.source_hash, su.metadata_hash, su.dependencies_hash, su.unit_type,
110
+ s.git_sha, s.extracted_at, s.git_branch
111
+ FROM codebase_snapshot_units su
112
+ JOIN codebase_snapshots s ON s.id = su.snapshot_id
113
+ WHERE su.identifier = ?
114
+ ORDER BY s.extracted_at DESC
115
+ LIMIT ?
116
+ SQL
117
+
118
+ entries = rows.map { |row| history_entry_from_row(row) }
119
+ mark_changed_entries(entries)
120
+ end
121
+
122
+ private
123
+
124
+ # Build a history entry hash from a database row.
125
+ #
126
+ # @param row [Hash]
127
+ # @return [Hash]
128
+ def history_entry_from_row(row)
129
+ {
130
+ git_sha: row['git_sha'],
131
+ extracted_at: row['extracted_at'],
132
+ git_branch: row['git_branch'],
133
+ unit_type: row['unit_type'],
134
+ source_hash: row['source_hash'],
135
+ metadata_hash: row['metadata_hash'],
136
+ dependencies_hash: row['dependencies_hash']
137
+ }
138
+ end
139
+
140
+ # Mark changed flag on history entries by comparing source hashes.
141
+ #
142
+ # @param entries [Array<Hash>]
143
+ # @return [Array<Hash>]
144
+ def mark_changed_entries(entries)
145
+ entries.each_with_index do |entry, i|
146
+ entry[:changed] = if i == entries.size - 1
147
+ true # Oldest version is always "changed" (first appearance)
148
+ else
149
+ entry[:source_hash] != entries[i + 1][:source_hash]
150
+ end
151
+ end
152
+ entries
153
+ end
154
+
155
+ # Get a value from a hash that may have string or symbol keys.
156
+ #
157
+ # @param hash [Hash]
158
+ # @param key [String]
159
+ # @return [Object, nil]
160
+ def mget(hash, key)
161
+ hash[key] || hash[key.to_sym]
162
+ end
163
+
164
+ # Insert or replace the snapshot row from manifest data.
165
+ #
166
+ # @param manifest [Hash]
167
+ # @param git_sha [String]
168
+ # @param default_total [Integer]
169
+ # @return [void]
170
+ def upsert_snapshot(manifest, git_sha, default_total)
171
+ params = [
172
+ git_sha,
173
+ mget(manifest, 'git_branch'),
174
+ mget(manifest, 'extracted_at') || Time.now.iso8601,
175
+ mget(manifest, 'rails_version'),
176
+ mget(manifest, 'ruby_version'),
177
+ mget(manifest, 'total_units') || default_total,
178
+ JSON.generate(mget(manifest, 'counts') || {}),
179
+ mget(manifest, 'gemfile_lock_sha'),
180
+ mget(manifest, 'schema_sha')
181
+ ]
182
+ @db.execute(<<~SQL, params)
183
+ INSERT OR REPLACE INTO codebase_snapshots
184
+ (git_sha, git_branch, extracted_at, rails_version, ruby_version,
185
+ total_units, unit_counts, gemfile_lock_sha, schema_sha)
186
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
187
+ SQL
188
+ end
189
+
190
+ # Update a snapshot's diff stats vs. a previous snapshot.
191
+ #
192
+ # @param snapshot_id [Integer]
193
+ # @param previous [Hash, nil]
194
+ # @return [void]
195
+ def update_diff_stats(snapshot_id, previous)
196
+ diff_stats = compute_diff_stats(snapshot_id, previous)
197
+ @db.execute(
198
+ 'UPDATE codebase_snapshots SET units_added = ?, units_modified = ?, units_deleted = ? WHERE id = ?',
199
+ [diff_stats[:added], diff_stats[:modified], diff_stats[:deleted], snapshot_id]
200
+ )
201
+ end
202
+
203
+ # Find the most recent snapshot.
204
+ #
205
+ # @return [Hash, nil]
206
+ def find_latest
207
+ row = @db.get_first_row('SELECT * FROM codebase_snapshots ORDER BY extracted_at DESC LIMIT 1')
208
+ return nil unless row
209
+
210
+ row_to_hash(row)
211
+ end
212
+
213
+ # Fetch a snapshot's ID by git SHA.
214
+ #
215
+ # @param git_sha [String]
216
+ # @return [Integer, nil]
217
+ def fetch_snapshot_id(git_sha)
218
+ @db.get_first_value('SELECT id FROM codebase_snapshots WHERE git_sha = ?', [git_sha])
219
+ end
220
+
221
+ # Insert per-unit hash records for a snapshot.
222
+ #
223
+ # @param snapshot_id [Integer]
224
+ # @param unit_hashes [Array<Hash>]
225
+ # @return [void]
226
+ def insert_unit_hashes(snapshot_id, unit_hashes)
227
+ sql = <<~SQL
228
+ INSERT INTO codebase_snapshot_units
229
+ (snapshot_id, identifier, unit_type, source_hash, metadata_hash, dependencies_hash)
230
+ VALUES (?, ?, ?, ?, ?, ?)
231
+ SQL
232
+
233
+ unit_hashes.each do |uh|
234
+ params = [
235
+ snapshot_id,
236
+ uh[:identifier] || uh['identifier'],
237
+ (uh[:type] || uh['type']).to_s,
238
+ uh[:source_hash] || uh['source_hash'],
239
+ uh[:metadata_hash] || uh['metadata_hash'],
240
+ uh[:dependencies_hash] || uh['dependencies_hash']
241
+ ]
242
+ @db.execute(sql, params)
243
+ end
244
+ end
245
+
246
+ # Load all unit records for a snapshot as a hash keyed by identifier.
247
+ #
248
+ # @param snapshot_id [Integer]
249
+ # @return [Hash{String => Hash}]
250
+ def load_snapshot_units(snapshot_id)
251
+ sql = <<~SQL
252
+ SELECT identifier, unit_type, source_hash, metadata_hash, dependencies_hash
253
+ FROM codebase_snapshot_units WHERE snapshot_id = ?
254
+ SQL
255
+ rows = @db.execute(sql, [snapshot_id])
256
+
257
+ rows.to_h do |row|
258
+ [row['identifier'], {
259
+ unit_type: row['unit_type'],
260
+ source_hash: row['source_hash'],
261
+ metadata_hash: row['metadata_hash'],
262
+ dependencies_hash: row['dependencies_hash']
263
+ }]
264
+ end
265
+ end
266
+
267
+ # Compute diff between two sets of unit hashes.
268
+ #
269
+ # @param units_a [Hash{String => Hash}] Before
270
+ # @param units_b [Hash{String => Hash}] After
271
+ # @return [Hash] {added: [...], modified: [...], deleted: [...]}
272
+ def compute_diff(units_a, units_b) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
273
+ added = []
274
+ modified = []
275
+ deleted = []
276
+
277
+ # Units in B but not A → added
278
+ # Units in both → check for modifications
279
+ units_b.each do |identifier, data_b|
280
+ if units_a.key?(identifier)
281
+ data_a = units_a[identifier]
282
+ if data_a[:source_hash] != data_b[:source_hash] ||
283
+ data_a[:metadata_hash] != data_b[:metadata_hash] ||
284
+ data_a[:dependencies_hash] != data_b[:dependencies_hash]
285
+ modified << { identifier: identifier, unit_type: data_b[:unit_type] }
286
+ end
287
+ else
288
+ added << { identifier: identifier, unit_type: data_b[:unit_type] }
289
+ end
290
+ end
291
+
292
+ # Units in A but not B → deleted
293
+ units_a.each do |identifier, data_a|
294
+ deleted << { identifier: identifier, unit_type: data_a[:unit_type] } unless units_b.key?(identifier)
295
+ end
296
+
297
+ { added: added, modified: modified, deleted: deleted }
298
+ end
299
+
300
+ # Compute aggregate diff stats.
301
+ #
302
+ # @param current_snapshot_id [Integer]
303
+ # @param previous_snapshot [Hash, nil]
304
+ # @return [Hash] {added:, modified:, deleted:}
305
+ def compute_diff_stats(current_snapshot_id, previous_snapshot)
306
+ return { added: 0, modified: 0, deleted: 0 } unless previous_snapshot
307
+
308
+ prev_id = fetch_snapshot_id(previous_snapshot[:git_sha])
309
+ return { added: 0, modified: 0, deleted: 0 } unless prev_id
310
+
311
+ units_prev = load_snapshot_units(prev_id)
312
+ units_curr = load_snapshot_units(current_snapshot_id)
313
+
314
+ result = compute_diff(units_prev, units_curr)
315
+ { added: result[:added].size, modified: result[:modified].size, deleted: result[:deleted].size }
316
+ end
317
+
318
+ # Convert a database row to a normalized hash.
319
+ #
320
+ # @param row [Hash] SQLite3 result row
321
+ # @return [Hash]
322
+ def row_to_hash(row)
323
+ {
324
+ id: row['id'],
325
+ git_sha: row['git_sha'],
326
+ git_branch: row['git_branch'],
327
+ extracted_at: row['extracted_at'],
328
+ rails_version: row['rails_version'],
329
+ ruby_version: row['ruby_version'],
330
+ total_units: row['total_units'],
331
+ unit_counts: row['unit_counts'] ? JSON.parse(row['unit_counts']) : {},
332
+ gemfile_lock_sha: row['gemfile_lock_sha'],
333
+ schema_sha: row['schema_sha'],
334
+ units_added: row['units_added'],
335
+ units_modified: row['units_modified'],
336
+ units_deleted: row['units_deleted']
337
+ }
338
+ end
339
+ end
340
+ end
341
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CodebaseIndex
4
+ VERSION = '0.1.0'
5
+ end