codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
@@ -0,0 +1,956 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'digest'
5
+ require 'fileutils'
6
+ require 'open3'
7
+ require 'pathname'
8
+ require 'set'
9
+
10
+ require_relative 'extracted_unit'
11
+ require_relative 'dependency_graph'
12
+ require_relative 'extractors/model_extractor'
13
+ require_relative 'extractors/controller_extractor'
14
+ require_relative 'extractors/phlex_extractor'
15
+ require_relative 'extractors/service_extractor'
16
+ require_relative 'extractors/job_extractor'
17
+ require_relative 'extractors/mailer_extractor'
18
+ require_relative 'extractors/graphql_extractor'
19
+ require_relative 'extractors/serializer_extractor'
20
+ require_relative 'extractors/rails_source_extractor'
21
+ require_relative 'extractors/view_component_extractor'
22
+ require_relative 'extractors/manager_extractor'
23
+ require_relative 'extractors/policy_extractor'
24
+ require_relative 'extractors/validator_extractor'
25
+ require_relative 'extractors/concern_extractor'
26
+ require_relative 'extractors/route_extractor'
27
+ require_relative 'extractors/middleware_extractor'
28
+ require_relative 'extractors/i18n_extractor'
29
+ require_relative 'extractors/pundit_extractor'
30
+ require_relative 'extractors/configuration_extractor'
31
+ require_relative 'extractors/engine_extractor'
32
+ require_relative 'extractors/view_template_extractor'
33
+ require_relative 'extractors/migration_extractor'
34
+ require_relative 'extractors/action_cable_extractor'
35
+ require_relative 'extractors/scheduled_job_extractor'
36
+ require_relative 'extractors/rake_task_extractor'
37
+ require_relative 'extractors/state_machine_extractor'
38
+ require_relative 'extractors/event_extractor'
39
+ require_relative 'extractors/decorator_extractor'
40
+ require_relative 'extractors/database_view_extractor'
41
+ require_relative 'extractors/caching_extractor'
42
+ require_relative 'extractors/factory_extractor'
43
+ require_relative 'extractors/test_mapping_extractor'
44
+ require_relative 'extractors/poro_extractor'
45
+ require_relative 'extractors/lib_extractor'
46
+ require_relative 'graph_analyzer'
47
+ require_relative 'model_name_cache'
48
+ require_relative 'flow_precomputer'
49
+
50
+ module CodebaseIndex
51
+ # Extractor is the main orchestrator for codebase extraction.
52
+ #
53
+ # It coordinates all individual extractors, builds the dependency graph,
54
+ # enriches with git data, and outputs structured JSON for the indexing pipeline.
55
+ #
56
+ # @example Full extraction
57
+ # extractor = Extractor.new(output_dir: "tmp/codebase_index")
58
+ # results = extractor.extract_all
59
+ #
60
+ # @example Incremental extraction (for CI)
61
+ # extractor = Extractor.new
62
+ # extractor.extract_changed(["app/models/user.rb", "app/services/checkout.rb"])
63
+ #
64
+ class Extractor
65
+ # Directories under app/ that contain classes we need to extract.
66
+ # Used by eager_load_extraction_directories as a fallback when
67
+ # Rails.application.eager_load! fails (e.g., NameError from graphql/).
68
+ EXTRACTION_DIRECTORIES = %w[
69
+ models
70
+ controllers
71
+ services
72
+ jobs
73
+ mailers
74
+ components
75
+ interactors
76
+ operations
77
+ commands
78
+ use_cases
79
+ serializers
80
+ decorators
81
+ blueprinters
82
+ managers
83
+ policies
84
+ validators
85
+ channels
86
+ presenters
87
+ form_objects
88
+ ].freeze
89
+
90
+ EXTRACTORS = {
91
+ models: Extractors::ModelExtractor,
92
+ controllers: Extractors::ControllerExtractor,
93
+ graphql: Extractors::GraphQLExtractor,
94
+ components: Extractors::PhlexExtractor,
95
+ view_components: Extractors::ViewComponentExtractor,
96
+ services: Extractors::ServiceExtractor,
97
+ jobs: Extractors::JobExtractor,
98
+ mailers: Extractors::MailerExtractor,
99
+ serializers: Extractors::SerializerExtractor,
100
+ managers: Extractors::ManagerExtractor,
101
+ policies: Extractors::PolicyExtractor,
102
+ validators: Extractors::ValidatorExtractor,
103
+ concerns: Extractors::ConcernExtractor,
104
+ routes: Extractors::RouteExtractor,
105
+ middleware: Extractors::MiddlewareExtractor,
106
+ i18n: Extractors::I18nExtractor,
107
+ pundit_policies: Extractors::PunditExtractor,
108
+ configurations: Extractors::ConfigurationExtractor,
109
+ engines: Extractors::EngineExtractor,
110
+ view_templates: Extractors::ViewTemplateExtractor,
111
+ migrations: Extractors::MigrationExtractor,
112
+ action_cable_channels: Extractors::ActionCableExtractor,
113
+ scheduled_jobs: Extractors::ScheduledJobExtractor,
114
+ rake_tasks: Extractors::RakeTaskExtractor,
115
+ state_machines: Extractors::StateMachineExtractor,
116
+ events: Extractors::EventExtractor,
117
+ decorators: Extractors::DecoratorExtractor,
118
+ database_views: Extractors::DatabaseViewExtractor,
119
+ caching: Extractors::CachingExtractor,
120
+ factories: Extractors::FactoryExtractor,
121
+ test_mappings: Extractors::TestMappingExtractor,
122
+ rails_source: Extractors::RailsSourceExtractor,
123
+ poros: Extractors::PoroExtractor,
124
+ libs: Extractors::LibExtractor
125
+ }.freeze
126
+
127
+ # Maps singular unit types (as stored in ExtractedUnit/graph nodes)
128
+ # to the plural keys used in the EXTRACTORS constant.
129
+ #
130
+ # @return [Hash{Symbol => Symbol}]
131
+ TYPE_TO_EXTRACTOR_KEY = {
132
+ model: :models,
133
+ controller: :controllers,
134
+ service: :services,
135
+ component: :components,
136
+ view_component: :view_components,
137
+ job: :jobs,
138
+ mailer: :mailers,
139
+ graphql_type: :graphql,
140
+ graphql_mutation: :graphql,
141
+ graphql_resolver: :graphql,
142
+ graphql_query: :graphql,
143
+ serializer: :serializers,
144
+ manager: :managers,
145
+ policy: :policies,
146
+ validator: :validators,
147
+ concern: :concerns,
148
+ route: :routes,
149
+ middleware: :middleware,
150
+ i18n: :i18n,
151
+ pundit_policy: :pundit_policies,
152
+ configuration: :configurations,
153
+ engine: :engines,
154
+ view_template: :view_templates,
155
+ migration: :migrations,
156
+ action_cable_channel: :action_cable_channels,
157
+ scheduled_job: :scheduled_jobs,
158
+ rake_task: :rake_tasks,
159
+ state_machine: :state_machines,
160
+ event: :events,
161
+ decorator: :decorators,
162
+ database_view: :database_views,
163
+ caching: :caching,
164
+ factory: :factories,
165
+ test_mapping: :test_mappings,
166
+ rails_source: :rails_source,
167
+ poro: :poros,
168
+ lib: :libs
169
+ }.freeze
170
+
171
+ # Maps unit types to class-based extractor methods (constantize + call).
172
+ CLASS_BASED = {
173
+ model: :extract_model, controller: :extract_controller,
174
+ component: :extract_component, view_component: :extract_component,
175
+ mailer: :extract_mailer, action_cable_channel: :extract_channel
176
+ }.freeze
177
+
178
+ # Maps unit types to file-based extractor methods (pass file_path).
179
+ FILE_BASED = {
180
+ service: :extract_service_file, job: :extract_job_file,
181
+ serializer: :extract_serializer_file, manager: :extract_manager_file,
182
+ policy: :extract_policy_file, validator: :extract_validator_file,
183
+ concern: :extract_concern_file,
184
+ i18n: :extract_i18n_file,
185
+ pundit_policy: :extract_pundit_file,
186
+ configuration: :extract_configuration_file,
187
+ view_template: :extract_view_template_file,
188
+ migration: :extract_migration_file,
189
+ rake_task: :extract_rake_file,
190
+ decorator: :extract_decorator_file,
191
+ database_view: :extract_view_file,
192
+ caching: :extract_caching_file,
193
+ test_mapping: :extract_test_file,
194
+ poro: :extract_poro_file,
195
+ lib: :extract_lib_file
196
+ }.freeze
197
+
198
+ # GraphQL types all use the same extractor method.
199
+ GRAPHQL_TYPES = %i[graphql_type graphql_mutation graphql_resolver graphql_query].freeze
200
+
201
+ attr_reader :output_dir, :dependency_graph
202
+
203
+ def initialize(output_dir: nil)
204
+ @output_dir = Pathname.new(output_dir || Rails.root.join('tmp/codebase_index'))
205
+ @dependency_graph = DependencyGraph.new
206
+ @results = {}
207
+ end
208
+
209
+ # ══════════════════════════════════════════════════════════════════════
210
+ # Full Extraction
211
+ # ══════════════════════════════════════════════════════════════════════
212
+
213
+ # Perform full extraction of the codebase
214
+ #
215
+ # @return [Hash] Results keyed by extractor type
216
+ def extract_all
217
+ setup_output_directory
218
+ ModelNameCache.reset!
219
+
220
+ # Eager load once — all extractors need loaded classes for introspection.
221
+ safe_eager_load!
222
+
223
+ # Phase 1: Extract all units
224
+ if CodebaseIndex.configuration.concurrent_extraction
225
+ extract_all_concurrent
226
+ else
227
+ extract_all_sequential
228
+ end
229
+
230
+ # Phase 1.5: Deduplicate results
231
+ Rails.logger.info '[CodebaseIndex] Deduplicating results...'
232
+ deduplicate_results
233
+
234
+ # Rebuild graph from deduped results — Phase 1 registered all units including
235
+ # duplicates, and DependencyGraph has no remove/unregister API.
236
+ @dependency_graph = DependencyGraph.new
237
+ @results.each_value { |units| units.each { |u| @dependency_graph.register(u) } }
238
+
239
+ # Phase 2: Resolve dependents (reverse dependencies)
240
+ Rails.logger.info '[CodebaseIndex] Resolving dependents...'
241
+ resolve_dependents
242
+
243
+ # Phase 3: Graph analysis (PageRank, structural metrics)
244
+ Rails.logger.info '[CodebaseIndex] Analyzing dependency graph...'
245
+ @graph_analysis = GraphAnalyzer.new(@dependency_graph).analyze
246
+
247
+ # Phase 3.5: Precompute request flows (opt-in)
248
+ if CodebaseIndex.configuration.precompute_flows
249
+ Rails.logger.info '[CodebaseIndex] Precomputing request flows...'
250
+ precompute_flows
251
+ end
252
+
253
+ # Phase 4: Enrich with git data
254
+ Rails.logger.info '[CodebaseIndex] Enriching with git data...'
255
+ enrich_with_git_data
256
+
257
+ # Phase 4.5: Normalize file_path to relative paths
258
+ Rails.logger.info '[CodebaseIndex] Normalizing file paths...'
259
+ normalize_file_paths
260
+
261
+ # Phase 5: Write output
262
+ Rails.logger.info '[CodebaseIndex] Writing output...'
263
+ write_results
264
+ write_dependency_graph
265
+ write_graph_analysis
266
+ write_manifest
267
+ write_structural_summary
268
+
269
+ log_summary
270
+
271
+ @results
272
+ end
273
+
274
+ # ══════════════════════════════════════════════════════════════════════
275
+ # Incremental Extraction
276
+ # ══════════════════════════════════════════════════════════════════════
277
+
278
+ # Extract only units affected by changed files
279
+ # Used for incremental indexing in CI
280
+ #
281
+ # @param changed_files [Array<String>] List of changed file paths
282
+ # @return [Array<String>] List of re-extracted unit identifiers
283
+ def extract_changed(changed_files)
284
+ # Load existing graph
285
+ graph_path = @output_dir.join('dependency_graph.json')
286
+ @dependency_graph = DependencyGraph.from_h(JSON.parse(File.read(graph_path))) if graph_path.exist?
287
+
288
+ ModelNameCache.reset!
289
+
290
+ # Eager load to ensure newly-added classes are discoverable.
291
+ safe_eager_load!
292
+
293
+ # Normalize relative paths (from git diff) to absolute (as stored in file_map)
294
+ absolute_files = changed_files.map do |f|
295
+ Pathname.new(f).absolute? ? f : Rails.root.join(f).to_s
296
+ end
297
+
298
+ # Compute affected units
299
+ affected_ids = @dependency_graph.affected_by(absolute_files)
300
+ Rails.logger.info "[CodebaseIndex] #{changed_files.size} changed files affect #{affected_ids.size} units"
301
+
302
+ # Re-extract affected units
303
+ affected_types = Set.new
304
+ affected_ids.each do |unit_id|
305
+ re_extract_unit(unit_id, affected_types: affected_types)
306
+ end
307
+
308
+ # Regenerate type indexes for affected types
309
+ affected_types.each do |type_key|
310
+ regenerate_type_index(type_key)
311
+ end
312
+
313
+ # Update graph, manifest, and summary
314
+ write_dependency_graph
315
+ write_manifest
316
+ write_structural_summary
317
+
318
+ affected_ids
319
+ end
320
+
321
+ private
322
+
323
+ # ──────────────────────────────────────────────────────────────────────
324
+ # Eager Loading
325
+ # ──────────────────────────────────────────────────────────────────────
326
+
327
+ # Attempt eager_load!, falling back to per-directory loading on NameError.
328
+ #
329
+ # A single NameError (e.g., app/graphql/ referencing an uninstalled gem)
330
+ # aborts eager_load! entirely. Zeitwerk processes dirs alphabetically,
331
+ # so graphql/ before models/ means models never load. The fallback
332
+ # loads only the directories we actually need for extraction.
333
+ def safe_eager_load!
334
+ Rails.application.eager_load!
335
+ rescue NameError => e
336
+ Rails.logger.warn "[CodebaseIndex] eager_load! hit NameError: #{e.message}"
337
+ Rails.logger.warn '[CodebaseIndex] Falling back to per-directory eager loading'
338
+ eager_load_extraction_directories
339
+ end
340
+
341
+ # Load classes from each extraction-relevant app/ subdirectory individually.
342
+ # Uses Zeitwerk's eager_load_dir when available (Rails 7.1+/Zeitwerk 2.6+),
343
+ # otherwise falls back to Dir.glob + require.
344
+ def eager_load_extraction_directories
345
+ loader = Rails.autoloaders.main
346
+
347
+ EXTRACTION_DIRECTORIES.each do |subdir|
348
+ dir = Rails.root.join('app', subdir)
349
+ next unless dir.exist?
350
+
351
+ begin
352
+ if loader.respond_to?(:eager_load_dir)
353
+ loader.eager_load_dir(dir.to_s)
354
+ else
355
+ Dir.glob(dir.join('**/*.rb')).each do |file|
356
+ require file
357
+ rescue NameError, LoadError => e
358
+ Rails.logger.warn "[CodebaseIndex] Skipped #{file}: #{e.message}"
359
+ end
360
+ end
361
+ rescue NameError, LoadError => e
362
+ Rails.logger.warn "[CodebaseIndex] Failed to eager load app/#{subdir}/: #{e.message}"
363
+ end
364
+ end
365
+ end
366
+
367
+ # ──────────────────────────────────────────────────────────────────────
368
+ # Extraction Strategies
369
+ # ──────────────────────────────────────────────────────────────────────
370
+
371
+ def extract_all_sequential
372
+ EXTRACTORS.each do |type, extractor_class|
373
+ Rails.logger.info "[CodebaseIndex] Extracting #{type}..."
374
+ start_time = Time.current
375
+
376
+ extractor = extractor_class.new
377
+ units = extractor.extract_all
378
+
379
+ @results[type] = units
380
+
381
+ elapsed = Time.current - start_time
382
+ Rails.logger.info "[CodebaseIndex] Extracted #{units.size} #{type} in #{elapsed.round(2)}s"
383
+
384
+ # Register in dependency graph
385
+ units.each { |unit| @dependency_graph.register(unit) }
386
+ end
387
+ end
388
+
389
+ # Run each extractor in its own thread, then register results sequentially.
390
+ #
391
+ # Thread safety notes:
392
+ # - ModelNameCache is pre-computed before threads start (avoids ||= race)
393
+ # - Each thread gets its own extractor instance (no shared mutable state)
394
+ # - Results collected via Mutex-protected Hash
395
+ # - DependencyGraph registration is sequential (post-join)
396
+ def extract_all_concurrent
397
+ # Pre-compute ModelNameCache to avoid race on lazy memoization.
398
+ # Multiple threads calling model_names concurrently could trigger
399
+ # duplicate compute_model_names calls without this warm-up.
400
+ ModelNameCache.model_names
401
+ ModelNameCache.model_names_regex
402
+
403
+ results_mutex = Mutex.new
404
+ threads = EXTRACTORS.map do |type, extractor_class|
405
+ Thread.new do
406
+ Rails.logger.info "[CodebaseIndex] [Thread] Extracting #{type}..."
407
+ start_time = Time.current
408
+
409
+ extractor = extractor_class.new
410
+ units = extractor.extract_all
411
+
412
+ elapsed = Time.current - start_time
413
+ Rails.logger.info "[CodebaseIndex] [Thread] Extracted #{units.size} #{type} in #{elapsed.round(2)}s"
414
+
415
+ results_mutex.synchronize { @results[type] = units }
416
+ rescue StandardError => e
417
+ Rails.logger.error "[CodebaseIndex] [Thread] #{type} failed: #{e.message}"
418
+ results_mutex.synchronize { @results[type] = [] }
419
+ end
420
+ end
421
+
422
+ threads.each(&:join)
423
+
424
+ # Register into dependency graph sequentially — DependencyGraph is not thread-safe
425
+ EXTRACTORS.each_key do |type|
426
+ (@results[type] || []).each { |unit| @dependency_graph.register(unit) }
427
+ end
428
+ end
429
+
430
+ # ──────────────────────────────────────────────────────────────────────
431
+ # Setup
432
+ # ──────────────────────────────────────────────────────────────────────
433
+
434
+ def setup_output_directory
435
+ FileUtils.mkdir_p(@output_dir)
436
+ EXTRACTORS.each_key do |type|
437
+ FileUtils.mkdir_p(@output_dir.join(type.to_s))
438
+ end
439
+ end
440
+
441
+ # ──────────────────────────────────────────────────────────────────────
442
+ # Dependency Resolution
443
+ # ──────────────────────────────────────────────────────────────────────
444
+
445
+ def resolve_dependents
446
+ all_units = @results.values.flatten
447
+ unit_map = all_units.index_by(&:identifier)
448
+
449
+ all_units.each do |unit|
450
+ unit.dependencies.each do |dep|
451
+ target_unit = unit_map[dep[:target]]
452
+ next unless target_unit
453
+
454
+ target_unit.dependents ||= []
455
+ target_unit.dependents << {
456
+ type: unit.type,
457
+ identifier: unit.identifier
458
+ }
459
+ end
460
+ end
461
+ end
462
+
463
+ # Remove duplicate units (same identifier) within each type, keeping the first occurrence.
464
+ # Duplicates arise when multiple extractors produce the same unit (e.g., engine-mounted
465
+ # routes duplicating app routes). Without dedup, downstream phases would produce inflated
466
+ # counts, duplicate _index.json entries, and last-writer-wins file overwrites.
467
+ def deduplicate_results
468
+ @results.each do |type, units|
469
+ deduped = units.uniq(&:identifier)
470
+ dropped = units.size - deduped.size
471
+
472
+ Rails.logger.warn "[CodebaseIndex] Deduplicated #{type}: dropped #{dropped} duplicate(s)" if dropped.positive?
473
+
474
+ @results[type] = deduped
475
+ end
476
+ end
477
+
478
+ # ──────────────────────────────────────────────────────────────────────
479
+ # Flow Precomputation
480
+ # ──────────────────────────────────────────────────────────────────────
481
+
482
+ def precompute_flows
483
+ all_units = @results.values.flatten
484
+ precomputer = FlowPrecomputer.new(units: all_units, graph: @dependency_graph, output_dir: @output_dir.to_s)
485
+ flow_map = precomputer.precompute
486
+ Rails.logger.info "[CodebaseIndex] Precomputed #{flow_map.size} request flows"
487
+ rescue StandardError => e
488
+ Rails.logger.error "[CodebaseIndex] Flow precomputation failed: #{e.message}"
489
+ end
490
+
491
+ # ──────────────────────────────────────────────────────────────────────
492
+ # Git Enrichment
493
+ # ──────────────────────────────────────────────────────────────────────
494
+
495
+ def enrich_with_git_data
496
+ return unless git_available?
497
+
498
+ # Collect all file paths that need git data
499
+ file_paths = []
500
+ @results.each do |type, units|
501
+ next if %i[rails_source gem_source].include?(type)
502
+
503
+ units.each do |unit|
504
+ file_paths << unit.file_path if unit.file_path && File.exist?(unit.file_path)
505
+ end
506
+ end
507
+
508
+ # Batch-fetch all git data in minimal subprocess calls
509
+ git_data = batch_git_data(file_paths)
510
+ root = "#{Rails.root}/"
511
+
512
+ # Assign results to units
513
+ @results.each do |type, units|
514
+ next if %i[rails_source gem_source].include?(type)
515
+
516
+ units.each do |unit|
517
+ next unless unit.file_path
518
+
519
+ relative = unit.file_path.sub(root, '')
520
+ unit.metadata[:git] = git_data[relative] if git_data[relative]
521
+ end
522
+ end
523
+ end
524
+
525
+ # Normalize all unit file_paths to relative paths (relative to Rails.root).
526
+ #
527
+ # Extractors set file_path via source_location, which returns absolute paths.
528
+ # This normalization ensures consistent relative paths (e.g., "app/models/user.rb")
529
+ # across all environments (local, Docker, CI) where Rails.root differs.
530
+ #
531
+ # Must run after enrich_with_git_data, which needs absolute paths for
532
+ # File.exist? checks and git log commands.
533
+ def normalize_file_paths
534
+ @results.each_value do |units|
535
+ units.each do |unit|
536
+ unit.file_path = normalize_file_path(unit.file_path)
537
+ end
538
+ end
539
+ end
540
+
541
+ # Strip Rails.root prefix from a file path, converting it to a relative path.
542
+ #
543
+ # @param path [String, nil] Absolute or relative file path
544
+ # @return [String, nil] Relative path, or the original value if already relative,
545
+ # nil, or not under Rails.root (e.g., a gem path)
546
+ def normalize_file_path(path)
547
+ return path unless path
548
+
549
+ root = Rails.root.to_s
550
+ prefix = root.end_with?('/') ? root : "#{root}/"
551
+ path.start_with?(prefix) ? path.sub(prefix, '') : path
552
+ end
553
+
554
+ def git_available?
555
+ return @git_available if defined?(@git_available)
556
+
557
+ @git_available = begin
558
+ _, status = Open3.capture2('git', 'rev-parse', '--git-dir')
559
+ status.success?
560
+ rescue StandardError
561
+ false
562
+ end
563
+ end
564
+
565
+ # Safe git command execution — no shell interpolation
566
+ #
567
+ # @param args [Array<String>] Git command arguments
568
+ # @return [String] Command output (empty string on failure)
569
+ def run_git(*args)
570
+ output, status = Open3.capture2('git', *args)
571
+ status.success? ? output.strip : ''
572
+ rescue StandardError
573
+ ''
574
+ end
575
+
576
+ # Batch-fetch git data for all file paths in two git commands.
577
+ #
578
+ # @param file_paths [Array<String>] Absolute file paths
579
+ # @return [Hash{String => Hash}] Keyed by relative path
580
+ def batch_git_data(file_paths)
581
+ return {} if file_paths.empty?
582
+
583
+ root = "#{Rails.root}/"
584
+ relative_paths = file_paths.map { |f| f.sub(root, '') }
585
+ result = {}
586
+ relative_paths.each { |rp| result[rp] = {} }
587
+
588
+ relative_paths.each_slice(500) do |batch|
589
+ log_output = run_git(
590
+ 'log', '--all', '--name-only',
591
+ '--format=__COMMIT__%H|||%an|||%cI|||%s',
592
+ '--since=365 days ago',
593
+ '--', *batch
594
+ )
595
+ parse_git_log_output(log_output, relative_paths.to_set, result)
596
+ end
597
+
598
+ ninety_days_ago = (Time.current - 90.days).iso8601
599
+ result.each do |relative_path, data|
600
+ result[relative_path] = build_file_metadata(data, ninety_days_ago)
601
+ end
602
+
603
+ result
604
+ end
605
+
606
+ # Parse git log output line-by-line, populating result with per-file commit data.
607
+ def parse_git_log_output(log_output, path_set, result)
608
+ current_commit = nil
609
+
610
+ log_output.each_line do |line|
611
+ line = line.strip
612
+ next if line.empty?
613
+
614
+ if line.start_with?('__COMMIT__')
615
+ parts = line.sub('__COMMIT__', '').split('|||', 4)
616
+ current_commit = { sha: parts[0], author: parts[1], date: parts[2], message: parts[3] }
617
+ elsif current_commit && path_set.include?(line)
618
+ entry = result[line] ||= {}
619
+ unless entry[:last_modified]
620
+ entry[:last_modified] = current_commit[:date]
621
+ entry[:last_author] = current_commit[:author]
622
+ end
623
+ (entry[:commits] ||= []) << current_commit
624
+ (entry[:contributors] ||= Hash.new(0))[current_commit[:author]] += 1
625
+ end
626
+ end
627
+ end
628
+
629
+ # Classify how frequently a file changes based on commit counts.
630
+ def classify_change_frequency(total_count, recent_count)
631
+ if total_count <= 2
632
+ :new
633
+ elsif recent_count >= 10
634
+ :hot
635
+ elsif recent_count >= 3
636
+ :active
637
+ elsif recent_count >= 1
638
+ :stable
639
+ else
640
+ :dormant
641
+ end
642
+ end
643
+
644
+ # Build final metadata hash from raw commit data.
645
+ def build_file_metadata(data, ninety_days_ago)
646
+ all_commits = data[:commits] || []
647
+ contributor_counts = data[:contributors] || {}
648
+ recent_count = all_commits.count { |c| c[:date] && c[:date] > ninety_days_ago }
649
+
650
+ {
651
+ last_modified: data[:last_modified],
652
+ last_author: data[:last_author],
653
+ commit_count: all_commits.size,
654
+ contributors: contributor_counts
655
+ .sort_by { |_, count| -count }
656
+ .first(5)
657
+ .map { |name, count| { name: name, commits: count } },
658
+ recent_commits: all_commits.first(5).map do |c|
659
+ { sha: c[:sha]&.first(8), message: c[:message], date: c[:date], author: c[:author] }
660
+ end,
661
+ change_frequency: classify_change_frequency(all_commits.size, recent_count)
662
+ }
663
+ end
664
+
665
+ # ──────────────────────────────────────────────────────────────────────
666
+ # Output Writers
667
+ # ──────────────────────────────────────────────────────────────────────
668
+
669
+ def write_results
670
+ @results.each do |type, units|
671
+ type_dir = @output_dir.join(type.to_s)
672
+
673
+ units.each do |unit|
674
+ File.write(
675
+ type_dir.join(collision_safe_filename(unit.identifier)),
676
+ json_serialize(unit.to_h)
677
+ )
678
+ end
679
+
680
+ # Also write a type index for fast lookups
681
+ index = units.map do |u|
682
+ {
683
+ identifier: u.identifier,
684
+ file_path: u.file_path,
685
+ namespace: u.namespace,
686
+ estimated_tokens: u.estimated_tokens,
687
+ chunk_count: u.chunks.size
688
+ }
689
+ end
690
+
691
+ File.write(
692
+ type_dir.join('_index.json'),
693
+ json_serialize(index)
694
+ )
695
+ end
696
+ end
697
+
698
+ def write_dependency_graph
699
+ graph_data = @dependency_graph.to_h
700
+ graph_data[:pagerank] = @dependency_graph.pagerank
701
+
702
+ File.write(
703
+ @output_dir.join('dependency_graph.json'),
704
+ json_serialize(graph_data)
705
+ )
706
+ end
707
+
708
+ def write_graph_analysis
709
+ return unless @graph_analysis
710
+
711
+ File.write(
712
+ @output_dir.join('graph_analysis.json'),
713
+ json_serialize(@graph_analysis)
714
+ )
715
+ end
716
+
717
+ def write_manifest
718
+ manifest = {
719
+ extracted_at: Time.current.iso8601,
720
+ rails_version: Rails.version,
721
+ ruby_version: RUBY_VERSION,
722
+
723
+ # Counts by type
724
+ counts: @results.transform_values(&:size),
725
+
726
+ # Total stats
727
+ total_units: @results.values.sum(&:size),
728
+ total_chunks: @results.values.flatten.sum { |u| u.chunks.size },
729
+
730
+ # Git info
731
+ git_sha: run_git('rev-parse', 'HEAD').presence,
732
+ git_branch: run_git('rev-parse', '--abbrev-ref', 'HEAD').presence,
733
+
734
+ # For change detection
735
+ gemfile_lock_sha: gemfile_lock_sha,
736
+ schema_sha: schema_sha
737
+ }
738
+
739
+ File.write(
740
+ @output_dir.join('manifest.json'),
741
+ json_serialize(manifest)
742
+ )
743
+ end
744
+
745
+ # Write a compact TOC-style summary of extracted units.
746
+ #
747
+ # Produces a SUMMARY.md under 8K tokens (~24KB) by listing one line per
748
+ # category with count and top-5 namespace breakdown, rather than enumerating
749
+ # every unit. Per-unit detail is available in the per-category _index.json files.
750
+ #
751
+ # @return [void]
752
+ def write_structural_summary
753
+ return if @results.empty?
754
+
755
+ total_units = @results.values.sum(&:size)
756
+ total_chunks = @results.values.flatten.sum { |u| [u.chunks.size, 1].max }
757
+ category_count = @results.count { |_, units| units.any? }
758
+
759
+ summary = []
760
+ summary << '# Codebase Index Summary'
761
+ summary << "Generated: #{Time.current.iso8601}"
762
+ summary << "Rails #{Rails.version} / Ruby #{RUBY_VERSION}"
763
+ summary << "Units: #{total_units} | Chunks: #{total_chunks} | Categories: #{category_count}"
764
+ summary << ''
765
+
766
+ @results.each do |type, units|
767
+ next if units.empty?
768
+
769
+ summary << "## #{type.to_s.titleize} (#{units.size})"
770
+
771
+ ns_counts = units
772
+ .group_by { |u| u.namespace.nil? || u.namespace.empty? ? '(root)' : u.namespace }
773
+ .transform_values(&:size)
774
+ .sort_by { |_, count| -count }
775
+ .first(5)
776
+
777
+ ns_parts = ns_counts.map { |ns, count| "#{ns} #{count}" }
778
+ summary << "Namespaces: #{ns_parts.join(', ')}" unless ns_parts.empty?
779
+ summary << ''
780
+ end
781
+
782
+ summary << '## Dependency Overview'
783
+ summary << ''
784
+
785
+ graph_stats = @dependency_graph.to_h[:stats]
786
+ if graph_stats
787
+ summary << "- Total nodes: #{graph_stats[:node_count]}"
788
+ summary << "- Total edges: #{graph_stats[:edge_count]}"
789
+ end
790
+
791
+ if @graph_analysis
792
+ hub_nodes = @graph_analysis[:hubs]
793
+ significant_hubs = hub_nodes&.select { |h| h[:dependent_count] > 20 }
794
+ if significant_hubs&.any?
795
+ hub_names = significant_hubs.map { |h| h[:identifier] }.join(', ')
796
+ summary << "- Hub nodes (>20 dependents): #{hub_names}"
797
+ end
798
+ end
799
+
800
+ summary << ''
801
+
802
+ File.write(
803
+ @output_dir.join('SUMMARY.md'),
804
+ summary.join("\n")
805
+ )
806
+ end
807
+
808
+ def regenerate_type_index(type_key)
809
+ type_dir = @output_dir.join(type_key.to_s)
810
+ return unless type_dir.directory?
811
+
812
+ # Scan existing unit JSON files (exclude _index.json)
813
+ index = Dir[type_dir.join('*.json')].filter_map do |file|
814
+ next if File.basename(file) == '_index.json'
815
+
816
+ data = JSON.parse(File.read(file))
817
+ {
818
+ identifier: data['identifier'],
819
+ file_path: data['file_path'],
820
+ namespace: data['namespace'],
821
+ estimated_tokens: data['estimated_tokens'],
822
+ chunk_count: (data['chunks'] || []).size
823
+ }
824
+ end
825
+
826
+ File.write(
827
+ type_dir.join('_index.json'),
828
+ json_serialize(index)
829
+ )
830
+ end
831
+
832
+ # ──────────────────────────────────────────────────────────────────────
833
+ # Helpers
834
+ # ──────────────────────────────────────────────────────────────────────
835
+
836
+ def gemfile_lock_sha
837
+ lock_path = Rails.root.join('Gemfile.lock')
838
+ return nil unless lock_path.exist?
839
+
840
+ Digest::SHA256.file(lock_path).hexdigest
841
+ end
842
+
843
+ def schema_sha
844
+ schema_path = Rails.root.join('db/schema.rb')
845
+ return nil unless schema_path.exist?
846
+
847
+ Digest::SHA256.file(schema_path).hexdigest
848
+ end
849
+
850
+ # Generate a safe JSON filename from a unit identifier.
851
+ #
852
+ # @param identifier [String] Unit identifier (e.g., "Admin::UsersController")
853
+ # @return [String] Safe filename (e.g., "Admin__UsersController.json")
854
+ def safe_filename(identifier)
855
+ "#{identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')}.json"
856
+ end
857
+
858
+ # Generate a collision-safe JSON filename by appending a short digest.
859
+ # Unlike safe_filename, this guarantees distinct filenames even when two
860
+ # identifiers differ only in characters that safe_filename normalizes
861
+ # (e.g., "GET /foo/bar" vs "GET /foo_bar" both become "GET__foo_bar.json").
862
+ #
863
+ # @param identifier [String] Unit identifier
864
+ # @return [String] Collision-safe filename (e.g., "GET__foo_bar_a1b2c3d4.json")
865
+ def collision_safe_filename(identifier)
866
+ base = identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')
867
+ digest = ::Digest::SHA256.hexdigest(identifier)[0, 8]
868
+ "#{base}_#{digest}.json"
869
+ end
870
+
871
+ def json_serialize(data)
872
+ if CodebaseIndex.configuration.pretty_json
873
+ JSON.pretty_generate(data)
874
+ else
875
+ JSON.generate(data)
876
+ end
877
+ end
878
+
879
+ def log_summary
880
+ total = @results.values.sum(&:size)
881
+ chunks = @results.values.flatten.sum { |u| u.chunks.size }
882
+
883
+ Rails.logger.info '[CodebaseIndex] ═══════════════════════════════════════════'
884
+ Rails.logger.info '[CodebaseIndex] Extraction Complete'
885
+ Rails.logger.info '[CodebaseIndex] ═══════════════════════════════════════════'
886
+ @results.each do |type, units|
887
+ Rails.logger.info "[CodebaseIndex] #{type}: #{units.size} units"
888
+ end
889
+ Rails.logger.info '[CodebaseIndex] ───────────────────────────────────────────'
890
+ Rails.logger.info "[CodebaseIndex] Total: #{total} units, #{chunks} chunks"
891
+ Rails.logger.info "[CodebaseIndex] Output: #{@output_dir}"
892
+ Rails.logger.info '[CodebaseIndex] ═══════════════════════════════════════════'
893
+ end
894
+
895
+ # ──────────────────────────────────────────────────────────────────────
896
+ # Incremental Re-extraction
897
+ # ──────────────────────────────────────────────────────────────────────
898
+
899
+ def re_extract_unit(unit_id, affected_types: nil)
900
+ # Framework source only changes on version updates
901
+ if unit_id.start_with?('rails/') || unit_id.start_with?('gems/')
902
+ Rails.logger.debug "[CodebaseIndex] Skipping framework re-extraction for #{unit_id}"
903
+ return
904
+ end
905
+
906
+ # Find the unit's type from the graph
907
+ node = @dependency_graph.to_h[:nodes][unit_id]
908
+ return unless node
909
+
910
+ type = node[:type]&.to_sym
911
+ file_path = node[:file_path]
912
+
913
+ return unless file_path && File.exist?(file_path)
914
+
915
+ # Re-extract based on type
916
+ extractor_key = TYPE_TO_EXTRACTOR_KEY[type]
917
+ return unless extractor_key
918
+
919
+ extractor = EXTRACTORS[extractor_key]&.new
920
+ return unless extractor
921
+
922
+ unit = if (method = CLASS_BASED[type])
923
+ klass = if unit_id.match?(/\A[A-Z][A-Za-z0-9_:]*\z/)
924
+ begin
925
+ unit_id.constantize
926
+ rescue StandardError
927
+ nil
928
+ end
929
+ end
930
+ extractor.public_send(method, klass) if klass
931
+ elsif (method = FILE_BASED[type])
932
+ extractor.public_send(method, file_path)
933
+ elsif GRAPHQL_TYPES.include?(type)
934
+ extractor.extract_graphql_file(file_path)
935
+ end
936
+
937
+ return unless unit
938
+
939
+ # Update dependency graph
940
+ @dependency_graph.register(unit)
941
+
942
+ # Track which type was affected
943
+ affected_types&.add(extractor_key)
944
+
945
+ # Write updated unit
946
+ type_dir = @output_dir.join(extractor_key.to_s)
947
+
948
+ File.write(
949
+ type_dir.join(collision_safe_filename(unit.identifier)),
950
+ json_serialize(unit.to_h)
951
+ )
952
+
953
+ Rails.logger.info "[CodebaseIndex] Re-extracted #{unit_id}"
954
+ end
955
+ end
956
+ end