codebase_index 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +29 -0
- data/CODE_OF_CONDUCT.md +83 -0
- data/CONTRIBUTING.md +65 -0
- data/LICENSE.txt +21 -0
- data/README.md +481 -0
- data/exe/codebase-console-mcp +22 -0
- data/exe/codebase-index-mcp +61 -0
- data/exe/codebase-index-mcp-http +64 -0
- data/exe/codebase-index-mcp-start +58 -0
- data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
- data/lib/codebase_index/ast/method_extractor.rb +76 -0
- data/lib/codebase_index/ast/node.rb +88 -0
- data/lib/codebase_index/ast/parser.rb +653 -0
- data/lib/codebase_index/ast.rb +6 -0
- data/lib/codebase_index/builder.rb +137 -0
- data/lib/codebase_index/chunking/chunk.rb +84 -0
- data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
- data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
- data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
- data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
- data/lib/codebase_index/console/audit_logger.rb +75 -0
- data/lib/codebase_index/console/bridge.rb +170 -0
- data/lib/codebase_index/console/confirmation.rb +90 -0
- data/lib/codebase_index/console/connection_manager.rb +173 -0
- data/lib/codebase_index/console/console_response_renderer.rb +78 -0
- data/lib/codebase_index/console/model_validator.rb +81 -0
- data/lib/codebase_index/console/safe_context.rb +82 -0
- data/lib/codebase_index/console/server.rb +557 -0
- data/lib/codebase_index/console/sql_validator.rb +172 -0
- data/lib/codebase_index/console/tools/tier1.rb +118 -0
- data/lib/codebase_index/console/tools/tier2.rb +117 -0
- data/lib/codebase_index/console/tools/tier3.rb +110 -0
- data/lib/codebase_index/console/tools/tier4.rb +79 -0
- data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
- data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
- data/lib/codebase_index/cost_model/estimator.rb +128 -0
- data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
- data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
- data/lib/codebase_index/cost_model.rb +22 -0
- data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
- data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
- data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
- data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
- data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
- data/lib/codebase_index/db/migrator.rb +71 -0
- data/lib/codebase_index/db/schema_version.rb +73 -0
- data/lib/codebase_index/dependency_graph.rb +227 -0
- data/lib/codebase_index/embedding/indexer.rb +130 -0
- data/lib/codebase_index/embedding/openai.rb +105 -0
- data/lib/codebase_index/embedding/provider.rb +135 -0
- data/lib/codebase_index/embedding/text_preparer.rb +112 -0
- data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
- data/lib/codebase_index/evaluation/evaluator.rb +146 -0
- data/lib/codebase_index/evaluation/metrics.rb +79 -0
- data/lib/codebase_index/evaluation/query_set.rb +148 -0
- data/lib/codebase_index/evaluation/report_generator.rb +90 -0
- data/lib/codebase_index/extracted_unit.rb +145 -0
- data/lib/codebase_index/extractor.rb +956 -0
- data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
- data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
- data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
- data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
- data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
- data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
- data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
- data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
- data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
- data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
- data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
- data/lib/codebase_index/extractors/event_extractor.rb +211 -0
- data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
- data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
- data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
- data/lib/codebase_index/extractors/job_extractor.rb +369 -0
- data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
- data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
- data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
- data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
- data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
- data/lib/codebase_index/extractors/model_extractor.rb +960 -0
- data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
- data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
- data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
- data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
- data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
- data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
- data/lib/codebase_index/extractors/route_extractor.rb +181 -0
- data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
- data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
- data/lib/codebase_index/extractors/service_extractor.rb +254 -0
- data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
- data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
- data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
- data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
- data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
- data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
- data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
- data/lib/codebase_index/feedback/gap_detector.rb +89 -0
- data/lib/codebase_index/feedback/store.rb +119 -0
- data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
- data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
- data/lib/codebase_index/flow_assembler.rb +290 -0
- data/lib/codebase_index/flow_document.rb +191 -0
- data/lib/codebase_index/flow_precomputer.rb +102 -0
- data/lib/codebase_index/formatting/base.rb +40 -0
- data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
- data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
- data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
- data/lib/codebase_index/formatting/human_adapter.rb +78 -0
- data/lib/codebase_index/graph_analyzer.rb +374 -0
- data/lib/codebase_index/mcp/index_reader.rb +394 -0
- data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
- data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
- data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
- data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
- data/lib/codebase_index/mcp/server.rb +935 -0
- data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
- data/lib/codebase_index/model_name_cache.rb +51 -0
- data/lib/codebase_index/notion/client.rb +217 -0
- data/lib/codebase_index/notion/exporter.rb +219 -0
- data/lib/codebase_index/notion/mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
- data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
- data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
- data/lib/codebase_index/notion/rate_limiter.rb +68 -0
- data/lib/codebase_index/observability/health_check.rb +81 -0
- data/lib/codebase_index/observability/instrumentation.rb +34 -0
- data/lib/codebase_index/observability/structured_logger.rb +75 -0
- data/lib/codebase_index/operator/error_escalator.rb +81 -0
- data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
- data/lib/codebase_index/operator/status_reporter.rb +80 -0
- data/lib/codebase_index/railtie.rb +26 -0
- data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
- data/lib/codebase_index/resilience/index_validator.rb +185 -0
- data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
- data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
- data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
- data/lib/codebase_index/retrieval/ranker.rb +273 -0
- data/lib/codebase_index/retrieval/search_executor.rb +327 -0
- data/lib/codebase_index/retriever.rb +160 -0
- data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
- data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
- data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
- data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
- data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
- data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
- data/lib/codebase_index/ruby_analyzer.rb +87 -0
- data/lib/codebase_index/session_tracer/file_store.rb +111 -0
- data/lib/codebase_index/session_tracer/middleware.rb +143 -0
- data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
- data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
- data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
- data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
- data/lib/codebase_index/session_tracer/store.rb +67 -0
- data/lib/codebase_index/storage/graph_store.rb +120 -0
- data/lib/codebase_index/storage/metadata_store.rb +169 -0
- data/lib/codebase_index/storage/pgvector.rb +163 -0
- data/lib/codebase_index/storage/qdrant.rb +172 -0
- data/lib/codebase_index/storage/vector_store.rb +156 -0
- data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
- data/lib/codebase_index/version.rb +5 -0
- data/lib/codebase_index.rb +223 -0
- data/lib/generators/codebase_index/install_generator.rb +32 -0
- data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
- data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
- data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
- data/lib/tasks/codebase_index.rake +583 -0
- data/lib/tasks/codebase_index_evaluation.rake +115 -0
- metadata +252 -0
|
@@ -0,0 +1,956 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'digest'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
require 'open3'
|
|
7
|
+
require 'pathname'
|
|
8
|
+
require 'set'
|
|
9
|
+
|
|
10
|
+
require_relative 'extracted_unit'
|
|
11
|
+
require_relative 'dependency_graph'
|
|
12
|
+
require_relative 'extractors/model_extractor'
|
|
13
|
+
require_relative 'extractors/controller_extractor'
|
|
14
|
+
require_relative 'extractors/phlex_extractor'
|
|
15
|
+
require_relative 'extractors/service_extractor'
|
|
16
|
+
require_relative 'extractors/job_extractor'
|
|
17
|
+
require_relative 'extractors/mailer_extractor'
|
|
18
|
+
require_relative 'extractors/graphql_extractor'
|
|
19
|
+
require_relative 'extractors/serializer_extractor'
|
|
20
|
+
require_relative 'extractors/rails_source_extractor'
|
|
21
|
+
require_relative 'extractors/view_component_extractor'
|
|
22
|
+
require_relative 'extractors/manager_extractor'
|
|
23
|
+
require_relative 'extractors/policy_extractor'
|
|
24
|
+
require_relative 'extractors/validator_extractor'
|
|
25
|
+
require_relative 'extractors/concern_extractor'
|
|
26
|
+
require_relative 'extractors/route_extractor'
|
|
27
|
+
require_relative 'extractors/middleware_extractor'
|
|
28
|
+
require_relative 'extractors/i18n_extractor'
|
|
29
|
+
require_relative 'extractors/pundit_extractor'
|
|
30
|
+
require_relative 'extractors/configuration_extractor'
|
|
31
|
+
require_relative 'extractors/engine_extractor'
|
|
32
|
+
require_relative 'extractors/view_template_extractor'
|
|
33
|
+
require_relative 'extractors/migration_extractor'
|
|
34
|
+
require_relative 'extractors/action_cable_extractor'
|
|
35
|
+
require_relative 'extractors/scheduled_job_extractor'
|
|
36
|
+
require_relative 'extractors/rake_task_extractor'
|
|
37
|
+
require_relative 'extractors/state_machine_extractor'
|
|
38
|
+
require_relative 'extractors/event_extractor'
|
|
39
|
+
require_relative 'extractors/decorator_extractor'
|
|
40
|
+
require_relative 'extractors/database_view_extractor'
|
|
41
|
+
require_relative 'extractors/caching_extractor'
|
|
42
|
+
require_relative 'extractors/factory_extractor'
|
|
43
|
+
require_relative 'extractors/test_mapping_extractor'
|
|
44
|
+
require_relative 'extractors/poro_extractor'
|
|
45
|
+
require_relative 'extractors/lib_extractor'
|
|
46
|
+
require_relative 'graph_analyzer'
|
|
47
|
+
require_relative 'model_name_cache'
|
|
48
|
+
require_relative 'flow_precomputer'
|
|
49
|
+
|
|
50
|
+
module CodebaseIndex
|
|
51
|
+
# Extractor is the main orchestrator for codebase extraction.
|
|
52
|
+
#
|
|
53
|
+
# It coordinates all individual extractors, builds the dependency graph,
|
|
54
|
+
# enriches with git data, and outputs structured JSON for the indexing pipeline.
|
|
55
|
+
#
|
|
56
|
+
# @example Full extraction
|
|
57
|
+
# extractor = Extractor.new(output_dir: "tmp/codebase_index")
|
|
58
|
+
# results = extractor.extract_all
|
|
59
|
+
#
|
|
60
|
+
# @example Incremental extraction (for CI)
|
|
61
|
+
# extractor = Extractor.new
|
|
62
|
+
# extractor.extract_changed(["app/models/user.rb", "app/services/checkout.rb"])
|
|
63
|
+
#
|
|
64
|
+
class Extractor
|
|
65
|
+
# Directories under app/ that contain classes we need to extract.
|
|
66
|
+
# Used by eager_load_extraction_directories as a fallback when
|
|
67
|
+
# Rails.application.eager_load! fails (e.g., NameError from graphql/).
|
|
68
|
+
EXTRACTION_DIRECTORIES = %w[
|
|
69
|
+
models
|
|
70
|
+
controllers
|
|
71
|
+
services
|
|
72
|
+
jobs
|
|
73
|
+
mailers
|
|
74
|
+
components
|
|
75
|
+
interactors
|
|
76
|
+
operations
|
|
77
|
+
commands
|
|
78
|
+
use_cases
|
|
79
|
+
serializers
|
|
80
|
+
decorators
|
|
81
|
+
blueprinters
|
|
82
|
+
managers
|
|
83
|
+
policies
|
|
84
|
+
validators
|
|
85
|
+
channels
|
|
86
|
+
presenters
|
|
87
|
+
form_objects
|
|
88
|
+
].freeze
|
|
89
|
+
|
|
90
|
+
EXTRACTORS = {
|
|
91
|
+
models: Extractors::ModelExtractor,
|
|
92
|
+
controllers: Extractors::ControllerExtractor,
|
|
93
|
+
graphql: Extractors::GraphQLExtractor,
|
|
94
|
+
components: Extractors::PhlexExtractor,
|
|
95
|
+
view_components: Extractors::ViewComponentExtractor,
|
|
96
|
+
services: Extractors::ServiceExtractor,
|
|
97
|
+
jobs: Extractors::JobExtractor,
|
|
98
|
+
mailers: Extractors::MailerExtractor,
|
|
99
|
+
serializers: Extractors::SerializerExtractor,
|
|
100
|
+
managers: Extractors::ManagerExtractor,
|
|
101
|
+
policies: Extractors::PolicyExtractor,
|
|
102
|
+
validators: Extractors::ValidatorExtractor,
|
|
103
|
+
concerns: Extractors::ConcernExtractor,
|
|
104
|
+
routes: Extractors::RouteExtractor,
|
|
105
|
+
middleware: Extractors::MiddlewareExtractor,
|
|
106
|
+
i18n: Extractors::I18nExtractor,
|
|
107
|
+
pundit_policies: Extractors::PunditExtractor,
|
|
108
|
+
configurations: Extractors::ConfigurationExtractor,
|
|
109
|
+
engines: Extractors::EngineExtractor,
|
|
110
|
+
view_templates: Extractors::ViewTemplateExtractor,
|
|
111
|
+
migrations: Extractors::MigrationExtractor,
|
|
112
|
+
action_cable_channels: Extractors::ActionCableExtractor,
|
|
113
|
+
scheduled_jobs: Extractors::ScheduledJobExtractor,
|
|
114
|
+
rake_tasks: Extractors::RakeTaskExtractor,
|
|
115
|
+
state_machines: Extractors::StateMachineExtractor,
|
|
116
|
+
events: Extractors::EventExtractor,
|
|
117
|
+
decorators: Extractors::DecoratorExtractor,
|
|
118
|
+
database_views: Extractors::DatabaseViewExtractor,
|
|
119
|
+
caching: Extractors::CachingExtractor,
|
|
120
|
+
factories: Extractors::FactoryExtractor,
|
|
121
|
+
test_mappings: Extractors::TestMappingExtractor,
|
|
122
|
+
rails_source: Extractors::RailsSourceExtractor,
|
|
123
|
+
poros: Extractors::PoroExtractor,
|
|
124
|
+
libs: Extractors::LibExtractor
|
|
125
|
+
}.freeze
|
|
126
|
+
|
|
127
|
+
# Maps singular unit types (as stored in ExtractedUnit/graph nodes)
|
|
128
|
+
# to the plural keys used in the EXTRACTORS constant.
|
|
129
|
+
#
|
|
130
|
+
# @return [Hash{Symbol => Symbol}]
|
|
131
|
+
TYPE_TO_EXTRACTOR_KEY = {
|
|
132
|
+
model: :models,
|
|
133
|
+
controller: :controllers,
|
|
134
|
+
service: :services,
|
|
135
|
+
component: :components,
|
|
136
|
+
view_component: :view_components,
|
|
137
|
+
job: :jobs,
|
|
138
|
+
mailer: :mailers,
|
|
139
|
+
graphql_type: :graphql,
|
|
140
|
+
graphql_mutation: :graphql,
|
|
141
|
+
graphql_resolver: :graphql,
|
|
142
|
+
graphql_query: :graphql,
|
|
143
|
+
serializer: :serializers,
|
|
144
|
+
manager: :managers,
|
|
145
|
+
policy: :policies,
|
|
146
|
+
validator: :validators,
|
|
147
|
+
concern: :concerns,
|
|
148
|
+
route: :routes,
|
|
149
|
+
middleware: :middleware,
|
|
150
|
+
i18n: :i18n,
|
|
151
|
+
pundit_policy: :pundit_policies,
|
|
152
|
+
configuration: :configurations,
|
|
153
|
+
engine: :engines,
|
|
154
|
+
view_template: :view_templates,
|
|
155
|
+
migration: :migrations,
|
|
156
|
+
action_cable_channel: :action_cable_channels,
|
|
157
|
+
scheduled_job: :scheduled_jobs,
|
|
158
|
+
rake_task: :rake_tasks,
|
|
159
|
+
state_machine: :state_machines,
|
|
160
|
+
event: :events,
|
|
161
|
+
decorator: :decorators,
|
|
162
|
+
database_view: :database_views,
|
|
163
|
+
caching: :caching,
|
|
164
|
+
factory: :factories,
|
|
165
|
+
test_mapping: :test_mappings,
|
|
166
|
+
rails_source: :rails_source,
|
|
167
|
+
poro: :poros,
|
|
168
|
+
lib: :libs
|
|
169
|
+
}.freeze
|
|
170
|
+
|
|
171
|
+
# Maps unit types to class-based extractor methods (constantize + call).
|
|
172
|
+
CLASS_BASED = {
|
|
173
|
+
model: :extract_model, controller: :extract_controller,
|
|
174
|
+
component: :extract_component, view_component: :extract_component,
|
|
175
|
+
mailer: :extract_mailer, action_cable_channel: :extract_channel
|
|
176
|
+
}.freeze
|
|
177
|
+
|
|
178
|
+
# Maps unit types to file-based extractor methods (pass file_path).
|
|
179
|
+
FILE_BASED = {
|
|
180
|
+
service: :extract_service_file, job: :extract_job_file,
|
|
181
|
+
serializer: :extract_serializer_file, manager: :extract_manager_file,
|
|
182
|
+
policy: :extract_policy_file, validator: :extract_validator_file,
|
|
183
|
+
concern: :extract_concern_file,
|
|
184
|
+
i18n: :extract_i18n_file,
|
|
185
|
+
pundit_policy: :extract_pundit_file,
|
|
186
|
+
configuration: :extract_configuration_file,
|
|
187
|
+
view_template: :extract_view_template_file,
|
|
188
|
+
migration: :extract_migration_file,
|
|
189
|
+
rake_task: :extract_rake_file,
|
|
190
|
+
decorator: :extract_decorator_file,
|
|
191
|
+
database_view: :extract_view_file,
|
|
192
|
+
caching: :extract_caching_file,
|
|
193
|
+
test_mapping: :extract_test_file,
|
|
194
|
+
poro: :extract_poro_file,
|
|
195
|
+
lib: :extract_lib_file
|
|
196
|
+
}.freeze
|
|
197
|
+
|
|
198
|
+
# GraphQL types all use the same extractor method.
|
|
199
|
+
GRAPHQL_TYPES = %i[graphql_type graphql_mutation graphql_resolver graphql_query].freeze
|
|
200
|
+
|
|
201
|
+
attr_reader :output_dir, :dependency_graph
|
|
202
|
+
|
|
203
|
+
def initialize(output_dir: nil)
|
|
204
|
+
@output_dir = Pathname.new(output_dir || Rails.root.join('tmp/codebase_index'))
|
|
205
|
+
@dependency_graph = DependencyGraph.new
|
|
206
|
+
@results = {}
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# ══════════════════════════════════════════════════════════════════════
|
|
210
|
+
# Full Extraction
|
|
211
|
+
# ══════════════════════════════════════════════════════════════════════
|
|
212
|
+
|
|
213
|
+
# Perform full extraction of the codebase
|
|
214
|
+
#
|
|
215
|
+
# @return [Hash] Results keyed by extractor type
|
|
216
|
+
def extract_all
|
|
217
|
+
setup_output_directory
|
|
218
|
+
ModelNameCache.reset!
|
|
219
|
+
|
|
220
|
+
# Eager load once — all extractors need loaded classes for introspection.
|
|
221
|
+
safe_eager_load!
|
|
222
|
+
|
|
223
|
+
# Phase 1: Extract all units
|
|
224
|
+
if CodebaseIndex.configuration.concurrent_extraction
|
|
225
|
+
extract_all_concurrent
|
|
226
|
+
else
|
|
227
|
+
extract_all_sequential
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Phase 1.5: Deduplicate results
|
|
231
|
+
Rails.logger.info '[CodebaseIndex] Deduplicating results...'
|
|
232
|
+
deduplicate_results
|
|
233
|
+
|
|
234
|
+
# Rebuild graph from deduped results — Phase 1 registered all units including
|
|
235
|
+
# duplicates, and DependencyGraph has no remove/unregister API.
|
|
236
|
+
@dependency_graph = DependencyGraph.new
|
|
237
|
+
@results.each_value { |units| units.each { |u| @dependency_graph.register(u) } }
|
|
238
|
+
|
|
239
|
+
# Phase 2: Resolve dependents (reverse dependencies)
|
|
240
|
+
Rails.logger.info '[CodebaseIndex] Resolving dependents...'
|
|
241
|
+
resolve_dependents
|
|
242
|
+
|
|
243
|
+
# Phase 3: Graph analysis (PageRank, structural metrics)
|
|
244
|
+
Rails.logger.info '[CodebaseIndex] Analyzing dependency graph...'
|
|
245
|
+
@graph_analysis = GraphAnalyzer.new(@dependency_graph).analyze
|
|
246
|
+
|
|
247
|
+
# Phase 3.5: Precompute request flows (opt-in)
|
|
248
|
+
if CodebaseIndex.configuration.precompute_flows
|
|
249
|
+
Rails.logger.info '[CodebaseIndex] Precomputing request flows...'
|
|
250
|
+
precompute_flows
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Phase 4: Enrich with git data
|
|
254
|
+
Rails.logger.info '[CodebaseIndex] Enriching with git data...'
|
|
255
|
+
enrich_with_git_data
|
|
256
|
+
|
|
257
|
+
# Phase 4.5: Normalize file_path to relative paths
|
|
258
|
+
Rails.logger.info '[CodebaseIndex] Normalizing file paths...'
|
|
259
|
+
normalize_file_paths
|
|
260
|
+
|
|
261
|
+
# Phase 5: Write output
|
|
262
|
+
Rails.logger.info '[CodebaseIndex] Writing output...'
|
|
263
|
+
write_results
|
|
264
|
+
write_dependency_graph
|
|
265
|
+
write_graph_analysis
|
|
266
|
+
write_manifest
|
|
267
|
+
write_structural_summary
|
|
268
|
+
|
|
269
|
+
log_summary
|
|
270
|
+
|
|
271
|
+
@results
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
# ══════════════════════════════════════════════════════════════════════
|
|
275
|
+
# Incremental Extraction
|
|
276
|
+
# ══════════════════════════════════════════════════════════════════════
|
|
277
|
+
|
|
278
|
+
# Extract only units affected by changed files
|
|
279
|
+
# Used for incremental indexing in CI
|
|
280
|
+
#
|
|
281
|
+
# @param changed_files [Array<String>] List of changed file paths
|
|
282
|
+
# @return [Array<String>] List of re-extracted unit identifiers
|
|
283
|
+
def extract_changed(changed_files)
|
|
284
|
+
# Load existing graph
|
|
285
|
+
graph_path = @output_dir.join('dependency_graph.json')
|
|
286
|
+
@dependency_graph = DependencyGraph.from_h(JSON.parse(File.read(graph_path))) if graph_path.exist?
|
|
287
|
+
|
|
288
|
+
ModelNameCache.reset!
|
|
289
|
+
|
|
290
|
+
# Eager load to ensure newly-added classes are discoverable.
|
|
291
|
+
safe_eager_load!
|
|
292
|
+
|
|
293
|
+
# Normalize relative paths (from git diff) to absolute (as stored in file_map)
|
|
294
|
+
absolute_files = changed_files.map do |f|
|
|
295
|
+
Pathname.new(f).absolute? ? f : Rails.root.join(f).to_s
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Compute affected units
|
|
299
|
+
affected_ids = @dependency_graph.affected_by(absolute_files)
|
|
300
|
+
Rails.logger.info "[CodebaseIndex] #{changed_files.size} changed files affect #{affected_ids.size} units"
|
|
301
|
+
|
|
302
|
+
# Re-extract affected units
|
|
303
|
+
affected_types = Set.new
|
|
304
|
+
affected_ids.each do |unit_id|
|
|
305
|
+
re_extract_unit(unit_id, affected_types: affected_types)
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
# Regenerate type indexes for affected types
|
|
309
|
+
affected_types.each do |type_key|
|
|
310
|
+
regenerate_type_index(type_key)
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Update graph, manifest, and summary
|
|
314
|
+
write_dependency_graph
|
|
315
|
+
write_manifest
|
|
316
|
+
write_structural_summary
|
|
317
|
+
|
|
318
|
+
affected_ids
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
private
|
|
322
|
+
|
|
323
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
324
|
+
# Eager Loading
|
|
325
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
326
|
+
|
|
327
|
+
# Attempt eager_load!, falling back to per-directory loading on NameError.
|
|
328
|
+
#
|
|
329
|
+
# A single NameError (e.g., app/graphql/ referencing an uninstalled gem)
|
|
330
|
+
# aborts eager_load! entirely. Zeitwerk processes dirs alphabetically,
|
|
331
|
+
# so graphql/ before models/ means models never load. The fallback
|
|
332
|
+
# loads only the directories we actually need for extraction.
|
|
333
|
+
def safe_eager_load!
|
|
334
|
+
Rails.application.eager_load!
|
|
335
|
+
rescue NameError => e
|
|
336
|
+
Rails.logger.warn "[CodebaseIndex] eager_load! hit NameError: #{e.message}"
|
|
337
|
+
Rails.logger.warn '[CodebaseIndex] Falling back to per-directory eager loading'
|
|
338
|
+
eager_load_extraction_directories
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# Load classes from each extraction-relevant app/ subdirectory individually.
|
|
342
|
+
# Uses Zeitwerk's eager_load_dir when available (Rails 7.1+/Zeitwerk 2.6+),
|
|
343
|
+
# otherwise falls back to Dir.glob + require.
|
|
344
|
+
def eager_load_extraction_directories
|
|
345
|
+
loader = Rails.autoloaders.main
|
|
346
|
+
|
|
347
|
+
EXTRACTION_DIRECTORIES.each do |subdir|
|
|
348
|
+
dir = Rails.root.join('app', subdir)
|
|
349
|
+
next unless dir.exist?
|
|
350
|
+
|
|
351
|
+
begin
|
|
352
|
+
if loader.respond_to?(:eager_load_dir)
|
|
353
|
+
loader.eager_load_dir(dir.to_s)
|
|
354
|
+
else
|
|
355
|
+
Dir.glob(dir.join('**/*.rb')).each do |file|
|
|
356
|
+
require file
|
|
357
|
+
rescue NameError, LoadError => e
|
|
358
|
+
Rails.logger.warn "[CodebaseIndex] Skipped #{file}: #{e.message}"
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
rescue NameError, LoadError => e
|
|
362
|
+
Rails.logger.warn "[CodebaseIndex] Failed to eager load app/#{subdir}/: #{e.message}"
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
368
|
+
# Extraction Strategies
|
|
369
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
370
|
+
|
|
371
|
+
def extract_all_sequential
|
|
372
|
+
EXTRACTORS.each do |type, extractor_class|
|
|
373
|
+
Rails.logger.info "[CodebaseIndex] Extracting #{type}..."
|
|
374
|
+
start_time = Time.current
|
|
375
|
+
|
|
376
|
+
extractor = extractor_class.new
|
|
377
|
+
units = extractor.extract_all
|
|
378
|
+
|
|
379
|
+
@results[type] = units
|
|
380
|
+
|
|
381
|
+
elapsed = Time.current - start_time
|
|
382
|
+
Rails.logger.info "[CodebaseIndex] Extracted #{units.size} #{type} in #{elapsed.round(2)}s"
|
|
383
|
+
|
|
384
|
+
# Register in dependency graph
|
|
385
|
+
units.each { |unit| @dependency_graph.register(unit) }
|
|
386
|
+
end
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
# Run each extractor in its own thread, then register results sequentially.
|
|
390
|
+
#
|
|
391
|
+
# Thread safety notes:
|
|
392
|
+
# - ModelNameCache is pre-computed before threads start (avoids ||= race)
|
|
393
|
+
# - Each thread gets its own extractor instance (no shared mutable state)
|
|
394
|
+
# - Results collected via Mutex-protected Hash
|
|
395
|
+
# - DependencyGraph registration is sequential (post-join)
|
|
396
|
+
def extract_all_concurrent
|
|
397
|
+
# Pre-compute ModelNameCache to avoid race on lazy memoization.
|
|
398
|
+
# Multiple threads calling model_names concurrently could trigger
|
|
399
|
+
# duplicate compute_model_names calls without this warm-up.
|
|
400
|
+
ModelNameCache.model_names
|
|
401
|
+
ModelNameCache.model_names_regex
|
|
402
|
+
|
|
403
|
+
results_mutex = Mutex.new
|
|
404
|
+
threads = EXTRACTORS.map do |type, extractor_class|
|
|
405
|
+
Thread.new do
|
|
406
|
+
Rails.logger.info "[CodebaseIndex] [Thread] Extracting #{type}..."
|
|
407
|
+
start_time = Time.current
|
|
408
|
+
|
|
409
|
+
extractor = extractor_class.new
|
|
410
|
+
units = extractor.extract_all
|
|
411
|
+
|
|
412
|
+
elapsed = Time.current - start_time
|
|
413
|
+
Rails.logger.info "[CodebaseIndex] [Thread] Extracted #{units.size} #{type} in #{elapsed.round(2)}s"
|
|
414
|
+
|
|
415
|
+
results_mutex.synchronize { @results[type] = units }
|
|
416
|
+
rescue StandardError => e
|
|
417
|
+
Rails.logger.error "[CodebaseIndex] [Thread] #{type} failed: #{e.message}"
|
|
418
|
+
results_mutex.synchronize { @results[type] = [] }
|
|
419
|
+
end
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
threads.each(&:join)
|
|
423
|
+
|
|
424
|
+
# Register into dependency graph sequentially — DependencyGraph is not thread-safe
|
|
425
|
+
EXTRACTORS.each_key do |type|
|
|
426
|
+
(@results[type] || []).each { |unit| @dependency_graph.register(unit) }
|
|
427
|
+
end
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
431
|
+
# Setup
|
|
432
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
433
|
+
|
|
434
|
+
def setup_output_directory
|
|
435
|
+
FileUtils.mkdir_p(@output_dir)
|
|
436
|
+
EXTRACTORS.each_key do |type|
|
|
437
|
+
FileUtils.mkdir_p(@output_dir.join(type.to_s))
|
|
438
|
+
end
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
442
|
+
# Dependency Resolution
|
|
443
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
444
|
+
|
|
445
|
+
def resolve_dependents
|
|
446
|
+
all_units = @results.values.flatten
|
|
447
|
+
unit_map = all_units.index_by(&:identifier)
|
|
448
|
+
|
|
449
|
+
all_units.each do |unit|
|
|
450
|
+
unit.dependencies.each do |dep|
|
|
451
|
+
target_unit = unit_map[dep[:target]]
|
|
452
|
+
next unless target_unit
|
|
453
|
+
|
|
454
|
+
target_unit.dependents ||= []
|
|
455
|
+
target_unit.dependents << {
|
|
456
|
+
type: unit.type,
|
|
457
|
+
identifier: unit.identifier
|
|
458
|
+
}
|
|
459
|
+
end
|
|
460
|
+
end
|
|
461
|
+
end
|
|
462
|
+
|
|
463
|
+
# Remove duplicate units (same identifier) within each type, keeping the first occurrence.
|
|
464
|
+
# Duplicates arise when multiple extractors produce the same unit (e.g., engine-mounted
|
|
465
|
+
# routes duplicating app routes). Without dedup, downstream phases would produce inflated
|
|
466
|
+
# counts, duplicate _index.json entries, and last-writer-wins file overwrites.
|
|
467
|
+
def deduplicate_results
|
|
468
|
+
@results.each do |type, units|
|
|
469
|
+
deduped = units.uniq(&:identifier)
|
|
470
|
+
dropped = units.size - deduped.size
|
|
471
|
+
|
|
472
|
+
Rails.logger.warn "[CodebaseIndex] Deduplicated #{type}: dropped #{dropped} duplicate(s)" if dropped.positive?
|
|
473
|
+
|
|
474
|
+
@results[type] = deduped
|
|
475
|
+
end
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
479
|
+
# Flow Precomputation
|
|
480
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
481
|
+
|
|
482
|
+
def precompute_flows
|
|
483
|
+
all_units = @results.values.flatten
|
|
484
|
+
precomputer = FlowPrecomputer.new(units: all_units, graph: @dependency_graph, output_dir: @output_dir.to_s)
|
|
485
|
+
flow_map = precomputer.precompute
|
|
486
|
+
Rails.logger.info "[CodebaseIndex] Precomputed #{flow_map.size} request flows"
|
|
487
|
+
rescue StandardError => e
|
|
488
|
+
Rails.logger.error "[CodebaseIndex] Flow precomputation failed: #{e.message}"
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
492
|
+
# Git Enrichment
|
|
493
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
494
|
+
|
|
495
|
+
def enrich_with_git_data
|
|
496
|
+
return unless git_available?
|
|
497
|
+
|
|
498
|
+
# Collect all file paths that need git data
|
|
499
|
+
file_paths = []
|
|
500
|
+
@results.each do |type, units|
|
|
501
|
+
next if %i[rails_source gem_source].include?(type)
|
|
502
|
+
|
|
503
|
+
units.each do |unit|
|
|
504
|
+
file_paths << unit.file_path if unit.file_path && File.exist?(unit.file_path)
|
|
505
|
+
end
|
|
506
|
+
end
|
|
507
|
+
|
|
508
|
+
# Batch-fetch all git data in minimal subprocess calls
|
|
509
|
+
git_data = batch_git_data(file_paths)
|
|
510
|
+
root = "#{Rails.root}/"
|
|
511
|
+
|
|
512
|
+
# Assign results to units
|
|
513
|
+
@results.each do |type, units|
|
|
514
|
+
next if %i[rails_source gem_source].include?(type)
|
|
515
|
+
|
|
516
|
+
units.each do |unit|
|
|
517
|
+
next unless unit.file_path
|
|
518
|
+
|
|
519
|
+
relative = unit.file_path.sub(root, '')
|
|
520
|
+
unit.metadata[:git] = git_data[relative] if git_data[relative]
|
|
521
|
+
end
|
|
522
|
+
end
|
|
523
|
+
end
|
|
524
|
+
|
|
525
|
+
# Normalize all unit file_paths to relative paths (relative to Rails.root).
|
|
526
|
+
#
|
|
527
|
+
# Extractors set file_path via source_location, which returns absolute paths.
|
|
528
|
+
# This normalization ensures consistent relative paths (e.g., "app/models/user.rb")
|
|
529
|
+
# across all environments (local, Docker, CI) where Rails.root differs.
|
|
530
|
+
#
|
|
531
|
+
# Must run after enrich_with_git_data, which needs absolute paths for
|
|
532
|
+
# File.exist? checks and git log commands.
|
|
533
|
+
def normalize_file_paths
|
|
534
|
+
@results.each_value do |units|
|
|
535
|
+
units.each do |unit|
|
|
536
|
+
unit.file_path = normalize_file_path(unit.file_path)
|
|
537
|
+
end
|
|
538
|
+
end
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
# Strip Rails.root prefix from a file path, converting it to a relative path.
|
|
542
|
+
#
|
|
543
|
+
# @param path [String, nil] Absolute or relative file path
|
|
544
|
+
# @return [String, nil] Relative path, or the original value if already relative,
|
|
545
|
+
# nil, or not under Rails.root (e.g., a gem path)
|
|
546
|
+
def normalize_file_path(path)
|
|
547
|
+
return path unless path
|
|
548
|
+
|
|
549
|
+
root = Rails.root.to_s
|
|
550
|
+
prefix = root.end_with?('/') ? root : "#{root}/"
|
|
551
|
+
path.start_with?(prefix) ? path.sub(prefix, '') : path
|
|
552
|
+
end
|
|
553
|
+
|
|
554
|
+
def git_available?
|
|
555
|
+
return @git_available if defined?(@git_available)
|
|
556
|
+
|
|
557
|
+
@git_available = begin
|
|
558
|
+
_, status = Open3.capture2('git', 'rev-parse', '--git-dir')
|
|
559
|
+
status.success?
|
|
560
|
+
rescue StandardError
|
|
561
|
+
false
|
|
562
|
+
end
|
|
563
|
+
end
|
|
564
|
+
|
|
565
|
+
# Safe git command execution — no shell interpolation
|
|
566
|
+
#
|
|
567
|
+
# @param args [Array<String>] Git command arguments
|
|
568
|
+
# @return [String] Command output (empty string on failure)
|
|
569
|
+
def run_git(*args)
|
|
570
|
+
output, status = Open3.capture2('git', *args)
|
|
571
|
+
status.success? ? output.strip : ''
|
|
572
|
+
rescue StandardError
|
|
573
|
+
''
|
|
574
|
+
end
|
|
575
|
+
|
|
576
|
+
# Batch-fetch git data for all file paths in two git commands.
|
|
577
|
+
#
|
|
578
|
+
# @param file_paths [Array<String>] Absolute file paths
|
|
579
|
+
# @return [Hash{String => Hash}] Keyed by relative path
|
|
580
|
+
def batch_git_data(file_paths)
|
|
581
|
+
return {} if file_paths.empty?
|
|
582
|
+
|
|
583
|
+
root = "#{Rails.root}/"
|
|
584
|
+
relative_paths = file_paths.map { |f| f.sub(root, '') }
|
|
585
|
+
result = {}
|
|
586
|
+
relative_paths.each { |rp| result[rp] = {} }
|
|
587
|
+
|
|
588
|
+
relative_paths.each_slice(500) do |batch|
|
|
589
|
+
log_output = run_git(
|
|
590
|
+
'log', '--all', '--name-only',
|
|
591
|
+
'--format=__COMMIT__%H|||%an|||%cI|||%s',
|
|
592
|
+
'--since=365 days ago',
|
|
593
|
+
'--', *batch
|
|
594
|
+
)
|
|
595
|
+
parse_git_log_output(log_output, relative_paths.to_set, result)
|
|
596
|
+
end
|
|
597
|
+
|
|
598
|
+
ninety_days_ago = (Time.current - 90.days).iso8601
|
|
599
|
+
result.each do |relative_path, data|
|
|
600
|
+
result[relative_path] = build_file_metadata(data, ninety_days_ago)
|
|
601
|
+
end
|
|
602
|
+
|
|
603
|
+
result
|
|
604
|
+
end
|
|
605
|
+
|
|
606
|
+
# Parse git log output line-by-line, populating result with per-file commit data.
|
|
607
|
+
def parse_git_log_output(log_output, path_set, result)
|
|
608
|
+
current_commit = nil
|
|
609
|
+
|
|
610
|
+
log_output.each_line do |line|
|
|
611
|
+
line = line.strip
|
|
612
|
+
next if line.empty?
|
|
613
|
+
|
|
614
|
+
if line.start_with?('__COMMIT__')
|
|
615
|
+
parts = line.sub('__COMMIT__', '').split('|||', 4)
|
|
616
|
+
current_commit = { sha: parts[0], author: parts[1], date: parts[2], message: parts[3] }
|
|
617
|
+
elsif current_commit && path_set.include?(line)
|
|
618
|
+
entry = result[line] ||= {}
|
|
619
|
+
unless entry[:last_modified]
|
|
620
|
+
entry[:last_modified] = current_commit[:date]
|
|
621
|
+
entry[:last_author] = current_commit[:author]
|
|
622
|
+
end
|
|
623
|
+
(entry[:commits] ||= []) << current_commit
|
|
624
|
+
(entry[:contributors] ||= Hash.new(0))[current_commit[:author]] += 1
|
|
625
|
+
end
|
|
626
|
+
end
|
|
627
|
+
end
|
|
628
|
+
|
|
629
|
+
# Classify how frequently a file changes based on commit counts.
|
|
630
|
+
def classify_change_frequency(total_count, recent_count)
|
|
631
|
+
if total_count <= 2
|
|
632
|
+
:new
|
|
633
|
+
elsif recent_count >= 10
|
|
634
|
+
:hot
|
|
635
|
+
elsif recent_count >= 3
|
|
636
|
+
:active
|
|
637
|
+
elsif recent_count >= 1
|
|
638
|
+
:stable
|
|
639
|
+
else
|
|
640
|
+
:dormant
|
|
641
|
+
end
|
|
642
|
+
end
|
|
643
|
+
|
|
644
|
+
# Build final metadata hash from raw commit data.
|
|
645
|
+
def build_file_metadata(data, ninety_days_ago)
|
|
646
|
+
all_commits = data[:commits] || []
|
|
647
|
+
contributor_counts = data[:contributors] || {}
|
|
648
|
+
recent_count = all_commits.count { |c| c[:date] && c[:date] > ninety_days_ago }
|
|
649
|
+
|
|
650
|
+
{
|
|
651
|
+
last_modified: data[:last_modified],
|
|
652
|
+
last_author: data[:last_author],
|
|
653
|
+
commit_count: all_commits.size,
|
|
654
|
+
contributors: contributor_counts
|
|
655
|
+
.sort_by { |_, count| -count }
|
|
656
|
+
.first(5)
|
|
657
|
+
.map { |name, count| { name: name, commits: count } },
|
|
658
|
+
recent_commits: all_commits.first(5).map do |c|
|
|
659
|
+
{ sha: c[:sha]&.first(8), message: c[:message], date: c[:date], author: c[:author] }
|
|
660
|
+
end,
|
|
661
|
+
change_frequency: classify_change_frequency(all_commits.size, recent_count)
|
|
662
|
+
}
|
|
663
|
+
end
|
|
664
|
+
|
|
665
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
666
|
+
# Output Writers
|
|
667
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
668
|
+
|
|
669
|
+
def write_results
|
|
670
|
+
@results.each do |type, units|
|
|
671
|
+
type_dir = @output_dir.join(type.to_s)
|
|
672
|
+
|
|
673
|
+
units.each do |unit|
|
|
674
|
+
File.write(
|
|
675
|
+
type_dir.join(collision_safe_filename(unit.identifier)),
|
|
676
|
+
json_serialize(unit.to_h)
|
|
677
|
+
)
|
|
678
|
+
end
|
|
679
|
+
|
|
680
|
+
# Also write a type index for fast lookups
|
|
681
|
+
index = units.map do |u|
|
|
682
|
+
{
|
|
683
|
+
identifier: u.identifier,
|
|
684
|
+
file_path: u.file_path,
|
|
685
|
+
namespace: u.namespace,
|
|
686
|
+
estimated_tokens: u.estimated_tokens,
|
|
687
|
+
chunk_count: u.chunks.size
|
|
688
|
+
}
|
|
689
|
+
end
|
|
690
|
+
|
|
691
|
+
File.write(
|
|
692
|
+
type_dir.join('_index.json'),
|
|
693
|
+
json_serialize(index)
|
|
694
|
+
)
|
|
695
|
+
end
|
|
696
|
+
end
|
|
697
|
+
|
|
698
|
+
def write_dependency_graph
|
|
699
|
+
graph_data = @dependency_graph.to_h
|
|
700
|
+
graph_data[:pagerank] = @dependency_graph.pagerank
|
|
701
|
+
|
|
702
|
+
File.write(
|
|
703
|
+
@output_dir.join('dependency_graph.json'),
|
|
704
|
+
json_serialize(graph_data)
|
|
705
|
+
)
|
|
706
|
+
end
|
|
707
|
+
|
|
708
|
+
def write_graph_analysis
|
|
709
|
+
return unless @graph_analysis
|
|
710
|
+
|
|
711
|
+
File.write(
|
|
712
|
+
@output_dir.join('graph_analysis.json'),
|
|
713
|
+
json_serialize(@graph_analysis)
|
|
714
|
+
)
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
def write_manifest
|
|
718
|
+
manifest = {
|
|
719
|
+
extracted_at: Time.current.iso8601,
|
|
720
|
+
rails_version: Rails.version,
|
|
721
|
+
ruby_version: RUBY_VERSION,
|
|
722
|
+
|
|
723
|
+
# Counts by type
|
|
724
|
+
counts: @results.transform_values(&:size),
|
|
725
|
+
|
|
726
|
+
# Total stats
|
|
727
|
+
total_units: @results.values.sum(&:size),
|
|
728
|
+
total_chunks: @results.values.flatten.sum { |u| u.chunks.size },
|
|
729
|
+
|
|
730
|
+
# Git info
|
|
731
|
+
git_sha: run_git('rev-parse', 'HEAD').presence,
|
|
732
|
+
git_branch: run_git('rev-parse', '--abbrev-ref', 'HEAD').presence,
|
|
733
|
+
|
|
734
|
+
# For change detection
|
|
735
|
+
gemfile_lock_sha: gemfile_lock_sha,
|
|
736
|
+
schema_sha: schema_sha
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
File.write(
|
|
740
|
+
@output_dir.join('manifest.json'),
|
|
741
|
+
json_serialize(manifest)
|
|
742
|
+
)
|
|
743
|
+
end
|
|
744
|
+
|
|
745
|
+
# Write a compact TOC-style summary of extracted units.
|
|
746
|
+
#
|
|
747
|
+
# Produces a SUMMARY.md under 8K tokens (~24KB) by listing one line per
|
|
748
|
+
# category with count and top-5 namespace breakdown, rather than enumerating
|
|
749
|
+
# every unit. Per-unit detail is available in the per-category _index.json files.
|
|
750
|
+
#
|
|
751
|
+
# @return [void]
|
|
752
|
+
def write_structural_summary
|
|
753
|
+
return if @results.empty?
|
|
754
|
+
|
|
755
|
+
total_units = @results.values.sum(&:size)
|
|
756
|
+
total_chunks = @results.values.flatten.sum { |u| [u.chunks.size, 1].max }
|
|
757
|
+
category_count = @results.count { |_, units| units.any? }
|
|
758
|
+
|
|
759
|
+
summary = []
|
|
760
|
+
summary << '# Codebase Index Summary'
|
|
761
|
+
summary << "Generated: #{Time.current.iso8601}"
|
|
762
|
+
summary << "Rails #{Rails.version} / Ruby #{RUBY_VERSION}"
|
|
763
|
+
summary << "Units: #{total_units} | Chunks: #{total_chunks} | Categories: #{category_count}"
|
|
764
|
+
summary << ''
|
|
765
|
+
|
|
766
|
+
@results.each do |type, units|
|
|
767
|
+
next if units.empty?
|
|
768
|
+
|
|
769
|
+
summary << "## #{type.to_s.titleize} (#{units.size})"
|
|
770
|
+
|
|
771
|
+
ns_counts = units
|
|
772
|
+
.group_by { |u| u.namespace.nil? || u.namespace.empty? ? '(root)' : u.namespace }
|
|
773
|
+
.transform_values(&:size)
|
|
774
|
+
.sort_by { |_, count| -count }
|
|
775
|
+
.first(5)
|
|
776
|
+
|
|
777
|
+
ns_parts = ns_counts.map { |ns, count| "#{ns} #{count}" }
|
|
778
|
+
summary << "Namespaces: #{ns_parts.join(', ')}" unless ns_parts.empty?
|
|
779
|
+
summary << ''
|
|
780
|
+
end
|
|
781
|
+
|
|
782
|
+
summary << '## Dependency Overview'
|
|
783
|
+
summary << ''
|
|
784
|
+
|
|
785
|
+
graph_stats = @dependency_graph.to_h[:stats]
|
|
786
|
+
if graph_stats
|
|
787
|
+
summary << "- Total nodes: #{graph_stats[:node_count]}"
|
|
788
|
+
summary << "- Total edges: #{graph_stats[:edge_count]}"
|
|
789
|
+
end
|
|
790
|
+
|
|
791
|
+
if @graph_analysis
|
|
792
|
+
hub_nodes = @graph_analysis[:hubs]
|
|
793
|
+
significant_hubs = hub_nodes&.select { |h| h[:dependent_count] > 20 }
|
|
794
|
+
if significant_hubs&.any?
|
|
795
|
+
hub_names = significant_hubs.map { |h| h[:identifier] }.join(', ')
|
|
796
|
+
summary << "- Hub nodes (>20 dependents): #{hub_names}"
|
|
797
|
+
end
|
|
798
|
+
end
|
|
799
|
+
|
|
800
|
+
summary << ''
|
|
801
|
+
|
|
802
|
+
File.write(
|
|
803
|
+
@output_dir.join('SUMMARY.md'),
|
|
804
|
+
summary.join("\n")
|
|
805
|
+
)
|
|
806
|
+
end
|
|
807
|
+
|
|
808
|
+
def regenerate_type_index(type_key)
|
|
809
|
+
type_dir = @output_dir.join(type_key.to_s)
|
|
810
|
+
return unless type_dir.directory?
|
|
811
|
+
|
|
812
|
+
# Scan existing unit JSON files (exclude _index.json)
|
|
813
|
+
index = Dir[type_dir.join('*.json')].filter_map do |file|
|
|
814
|
+
next if File.basename(file) == '_index.json'
|
|
815
|
+
|
|
816
|
+
data = JSON.parse(File.read(file))
|
|
817
|
+
{
|
|
818
|
+
identifier: data['identifier'],
|
|
819
|
+
file_path: data['file_path'],
|
|
820
|
+
namespace: data['namespace'],
|
|
821
|
+
estimated_tokens: data['estimated_tokens'],
|
|
822
|
+
chunk_count: (data['chunks'] || []).size
|
|
823
|
+
}
|
|
824
|
+
end
|
|
825
|
+
|
|
826
|
+
File.write(
|
|
827
|
+
type_dir.join('_index.json'),
|
|
828
|
+
json_serialize(index)
|
|
829
|
+
)
|
|
830
|
+
end
|
|
831
|
+
|
|
832
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
833
|
+
# Helpers
|
|
834
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
835
|
+
|
|
836
|
+
def gemfile_lock_sha
|
|
837
|
+
lock_path = Rails.root.join('Gemfile.lock')
|
|
838
|
+
return nil unless lock_path.exist?
|
|
839
|
+
|
|
840
|
+
Digest::SHA256.file(lock_path).hexdigest
|
|
841
|
+
end
|
|
842
|
+
|
|
843
|
+
def schema_sha
|
|
844
|
+
schema_path = Rails.root.join('db/schema.rb')
|
|
845
|
+
return nil unless schema_path.exist?
|
|
846
|
+
|
|
847
|
+
Digest::SHA256.file(schema_path).hexdigest
|
|
848
|
+
end
|
|
849
|
+
|
|
850
|
+
# Generate a safe JSON filename from a unit identifier.
|
|
851
|
+
#
|
|
852
|
+
# @param identifier [String] Unit identifier (e.g., "Admin::UsersController")
|
|
853
|
+
# @return [String] Safe filename (e.g., "Admin__UsersController.json")
|
|
854
|
+
def safe_filename(identifier)
|
|
855
|
+
"#{identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')}.json"
|
|
856
|
+
end
|
|
857
|
+
|
|
858
|
+
# Generate a collision-safe JSON filename by appending a short digest.
|
|
859
|
+
# Unlike safe_filename, this guarantees distinct filenames even when two
|
|
860
|
+
# identifiers differ only in characters that safe_filename normalizes
|
|
861
|
+
# (e.g., "GET /foo/bar" vs "GET /foo_bar" both become "GET__foo_bar.json").
|
|
862
|
+
#
|
|
863
|
+
# @param identifier [String] Unit identifier
|
|
864
|
+
# @return [String] Collision-safe filename (e.g., "GET__foo_bar_a1b2c3d4.json")
|
|
865
|
+
def collision_safe_filename(identifier)
|
|
866
|
+
base = identifier.gsub('::', '__').gsub(/[^a-zA-Z0-9_-]/, '_')
|
|
867
|
+
digest = ::Digest::SHA256.hexdigest(identifier)[0, 8]
|
|
868
|
+
"#{base}_#{digest}.json"
|
|
869
|
+
end
|
|
870
|
+
|
|
871
|
+
def json_serialize(data)
|
|
872
|
+
if CodebaseIndex.configuration.pretty_json
|
|
873
|
+
JSON.pretty_generate(data)
|
|
874
|
+
else
|
|
875
|
+
JSON.generate(data)
|
|
876
|
+
end
|
|
877
|
+
end
|
|
878
|
+
|
|
879
|
+
def log_summary
|
|
880
|
+
total = @results.values.sum(&:size)
|
|
881
|
+
chunks = @results.values.flatten.sum { |u| u.chunks.size }
|
|
882
|
+
|
|
883
|
+
Rails.logger.info '[CodebaseIndex] ═══════════════════════════════════════════'
|
|
884
|
+
Rails.logger.info '[CodebaseIndex] Extraction Complete'
|
|
885
|
+
Rails.logger.info '[CodebaseIndex] ═══════════════════════════════════════════'
|
|
886
|
+
@results.each do |type, units|
|
|
887
|
+
Rails.logger.info "[CodebaseIndex] #{type}: #{units.size} units"
|
|
888
|
+
end
|
|
889
|
+
Rails.logger.info '[CodebaseIndex] ───────────────────────────────────────────'
|
|
890
|
+
Rails.logger.info "[CodebaseIndex] Total: #{total} units, #{chunks} chunks"
|
|
891
|
+
Rails.logger.info "[CodebaseIndex] Output: #{@output_dir}"
|
|
892
|
+
Rails.logger.info '[CodebaseIndex] ═══════════════════════════════════════════'
|
|
893
|
+
end
|
|
894
|
+
|
|
895
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
896
|
+
# Incremental Re-extraction
|
|
897
|
+
# ──────────────────────────────────────────────────────────────────────
|
|
898
|
+
|
|
899
|
+
def re_extract_unit(unit_id, affected_types: nil)
|
|
900
|
+
# Framework source only changes on version updates
|
|
901
|
+
if unit_id.start_with?('rails/') || unit_id.start_with?('gems/')
|
|
902
|
+
Rails.logger.debug "[CodebaseIndex] Skipping framework re-extraction for #{unit_id}"
|
|
903
|
+
return
|
|
904
|
+
end
|
|
905
|
+
|
|
906
|
+
# Find the unit's type from the graph
|
|
907
|
+
node = @dependency_graph.to_h[:nodes][unit_id]
|
|
908
|
+
return unless node
|
|
909
|
+
|
|
910
|
+
type = node[:type]&.to_sym
|
|
911
|
+
file_path = node[:file_path]
|
|
912
|
+
|
|
913
|
+
return unless file_path && File.exist?(file_path)
|
|
914
|
+
|
|
915
|
+
# Re-extract based on type
|
|
916
|
+
extractor_key = TYPE_TO_EXTRACTOR_KEY[type]
|
|
917
|
+
return unless extractor_key
|
|
918
|
+
|
|
919
|
+
extractor = EXTRACTORS[extractor_key]&.new
|
|
920
|
+
return unless extractor
|
|
921
|
+
|
|
922
|
+
unit = if (method = CLASS_BASED[type])
|
|
923
|
+
klass = if unit_id.match?(/\A[A-Z][A-Za-z0-9_:]*\z/)
|
|
924
|
+
begin
|
|
925
|
+
unit_id.constantize
|
|
926
|
+
rescue StandardError
|
|
927
|
+
nil
|
|
928
|
+
end
|
|
929
|
+
end
|
|
930
|
+
extractor.public_send(method, klass) if klass
|
|
931
|
+
elsif (method = FILE_BASED[type])
|
|
932
|
+
extractor.public_send(method, file_path)
|
|
933
|
+
elsif GRAPHQL_TYPES.include?(type)
|
|
934
|
+
extractor.extract_graphql_file(file_path)
|
|
935
|
+
end
|
|
936
|
+
|
|
937
|
+
return unless unit
|
|
938
|
+
|
|
939
|
+
# Update dependency graph
|
|
940
|
+
@dependency_graph.register(unit)
|
|
941
|
+
|
|
942
|
+
# Track which type was affected
|
|
943
|
+
affected_types&.add(extractor_key)
|
|
944
|
+
|
|
945
|
+
# Write updated unit
|
|
946
|
+
type_dir = @output_dir.join(extractor_key.to_s)
|
|
947
|
+
|
|
948
|
+
File.write(
|
|
949
|
+
type_dir.join(collision_safe_filename(unit.identifier)),
|
|
950
|
+
json_serialize(unit.to_h)
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
Rails.logger.info "[CodebaseIndex] Re-extracted #{unit_id}"
|
|
954
|
+
end
|
|
955
|
+
end
|
|
956
|
+
end
|