codebase_index 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +29 -0
  3. data/CODE_OF_CONDUCT.md +83 -0
  4. data/CONTRIBUTING.md +65 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +481 -0
  7. data/exe/codebase-console-mcp +22 -0
  8. data/exe/codebase-index-mcp +61 -0
  9. data/exe/codebase-index-mcp-http +64 -0
  10. data/exe/codebase-index-mcp-start +58 -0
  11. data/lib/codebase_index/ast/call_site_extractor.rb +106 -0
  12. data/lib/codebase_index/ast/method_extractor.rb +76 -0
  13. data/lib/codebase_index/ast/node.rb +88 -0
  14. data/lib/codebase_index/ast/parser.rb +653 -0
  15. data/lib/codebase_index/ast.rb +6 -0
  16. data/lib/codebase_index/builder.rb +137 -0
  17. data/lib/codebase_index/chunking/chunk.rb +84 -0
  18. data/lib/codebase_index/chunking/semantic_chunker.rb +290 -0
  19. data/lib/codebase_index/console/adapters/cache_adapter.rb +58 -0
  20. data/lib/codebase_index/console/adapters/good_job_adapter.rb +66 -0
  21. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +66 -0
  22. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +66 -0
  23. data/lib/codebase_index/console/audit_logger.rb +75 -0
  24. data/lib/codebase_index/console/bridge.rb +170 -0
  25. data/lib/codebase_index/console/confirmation.rb +90 -0
  26. data/lib/codebase_index/console/connection_manager.rb +173 -0
  27. data/lib/codebase_index/console/console_response_renderer.rb +78 -0
  28. data/lib/codebase_index/console/model_validator.rb +81 -0
  29. data/lib/codebase_index/console/safe_context.rb +82 -0
  30. data/lib/codebase_index/console/server.rb +557 -0
  31. data/lib/codebase_index/console/sql_validator.rb +172 -0
  32. data/lib/codebase_index/console/tools/tier1.rb +118 -0
  33. data/lib/codebase_index/console/tools/tier2.rb +117 -0
  34. data/lib/codebase_index/console/tools/tier3.rb +110 -0
  35. data/lib/codebase_index/console/tools/tier4.rb +79 -0
  36. data/lib/codebase_index/coordination/pipeline_lock.rb +109 -0
  37. data/lib/codebase_index/cost_model/embedding_cost.rb +88 -0
  38. data/lib/codebase_index/cost_model/estimator.rb +128 -0
  39. data/lib/codebase_index/cost_model/provider_pricing.rb +67 -0
  40. data/lib/codebase_index/cost_model/storage_cost.rb +52 -0
  41. data/lib/codebase_index/cost_model.rb +22 -0
  42. data/lib/codebase_index/db/migrations/001_create_units.rb +38 -0
  43. data/lib/codebase_index/db/migrations/002_create_edges.rb +35 -0
  44. data/lib/codebase_index/db/migrations/003_create_embeddings.rb +37 -0
  45. data/lib/codebase_index/db/migrations/004_create_snapshots.rb +45 -0
  46. data/lib/codebase_index/db/migrations/005_create_snapshot_units.rb +40 -0
  47. data/lib/codebase_index/db/migrator.rb +71 -0
  48. data/lib/codebase_index/db/schema_version.rb +73 -0
  49. data/lib/codebase_index/dependency_graph.rb +227 -0
  50. data/lib/codebase_index/embedding/indexer.rb +130 -0
  51. data/lib/codebase_index/embedding/openai.rb +105 -0
  52. data/lib/codebase_index/embedding/provider.rb +135 -0
  53. data/lib/codebase_index/embedding/text_preparer.rb +112 -0
  54. data/lib/codebase_index/evaluation/baseline_runner.rb +115 -0
  55. data/lib/codebase_index/evaluation/evaluator.rb +146 -0
  56. data/lib/codebase_index/evaluation/metrics.rb +79 -0
  57. data/lib/codebase_index/evaluation/query_set.rb +148 -0
  58. data/lib/codebase_index/evaluation/report_generator.rb +90 -0
  59. data/lib/codebase_index/extracted_unit.rb +145 -0
  60. data/lib/codebase_index/extractor.rb +956 -0
  61. data/lib/codebase_index/extractors/action_cable_extractor.rb +228 -0
  62. data/lib/codebase_index/extractors/ast_source_extraction.rb +46 -0
  63. data/lib/codebase_index/extractors/behavioral_profile.rb +309 -0
  64. data/lib/codebase_index/extractors/caching_extractor.rb +261 -0
  65. data/lib/codebase_index/extractors/callback_analyzer.rb +232 -0
  66. data/lib/codebase_index/extractors/concern_extractor.rb +253 -0
  67. data/lib/codebase_index/extractors/configuration_extractor.rb +219 -0
  68. data/lib/codebase_index/extractors/controller_extractor.rb +494 -0
  69. data/lib/codebase_index/extractors/database_view_extractor.rb +278 -0
  70. data/lib/codebase_index/extractors/decorator_extractor.rb +260 -0
  71. data/lib/codebase_index/extractors/engine_extractor.rb +204 -0
  72. data/lib/codebase_index/extractors/event_extractor.rb +211 -0
  73. data/lib/codebase_index/extractors/factory_extractor.rb +289 -0
  74. data/lib/codebase_index/extractors/graphql_extractor.rb +917 -0
  75. data/lib/codebase_index/extractors/i18n_extractor.rb +117 -0
  76. data/lib/codebase_index/extractors/job_extractor.rb +369 -0
  77. data/lib/codebase_index/extractors/lib_extractor.rb +249 -0
  78. data/lib/codebase_index/extractors/mailer_extractor.rb +339 -0
  79. data/lib/codebase_index/extractors/manager_extractor.rb +202 -0
  80. data/lib/codebase_index/extractors/middleware_extractor.rb +133 -0
  81. data/lib/codebase_index/extractors/migration_extractor.rb +469 -0
  82. data/lib/codebase_index/extractors/model_extractor.rb +960 -0
  83. data/lib/codebase_index/extractors/phlex_extractor.rb +252 -0
  84. data/lib/codebase_index/extractors/policy_extractor.rb +214 -0
  85. data/lib/codebase_index/extractors/poro_extractor.rb +246 -0
  86. data/lib/codebase_index/extractors/pundit_extractor.rb +223 -0
  87. data/lib/codebase_index/extractors/rails_source_extractor.rb +473 -0
  88. data/lib/codebase_index/extractors/rake_task_extractor.rb +343 -0
  89. data/lib/codebase_index/extractors/route_extractor.rb +181 -0
  90. data/lib/codebase_index/extractors/scheduled_job_extractor.rb +331 -0
  91. data/lib/codebase_index/extractors/serializer_extractor.rb +334 -0
  92. data/lib/codebase_index/extractors/service_extractor.rb +254 -0
  93. data/lib/codebase_index/extractors/shared_dependency_scanner.rb +91 -0
  94. data/lib/codebase_index/extractors/shared_utility_methods.rb +99 -0
  95. data/lib/codebase_index/extractors/state_machine_extractor.rb +398 -0
  96. data/lib/codebase_index/extractors/test_mapping_extractor.rb +225 -0
  97. data/lib/codebase_index/extractors/validator_extractor.rb +225 -0
  98. data/lib/codebase_index/extractors/view_component_extractor.rb +310 -0
  99. data/lib/codebase_index/extractors/view_template_extractor.rb +261 -0
  100. data/lib/codebase_index/feedback/gap_detector.rb +89 -0
  101. data/lib/codebase_index/feedback/store.rb +119 -0
  102. data/lib/codebase_index/flow_analysis/operation_extractor.rb +209 -0
  103. data/lib/codebase_index/flow_analysis/response_code_mapper.rb +154 -0
  104. data/lib/codebase_index/flow_assembler.rb +290 -0
  105. data/lib/codebase_index/flow_document.rb +191 -0
  106. data/lib/codebase_index/flow_precomputer.rb +102 -0
  107. data/lib/codebase_index/formatting/base.rb +40 -0
  108. data/lib/codebase_index/formatting/claude_adapter.rb +98 -0
  109. data/lib/codebase_index/formatting/generic_adapter.rb +56 -0
  110. data/lib/codebase_index/formatting/gpt_adapter.rb +64 -0
  111. data/lib/codebase_index/formatting/human_adapter.rb +78 -0
  112. data/lib/codebase_index/graph_analyzer.rb +374 -0
  113. data/lib/codebase_index/mcp/index_reader.rb +394 -0
  114. data/lib/codebase_index/mcp/renderers/claude_renderer.rb +81 -0
  115. data/lib/codebase_index/mcp/renderers/json_renderer.rb +17 -0
  116. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +352 -0
  117. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +240 -0
  118. data/lib/codebase_index/mcp/server.rb +935 -0
  119. data/lib/codebase_index/mcp/tool_response_renderer.rb +62 -0
  120. data/lib/codebase_index/model_name_cache.rb +51 -0
  121. data/lib/codebase_index/notion/client.rb +217 -0
  122. data/lib/codebase_index/notion/exporter.rb +219 -0
  123. data/lib/codebase_index/notion/mapper.rb +39 -0
  124. data/lib/codebase_index/notion/mappers/column_mapper.rb +65 -0
  125. data/lib/codebase_index/notion/mappers/migration_mapper.rb +39 -0
  126. data/lib/codebase_index/notion/mappers/model_mapper.rb +164 -0
  127. data/lib/codebase_index/notion/rate_limiter.rb +68 -0
  128. data/lib/codebase_index/observability/health_check.rb +81 -0
  129. data/lib/codebase_index/observability/instrumentation.rb +34 -0
  130. data/lib/codebase_index/observability/structured_logger.rb +75 -0
  131. data/lib/codebase_index/operator/error_escalator.rb +81 -0
  132. data/lib/codebase_index/operator/pipeline_guard.rb +99 -0
  133. data/lib/codebase_index/operator/status_reporter.rb +80 -0
  134. data/lib/codebase_index/railtie.rb +26 -0
  135. data/lib/codebase_index/resilience/circuit_breaker.rb +99 -0
  136. data/lib/codebase_index/resilience/index_validator.rb +185 -0
  137. data/lib/codebase_index/resilience/retryable_provider.rb +108 -0
  138. data/lib/codebase_index/retrieval/context_assembler.rb +249 -0
  139. data/lib/codebase_index/retrieval/query_classifier.rb +131 -0
  140. data/lib/codebase_index/retrieval/ranker.rb +273 -0
  141. data/lib/codebase_index/retrieval/search_executor.rb +327 -0
  142. data/lib/codebase_index/retriever.rb +160 -0
  143. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +190 -0
  144. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +78 -0
  145. data/lib/codebase_index/ruby_analyzer/fqn_builder.rb +18 -0
  146. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +275 -0
  147. data/lib/codebase_index/ruby_analyzer/method_analyzer.rb +143 -0
  148. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +139 -0
  149. data/lib/codebase_index/ruby_analyzer.rb +87 -0
  150. data/lib/codebase_index/session_tracer/file_store.rb +111 -0
  151. data/lib/codebase_index/session_tracer/middleware.rb +143 -0
  152. data/lib/codebase_index/session_tracer/redis_store.rb +112 -0
  153. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +263 -0
  154. data/lib/codebase_index/session_tracer/session_flow_document.rb +223 -0
  155. data/lib/codebase_index/session_tracer/solid_cache_store.rb +145 -0
  156. data/lib/codebase_index/session_tracer/store.rb +67 -0
  157. data/lib/codebase_index/storage/graph_store.rb +120 -0
  158. data/lib/codebase_index/storage/metadata_store.rb +169 -0
  159. data/lib/codebase_index/storage/pgvector.rb +163 -0
  160. data/lib/codebase_index/storage/qdrant.rb +172 -0
  161. data/lib/codebase_index/storage/vector_store.rb +156 -0
  162. data/lib/codebase_index/temporal/snapshot_store.rb +341 -0
  163. data/lib/codebase_index/version.rb +5 -0
  164. data/lib/codebase_index.rb +223 -0
  165. data/lib/generators/codebase_index/install_generator.rb +32 -0
  166. data/lib/generators/codebase_index/pgvector_generator.rb +37 -0
  167. data/lib/generators/codebase_index/templates/add_pgvector_to_codebase_index.rb.erb +15 -0
  168. data/lib/generators/codebase_index/templates/create_codebase_index_tables.rb.erb +43 -0
  169. data/lib/tasks/codebase_index.rake +583 -0
  170. data/lib/tasks/codebase_index_evaluation.rake +115 -0
  171. metadata +252 -0
data/README.md ADDED
@@ -0,0 +1,481 @@
1
+ # CodebaseIndex
2
+
3
+ A Rails codebase extraction and indexing system designed to provide accurate, version-specific context for AI-assisted development tooling.
4
+
5
+ ## The Problem
6
+
7
+ LLMs working with Rails codebases face a fundamental accuracy gap. Training data contains documentation and examples from many Rails versions, but a production app runs on *one* version. When a developer asks "what options does `has_many` support?" or "what callbacks fire when a record is saved?", the answer depends on their exact Rails version — and generic LLM responses often get it wrong.
8
+
9
+ Beyond version accuracy, Rails conventions hide enormous amounts of implementation behind "magic." A model file might be 50 lines, but with concerns inlined, schema context, callbacks, validations, and association behavior, the *actual* surface area is 10x that. AI tools that only see the source file miss most of what matters.
10
+
11
+ CodebaseIndex solves this by:
12
+
13
+ - **Running inside Rails** to leverage runtime introspection (not just static parsing)
14
+ - **Inlining concerns** directly into model source so the full picture is visible
15
+ - **Prepending schema comments** with column types, indexes, and foreign keys
16
+ - **Mapping routes to controllers** so HTTP → action flow is explicit
17
+ - **Indexing the exact Rails/gem source** for the versions in `Gemfile.lock`
18
+ - **Tracking dependencies** bidirectionally so you can trace impact across the codebase
19
+ - **Enriching with git data** so you know what's actively changing vs. dormant
20
+
21
+ ## Installation
22
+
23
+ Add to your Gemfile:
24
+
25
+ ```ruby
26
+ gem 'codebase_index'
27
+ ```
28
+
29
+ Then:
30
+
31
+ ```bash
32
+ bundle install
33
+ ```
34
+
35
+ Or install directly:
36
+
37
+ ```bash
38
+ gem install codebase_index
39
+ ```
40
+
41
+ > **Requires Rails.** Extraction runs inside a booted Rails application using runtime introspection (`ActiveRecord::Base.descendants`, `Rails.application.routes`, etc.). The gem cannot extract from source files alone. See [Getting Started](docs/GETTING_STARTED.md) for setup.
42
+
43
+ ## Target Environment
44
+
45
+ Designed for Rails applications of any scale, with particular strength in large monoliths:
46
+
47
+ - Any database (MySQL, PostgreSQL, SQLite)
48
+ - Any background job system (Sidekiq, Solid Queue, GoodJob, inline)
49
+ - Any view layer (ERB, Phlex, ViewComponent)
50
+ - Docker or bare metal, CI or manual
51
+ - Continuous or one-shot indexing
52
+
53
+ See [docs/BACKEND_MATRIX.md](docs/BACKEND_MATRIX.md) for supported infrastructure combinations.
54
+
55
+ ## Use Cases
56
+
57
+ **1. Coding & Debugging** — Primary context for AI coding assistants. Answer "how does our checkout flow work?" with the actual service, model callbacks, controller actions, and framework behavior for the running version.
58
+
59
+ **2. Performance Analysis** — Correlate code structure with runtime behavior. Identify models with high write volume and complex callback chains, find N+1-prone association patterns, surface hot code paths.
60
+
61
+ **3. Deeper Analytics** — Query frequency by scope, error rates by action, background job characteristics. Bridge the gap between code structure and operational data.
62
+
63
+ **4. Support & Marketing Tooling** — Domain-concept retrieval for non-developers. Map business terms to code paths, surface feature flags, document user-facing behavior.
64
+
65
+ ## Architecture
66
+
67
+ ```
68
+ ┌─────────────────────────────────────────────────────────────────────┐
69
+ │ CodebaseIndex │
70
+ ├─────────────────────────────────────────────────────────────────────┤
71
+ │ │
72
+ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
73
+ │ │ Extraction │───▶│ Storage │◀───│ Retrieval │ │
74
+ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
75
+ │ │ │ │ │
76
+ │ ▼ ▼ ▼ │
77
+ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
78
+ │ │ Extractors │ │ JSON per unit │ │ Query Classifier│ │
79
+ │ │ · Model │ │ Vector Index │ │ Context Assembly│ │
80
+ │ │ · Controller │ │ Metadata Index │ │ Result Ranking │ │
81
+ │ │ · Service │ │ Dep Graph │ │ │ │
82
+ │ │ · Component │ │ │ │ │ │
83
+ │ │ · Rails Source │ │ │ │ │ │
84
+ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
85
+ │ │
86
+ └─────────────────────────────────────────────────────────────────────┘
87
+ ```
88
+
89
+ ### Extraction Pipeline
90
+
91
+ Extraction runs inside the Rails application (via rake task) to access runtime introspection — `ActiveRecord::Base.descendants`, `Rails.application.routes`, reflection APIs, etc. This is fundamentally more accurate than static parsing.
92
+
93
+ **Four phases:**
94
+
95
+ 1. **Extract** — Each extractor produces `ExtractedUnit` objects with source, metadata, and dependencies
96
+ 2. **Resolve dependents** — Build reverse dependency edges (who calls what)
97
+ 3. **Enrich with git** — Last modified, contributors, change frequency, recent commits
98
+ 4. **Write output** — JSON per unit, dependency graph, manifest, structural summary
99
+
100
+ ### Extractors (34)
101
+
102
+ **Core Application**
103
+
104
+ | Extractor | What it captures |
105
+ |-----------|-----------------|
106
+ | **ModelExtractor** | Schema (columns, indexes, FKs), associations, validations, callbacks (all 13 types), scopes, enums, inlined concerns. Chunks large models into summary/associations/callbacks/validations. |
107
+ | **ControllerExtractor** | Route mapping (verb → path → action), filter chains per action, response formats, permitted params. Per-action chunks with applicable filters and route context. |
108
+ | **ServiceExtractor** | Scans `app/services`, `app/interactors`, `app/operations`, `app/commands`, `app/use_cases`. Entry points, dependency injection, custom errors, return type inference. |
109
+ | **JobExtractor** | ActiveJob and Sidekiq workers. Queue config, retry/concurrency options, perform arguments, callbacks. |
110
+ | **MailerExtractor** | ActionMailer classes with defaults, per-action templates, callbacks, helper usage. |
111
+ | **ConfigurationExtractor** | Rails initializers from `config/initializers` and `config/environments`, plus behavioral profile from resolved `Rails.application.config`. |
112
+ | **RouteExtractor** | All Rails routes via runtime introspection of `Rails.application.routes`. |
113
+ | **MiddlewareExtractor** | Rack middleware stack as a single ordered unit. |
114
+
115
+ **UI Components**
116
+
117
+ | Extractor | What it captures |
118
+ |-----------|-----------------|
119
+ | **PhlexExtractor** | Phlex component slots, initialize params, sub-components, Stimulus controller references, route helpers. |
120
+ | **ViewComponentExtractor** | ViewComponent slots, template paths, preview classes, collection support. |
121
+ | **ViewTemplateExtractor** | ERB view templates with render calls, instance variables, helper usage. |
122
+ | **DecoratorExtractor** | Decorators, presenters, and form objects from `app/decorators`, `app/presenters`, `app/form_objects`. |
123
+
124
+ **Data Layer**
125
+
126
+ | Extractor | What it captures |
127
+ |-----------|-----------------|
128
+ | **ConcernExtractor** | ActiveSupport::Concern modules from `app/models/concerns` and `app/controllers/concerns`. |
129
+ | **PoroExtractor** | Plain Ruby objects in `app/models` (non-ActiveRecord classes, excluding concerns). |
130
+ | **SerializerExtractor** | ActiveModelSerializers, Blueprinter, Alba, and Draper. Auto-detects loaded serialization gems. |
131
+ | **ValidatorExtractor** | Custom ActiveModel validator classes with validation rules. |
132
+ | **ManagerExtractor** | SimpleDelegator subclasses — wrapped model, public methods, delegation chain. |
133
+
134
+ **API & Authorization**
135
+
136
+ | Extractor | What it captures |
137
+ |-----------|-----------------|
138
+ | **GraphQLExtractor** | graphql-ruby types, mutations, queries, resolvers, field metadata, authorization patterns. Produces 4 unit types. |
139
+ | **PunditExtractor** | Pundit authorization policies with action methods (index?, show?, create?, etc.). |
140
+ | **PolicyExtractor** | Domain policy classes with decision methods and eligibility rules. |
141
+
142
+ **Infrastructure**
143
+
144
+ | Extractor | What it captures |
145
+ |-----------|-----------------|
146
+ | **EngineExtractor** | Mounted Rails engines via runtime introspection with mount points and route counts. |
147
+ | **I18nExtractor** | Locale files from `config/locales` with translation key structures. |
148
+ | **ActionCableExtractor** | ActionCable channels with stream subscriptions, actions, broadcast patterns. |
149
+ | **ScheduledJobExtractor** | Scheduled jobs from `config/recurring.yml`, `config/sidekiq_cron.yml`, `config/schedule.rb`. |
150
+ | **RakeTaskExtractor** | Rake tasks from `lib/tasks/*.rake` with namespaces, dependencies, descriptions. |
151
+ | **MigrationExtractor** | ActiveRecord migrations with DDL metadata, table operations, reversibility, risk indicators. |
152
+ | **DatabaseViewExtractor** | SQL views from `db/views` (Scenic convention) with materialization and table references. |
153
+ | **StateMachineExtractor** | AASM, Statesman, and state_machines DSL definitions with states and transitions. |
154
+ | **EventExtractor** | Event publish/subscribe patterns (ActiveSupport::Notifications, Wisper). |
155
+ | **CachingExtractor** | Cache usage across controllers, models, and views — strategies, TTLs, cache keys. |
156
+
157
+ **Testing & Source**
158
+
159
+ | Extractor | What it captures |
160
+ |-----------|-----------------|
161
+ | **FactoryExtractor** | FactoryBot factory definitions with traits and associations. |
162
+ | **TestMappingExtractor** | Test file → subject class mapping with test counts and framework type. |
163
+ | **LibExtractor** | Ruby files from `lib/` (excluding tasks and generators). |
164
+ | **RailsSourceExtractor** | High-value Rails framework source and gem source pinned to exact installed versions. |
165
+
166
+ ### Key Design Decisions
167
+
168
+ **Concern inlining.** When extracting a model, included concerns are read from disk and embedded as formatted comments directly in the model's source. This means the full behavioral picture is in one unit — no separate lookups needed during retrieval.
169
+
170
+ **Route prepending.** Controller source gets a header block showing the HTTP routes that map to it, so the relationship between URLs and actions is immediately visible.
171
+
172
+ **Semantic chunking.** Large models are split into purpose-specific chunks (summary, associations, callbacks, validations) rather than arbitrary size-based splits. Controllers chunk per-action with the relevant filters and route attached.
173
+
174
+ **Dependency graph with BFS blast radius.** The graph tracks both forward dependencies (what this unit uses) and reverse dependencies (what uses this unit). Changed-file impact is computed via breadth-first traversal — if a concern changes, every model including it gets re-indexed.
175
+
176
+ ## MCP Servers
177
+
178
+ CodebaseIndex ships two [MCP](https://modelcontextprotocol.io/) servers for integrating with AI development tools (Claude Code, Cursor, Windsurf, etc.).
179
+
180
+ **Index Server** (26 tools) — Reads pre-extracted data from disk. No Rails boot required. Provides code lookup, dependency traversal, graph analysis, semantic search, pipeline management, feedback collection, and temporal snapshots.
181
+
182
+ ```bash
183
+ codebase-index-mcp /path/to/rails-app/tmp/codebase_index
184
+ ```
185
+
186
+ **Console Server** (31 tools) — Bridges to a live Rails process for database queries, model diagnostics, job monitoring, and guarded operations. All queries run in rolled-back transactions with SQL validation and audit logging.
187
+
188
+ ```bash
189
+ codebase-console-mcp
190
+ ```
191
+
192
+ See [docs/MCP_SERVERS.md](docs/MCP_SERVERS.md) for the full tool catalog and setup instructions.
193
+
194
+ ## Subsystems
195
+
196
+ ```
197
+ lib/
198
+ ├── codebase_index.rb # Module interface, Configuration, entry point
199
+ ├── codebase_index/
200
+ │ ├── extracted_unit.rb # Core value object
201
+ │ ├── extractor.rb # Orchestrator — coordinates all extractors
202
+ │ ├── dependency_graph.rb # Directed graph + PageRank scoring
203
+ │ ├── graph_analyzer.rb # Structural analysis (orphans, hubs, cycles, bridges)
204
+ │ ├── model_name_cache.rb # Precomputed regex for dependency scanning
205
+ │ ├── retriever.rb # Retriever orchestrator with degradation tiers
206
+ │ ├── builder.rb # DSL builder for configuration
207
+ │ ├── version.rb # Gem version
208
+ │ ├── railtie.rb # Rails integration
209
+ │ │
210
+ │ ├── extractors/ # 34 extractors (one per Rails concept)
211
+ │ │ ├── model_extractor.rb # ActiveRecord models
212
+ │ │ ├── controller_extractor.rb # ActionController
213
+ │ │ ├── service_extractor.rb # Service objects
214
+ │ │ ├── job_extractor.rb # ActiveJob/Sidekiq workers
215
+ │ │ ├── mailer_extractor.rb # ActionMailer
216
+ │ │ ├── phlex_extractor.rb # Phlex components
217
+ │ │ ├── view_component_extractor.rb # ViewComponent
218
+ │ │ ├── graphql_extractor.rb # GraphQL types, mutations, queries
219
+ │ │ ├── serializer_extractor.rb # Serializers/decorators
220
+ │ │ ├── manager_extractor.rb # SimpleDelegator managers
221
+ │ │ ├── policy_extractor.rb # Policy classes
222
+ │ │ ├── validator_extractor.rb # Standalone validators
223
+ │ │ ├── rails_source_extractor.rb # Framework/gem source
224
+ │ │ ├── shared_dependency_scanner.rb # Shared dependency detection
225
+ │ │ ├── shared_utility_methods.rb # Shared extractor utilities
226
+ │ │ └── ast_source_extraction.rb # AST-based source extraction
227
+ │ │
228
+ │ ├── ast/ # Prism-based AST layer
229
+ │ │ ├── parser.rb # Source parsing adapter
230
+ │ │ ├── node.rb # Normalized AST node
231
+ │ │ ├── method_extractor.rb # Method boundary detection
232
+ │ │ └── call_site_extractor.rb # Call site analysis
233
+ │ │
234
+ │ ├── ruby_analyzer/ # Static analysis
235
+ │ │ ├── class_analyzer.rb # Class structure analysis
236
+ │ │ ├── method_analyzer.rb # Method complexity/dependencies
237
+ │ │ ├── dataflow_analyzer.rb # Data flow tracing
238
+ │ │ ├── trace_enricher.rb # Enriches flow traces
239
+ │ │ ├── fqn_builder.rb # Fully-qualified name resolution
240
+ │ │ └── mermaid_renderer.rb # Diagram generation
241
+ │ │
242
+ │ ├── flow_analysis/ # Execution flow tracing
243
+ │ │ ├── operation_extractor.rb # Extract operations from AST
244
+ │ │ └── response_code_mapper.rb # HTTP response mapping
245
+ │ ├── flow_assembler.rb # Assembles execution flows
246
+ │ ├── flow_document.rb # Flow documentation format
247
+ │ │
248
+ │ ├── chunking/ # Semantic chunking
249
+ │ │ ├── chunk.rb # Chunk value object
250
+ │ │ └── semantic_chunker.rb # Type-aware splitting
251
+ │ │
252
+ │ ├── embedding/ # Embedding pipeline
253
+ │ │ ├── provider.rb # Provider interface
254
+ │ │ ├── openai.rb # OpenAI adapter
255
+ │ │ ├── text_preparer.rb # Text preparation for embedding
256
+ │ │ └── indexer.rb # Batch indexing with resumability
257
+ │ │
258
+ │ ├── storage/ # Storage backends
259
+ │ │ ├── vector_store.rb # Vector store interface + InMemory
260
+ │ │ ├── metadata_store.rb # Metadata store interface + InMemory/SQLite
261
+ │ │ ├── graph_store.rb # Graph store interface + InMemory
262
+ │ │ ├── pgvector.rb # PostgreSQL pgvector adapter
263
+ │ │ └── qdrant.rb # Qdrant adapter
264
+ │ │
265
+ │ ├── retrieval/ # Retrieval pipeline
266
+ │ │ ├── query_classifier.rb # Intent/scope/type classification
267
+ │ │ ├── search_executor.rb # Multi-strategy search
268
+ │ │ ├── ranker.rb # RRF-based ranking
269
+ │ │ └── context_assembler.rb # Token-budgeted context assembly
270
+ │ │
271
+ │ ├── formatting/ # LLM context formatting
272
+ │ │ ├── base.rb # Base formatter
273
+ │ │ ├── claude_adapter.rb # Claude-optimized output
274
+ │ │ ├── gpt_adapter.rb # GPT-optimized output
275
+ │ │ ├── generic_adapter.rb # Generic LLM output
276
+ │ │ └── human_adapter.rb # Human-readable output
277
+ │ │
278
+ │ ├── mcp/ # MCP Index Server (26 tools)
279
+ │ │ ├── server.rb # Tool definitions + dispatch
280
+ │ │ └── index_reader.rb # JSON index reader
281
+ │ │
282
+ │ ├── console/ # Console MCP Server (31 tools)
283
+ │ │ ├── server.rb # Console server + tool registration
284
+ │ │ ├── bridge.rb # JSON-lines protocol bridge
285
+ │ │ ├── safe_context.rb # Transaction rollback + timeout
286
+ │ │ ├── connection_manager.rb # Docker/direct/SSH modes
287
+ │ │ ├── model_validator.rb # AR schema validation
288
+ │ │ ├── sql_validator.rb # SQL statement validation
289
+ │ │ ├── audit_logger.rb # JSONL audit logging
290
+ │ │ ├── confirmation.rb # Human-in-the-loop confirmation
291
+ │ │ ├── tools/
292
+ │ │ │ ├── tier1.rb # 9 safe read-only tools
293
+ │ │ │ ├── tier2.rb # 9 domain-aware tools
294
+ │ │ │ ├── tier3.rb # 10 analytics tools
295
+ │ │ │ └── tier4.rb # 3 guarded tools
296
+ │ │ └── adapters/
297
+ │ │ ├── sidekiq_adapter.rb # Sidekiq job backend
298
+ │ │ ├── solid_queue_adapter.rb # Solid Queue job backend
299
+ │ │ ├── good_job_adapter.rb # GoodJob job backend
300
+ │ │ └── cache_adapter.rb # Cache backend adapters
301
+ │ │
302
+ │ ├── coordination/ # Multi-agent coordination
303
+ │ │ └── pipeline_lock.rb # File-based pipeline locking
304
+ │ │
305
+ │ ├── feedback/ # Agent self-service
306
+ │ │ ├── store.rb # JSONL feedback storage
307
+ │ │ └── gap_detector.rb # Feedback-driven gap detection
308
+ │ │
309
+ │ ├── operator/ # Pipeline management
310
+ │ │ ├── status_reporter.rb # Pipeline status
311
+ │ │ ├── error_escalator.rb # Error classification
312
+ │ │ └── pipeline_guard.rb # Rate limiting
313
+ │ │
314
+ │ ├── observability/ # Instrumentation
315
+ │ │ ├── instrumentation.rb # ActiveSupport::Notifications
316
+ │ │ ├── structured_logger.rb # JSON structured logging
317
+ │ │ └── health_check.rb # Component health checks
318
+ │ │
319
+ │ ├── resilience/ # Fault tolerance
320
+ │ │ ├── circuit_breaker.rb # Circuit breaker pattern
321
+ │ │ ├── retryable_provider.rb # Retry with backoff
322
+ │ │ └── index_validator.rb # Index integrity validation
323
+ │ │
324
+ │ ├── db/ # Schema management
325
+ │ │ ├── schema_version.rb # Version tracking
326
+ │ │ ├── migrator.rb # Standalone migration runner
327
+ │ │ └── migrations/
328
+ │ │ ├── 001_create_units.rb
329
+ │ │ ├── 002_create_edges.rb
330
+ │ │ └── 003_create_embeddings.rb
331
+ │ │
332
+ │ ├── session_tracer/ # Session tracing middleware + stores
333
+ │ │ ├── middleware.rb # Rack middleware
334
+ │ │ ├── file_store.rb # File-based trace storage
335
+ │ │ ├── redis_store.rb # Redis trace storage
336
+ │ │ └── solid_cache_store.rb # SolidCache trace storage
337
+ │ │
338
+ │ ├── temporal/ # Temporal snapshot system
339
+ │ │ ├── snapshot_store.rb # Snapshot persistence + diff
340
+ │ │ └── snapshot_metadata.rb # Snapshot metadata
341
+ │ │
342
+ │ └── evaluation/ # Retrieval evaluation
343
+ │ ├── query_set.rb # Evaluation query loading
344
+ │ ├── metrics.rb # Precision@k, Recall, MRR
345
+ │ ├── evaluator.rb # Query evaluation
346
+ │ ├── baseline_runner.rb # Grep/random/file baselines
347
+ │ └── report_generator.rb # JSON report generation
348
+
349
+ ├── generators/codebase_index/ # Rails generators
350
+ │ ├── install_generator.rb # Initial setup
351
+ │ └── pgvector_generator.rb # pgvector migration
352
+
353
+ ├── tasks/
354
+ │ └── codebase_index.rake # Rake task definitions
355
+
356
+ exe/
357
+ ├── codebase-index-mcp # MCP Index Server executable (stdio)
358
+ ├── codebase-index-mcp-start # Self-healing MCP wrapper
359
+ ├── codebase-index-mcp-http # MCP Index Server (HTTP/Rack)
360
+ └── codebase-console-mcp # Console MCP Server executable
361
+ ```
362
+
363
+ ## Context Assembly
364
+
365
+ When serving context to an LLM, token budget is allocated in layers:
366
+
367
+ ```
368
+ Budget Allocation:
369
+ ├── 10% Structural overview (always included)
370
+ ├── 50% Primary relevant units
371
+ ├── 25% Supporting context (dependencies)
372
+ └── 15% Framework reference (when needed)
373
+ ```
374
+
375
+ Queries are classified to determine whether framework source context is needed. "What options does has_many support?" routes to Rails source; "how do we handle checkout?" routes to application code.
376
+
377
+ ## Usage
378
+
379
+ ### Full Extraction
380
+
381
+ ```bash
382
+ bundle exec rake codebase_index:extract
383
+ ```
384
+
385
+ ### Incremental (CI)
386
+
387
+ ```bash
388
+ # Auto-detects GitHub Actions / GitLab CI environment
389
+ bundle exec rake codebase_index:incremental
390
+ ```
391
+
392
+ ```yaml
393
+ # .github/workflows/index.yml
394
+ jobs:
395
+ index:
396
+ runs-on: ubuntu-latest
397
+ steps:
398
+ - uses: actions/checkout@v4
399
+ with:
400
+ fetch-depth: 2
401
+ - name: Update index
402
+ run: bundle exec rake codebase_index:incremental
403
+ env:
404
+ GITHUB_BASE_REF: ${{ github.base_ref }}
405
+ ```
406
+
407
+ ### Framework-Only (on dependency changes)
408
+
409
+ ```bash
410
+ bundle exec rake codebase_index:extract_framework
411
+ ```
412
+
413
+ ### Other Tasks
414
+
415
+ ```bash
416
+ rake codebase_index:validate # Check index integrity
417
+ rake codebase_index:stats # Show unit counts, sizes, graph stats
418
+ rake codebase_index:clean # Remove index
419
+ ```
420
+
421
+ ### Ruby API
422
+
423
+ ```ruby
424
+ # Full extraction
425
+ CodebaseIndex.extract!
426
+
427
+ # Incremental
428
+ CodebaseIndex.extract_changed!(["app/models/user.rb", "app/services/checkout.rb"])
429
+
430
+ # Configuration
431
+ CodebaseIndex.configure do |config|
432
+ config.output_dir = Rails.root.join("tmp/codebase_index")
433
+ config.max_context_tokens = 8000
434
+ config.include_framework_sources = true
435
+ config.add_gem "devise", paths: ["lib/devise/models"], priority: :high
436
+ end
437
+ ```
438
+
439
+ ## Output Structure
440
+
441
+ ```
442
+ tmp/codebase_index/
443
+ ├── manifest.json # Extraction metadata, git SHA, checksums
444
+ ├── dependency_graph.json # Full graph with forward/reverse edges
445
+ ├── SUMMARY.md # Human-readable structural overview
446
+ ├── models/
447
+ │ ├── _index.json # Quick lookup index
448
+ │ ├── User.json # Full extracted unit
449
+ │ └── Order.json
450
+ ├── controllers/
451
+ │ ├── _index.json
452
+ │ └── OrdersController.json
453
+ ├── services/
454
+ │ ├── _index.json
455
+ │ └── CheckoutService.json
456
+ ├── components/
457
+ │ └── ...
458
+ └── rails_source/
459
+ └── ...
460
+ ```
461
+
462
+ Each unit JSON contains: `identifier`, `type`, `file_path`, `source_code` (annotated), `metadata` (rich structured data), `dependencies`, `dependents`, `chunks` (if applicable), and `estimated_tokens`.
463
+
464
+ ## Development
465
+
466
+ After checking out the repo:
467
+
468
+ ```bash
469
+ bin/setup # Install dependencies
470
+ bin/console # Interactive prompt
471
+ bundle exec rake spec # Run tests
472
+ bundle exec rubocop # Lint
473
+ ```
474
+
475
+ ## Contributing
476
+
477
+ Bug reports and pull requests are welcome on GitHub at https://github.com/LeahArmstrong/codebase_index. See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
478
+
479
+ ## License
480
+
481
+ The gem is available as open source under the terms of the [MIT License](LICENSE.txt).
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Console MCP server for querying live Rails application state.
5
+ #
6
+ # Usage:
7
+ # codebase-console-mcp
8
+ # CODEBASE_CONSOLE_CONFIG=/path/to/console.yml codebase-console-mcp
9
+ #
10
+ # Connects to a Rails application via a bridge process (Docker exec, direct,
11
+ # or SSH) and exposes read-only query tools via the Model Context Protocol
12
+ # (stdio transport).
13
+
14
+ require 'yaml'
15
+ require_relative '../lib/codebase_index/console/server'
16
+
17
+ config_path = ENV.fetch('CODEBASE_CONSOLE_CONFIG', File.expand_path('~/.codebase_index/console.yml'))
18
+ config = File.exist?(config_path) ? YAML.safe_load_file(config_path) : {}
19
+
20
+ server = CodebaseIndex::Console::Server.build(config: config)
21
+ transport = MCP::Server::Transports::StdioTransport.new(server)
22
+ transport.open
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # MCP server for querying CodebaseIndex extraction output.
5
+ #
6
+ # Usage:
7
+ # codebase-index-mcp [INDEX_DIR]
8
+ # CODEBASE_INDEX_DIR=/path/to/output codebase-index-mcp
9
+ #
10
+ # Reads JSON files from the extraction output directory and exposes
11
+ # them via the Model Context Protocol (stdio transport).
12
+ # Does NOT require Rails — only reads pre-extracted data.
13
+
14
+ require_relative '../lib/codebase_index'
15
+ require_relative '../lib/codebase_index/dependency_graph'
16
+ require_relative '../lib/codebase_index/graph_analyzer'
17
+ require_relative '../lib/codebase_index/mcp/server'
18
+ require_relative '../lib/codebase_index/embedding/text_preparer'
19
+ require_relative '../lib/codebase_index/embedding/indexer'
20
+
21
+ index_dir = ARGV[0] || ENV['CODEBASE_INDEX_DIR'] || Dir.pwd
22
+
23
+ unless Dir.exist?(index_dir)
24
+ warn "Error: Index directory does not exist: #{index_dir}"
25
+ exit 1
26
+ end
27
+
28
+ unless File.exist?(File.join(index_dir, 'manifest.json'))
29
+ warn "Error: No manifest.json found in: #{index_dir}"
30
+ warn 'Run `bundle exec rake codebase_index:extract` in your Rails app first.'
31
+ exit 1
32
+ end
33
+
34
+ # Attempt to build a retriever for semantic search.
35
+ # Auto-configures from environment variables when no explicit configuration exists.
36
+ retriever = begin
37
+ config = CodebaseIndex.configuration
38
+
39
+ if !config.embedding_provider && ENV.fetch('OPENAI_API_KEY', nil)
40
+ config.vector_store = :in_memory
41
+ config.metadata_store = :in_memory
42
+ config.graph_store = :in_memory
43
+ config.embedding_provider = :openai
44
+ config.embedding_options = { api_key: ENV.fetch('OPENAI_API_KEY', nil) }
45
+ end
46
+
47
+ CodebaseIndex::Builder.new(config).build_retriever if config.embedding_provider
48
+ rescue StandardError => e
49
+ warn "Note: Semantic search unavailable (#{e.message}). Using pattern-based search only."
50
+ nil
51
+ end
52
+
53
+ server = CodebaseIndex::MCP::Server.build(index_dir: index_dir, retriever: retriever)
54
+
55
+ # Pin protocol version for broad client compatibility (Claude Code, Cursor, etc.)
56
+ if ENV['MCP_PROTOCOL_VERSION']
57
+ server.configuration = MCP::Configuration.new(protocol_version: ENV['MCP_PROTOCOL_VERSION'])
58
+ end
59
+
60
+ transport = MCP::Server::Transports::StdioTransport.new(server)
61
+ transport.open
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # MCP server for querying CodebaseIndex extraction output over HTTP.
5
+ #
6
+ # Usage:
7
+ # codebase-index-mcp-http [INDEX_DIR]
8
+ # CODEBASE_INDEX_DIR=/path/to/output codebase-index-mcp-http
9
+ #
10
+ # Reads JSON files from the extraction output directory and exposes
11
+ # them via the Model Context Protocol (Streamable HTTP transport).
12
+ # Requires the `rackup` gem and a Rack-compatible server (e.g., puma).
13
+
14
+ require 'rackup'
15
+ require_relative '../lib/codebase_index'
16
+ require_relative '../lib/codebase_index/dependency_graph'
17
+ require_relative '../lib/codebase_index/graph_analyzer'
18
+ require_relative '../lib/codebase_index/mcp/server'
19
+ require_relative '../lib/codebase_index/embedding/text_preparer'
20
+ require_relative '../lib/codebase_index/embedding/indexer'
21
+
22
+ index_dir = ARGV[0] || ENV['CODEBASE_INDEX_DIR'] || Dir.pwd
23
+
24
+ unless Dir.exist?(index_dir)
25
+ warn "Error: Index directory does not exist: #{index_dir}"
26
+ exit 1
27
+ end
28
+
29
+ unless File.exist?(File.join(index_dir, 'manifest.json'))
30
+ warn "Error: No manifest.json found in: #{index_dir}"
31
+ warn 'Run `bundle exec rake codebase_index:extract` in your Rails app first.'
32
+ exit 1
33
+ end
34
+
35
+ # Attempt to build a retriever for semantic search.
36
+ # Auto-configures from environment variables when no explicit configuration exists.
37
+ retriever = begin
38
+ config = CodebaseIndex.configuration
39
+
40
+ if !config.embedding_provider && ENV.fetch('OPENAI_API_KEY', nil)
41
+ config.vector_store = :in_memory
42
+ config.metadata_store = :in_memory
43
+ config.graph_store = :in_memory
44
+ config.embedding_provider = :openai
45
+ config.embedding_options = { api_key: ENV.fetch('OPENAI_API_KEY', nil) }
46
+ end
47
+
48
+ CodebaseIndex::Builder.new(config).build_retriever if config.embedding_provider
49
+ rescue StandardError => e
50
+ warn "Note: Semantic search unavailable (#{e.message}). Using pattern-based search only."
51
+ nil
52
+ end
53
+
54
+ port = (ENV['PORT'] || 9292).to_i
55
+ host = ENV['HOST'] || 'localhost'
56
+
57
+ server = CodebaseIndex::MCP::Server.build(index_dir: index_dir, retriever: retriever)
58
+ transport = MCP::Server::Transports::StreamableHTTPTransport.new(server)
59
+ server.transport = transport
60
+
61
+ app = proc { |env| transport.handle_request(Rack::Request.new(env)) }
62
+
63
+ warn "CodebaseIndex MCP HTTP server starting on http://#{host}:#{port}"
64
+ Rackup::Handler.default.run(app, Port: port, Host: host)