codebase_index 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +60 -0
  3. data/README.md +95 -300
  4. data/exe/codebase-index-mcp +3 -31
  5. data/exe/codebase-index-mcp-http +3 -31
  6. data/lib/codebase_index/ast/method_extractor.rb +3 -8
  7. data/lib/codebase_index/ast/node.rb +28 -0
  8. data/lib/codebase_index/ast/parser.rb +53 -92
  9. data/lib/codebase_index/builder.rb +67 -4
  10. data/lib/codebase_index/cache/cache_middleware.rb +199 -0
  11. data/lib/codebase_index/cache/cache_store.rb +264 -0
  12. data/lib/codebase_index/cache/redis_cache_store.rb +116 -0
  13. data/lib/codebase_index/cache/solid_cache_store.rb +111 -0
  14. data/lib/codebase_index/chunking/semantic_chunker.rb +29 -24
  15. data/lib/codebase_index/console/adapters/good_job_adapter.rb +7 -40
  16. data/lib/codebase_index/console/adapters/job_adapter.rb +68 -0
  17. data/lib/codebase_index/console/adapters/sidekiq_adapter.rb +7 -40
  18. data/lib/codebase_index/console/adapters/solid_queue_adapter.rb +7 -40
  19. data/lib/codebase_index/console/bridge.rb +7 -0
  20. data/lib/codebase_index/console/console_response_renderer.rb +3 -7
  21. data/lib/codebase_index/console/embedded_executor.rb +2 -1
  22. data/lib/codebase_index/console/server.rb +1 -4
  23. data/lib/codebase_index/dependency_graph.rb +28 -19
  24. data/lib/codebase_index/embedding/indexer.rb +18 -8
  25. data/lib/codebase_index/embedding/openai.rb +27 -6
  26. data/lib/codebase_index/embedding/provider.rb +29 -2
  27. data/lib/codebase_index/evaluation/evaluator.rb +5 -12
  28. data/lib/codebase_index/extractor.rb +40 -44
  29. data/lib/codebase_index/extractors/action_cable_extractor.rb +9 -36
  30. data/lib/codebase_index/extractors/callback_analyzer.rb +22 -8
  31. data/lib/codebase_index/extractors/controller_extractor.rb +3 -93
  32. data/lib/codebase_index/extractors/decorator_extractor.rb +7 -14
  33. data/lib/codebase_index/extractors/engine_extractor.rb +20 -1
  34. data/lib/codebase_index/extractors/graphql_extractor.rb +4 -29
  35. data/lib/codebase_index/extractors/job_extractor.rb +11 -6
  36. data/lib/codebase_index/extractors/lib_extractor.rb +0 -31
  37. data/lib/codebase_index/extractors/mailer_extractor.rb +15 -85
  38. data/lib/codebase_index/extractors/manager_extractor.rb +1 -15
  39. data/lib/codebase_index/extractors/model_extractor.rb +20 -53
  40. data/lib/codebase_index/extractors/phlex_extractor.rb +8 -8
  41. data/lib/codebase_index/extractors/policy_extractor.rb +1 -24
  42. data/lib/codebase_index/extractors/poro_extractor.rb +0 -17
  43. data/lib/codebase_index/extractors/serializer_extractor.rb +12 -7
  44. data/lib/codebase_index/extractors/service_extractor.rb +1 -38
  45. data/lib/codebase_index/extractors/shared_utility_methods.rb +183 -1
  46. data/lib/codebase_index/extractors/validator_extractor.rb +3 -17
  47. data/lib/codebase_index/extractors/view_component_extractor.rb +10 -9
  48. data/lib/codebase_index/filename_utils.rb +32 -0
  49. data/lib/codebase_index/flow_analysis/operation_extractor.rb +1 -4
  50. data/lib/codebase_index/formatting/base.rb +0 -10
  51. data/lib/codebase_index/graph_analyzer.rb +1 -1
  52. data/lib/codebase_index/mcp/bootstrapper.rb +58 -0
  53. data/lib/codebase_index/mcp/renderers/markdown_renderer.rb +35 -34
  54. data/lib/codebase_index/mcp/renderers/plain_renderer.rb +29 -29
  55. data/lib/codebase_index/mcp/server.rb +59 -68
  56. data/lib/codebase_index/mcp/tool_response_renderer.rb +23 -0
  57. data/lib/codebase_index/notion/client.rb +2 -2
  58. data/lib/codebase_index/notion/mapper.rb +1 -0
  59. data/lib/codebase_index/notion/mappers/column_mapper.rb +3 -11
  60. data/lib/codebase_index/notion/mappers/model_mapper.rb +20 -23
  61. data/lib/codebase_index/notion/mappers/shared.rb +22 -0
  62. data/lib/codebase_index/observability/health_check.rb +0 -2
  63. data/lib/codebase_index/observability/structured_logger.rb +12 -30
  64. data/lib/codebase_index/operator/pipeline_guard.rb +0 -7
  65. data/lib/codebase_index/resilience/index_validator.rb +3 -21
  66. data/lib/codebase_index/retrieval/context_assembler.rb +19 -7
  67. data/lib/codebase_index/retrieval/query_classifier.rb +14 -12
  68. data/lib/codebase_index/retrieval/ranker.rb +6 -2
  69. data/lib/codebase_index/retrieval/search_executor.rb +8 -19
  70. data/lib/codebase_index/retriever.rb +1 -9
  71. data/lib/codebase_index/ruby_analyzer/class_analyzer.rb +5 -25
  72. data/lib/codebase_index/ruby_analyzer/dataflow_analyzer.rb +6 -7
  73. data/lib/codebase_index/ruby_analyzer/mermaid_renderer.rb +58 -53
  74. data/lib/codebase_index/ruby_analyzer/trace_enricher.rb +11 -7
  75. data/lib/codebase_index/session_tracer/file_store.rb +1 -8
  76. data/lib/codebase_index/session_tracer/redis_store.rb +1 -7
  77. data/lib/codebase_index/session_tracer/session_flow_assembler.rb +4 -13
  78. data/lib/codebase_index/session_tracer/solid_cache_store.rb +1 -7
  79. data/lib/codebase_index/session_tracer/store.rb +14 -0
  80. data/lib/codebase_index/storage/metadata_store.rb +37 -10
  81. data/lib/codebase_index/storage/pgvector.rb +37 -5
  82. data/lib/codebase_index/storage/qdrant.rb +39 -6
  83. data/lib/codebase_index/storage/vector_store.rb +11 -0
  84. data/lib/codebase_index/temporal/snapshot_store.rb +14 -10
  85. data/lib/codebase_index/token_utils.rb +19 -0
  86. data/lib/codebase_index/version.rb +1 -1
  87. data/lib/codebase_index.rb +25 -6
  88. data/lib/tasks/codebase_index.rake +2 -2
  89. metadata +11 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8ba2df9baa16005b8f3981639c0e1bca59bbff3382c1cd483e2a686d399054f4
4
- data.tar.gz: '0983e4f7e63febbe63631bff76caab5d11cc88dec9e360eab2a3ea33f6adb025'
3
+ metadata.gz: 982e7949df0e0db9249705ab9f009121c3c8156582c63712f0613fccc998337d
4
+ data.tar.gz: 4fb41c658901cd26606e44164da7059a7d62aa39c795a682020cfbb6252311be
5
5
  SHA512:
6
- metadata.gz: 25af1daf07cdbdbf810919db95f8dd9535b79c784e6fc74aad5885931221ca9c389507fd67eb1a1e5ec2cb3f5415fc111c4946db27a29df1882975a0741eb2f8
7
- data.tar.gz: 5a9e35f69d822a8cc0e12a82a0d33fad0fdc6fac61d65c88e4c00ccfe8fbda5b7f7bcde0a82f7051a3238e43b60f1a8143dc35a28486fbc8e6818770b9d98693
6
+ metadata.gz: 6b62fe0a1d8b0db683744214461ec5d0029e41cf4538b7313ce2701a3a985bf9b1d06ce955acb225db09e93aaa72bcbc5e3b40cd534f7d682d98d190a670d722
7
+ data.tar.gz: f0a948295982aa85951fa8cca96cc8c30b317176a53424fe18e50cc1d3e28b17df5fc9dfb6a191e6450a1318b3b0f81a0f2cdf20fe7326c0c4e492a7f8b47f70
data/CHANGELOG.md CHANGED
@@ -5,6 +5,66 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.3.1] - 2026-03-04
9
+
10
+ ### Fixed
11
+
12
+ - **Gemspec version** now reads from `version.rb` instead of being hardcoded — prevents version mismatch during gem builds
13
+ - **Release workflow** replaced `rake release` (fails on tag-triggered detached HEAD) with `gem build` + `gem push`
14
+
15
+ ## [0.3.0] - 2026-03-04
16
+
17
+ ### Added
18
+
19
+ - **Redis/SolidCache caching layer** for retrieval pipeline with TTL, namespace isolation, and nil-caching
20
+ - **Engine classification** — engines tagged as `:framework` or `:application` based on install path (handles Docker vendor paths)
21
+ - **Graph analysis staleness tracking** — `generated_at` timestamp and `graph_sha` for detecting stale analysis
22
+ - **Docker setup guide** (`docs/DOCKER_SETUP.md`) — split architecture, volume mounts, bridge mode, troubleshooting
23
+ - **Context7 documentation suite** — 10 new user-facing docs optimized for AI retrieval: FAQ, Troubleshooting, Architecture, Extractor Reference, WHY CodebaseIndex, MCP Tool Cookbook, and 3 Context7 skills
24
+ - **`context7.json`** configuration for controlling Context7 indexing scope
25
+
26
+ ### Fixed
27
+
28
+ - **Vendor path leak** in source file resolution across 9 extractors — framework gems under `vendor/bundle` no longer produce empty source
29
+ - **Prism cross-version compatibility** — handle API differences between Prism versions
30
+ - **`schema_sha`** now supports `db/structure.sql` fallback (not just `db/schema.rb`)
31
+ - **ViewComponent extractor** skips framework-internal components with no resolvable source file
32
+ - **HTTP connection reuse** and retry handling in embedding providers
33
+ - **DependencyGraph `to_h`** returns a dup to prevent cache pollution
34
+ - **MCP tool counts** corrected across all documentation (27 index / 31 console)
35
+ - **TROUBLESHOOTING.md** corrected: `config.extractors` controls retrieval scope, not which extractors run
36
+
37
+ ### Changed
38
+
39
+ - **README streamlined** from 620 to 325 lines — added Quick Start, Documentation table; removed verbose sections in favor of links to dedicated docs
40
+ - **Internal rake tasks** (`retrieve`, `self_analyze`) hidden from `rails -T`
41
+ - **Estimated tokens memoization** removed to prevent stale values after source changes
42
+ - **Simplification sweep** — dead code removal, shared helper extraction, bug fixes across caching and retrieval layers
43
+
44
+ ### Performance
45
+
46
+ - Critical hotspots fixed across extraction, storage, and retrieval pipelines
47
+ - `fetch_key` optimization for falsy value handling in cache layer
48
+
49
+ ## [0.2.1] - 2026-02-19
50
+
51
+ ### Changed
52
+
53
+ - Switch release workflow to RubyGems trusted publishing
54
+
55
+ ## [0.2.0] - 2026-02-19
56
+
57
+ ### Added
58
+
59
+ - **Embedded console MCP server** for zero-config Rails querying (no bridge process needed)
60
+ - **Console MCP setup guide** (`docs/CONSOLE_MCP_SETUP.md`) — stdio, Docker, HTTP/Rack, SSH bridge options
61
+ - **CODEOWNERS** and issue template configuration
62
+
63
+ ### Fixed
64
+
65
+ - MCP gem compatibility and symbol key handling in embedded executor
66
+ - Duplicate URI warning in gemspec
67
+
8
68
  ## [0.1.0] - 2026-02-18
9
69
 
10
70
  ### Added
data/README.md CHANGED
@@ -18,6 +18,21 @@ CodebaseIndex solves this by:
18
18
  - **Tracking dependencies** bidirectionally so you can trace impact across the codebase
19
19
  - **Enriching with git data** so you know what's actively changing vs. dormant
20
20
 
21
+ See [Why CodebaseIndex?](docs/WHY_CODEBASE_INDEX.md) for concrete before/after examples.
22
+
23
+ ## Quick Start
24
+
25
+ ```bash
26
+ # Add to your Rails app's Gemfile, then:
27
+ bundle install
28
+ rails generate codebase_index:install
29
+ bundle exec rake codebase_index:extract
30
+ bundle exec rake codebase_index:stats
31
+ # Add the MCP server to .mcp.json (see below) and start asking questions
32
+ ```
33
+
34
+ See [Getting Started](docs/GETTING_STARTED.md) for the full walkthrough including Docker, storage presets, and CI setup.
35
+
21
36
  ## Installation
22
37
 
23
38
  Add to your Gemfile:
@@ -30,15 +45,26 @@ Then:
30
45
 
31
46
  ```bash
32
47
  bundle install
48
+ rails generate codebase_index:install
49
+ rails db:migrate
50
+ ```
51
+
52
+ Create a minimal configuration:
53
+
54
+ ```ruby
55
+ # config/initializers/codebase_index.rb
56
+ CodebaseIndex.configure do |config|
57
+ config.output_dir = Rails.root.join('tmp/codebase_index')
58
+ end
33
59
  ```
34
60
 
35
- Or install directly:
61
+ Or install the gem directly:
36
62
 
37
63
  ```bash
38
64
  gem install codebase_index
39
65
  ```
40
66
 
41
- > **Requires Rails.** Extraction runs inside a booted Rails application using runtime introspection (`ActiveRecord::Base.descendants`, `Rails.application.routes`, etc.). The gem cannot extract from source files alone. See [Getting Started](docs/GETTING_STARTED.md) for setup.
67
+ > **Requires Rails.** Extraction runs inside a booted Rails application using runtime introspection (`ActiveRecord::Base.descendants`, `Rails.application.routes`, etc.). The gem cannot extract from source files alone. See [Getting Started](docs/GETTING_STARTED.md) for full setup details.
42
68
 
43
69
  ## Target Environment
44
70
 
@@ -99,85 +125,19 @@ Extraction runs inside the Rails application (via rake task) to access runtime i
99
125
 
100
126
  ### Extractors (34)
101
127
 
102
- **Core Application**
103
-
104
- | Extractor | What it captures |
105
- |-----------|-----------------|
106
- | **ModelExtractor** | Schema (columns, indexes, FKs), associations, validations, callbacks (all 13 types), scopes, enums, inlined concerns. Chunks large models into summary/associations/callbacks/validations. |
107
- | **ControllerExtractor** | Route mapping (verb → path → action), filter chains per action, response formats, permitted params. Per-action chunks with applicable filters and route context. |
108
- | **ServiceExtractor** | Scans `app/services`, `app/interactors`, `app/operations`, `app/commands`, `app/use_cases`. Entry points, dependency injection, custom errors, return type inference. |
109
- | **JobExtractor** | ActiveJob and Sidekiq workers. Queue config, retry/concurrency options, perform arguments, callbacks. |
110
- | **MailerExtractor** | ActionMailer classes with defaults, per-action templates, callbacks, helper usage. |
111
- | **ConfigurationExtractor** | Rails initializers from `config/initializers` and `config/environments`, plus behavioral profile from resolved `Rails.application.config`. |
112
- | **RouteExtractor** | All Rails routes via runtime introspection of `Rails.application.routes`. |
113
- | **MiddlewareExtractor** | Rack middleware stack as a single ordered unit. |
114
-
115
- **UI Components**
116
-
117
- | Extractor | What it captures |
118
- |-----------|-----------------|
119
- | **PhlexExtractor** | Phlex component slots, initialize params, sub-components, Stimulus controller references, route helpers. |
120
- | **ViewComponentExtractor** | ViewComponent slots, template paths, preview classes, collection support. |
121
- | **ViewTemplateExtractor** | ERB view templates with render calls, instance variables, helper usage. |
122
- | **DecoratorExtractor** | Decorators, presenters, and form objects from `app/decorators`, `app/presenters`, `app/form_objects`. |
123
-
124
- **Data Layer**
125
-
126
- | Extractor | What it captures |
127
- |-----------|-----------------|
128
- | **ConcernExtractor** | ActiveSupport::Concern modules from `app/models/concerns` and `app/controllers/concerns`. |
129
- | **PoroExtractor** | Plain Ruby objects in `app/models` (non-ActiveRecord classes, excluding concerns). |
130
- | **SerializerExtractor** | ActiveModelSerializers, Blueprinter, Alba, and Draper. Auto-detects loaded serialization gems. |
131
- | **ValidatorExtractor** | Custom ActiveModel validator classes with validation rules. |
132
- | **ManagerExtractor** | SimpleDelegator subclasses — wrapped model, public methods, delegation chain. |
133
-
134
- **API & Authorization**
135
-
136
- | Extractor | What it captures |
137
- |-----------|-----------------|
138
- | **GraphQLExtractor** | graphql-ruby types, mutations, queries, resolvers, field metadata, authorization patterns. Produces 4 unit types. |
139
- | **PunditExtractor** | Pundit authorization policies with action methods (index?, show?, create?, etc.). |
140
- | **PolicyExtractor** | Domain policy classes with decision methods and eligibility rules. |
141
-
142
- **Infrastructure**
143
-
144
- | Extractor | What it captures |
145
- |-----------|-----------------|
146
- | **EngineExtractor** | Mounted Rails engines via runtime introspection with mount points and route counts. |
147
- | **I18nExtractor** | Locale files from `config/locales` with translation key structures. |
148
- | **ActionCableExtractor** | ActionCable channels with stream subscriptions, actions, broadcast patterns. |
149
- | **ScheduledJobExtractor** | Scheduled jobs from `config/recurring.yml`, `config/sidekiq_cron.yml`, `config/schedule.rb`. |
150
- | **RakeTaskExtractor** | Rake tasks from `lib/tasks/*.rake` with namespaces, dependencies, descriptions. |
151
- | **MigrationExtractor** | ActiveRecord migrations with DDL metadata, table operations, reversibility, risk indicators. |
152
- | **DatabaseViewExtractor** | SQL views from `db/views` (Scenic convention) with materialization and table references. |
153
- | **StateMachineExtractor** | AASM, Statesman, and state_machines DSL definitions with states and transitions. |
154
- | **EventExtractor** | Event publish/subscribe patterns (ActiveSupport::Notifications, Wisper). |
155
- | **CachingExtractor** | Cache usage across controllers, models, and views — strategies, TTLs, cache keys. |
156
-
157
- **Testing & Source**
158
-
159
- | Extractor | What it captures |
160
- |-----------|-----------------|
161
- | **FactoryExtractor** | FactoryBot factory definitions with traits and associations. |
162
- | **TestMappingExtractor** | Test file → subject class mapping with test counts and framework type. |
163
- | **LibExtractor** | Ruby files from `lib/` (excluding tasks and generators). |
164
- | **RailsSourceExtractor** | High-value Rails framework source and gem source pinned to exact installed versions. |
165
-
166
- ### Key Design Decisions
167
-
168
- **Concern inlining.** When extracting a model, included concerns are read from disk and embedded as formatted comments directly in the model's source. This means the full behavioral picture is in one unit — no separate lookups needed during retrieval.
128
+ 34 extractors cover every major Rails concept: models (with inlined concerns and schema), controllers (with route context), services, jobs, mailers, GraphQL types/mutations/resolvers, serializers, view components (Phlex and ViewComponent), ERB templates, decorators, concerns, validators, policies, routes, middleware, engines, i18n, Action Cable, rake tasks, migrations, database views, state machines, events, caching patterns, factories, test mappings, and Rails framework source pinned to exact installed versions.
169
129
 
170
- **Route prepending.** Controller source gets a header block showing the HTTP routes that map to it, so the relationship between URLs and actions is immediately visible.
130
+ See [docs/EXTRACTOR_REFERENCE.md](docs/EXTRACTOR_REFERENCE.md) for per-extractor documentation with configuration, edge cases, and example output.
171
131
 
172
- **Semantic chunking.** Large models are split into purpose-specific chunks (summary, associations, callbacks, validations) rather than arbitrary size-based splits. Controllers chunk per-action with the relevant filters and route attached.
132
+ ### Key Design Decisions
173
133
 
174
- **Dependency graph with BFS blast radius.** The graph tracks both forward dependencies (what this unit uses) and reverse dependencies (what uses this unit). Changed-file impact is computed via breadth-first traversal if a concern changes, every model including it gets re-indexed.
134
+ **Concern inlining** included concerns are embedded directly in the model's source. **Route prepending** controllers get a route header showing HTTP verb → path → action. **Semantic chunking** models split by purpose (associations, callbacks, validations), controllers split per-action. **Dependency graph with BFS blast radius** forward and reverse edges enable change-impact traversal.
175
135
 
176
136
  ## MCP Servers
177
137
 
178
138
  CodebaseIndex ships two [MCP](https://modelcontextprotocol.io/) servers for integrating with AI development tools (Claude Code, Cursor, Windsurf, etc.).
179
139
 
180
- **Index Server** (26 tools) — Reads pre-extracted data from disk. No Rails boot required. Provides code lookup, dependency traversal, graph analysis, semantic search, pipeline management, feedback collection, and temporal snapshots.
140
+ **Index Server** (27 tools) — Reads pre-extracted data from disk. No Rails boot required. Provides code lookup, dependency traversal, graph analysis, semantic search, pipeline management, feedback collection, and temporal snapshots.
181
141
 
182
142
  ```bash
183
143
  codebase-index-mcp /path/to/rails-app/tmp/codebase_index
@@ -199,7 +159,7 @@ Add the servers to your project's `.mcp.json`:
199
159
  {
200
160
  "mcpServers": {
201
161
  "codebase-index": {
202
- "command": "codebase-index-mcp",
162
+ "command": "codebase-index-mcp-start",
203
163
  "args": ["/path/to/rails-app/tmp/codebase_index"]
204
164
  },
205
165
  "codebase-console": {
@@ -211,217 +171,44 @@ Add the servers to your project's `.mcp.json`:
211
171
  }
212
172
  ```
213
173
 
214
- The **index server** reads from a pre-extracted directory run `bundle exec rake codebase_index:extract` in your Rails app first.
215
-
216
- The **console server** runs embedded inside your Rails app (no config file needed). For Docker:
174
+ > **Recommended**: Use `codebase-index-mcp-start` instead of `codebase-index-mcp` for Claude Code. It validates the index directory exists, checks for a manifest, ensures dependencies are installed, and restarts automatically on failure.
217
175
 
218
- ```json
219
- {
220
- "mcpServers": {
221
- "codebase-console": {
222
- "command": "docker",
223
- "args": ["exec", "-i", "my_container", "bundle", "exec", "rake", "codebase_index:console"]
224
- }
225
- }
226
- }
227
- ```
228
-
229
- ### Validation
230
-
231
- Verify each server starts and lists its tools:
232
-
233
- ```bash
234
- # Index server — should list 27 tools
235
- echo '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' | \
236
- codebase-index-mcp /path/to/rails-app/tmp/codebase_index
176
+ The **index server** reads from a pre-extracted directory — run `bundle exec rake codebase_index:extract` in your Rails app first.
237
177
 
238
- # Console server should list 31 tools (requires Rails app)
239
- echo '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' | \
240
- bundle exec rake codebase_index:console
241
- ```
178
+ The **console server** runs embedded inside your Rails app (no config file needed). For Docker setups, see [docs/DOCKER_SETUP.md](docs/DOCKER_SETUP.md).
242
179
 
243
180
  ## Subsystems
244
181
 
245
182
  ```
246
- lib/
247
- ├── codebase_index.rb # Module interface, Configuration, entry point
248
- ├── codebase_index/
249
- ├── extracted_unit.rb # Core value object
250
- ├── extractor.rb # Orchestrator coordinates all extractors
251
- ├── dependency_graph.rb # Directed graph + PageRank scoring
252
- ├── graph_analyzer.rb # Structural analysis (orphans, hubs, cycles, bridges)
253
- ├── model_name_cache.rb # Precomputed regex for dependency scanning
254
- ├── retriever.rb # Retriever orchestrator with degradation tiers
255
- ├── builder.rb # DSL builder for configuration
256
- ├── version.rb # Gem version
257
- ├── railtie.rb # Rails integration
258
- │ │
259
- ├── extractors/ # 34 extractors (one per Rails concept)
260
- │ │ ├── model_extractor.rb # ActiveRecord models
261
- │ │ ├── controller_extractor.rb # ActionController
262
- │ │ ├── service_extractor.rb # Service objects
263
- │ │ ├── job_extractor.rb # ActiveJob/Sidekiq workers
264
- │ │ ├── mailer_extractor.rb # ActionMailer
265
- │ │ ├── phlex_extractor.rb # Phlex components
266
- │ │ ├── view_component_extractor.rb # ViewComponent
267
- │ │ ├── graphql_extractor.rb # GraphQL types, mutations, queries
268
- │ │ ├── serializer_extractor.rb # Serializers/decorators
269
- │ │ ├── manager_extractor.rb # SimpleDelegator managers
270
- │ │ ├── policy_extractor.rb # Policy classes
271
- │ │ ├── validator_extractor.rb # Standalone validators
272
- │ │ ├── rails_source_extractor.rb # Framework/gem source
273
- │ │ ├── shared_dependency_scanner.rb # Shared dependency detection
274
- │ │ ├── shared_utility_methods.rb # Shared extractor utilities
275
- │ │ └── ast_source_extraction.rb # AST-based source extraction
276
- │ │
277
- │ ├── ast/ # Prism-based AST layer
278
- │ │ ├── parser.rb # Source parsing adapter
279
- │ │ ├── node.rb # Normalized AST node
280
- │ │ ├── method_extractor.rb # Method boundary detection
281
- │ │ └── call_site_extractor.rb # Call site analysis
282
- │ │
283
- │ ├── ruby_analyzer/ # Static analysis
284
- │ │ ├── class_analyzer.rb # Class structure analysis
285
- │ │ ├── method_analyzer.rb # Method complexity/dependencies
286
- │ │ ├── dataflow_analyzer.rb # Data flow tracing
287
- │ │ ├── trace_enricher.rb # Enriches flow traces
288
- │ │ ├── fqn_builder.rb # Fully-qualified name resolution
289
- │ │ └── mermaid_renderer.rb # Diagram generation
290
- │ │
291
- │ ├── flow_analysis/ # Execution flow tracing
292
- │ │ ├── operation_extractor.rb # Extract operations from AST
293
- │ │ └── response_code_mapper.rb # HTTP response mapping
294
- │ ├── flow_assembler.rb # Assembles execution flows
295
- │ ├── flow_document.rb # Flow documentation format
296
- │ │
297
- │ ├── chunking/ # Semantic chunking
298
- │ │ ├── chunk.rb # Chunk value object
299
- │ │ └── semantic_chunker.rb # Type-aware splitting
300
- │ │
301
- │ ├── embedding/ # Embedding pipeline
302
- │ │ ├── provider.rb # Provider interface
303
- │ │ ├── openai.rb # OpenAI adapter
304
- │ │ ├── text_preparer.rb # Text preparation for embedding
305
- │ │ └── indexer.rb # Batch indexing with resumability
306
- │ │
307
- │ ├── storage/ # Storage backends
308
- │ │ ├── vector_store.rb # Vector store interface + InMemory
309
- │ │ ├── metadata_store.rb # Metadata store interface + InMemory/SQLite
310
- │ │ ├── graph_store.rb # Graph store interface + InMemory
311
- │ │ ├── pgvector.rb # PostgreSQL pgvector adapter
312
- │ │ └── qdrant.rb # Qdrant adapter
313
- │ │
314
- │ ├── retrieval/ # Retrieval pipeline
315
- │ │ ├── query_classifier.rb # Intent/scope/type classification
316
- │ │ ├── search_executor.rb # Multi-strategy search
317
- │ │ ├── ranker.rb # RRF-based ranking
318
- │ │ └── context_assembler.rb # Token-budgeted context assembly
319
- │ │
320
- │ ├── formatting/ # LLM context formatting
321
- │ │ ├── base.rb # Base formatter
322
- │ │ ├── claude_adapter.rb # Claude-optimized output
323
- │ │ ├── gpt_adapter.rb # GPT-optimized output
324
- │ │ ├── generic_adapter.rb # Generic LLM output
325
- │ │ └── human_adapter.rb # Human-readable output
326
- │ │
327
- │ ├── mcp/ # MCP Index Server (26 tools)
328
- │ │ ├── server.rb # Tool definitions + dispatch
329
- │ │ └── index_reader.rb # JSON index reader
330
- │ │
331
- │ ├── console/ # Console MCP Server (31 tools)
332
- │ │ ├── server.rb # Console server + tool registration
333
- │ │ ├── bridge.rb # JSON-lines protocol bridge
334
- │ │ ├── safe_context.rb # Transaction rollback + timeout
335
- │ │ ├── connection_manager.rb # Docker/direct/SSH modes
336
- │ │ ├── model_validator.rb # AR schema validation
337
- │ │ ├── sql_validator.rb # SQL statement validation
338
- │ │ ├── audit_logger.rb # JSONL audit logging
339
- │ │ ├── confirmation.rb # Human-in-the-loop confirmation
340
- │ │ ├── tools/
341
- │ │ │ ├── tier1.rb # 9 safe read-only tools
342
- │ │ │ ├── tier2.rb # 9 domain-aware tools
343
- │ │ │ ├── tier3.rb # 10 analytics tools
344
- │ │ │ └── tier4.rb # 3 guarded tools
345
- │ │ └── adapters/
346
- │ │ ├── sidekiq_adapter.rb # Sidekiq job backend
347
- │ │ ├── solid_queue_adapter.rb # Solid Queue job backend
348
- │ │ ├── good_job_adapter.rb # GoodJob job backend
349
- │ │ └── cache_adapter.rb # Cache backend adapters
350
- │ │
351
- │ ├── coordination/ # Multi-agent coordination
352
- │ │ └── pipeline_lock.rb # File-based pipeline locking
353
- │ │
354
- │ ├── feedback/ # Agent self-service
355
- │ │ ├── store.rb # JSONL feedback storage
356
- │ │ └── gap_detector.rb # Feedback-driven gap detection
357
- │ │
358
- │ ├── operator/ # Pipeline management
359
- │ │ ├── status_reporter.rb # Pipeline status
360
- │ │ ├── error_escalator.rb # Error classification
361
- │ │ └── pipeline_guard.rb # Rate limiting
362
- │ │
363
- │ ├── observability/ # Instrumentation
364
- │ │ ├── instrumentation.rb # ActiveSupport::Notifications
365
- │ │ ├── structured_logger.rb # JSON structured logging
366
- │ │ └── health_check.rb # Component health checks
367
- │ │
368
- │ ├── resilience/ # Fault tolerance
369
- │ │ ├── circuit_breaker.rb # Circuit breaker pattern
370
- │ │ ├── retryable_provider.rb # Retry with backoff
371
- │ │ └── index_validator.rb # Index integrity validation
372
- │ │
373
- │ ├── db/ # Schema management
374
- │ │ ├── schema_version.rb # Version tracking
375
- │ │ ├── migrator.rb # Standalone migration runner
376
- │ │ └── migrations/
377
- │ │ ├── 001_create_units.rb
378
- │ │ ├── 002_create_edges.rb
379
- │ │ └── 003_create_embeddings.rb
380
- │ │
381
- │ ├── session_tracer/ # Session tracing middleware + stores
382
- │ │ ├── middleware.rb # Rack middleware
383
- │ │ ├── file_store.rb # File-based trace storage
384
- │ │ ├── redis_store.rb # Redis trace storage
385
- │ │ └── solid_cache_store.rb # SolidCache trace storage
386
- │ │
387
- │ ├── temporal/ # Temporal snapshot system
388
- │ │ ├── snapshot_store.rb # Snapshot persistence + diff
389
- │ │ └── snapshot_metadata.rb # Snapshot metadata
390
- │ │
391
- │ └── evaluation/ # Retrieval evaluation
392
- │ ├── query_set.rb # Evaluation query loading
393
- │ ├── metrics.rb # Precision@k, Recall, MRR
394
- │ ├── evaluator.rb # Query evaluation
395
- │ ├── baseline_runner.rb # Grep/random/file baselines
396
- │ └── report_generator.rb # JSON report generation
397
-
398
- ├── generators/codebase_index/ # Rails generators
399
- │ ├── install_generator.rb # Initial setup
400
- │ └── pgvector_generator.rb # pgvector migration
401
-
402
- ├── tasks/
403
- │ └── codebase_index.rake # Rake task definitions
404
-
405
- exe/
406
- ├── codebase-index-mcp # MCP Index Server executable (stdio)
407
- ├── codebase-index-mcp-start # Self-healing MCP wrapper
408
- ├── codebase-index-mcp-http # MCP Index Server (HTTP/Rack)
409
- └── codebase-console-mcp # Console MCP Server executable
410
- ```
183
+ lib/codebase_index/
184
+ ├── extractor.rb # Orchestrator coordinates all 34 extractors
185
+ ├── extracted_unit.rb # Core value object (the universal currency)
186
+ ├── dependency_graph.rb # Directed graph + PageRank scoring
187
+ ├── graph_analyzer.rb # Structural analysis (orphans, hubs, cycles, bridges)
188
+ ├── retriever.rb # Retrieval orchestrator with degradation tiers
189
+ ├── extractors/ # 34 extractors (one per Rails concept)
190
+ ├── ast/ # Prism-based AST layer
191
+ ├── ruby_analyzer/ # Static analysis (class, method, dataflow)
192
+ ├── chunking/ # Semantic chunking (type-aware splitting)
193
+ ├── embedding/ # Embedding pipeline (OpenAI, Ollama)
194
+ ├── storage/ # Storage backends (pgvector, Qdrant, SQLite)
195
+ ├── retrieval/ # Retrieval pipeline (classify, search, rank, assemble)
196
+ ├── mcp/ # MCP Index Server (27 tools)
197
+ ├── console/ # Console MCP Server (31 tools, 4 tiers)
198
+ ├── coordination/ # Multi-agent pipeline locking
199
+ ├── notion/ # Notion export
200
+ ├── session_tracer/ # Session tracing middleware
201
+ ├── temporal/ # Temporal snapshot system
202
+ └── evaluation/ # Retrieval evaluation harness
411
203
 
412
- ## Context Assembly
413
-
414
- When serving context to an LLM, token budget is allocated in layers:
415
-
416
- ```
417
- Budget Allocation:
418
- ├── 10% Structural overview (always included)
419
- ├── 50% Primary relevant units
420
- ├── 25% Supporting context (dependencies)
421
- └── 15% Framework reference (when needed)
204
+ exe/
205
+ ├── codebase-index-mcp # Index Server executable (stdio)
206
+ ├── codebase-index-mcp-start # Self-healing MCP wrapper
207
+ ├── codebase-index-mcp-http # Index Server (HTTP/Rack)
208
+ └── codebase-console-mcp # Console MCP Server executable
422
209
  ```
423
210
 
424
- Queries are classified to determine whether framework source context is needed. "What options does has_many support?" routes to Rails source; "how do we handle checkout?" routes to application code.
211
+ See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full pipeline explanation extraction phases, dependency graph, retrieval pipeline, storage backends, and semantic chunking.
425
212
 
426
213
  ## Usage
427
214
 
@@ -434,43 +221,40 @@ bundle exec rake codebase_index:extract
434
221
  ### Incremental (CI)
435
222
 
436
223
  ```bash
437
- # Auto-detects GitHub Actions / GitLab CI environment
438
224
  bundle exec rake codebase_index:incremental
439
225
  ```
440
226
 
441
- ```yaml
442
- # .github/workflows/index.yml
443
- jobs:
444
- index:
445
- runs-on: ubuntu-latest
446
- steps:
447
- - uses: actions/checkout@v4
448
- with:
449
- fetch-depth: 2
450
- - name: Update index
451
- run: bundle exec rake codebase_index:incremental
452
- env:
453
- GITHUB_BASE_REF: ${{ github.base_ref }}
454
- ```
227
+ Auto-detects GitHub Actions / GitLab CI environment. See [Getting Started](docs/GETTING_STARTED.md) for CI workflow YAML.
228
+
229
+ ### Docker
455
230
 
456
- ### Framework-Only (on dependency changes)
231
+ Extraction runs inside the container; the Index Server runs on the host reading volume-mounted output. See [docs/DOCKER_SETUP.md](docs/DOCKER_SETUP.md) for Docker setup, MCP config, and troubleshooting.
457
232
 
458
233
  ```bash
459
- bundle exec rake codebase_index:extract_framework
234
+ docker compose exec app bundle exec rake codebase_index:extract
460
235
  ```
461
236
 
462
237
  ### Other Tasks
463
238
 
464
239
  ```bash
465
- rake codebase_index:validate # Check index integrity
466
- rake codebase_index:stats # Show unit counts, sizes, graph stats
467
- rake codebase_index:clean # Remove index
240
+ rake codebase_index:validate # Check index integrity
241
+ rake codebase_index:stats # Show unit counts, sizes, graph stats
242
+ rake codebase_index:clean # Remove index
243
+ rake codebase_index:embed # Embed all extracted units
244
+ rake codebase_index:embed_incremental # Embed changed units only
245
+ rake codebase_index:flow[EntryPoint] # Generate execution flow for an entry point
246
+ rake codebase_index:console # Start console MCP server
247
+ rake codebase_index:notion_sync # Sync models/columns to Notion databases
468
248
  ```
469
249
 
250
+ See [docs/NOTION_INTEGRATION.md](docs/NOTION_INTEGRATION.md) for Notion export configuration.
251
+
470
252
  ### Ruby API
471
253
 
254
+ > **Requires a booted Rails environment.** These methods use runtime introspection and must be called from within a Rails process (console, rake task, initializer).
255
+
472
256
  ```ruby
473
- # Full extraction
257
+ # Full extraction (output_dir from configuration)
474
258
  CodebaseIndex.extract!
475
259
 
476
260
  # Incremental
@@ -510,13 +294,24 @@ tmp/codebase_index/
510
294
 
511
295
  Each unit JSON contains: `identifier`, `type`, `file_path`, `source_code` (annotated), `metadata` (rich structured data), `dependencies`, `dependents`, `chunks` (if applicable), and `estimated_tokens`.
512
296
 
513
- ## Development
297
+ ## Documentation
514
298
 
515
- After checking out the repo:
299
+ | Guide | Purpose |
300
+ |-------|---------|
301
+ | [Getting Started](docs/GETTING_STARTED.md) | Install, configure, extract, inspect |
302
+ | [FAQ](docs/FAQ.md) | Common questions about setup, extraction, MCP, Docker |
303
+ | [Troubleshooting](docs/TROUBLESHOOTING.md) | Symptom → cause → fix for common problems |
304
+ | [Architecture](docs/ARCHITECTURE.md) | Pipeline stages, dependency graph, retrieval, storage |
305
+ | [Extractor Reference](docs/EXTRACTOR_REFERENCE.md) | What each of the 34 extractors captures |
306
+ | [MCP Servers](docs/MCP_SERVERS.md) | Full tool catalog and setup for Claude Code, Cursor, Windsurf |
307
+ | [MCP Tool Cookbook](docs/MCP_TOOL_COOKBOOK.md) | Scenario-based examples for common tasks |
308
+ | [Configuration Reference](docs/CONFIGURATION_REFERENCE.md) | All options with defaults |
309
+ | [Backend Matrix](docs/BACKEND_MATRIX.md) | Supported infrastructure combinations |
310
+
311
+ ## Development
516
312
 
517
313
  ```bash
518
314
  bin/setup # Install dependencies
519
- bin/console # Interactive prompt
520
315
  bundle exec rake spec # Run tests
521
316
  bundle exec rubocop # Lint
522
317
  ```
@@ -15,40 +15,12 @@ require_relative '../lib/codebase_index'
15
15
  require_relative '../lib/codebase_index/dependency_graph'
16
16
  require_relative '../lib/codebase_index/graph_analyzer'
17
17
  require_relative '../lib/codebase_index/mcp/server'
18
+ require_relative '../lib/codebase_index/mcp/bootstrapper'
18
19
  require_relative '../lib/codebase_index/embedding/text_preparer'
19
20
  require_relative '../lib/codebase_index/embedding/indexer'
20
21
 
21
- index_dir = ARGV[0] || ENV['CODEBASE_INDEX_DIR'] || Dir.pwd
22
-
23
- unless Dir.exist?(index_dir)
24
- warn "Error: Index directory does not exist: #{index_dir}"
25
- exit 1
26
- end
27
-
28
- unless File.exist?(File.join(index_dir, 'manifest.json'))
29
- warn "Error: No manifest.json found in: #{index_dir}"
30
- warn 'Run `bundle exec rake codebase_index:extract` in your Rails app first.'
31
- exit 1
32
- end
33
-
34
- # Attempt to build a retriever for semantic search.
35
- # Auto-configures from environment variables when no explicit configuration exists.
36
- retriever = begin
37
- config = CodebaseIndex.configuration
38
-
39
- if !config.embedding_provider && ENV.fetch('OPENAI_API_KEY', nil)
40
- config.vector_store = :in_memory
41
- config.metadata_store = :in_memory
42
- config.graph_store = :in_memory
43
- config.embedding_provider = :openai
44
- config.embedding_options = { api_key: ENV.fetch('OPENAI_API_KEY', nil) }
45
- end
46
-
47
- CodebaseIndex::Builder.new(config).build_retriever if config.embedding_provider
48
- rescue StandardError => e
49
- warn "Note: Semantic search unavailable (#{e.message}). Using pattern-based search only."
50
- nil
51
- end
22
+ index_dir = CodebaseIndex::MCP::Bootstrapper.resolve_index_dir(ARGV)
23
+ retriever = CodebaseIndex::MCP::Bootstrapper.build_retriever
52
24
 
53
25
  server = CodebaseIndex::MCP::Server.build(index_dir: index_dir, retriever: retriever)
54
26