codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. codebase_index/__init__.py +7 -0
  2. codebase_index/__main__.py +3 -0
  3. codebase_index/cli.py +916 -0
  4. codebase_index/config.py +110 -0
  5. codebase_index/discovery/__init__.py +10 -0
  6. codebase_index/discovery/classify.py +151 -0
  7. codebase_index/discovery/ignore.py +58 -0
  8. codebase_index/discovery/walker.py +75 -0
  9. codebase_index/doctor.py +138 -0
  10. codebase_index/embeddings/__init__.py +2 -0
  11. codebase_index/embeddings/backend.py +67 -0
  12. codebase_index/embeddings/external.py +56 -0
  13. codebase_index/embeddings/local.py +41 -0
  14. codebase_index/embeddings/noop.py +15 -0
  15. codebase_index/graph/__init__.py +8 -0
  16. codebase_index/graph/analysis.py +468 -0
  17. codebase_index/graph/builder.py +160 -0
  18. codebase_index/graph/expand.py +136 -0
  19. codebase_index/graph/export.py +381 -0
  20. codebase_index/graph/navigate.py +201 -0
  21. codebase_index/indexer/__init__.py +8 -0
  22. codebase_index/indexer/doc_chunks.py +202 -0
  23. codebase_index/indexer/freshness.py +109 -0
  24. codebase_index/indexer/pipeline.py +423 -0
  25. codebase_index/mcp/__init__.py +2 -0
  26. codebase_index/mcp/server.py +354 -0
  27. codebase_index/models.py +145 -0
  28. codebase_index/output/__init__.py +6 -0
  29. codebase_index/output/json.py +13 -0
  30. codebase_index/output/markdown.py +316 -0
  31. codebase_index/output/redact.py +31 -0
  32. codebase_index/parsers/__init__.py +9 -0
  33. codebase_index/parsers/base.py +47 -0
  34. codebase_index/parsers/languages.py +290 -0
  35. codebase_index/parsers/line_chunker.py +39 -0
  36. codebase_index/parsers/symbol_chunks.py +62 -0
  37. codebase_index/parsers/treesitter.py +439 -0
  38. codebase_index/retrieval/__init__.py +9 -0
  39. codebase_index/retrieval/budget.py +82 -0
  40. codebase_index/retrieval/fusion.py +62 -0
  41. codebase_index/retrieval/intent.py +56 -0
  42. codebase_index/retrieval/pipeline.py +207 -0
  43. codebase_index/retrieval/rerank.py +69 -0
  44. codebase_index/retrieval/searchers.py +291 -0
  45. codebase_index/retrieval/skeleton.py +251 -0
  46. codebase_index/retrieval/types.py +79 -0
  47. codebase_index/scaffold.py +399 -0
  48. codebase_index/service.py +158 -0
  49. codebase_index/skill_template/SKILL.md +198 -0
  50. codebase_index/skill_template/examples/hooks/settings.json +16 -0
  51. codebase_index/skill_template/scripts/cbx +25 -0
  52. codebase_index/skill_template/scripts/cbx.ps1 +25 -0
  53. codebase_index/skill_update.py +150 -0
  54. codebase_index/storage/__init__.py +8 -0
  55. codebase_index/storage/db.py +116 -0
  56. codebase_index/storage/repo.py +701 -0
  57. codebase_index/storage/schema.sql +125 -0
  58. codebase_index/watch/__init__.py +5 -0
  59. codebase_index/watch/watcher.py +93 -0
  60. codebase_index-1.6.0.dist-info/METADATA +748 -0
  61. codebase_index-1.6.0.dist-info/RECORD +64 -0
  62. codebase_index-1.6.0.dist-info/WHEEL +4 -0
  63. codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
  64. codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,748 @@
1
+ Metadata-Version: 2.4
2
+ Name: codebase-index
3
+ Version: 1.6.0
4
+ Summary: Local-first hybrid codebase index for AI coding agents, exposed as CLI, Skill, and MCP tools.
5
+ Project-URL: Homepage, https://github.com/denfry/codebase-index
6
+ Project-URL: Documentation, https://github.com/denfry/codebase-index/tree/main/docs
7
+ Project-URL: Changelog, https://github.com/denfry/codebase-index/blob/main/CHANGELOG.md
8
+ Project-URL: Issues, https://github.com/denfry/codebase-index/issues
9
+ Author: codebase-index contributors
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: ai-agents,claude-code,cli,code-search,codebase-indexing,codebase-rag,codex-cli,fts5,local-first,mcp,opencode,rag,semantic-code-search,sqlite,tree-sitter
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Software Development :: Libraries
22
+ Classifier: Topic :: Text Processing :: Indexing
23
+ Requires-Python: >=3.11
24
+ Requires-Dist: pathspec>=0.12
25
+ Requires-Dist: pydantic>=2.6
26
+ Requires-Dist: rich>=13.0
27
+ Requires-Dist: tree-sitter-language-pack==1.8.1
28
+ Requires-Dist: tree-sitter==0.25.2
29
+ Requires-Dist: typer>=0.12
30
+ Provides-Extra: build
31
+ Requires-Dist: build>=1.2; extra == 'build'
32
+ Requires-Dist: twine>=5.0; extra == 'build'
33
+ Provides-Extra: dev
34
+ Requires-Dist: mcp>=1.0; extra == 'dev'
35
+ Requires-Dist: mypy>=1.10; extra == 'dev'
36
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
37
+ Requires-Dist: pytest>=8.0; extra == 'dev'
38
+ Requires-Dist: pyyaml>=6.0; extra == 'dev'
39
+ Requires-Dist: ruff>=0.5; extra == 'dev'
40
+ Provides-Extra: embeddings
41
+ Requires-Dist: numpy>=1.26; extra == 'embeddings'
42
+ Requires-Dist: sqlite-vec>=0.1; extra == 'embeddings'
43
+ Provides-Extra: embeddings-local
44
+ Requires-Dist: sentence-transformers>=3.0; extra == 'embeddings-local'
45
+ Provides-Extra: mcp
46
+ Requires-Dist: mcp>=1.0; extra == 'mcp'
47
+ Provides-Extra: watch
48
+ Requires-Dist: watchdog>=4.0; extra == 'watch'
49
+ Description-Content-Type: text/markdown
50
+
51
+ # codebase-index: Local Codebase Indexing for AI Coding Agents
52
+
53
+ `codebase-index` is a local-first codebase indexing tool that helps Claude Code,
54
+ Codex CLI, OpenCode, and other AI coding agents find relevant files, symbols, and
55
+ references without scanning an entire repository.
56
+
57
+ [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
58
+ [![Python](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/)
59
+ [![CI](https://github.com/denfry/codebase-index/actions/workflows/ci.yml/badge.svg)](https://github.com/denfry/codebase-index/actions)
60
+ [![Claude Code Skill](https://img.shields.io/badge/Claude%20Code%20Skill-yes-green.svg)](skill/SKILL.md)
61
+ [![Codex CLI](https://img.shields.io/badge/Codex%20CLI-supported-green.svg)](#which-ai-clis-does-codebase-index-support)
62
+ [![OpenCode](https://img.shields.io/badge/OpenCode-supported-green.svg)](#which-ai-clis-does-codebase-index-support)
63
+ [![MCP](https://img.shields.io/badge/MCP-stdio%20server-green.svg)](docs/MCP.md)
64
+ [![Local First](https://img.shields.io/badge/local--first-yes-green.svg)](#safety-and-privacy)
65
+ [![No Telemetry](https://img.shields.io/badge/no%20telemetry-yes-green.svg)](#safety-and-privacy)
66
+ [![No Network By Default](https://img.shields.io/badge/no%20network%20by%20default-yes-green.svg)](#safety-and-privacy)
67
+ [![SQLite](https://img.shields.io/badge/database-SQLite-blue.svg)](docs/DATABASE_SCHEMA.md)
68
+ [![Tree-sitter](https://img.shields.io/badge/parsing-Tree--sitter-orange.svg)](docs/ARCHITECTURE.md)
69
+
70
+ <p align="center">
71
+ <img src="assets/demo.png" width="820"
72
+ alt="codebase-index ranking a local search for 'where is user authentication implemented?' into scored files with recommended file:line ranges to read">
73
+ </p>
74
+
75
+ ## What Is codebase-index?
76
+
77
+ **codebase-index is a private, offline retrieval layer for AI code search.** It
78
+ builds a SQLite index of your repository, extracts symbols with Tree-sitter,
79
+ ranks matches with hybrid retrieval, and returns compact file:line ranges that
80
+ an AI coding agent can read instead of opening broad file sets.
81
+
82
+ Use it when you want Cursor-like codebase awareness in terminal-based AI tools
83
+ while keeping source code, snippets, and search metadata on your machine.
84
+
85
+ > **codebase-index is not an IDE and not a coding agent.** It is the local
86
+ > retrieval/index layer that gives terminal and MCP-based AI agents precise
87
+ > codebase context. The agent stays your interface; this gives it better aim.
88
+
89
+ ## Who Is It For?
90
+
91
+ - **Claude Code / Codex CLI / OpenCode users** on medium-to-large repos who want
92
+ the agent to read 3 ranked files instead of grepping and scanning 60.
93
+ - **Privacy-constrained teams** (proprietary or regulated code) who cannot send
94
+ source to a cloud code-intelligence service.
95
+ - **MCP power users** who want a stable, queryable code index as a tool, not a
96
+ black box baked into one agent's prompt.
97
+ - **Tooling authors** who need scriptable retrieval (`--json`, SQLite, MCP) that
98
+ other tools can build on.
99
+
100
+ Not for you if you want a full IDE, org-scale multi-repo search, or a hosted
101
+ platform — use Cursor or Sourcegraph for those.
102
+
103
+ ## Start Here
104
+
105
+ If you are opening this repository for the first time, follow this order:
106
+
107
+ 1. [Quick Start (5 minutes)](docs/QUICKSTART.md)
108
+ 2. [Installation Guide](docs/INSTALLATION.md)
109
+ 3. [Benchmarks](docs/BENCHMARKS.md)
110
+ 4. [How the skill works](skill/SKILL.md)
111
+ 5. [MCP server](docs/MCP.md)
112
+ 6. [FAQ](docs/FAQ.md)
113
+
114
+ If you only need the shortest path, run:
115
+
116
+ ```bash
117
+ pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.6.0"
118
+ cd your-project
119
+ codebase-index init # prompts for Claude Code / Codex CLI / OpenCode
120
+ codebase-index index
121
+ codebase-index search "where is authentication implemented?"
122
+ ```
123
+
124
+ ## Project Status
125
+
126
+ **`1.6.0` is released.** The current release includes repository discovery,
127
+ SQLite FTS5 storage, Tree-sitter symbols and references, hybrid ranking, graph
128
+ impact analysis, token-budgeted retrieval packets, optional local embeddings,
129
+ hooks/watch support, multi-CLI installation, MCP server support, and a tested
130
+ GitHub-only `pipx` install path.
131
+
132
+ The `1.6.0` release turns the dependency graph into a navigable map: every edge
133
+ carries a `confidence` audit trail (`extracted`/`inferred`/`ambiguous`, surfaced
134
+ in `refs`/`impact`); a new zero-dependency analytics pass computes modules
135
+ (communities), god nodes, and surprising cross-module links, exposed via the
136
+ `architecture` command/MCP tool; `path` traces the shortest dependency chain
137
+ between two symbols and `describe` prints a symbol's node card; and the HTML
138
+ graph is coloured by module and sized by connectivity, with `--format
139
+ graphml|dot|neo4j` exports for external tools. Requires a one-time reindex
140
+ (schema 2 → 3).
141
+
142
+ The earlier `1.4.0` release hardened the MCP contract (a `schema_version` +
143
+ `tool` envelope on every payload, golden-locked per tool, plus a fix so the
144
+ server loads on current `mcp`/`pydantic`), dampened the god-class `in_degree`
145
+ rerank tiebreak (logarithmic, validated no-regression on the public benchmark),
146
+ and labelled config/IaC files (Dockerfile, Terraform, HCL, INI, Makefiles) so
147
+ infra surfaces in `stats` and search.
148
+
149
+ The earlier `1.3.0` release added a content-addressed embedding cache (rebuilds reuse
150
+ vectors for unchanged content), a batched graph build (7–28× faster edge
151
+ resolution plus a new `edges(file_id)` index), a shared CLI/MCP service layer
152
+ (MCP hybrid search now uses the vector channel; `index_stats` reports the
153
+ per-language graph tier), graph-coverage signals in `stats`/`refs`/`impact`,
154
+ CLI pagination via `search --offset`, and single-source versioning with a CI
155
+ gate that keeps every committed skill copy in sync.
156
+ The `1.2.1` release added skill auto-update/rollback commands and version
157
+ stamps so installed skills stay in sync with the package automatically.
158
+ See [CHANGELOG.md](CHANGELOG.md) and
159
+ [docs/ROADMAP.md](docs/ROADMAP.md).
160
+
161
+ MCP is now available as a stdio server via `codebase-index mcp --root <repo>`.
162
+ It exposes `healthcheck`, `search_code`, `find_symbol`, `find_refs`,
163
+ `impact_of`, `explain_code`, `architecture_overview`, `path_between`,
164
+ `describe_symbol`, and `index_stats`; see [docs/MCP.md](docs/MCP.md).
165
+
166
+ ```
167
+ You: "Where is user authentication implemented?"
168
+ Agent: searches local index (symbols + FTS5 + graph)
169
+ reads only 3 ranked files instead of scanning 60
170
+ answers with citations: src/auth/AuthService.ts:12-148
171
+ ```
172
+
173
+ ---
174
+
175
+ ## How Do I Install codebase-index?
176
+
177
+ For most users, install the package from the tagged GitHub release and run
178
+ `init` inside the repository you want to index:
179
+
180
+ ```bash
181
+ pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.6.0"
182
+ cd your-project
183
+ codebase-index init # choose Claude Code, Codex CLI, OpenCode, or all
184
+ codebase-index index
185
+ ```
186
+
187
+ In a non-interactive script, pass a target explicitly:
188
+
189
+ ```bash
190
+ codebase-index init --target auto # install into detected AI CLIs
191
+ codebase-index init --target codex # write AGENTS.md + Codex resources
192
+ codebase-index init --target claude # write .claude/skills/codebase-index
193
+ codebase-index init --target opencode # write OpenCode command + agent files
194
+ ```
195
+
196
+ ### Install as a Claude Code plugin
197
+
198
+ One command in Claude Code:
199
+
200
+ ```
201
+ /plugin marketplace add denfry/codebase-index
202
+ /plugin install codebase-index@codebase-index
203
+ ```
204
+
205
+ Or just ask: "install the codebase-index plugin".
206
+
207
+ **What happens on first run:** when a session starts, a `SessionStart` hook
208
+ (`scripts/bootstrap.sh` / `.ps1`) creates a private Python virtual environment under
209
+ `~/.claude/plugins/data/codebase-index-*/venv` and installs the pinned
210
+ `codebase-index` package (from `requirements.lock`) into it — using `uv` if present,
211
+ otherwise `python -m venv` + `pip`. It reinstalls only when the lock file changes.
212
+ Nothing is installed globally; uninstalling the plugin removes the data directory.
213
+
214
+ **Prerequisite:** Python 3.11+ on your PATH. The first install needs network access to
215
+ fetch the package; later sessions are offline. The skill builds its index on
216
+ your first codebase question, so there is no manual `index` step.
217
+
218
+ **Distribution note:** the plugin bootstrap installs the pinned requirement from
219
+ `requirements.lock`. In `1.6.0`, that lock points at the tagged GitHub release
220
+ instead of PyPI. You can override it with `CBX_INSTALL_SPEC` when testing a local
221
+ checkout or a different Git ref.
222
+
223
+ ## What Problem Does codebase-index Solve?
224
+
225
+ AI coding agents struggle with large repositories when they rely on broad file
226
+ reads, grep output, or user-provided context. `codebase-index` gives those agents
227
+ a ranked local retrieval packet before they read source files.
228
+
229
+ - **Token waste** — Scanning entire files or running broad grep/glob queries burns through the context window on irrelevant content.
230
+ - **No symbol awareness** — Standard search can't distinguish a function definition from a call, or a class from a variable.
231
+ - **No ranking** — Grep returns all matches with no relevance ordering. The agent must read everything.
232
+ - **No context** — Grep doesn't know which files are related or what to read next.
233
+ - **Cloud dependency** — External code indexing services send your proprietary code to remote servers.
234
+
235
+ Developers get Cursor-like codebase awareness in Claude Code, Codex CLI, and
236
+ OpenCode without leaving the terminal or sending code to a remote indexing
237
+ service.
238
+
239
+ ## How Is This Different?
240
+
241
+ Short answers to the questions people actually ask. The full, honest matrix —
242
+ including when you should pick the other tool — is in
243
+ [docs/COMPARISON.md](docs/COMPARISON.md).
244
+
245
+ - **Why not just `grep`/`rg`?** Grep returns every match with no ranking, no
246
+ symbol awareness, and no idea which files relate. codebase-index ranks results,
247
+ knows a definition from a call, expands along the dependency graph, and returns
248
+ specific line ranges under a token budget — so the agent reads less and answers
249
+ with citations.
250
+ - **Why not Cursor?** Cursor is a great AI IDE with strong codebase awareness, but
251
+ it is proprietary and IDE-centric. codebase-index is a local, open retrieval
252
+ layer for **terminal and MCP** agents, offline by default, with no IDE lock-in.
253
+ If you live inside Cursor, keep using Cursor.
254
+ - **Why not Aider repo-map?** Aider's repo-map is a good graph-ranked,
255
+ token-budgeted context map — but it is optimized to feed Aider's own chat.
256
+ codebase-index is a **reusable, queryable index**: CLI/JSON/MCP commands return
257
+ ranked `file:line` ranges, symbols, references, and impact that *any*
258
+ shell-capable agent can consume, with freshness and security gates.
259
+ - **Why not Sourcegraph / Cody / Amp?** They are excellent enterprise-grade,
260
+ cross-repo code intelligence platforms. They are also heavier and
261
+ account/platform-oriented. codebase-index is single-repo, local, and
262
+ lightweight — no server, no account, no code leaving the machine by default.
263
+ - **Why not Codebase-Memory MCP?** It is the closest direct alternative — a
264
+ broader graph engine with a static binary and wide language/agent coverage. We
265
+ do **not** claim to beat it globally. We differentiate on simplicity, a strict
266
+ privacy model, token-budgeted retrieval packets, a transparent Python
267
+ implementation, the Claude/Codex/OpenCode workflow, and honest benchmarks. If
268
+ you need its broader graph and language reach today, choose it.
269
+
270
+ **What makes it trustworthy?** No telemetry, no network by default, a multi-gate
271
+ exclusion pipeline (secrets/binaries/generated/dependencies never indexed),
272
+ output-time secret redaction, a `doctor --strict` safety self-check, and a
273
+ public benchmark suite wired as a CI regression gate. Claims that aren't proven
274
+ in this repo are marked as roadmap, not done.
275
+
276
+ ### Proven today vs. roadmap
277
+
278
+ | Capability | Status |
279
+ |---|---|
280
+ | Hybrid retrieval (path + symbol + FTS5 + graph), token-budgeted packets | ✅ Shipped |
281
+ | Tree-sitter symbols for 12 Tier-A languages + Tier-B generic path | ✅ Shipped |
282
+ | Import/call/reference/inheritance graph, `refs`/`impact` | ✅ Shipped |
283
+ | Optional local embeddings; external embeddings gated 3 ways | ✅ Shipped |
284
+ | stdio MCP server; CLI/skill/MCP share one service layer | ✅ Shipped |
285
+ | Honest 55k LOC Java benchmark (recall@3 70% vs 40% `rg`, ~13× fewer tokens) | ✅ Shipped |
286
+ | 10k/100k/1M LOC public-repo benchmarks | 🚧 Roadmap |
287
+ | Framework-aware typed edges (route→handler→service→model) | 🚧 Roadmap |
288
+ | PyPI / `uvx` / Homebrew, signed checksums, SBOM | 🚧 Roadmap |
289
+ | Verified per-client MCP docs, paged/progressive results | 🚧 Roadmap |
290
+
291
+ See [docs/PRODUCT_UPGRADE_PLAN.md](docs/PRODUCT_UPGRADE_PLAN.md) for the full
292
+ upgrade plan and ranked roadmap.
293
+
294
+ ## How Does codebase-index Work?
295
+
296
+ `codebase-index` builds a local hybrid index that combines:
297
+
298
+ - **Symbol search** — Tree-sitter AST parsing extracts classes, functions, methods, and variables across the supported code-language set.
299
+ - **Full-text search** — SQLite FTS5 for fast lexical search across code chunks.
300
+ - **Path search** — File path matching for location-aware queries.
301
+ - **Optional semantic search** — Vector embeddings for similarity-based retrieval (opt-in, local by default).
302
+ - **Dependency graph** — Import, call, and reference edges for impact analysis and graph expansion.
303
+ - **Token-budgeted output** — Ranked retrieval packets with specific line ranges, not whole files.
304
+
305
+ The AI agent reads only the recommended files and line ranges, not the entire
306
+ repository.
307
+
308
+ ## Quick Demo
309
+
310
+ ```bash
311
+ /codebase-index "where is user authentication implemented?"
312
+ ```
313
+
314
+ Expected output:
315
+
316
+ ```
317
+ Top matches:
318
+ ┌──────┬──────────────────────────┬──────────────────────────┬───────┬──────────────────────────────┐
319
+ │ Rank │ Path │ Symbols │ Score │ Reason │
320
+ ├──────┼──────────────────────────┼──────────────────────────┼───────┼──────────────────────────────┤
321
+ │ 1 │ src/auth/AuthService.ts │ AuthService, login │ 0.92 │ exact symbol match │
322
+ │ 2 │ src/routes/auth.ts │ loginHandler, logout │ 0.78 │ FTS match · 4 callers │
323
+ │ 3 │ src/middleware/auth.ts │ requireAuth │ 0.65 │ path match · FTS match │
324
+ └──────┴──────────────────────────┴──────────────────────────┴───────┴──────────────────────────────┘
325
+
326
+ Recommended reads:
327
+ 1. src/auth/AuthService.ts:12-148
328
+ reason: matched AuthService, login(), validatePassword()
329
+ 2. src/routes/auth.ts:20-91
330
+ reason: /login route calls AuthService.login()
331
+ 3. src/middleware/auth.ts:5-42
332
+ reason: auth middleware validates sessions
333
+ ```
334
+
335
+ ## Installation Options
336
+
337
+ If you are new to this repo, start with [docs/QUICKSTART.md](docs/QUICKSTART.md).
338
+ If you want all install options and troubleshooting, use [docs/INSTALLATION.md](docs/INSTALLATION.md).
339
+
340
+ **Multi-CLI installer (Claude Code + Codex CLI + OpenCode):** one command via
341
+ `install.sh` / `install.ps1` — see [docs/installer.md](docs/installer.md).
342
+
343
+ ```bash
344
+ # macOS / Linux
345
+ curl -fsSL https://raw.githubusercontent.com/denfry/codebase-index/main/install.sh | sh
346
+ ```
347
+ ```powershell
348
+ # Windows PowerShell
349
+ irm https://raw.githubusercontent.com/denfry/codebase-index/main/install.ps1 | iex
350
+ ```
351
+
352
+ ### Option 1: Install from a tagged GitHub release
353
+
354
+ ```bash
355
+ cd your-project
356
+ pip install "codebase-index @ git+https://github.com/denfry/codebase-index.git@v1.6.0"
357
+ codebase-index init
358
+ codebase-index index
359
+ ```
360
+
361
+ ### Python version compatibility
362
+
363
+ `codebase-index` requires Python 3.11 or newer.
364
+
365
+ If `codebase-index init --target opencode` fails with:
366
+
367
+ ```text
368
+ ModuleNotFoundError: No module named 'importlib.resources.abc'; 'importlib.resources' is not a package
369
+ ```
370
+
371
+ the `pipx` environment was likely created with an older Python version. Reinstall `codebase-index` using Python 3.11+ explicitly:
372
+
373
+ ```powershell
374
+ pipx uninstall codebase-index
375
+ py -0p
376
+ pipx install --python "<path-to-python-3.11-or-newer>\python.exe" "git+https://github.com/denfry/codebase-index.git@v1.6.0"
377
+ ```
378
+
379
+ For example:
380
+
381
+ ```powershell
382
+ pipx install --python "C:\Users\you\AppData\Local\Programs\Python\Python312\python.exe" "git+https://github.com/denfry/codebase-index.git@v1.6.0"
383
+ ```
384
+
385
+ Then run initialization again:
386
+
387
+ ```powershell
388
+ codebase-index init --target opencode
389
+ codebase-index index
390
+ ```
391
+
392
+
393
+ ### Option 2: Install with pipx from GitHub
394
+
395
+ ```bash
396
+ pipx install "git+https://github.com/denfry/codebase-index.git@v1.6.0"
397
+ cd your-project
398
+ codebase-index init --target auto
399
+ codebase-index index
400
+ ```
401
+
402
+ ### Option 3: Install from source
403
+
404
+ ```bash
405
+ git clone https://github.com/denfry/codebase-index.git
406
+ cd codebase-index
407
+ pip install -e ".[dev]"
408
+ ```
409
+
410
+ ### Distribution roadmap
411
+
412
+ PyPI, `uvx`, Homebrew, signed release checksums, and SBOMs are important for a
413
+ tool that reads entire repositories, but they are not all verified as shipped in
414
+ `1.6.0`. Target install story:
415
+
416
+ ```bash
417
+ uvx codebase-index init
418
+ pipx install codebase-index
419
+ brew install denfry/tap/codebase-index
420
+ ```
421
+
422
+ ### Verify the install
423
+
424
+ ```bash
425
+ codebase-index doctor
426
+ ```
427
+
428
+ See [docs/INSTALLATION.md](docs/INSTALLATION.md) for the full guide, including optional extras (embeddings, watch mode) and troubleshooting.
429
+
430
+ ## Usage
431
+
432
+ ```bash
433
+ # Initialize the index for your project
434
+ codebase-index init
435
+
436
+ # Build the index
437
+ codebase-index index
438
+
439
+ # Search for something
440
+ codebase-index search "where is authentication implemented?"
441
+
442
+ # Look up a specific symbol
443
+ codebase-index symbol "AuthService"
444
+
445
+ # Find callers and references
446
+ codebase-index refs "AuthService.login"
447
+
448
+ # Analyze impact of a change
449
+ codebase-index impact "src/auth/AuthService.ts"
450
+
451
+ # Map the codebase: modules, god nodes, surprising links, suggested questions
452
+ codebase-index architecture
453
+
454
+ # How are two symbols/files connected? Shortest dependency/call path
455
+ codebase-index path "renew" "refresh_access_token"
456
+
457
+ # Node card: definition, callers, callees, centrality, module
458
+ codebase-index describe "Database"
459
+
460
+ # Visualize the graph (modules coloured, size = connectivity, edge style = confidence)
461
+ codebase-index graph --open
462
+ # …or export for external tools: graphml (Gephi/yEd), dot (Graphviz), neo4j (Cypher)
463
+ codebase-index graph --format graphml -o graph.graphml
464
+
465
+ # View index statistics
466
+ codebase-index stats
467
+
468
+ # Run diagnostics
469
+ codebase-index doctor
470
+ ```
471
+
472
+ Add `--json` to any command for machine-readable output.
473
+
474
+ ## How Does Retrieval Flow Through codebase-index?
475
+
476
+ ```
477
+ User question
478
+
479
+ CLI instructions or skill
480
+
481
+ Hybrid retrieval
482
+ ├─ Path search
483
+ ├─ Symbol search (Tree-sitter AST)
484
+ ├─ SQLite FTS5 full-text search
485
+ ├─ Optional embeddings (vector search)
486
+ └─ Graph expansion (callers, imports, references)
487
+
488
+ Ranked retrieval packet
489
+
490
+ Agent reads only the recommended line ranges
491
+
492
+ Answer with precise file:line citations
493
+ ```
494
+
495
+ ## Features
496
+
497
+ - [x] **Local-first indexing** — All data stays on your machine
498
+ - [x] **No network by default** — Zero external API calls out of the box
499
+ - [x] **Respects ignore files** — `.gitignore`, `.claudeignore`, `.codeindexignore`
500
+ - [x] **SQLite storage** — Fast, reliable, single-file database
501
+ - [x] **FTS5 lexical search** — Full-text search with code-aware tokenization
502
+ - [x] **Tree-sitter AST parsing** — Tier-A symbol extraction for Python, JavaScript, TypeScript, Java, Go, Rust, C, C++, C#, Ruby, PHP, and Kotlin; Tier-B generic extraction for code languages with a loadable grammar such as Lua
503
+ - [x] **Symbol extraction** — Classes, functions, methods, variables with line ranges
504
+ - [x] **Incremental indexing** — Only changed files are re-indexed
505
+ - [x] **Token-budgeted output** — Configurable max output size
506
+ - [x] **Secret redaction** — Masks keys, tokens, and credentials in snippets
507
+ - [x] **Optional embeddings** — Local or remote vector search (opt-in)
508
+ - [x] **Optional hooks/watch** — Auto-update index after file edits
509
+ - [x] **Multi-CLI setup** — Claude Code, Codex CLI, and OpenCode instructions
510
+ - [x] **MCP server** — stdio MCP tools for search, symbols, refs, impact, explain, health, and stats
511
+
512
+ ## Safety and Privacy
513
+
514
+ > **Trust model in 60 seconds**
515
+ > 1. **Offline by default** — the base install has zero network dependencies; nothing leaves your machine.
516
+ > 2. **One opt-in exit, triple-gated** — external embeddings require `allow_external` **and** an env API key **and** a printed endpoint warning, or they are refused.
517
+ > 3. **Secrets never get in** — `.env`, keys, certs, and credential files are excluded before parsing (multi-gate ignore pipeline).
518
+ > 4. **Secrets never get out** — every snippet is redacted (AWS keys, private keys, JWTs, bearer tokens, connection strings) before it reaches the agent.
519
+ > 5. **No telemetry, ever** — no analytics, no phone-home, no usage data.
520
+ > 6. **Verify it yourself** — `codebase-index doctor --strict` audits all of the above and exits non-zero in CI on any high-severity finding.
521
+
522
+ `codebase-index` is designed with privacy as a first principle:
523
+
524
+ - **No telemetry** — No usage data, analytics, or crash reports are collected or transmitted.
525
+ - **No external API calls by default** — All indexing, storage, and search happen locally.
526
+ - **Does not index sensitive files** — `.env`, private keys, certificates, tokens, and credential files are excluded before parsing.
527
+ - **Respects ignore files** — `.gitignore`, `.claudeignore`, `.codeindexignore`, and `.cursorignore` are all honored.
528
+ - **Index stored locally** — SQLite database in `.claude/cache/codebase-index/` (gitignored by default).
529
+ - **Optional embeddings are local by default** — External embedding APIs require explicit opt-in with warnings.
530
+ - **Secret redaction** — Snippets are scrubbed for AWS keys, private keys, JWTs, bearer tokens, and connection strings before output.
531
+
532
+ See [docs/SECURITY_MODEL.md](docs/SECURITY_MODEL.md) for the full security model and threat analysis.
533
+
534
+
535
+ ## Benchmark Results
536
+
537
+ There are three benchmark surfaces today:
538
+
539
+ 1. **Public benchmark suite** in `tests/benchmark_public.py`: reproducible
540
+ multi-language fixture with Recall@1/3/5, MRR, nDCG, answer-correctness proxy,
541
+ token economy, language breakdown, freshness latency, graph tasks, and scale counters.
542
+ 2. **Smoke benchmark** on `sample_repo`: validates the CLI is fast and stable on
543
+ a tiny fixture, but it is not evidence of production retrieval quality.
544
+ 3. **Honest benchmark** on a real Java repository: `tests/benchmark_honest.py`
545
+ compares codebase-index against a disciplined `rg` + read-window baseline on
546
+ 10 realistic questions. Results are documented in
547
+ [tests/benchmark_honest_RESULTS.md](tests/benchmark_honest_RESULTS.md).
548
+
549
+ Run the public suite:
550
+
551
+ ```bash
552
+ python tests/benchmark_public.py --workdir .tmp-public-benchmark
553
+ ```
554
+
555
+ Current honest benchmark headline:
556
+
557
+ | Metric | Result |
558
+ |---|---|
559
+ | Repo | 303 Java files, ~55k LOC |
560
+ | Retrieval quality | recall@3: 70% index vs 40% `rg` baseline |
561
+ | Token economy | ~13x fewer answer tokens than `rg` + 80-line windows |
562
+ | Verified language impact | Java symbols fixed from 0 to 3,543 symbols |
563
+
564
+ The public suite now has the metric framework. It still needs larger public or
565
+ documented external repos for 10k/100k/1M LOC scale claims and deeper framework
566
+ graph tasks. See [docs/BENCHMARKS.md](docs/BENCHMARKS.md).
567
+
568
+ ## Repository Layout
569
+
570
+ ```
571
+ ├── skill/ # Source instruction package (SKILL.md, scripts, examples)
572
+ ├── skills/ # Plugin skill copy
573
+ ├── src/codebase_index/ # Python package (CLI, indexer, retrieval, storage)
574
+ ├── docs/ # Documentation (architecture, schema, security, FAQ)
575
+ ├── examples/ # Sample queries, retrieval output, demo project
576
+ ├── tests/ # Test suite with fixture repositories
577
+ ├── bin/ # Plugin CLI wrappers (cbx, codebase-index)
578
+ ├── scripts/ # Bootstrap scripts (bootstrap.sh, bootstrap.ps1)
579
+ ├── hooks/ # Plugin hooks (hooks.json)
580
+ ├── .claude-plugin/ # Plugin manifest + marketplace catalog
581
+ ├── .github/ # Issue templates, CI workflows, PR template
582
+ ├── README.md # This file
583
+ ├── LICENSE # MIT License
584
+ ├── CHANGELOG.md # Release history
585
+ ├── CONTRIBUTING.md # Contributor guide
586
+ ├── SECURITY.md # Security policy
587
+ ├── ROADMAP.md # Development milestones
588
+ ├── requirements.lock # Pinned install spec for bootstrap
589
+ └── pyproject.toml # Package configuration
590
+ ```
591
+
592
+ ## Configuration
593
+
594
+ Create `.codeindex.json` in your project root:
595
+
596
+ ```json
597
+ {
598
+ "index": {
599
+ "max_file_bytes": 1048576,
600
+ "chunk_size": 500,
601
+ "chunk_overlap": 50
602
+ },
603
+ "embeddings": {
604
+ "backend": "noop",
605
+ "allow_external": false
606
+ }
607
+ }
608
+ ```
609
+
610
+ ### Ignore Files
611
+
612
+ - `.codeindexignore` — Tool-specific ignore patterns (highest priority)
613
+ - `.gitignore` — Standard git ignore patterns
614
+ - `.claudeignore` — Claude-specific ignore patterns
615
+
616
+ ### Cache Location
617
+
618
+ ```
619
+ .claude/cache/codebase-index/
620
+ ├── index.sqlite # SQLite database with FTS5
621
+ └── config.json # Resolved configuration
622
+ ```
623
+
624
+ ## Which AI CLIs Does codebase-index Support?
625
+
626
+ `codebase-index init` can install instructions for three AI coding CLIs:
627
+
628
+ | CLI | Files written by `init` | Best command |
629
+ |---|---|---|
630
+ | Claude Code | `.claude/skills/codebase-index/` | `codebase-index init --target claude` |
631
+ | Codex CLI | `AGENTS.md` + `.codex/skills/codebase-index/` | `codebase-index init --target codex` |
632
+ | OpenCode | `.opencode/commands/` + `.opencode/agents/` + resources | `codebase-index init --target opencode` |
633
+
634
+ Use `codebase-index init --target auto` to install into detected CLIs, or
635
+ `codebase-index init --target all` to write every supported integration.
636
+
637
+ ### Claude Code Integration
638
+
639
+ The Claude Code skill is defined in [`skill/SKILL.md`](skill/SKILL.md) with
640
+ YAML frontmatter for automatic selection.
641
+
642
+ Example `.claude/CLAUDE.md`:
643
+
644
+ ```markdown
645
+ ## Codebase Questions
646
+
647
+ Before answering any question about this project's code:
648
+ 1. Use the codebase-index skill to search the local index first.
649
+ 2. Read only the recommended line ranges — do not scan entire files.
650
+ 3. Answer with file:line citations.
651
+ ```
652
+
653
+ ### Optional Hooks
654
+
655
+ Configure automatic index updates in `.codeindex.json`:
656
+
657
+ ```json
658
+ {
659
+ "hooks": {
660
+ "post_tool_use": {
661
+ "enabled": true,
662
+ "events": ["Write", "Edit"],
663
+ "command": "codebase-index update --quiet"
664
+ }
665
+ }
666
+ }
667
+ ```
668
+
669
+ See [skill/examples/](skill/examples/) for full examples.
670
+
671
+ ## FAQ
672
+
673
+ ### Is this a Cursor replacement?
674
+
675
+ No. `codebase-index` is not a replacement for Cursor or any IDE. It is a
676
+ local retrieval layer for terminal AI coding agents. You still use Claude Code,
677
+ Codex CLI, OpenCode, or another agent as your primary interface.
678
+
679
+ ### Does it send my code anywhere?
680
+
681
+ No. By default, `codebase-index` is completely local-first and offline. All indexing, storage, and search happen on your machine. External embeddings are opt-in only and require explicit configuration.
682
+
683
+ ### Does it work without embeddings?
684
+
685
+ Yes. The default configuration disables embeddings entirely (`backend = "noop"`). Search uses SQLite FTS5, Tree-sitter symbol extraction, path matching, and graph expansion. Embeddings are an optional enhancement.
686
+
687
+ ### Does it support large repositories?
688
+
689
+ Yes. The index is incremental — only changed files are re-indexed. SQLite with FTS5 handles large datasets efficiently. Generated files, dependencies, and binaries are excluded automatically.
690
+
691
+ ### Why not just use Grep?
692
+
693
+ Grep returns all matches with no ranking, no symbol awareness, and no context about related files. `codebase-index` combines lexical search with symbol extraction and graph expansion to return **ranked, contextual results** with specific line ranges to read.
694
+
695
+ ### Does it support MCP?
696
+
697
+ Yes. Run `codebase-index mcp --root <repo>` to expose the local index over stdio
698
+ MCP. See [docs/MCP.md](docs/MCP.md) for tools and client config templates.
699
+
700
+ ### Can I use it with other agents?
701
+
702
+ Yes. The CLI is agent-agnostic. Any agent that can run shell commands can use
703
+ `codebase-index`, and JSON output (`--json`) is parseable by other tools.
704
+
705
+ ### How do I reset the index?
706
+
707
+ ```bash
708
+ codebase-index clean # reset the index DB (keeps the skill)
709
+ codebase-index clean --all # wipe the whole .claude/cache/codebase-index/ dir
710
+ # Or manually: rm -rf .claude/cache/codebase-index/
711
+ codebase-index index
712
+ ```
713
+
714
+ ## Contributing
715
+
716
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for the full guide.
717
+
718
+ Quick start:
719
+
720
+ ```bash
721
+ git clone https://github.com/denfry/codebase-index.git
722
+ cd codebase-index
723
+ pip install -e ".[dev]"
724
+ pytest
725
+ ruff check src/ tests/
726
+ ```
727
+
728
+ ## Roadmap
729
+
730
+ See [ROADMAP.md](ROADMAP.md) for the full milestone plan.
731
+
732
+ | Milestone | Status | Description |
733
+ |---|---|---|
734
+ | M0 | ✅ Done | Repository packaging |
735
+ | M1 | ✅ Done | SQLite + FTS5 index |
736
+ | M2 | ✅ Done | Tree-sitter symbol extraction |
737
+ | M3 | ✅ Done | Hybrid retrieval |
738
+ | M4 | ✅ Done | Graph expansion |
739
+ | M5 | ✅ Done | Token-budgeted retrieval packets |
740
+ | M6 | ✅ Done | Optional local embeddings |
741
+ | M7 | ✅ Done | Claude Code Skill packaging |
742
+ | M7.5 | ✅ Done | One-command plugin install |
743
+ | M8 | ✅ Done | Hooks + watch mode |
744
+ | M9 | ✅ Done | Public release |
745
+
746
+ ## License
747
+
748
+ [MIT](LICENSE)