claude-local-docs 1.0.13 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/.mcp.json +2 -1
  2. package/README.md +124 -58
  3. package/commands/fetch-docs.md +54 -28
  4. package/commands/index-codebase.md +53 -0
  5. package/dist/code-indexer.d.ts +14 -0
  6. package/dist/code-indexer.js +519 -0
  7. package/dist/code-indexer.js.map +1 -0
  8. package/dist/code-search.d.ts +14 -0
  9. package/dist/code-search.js +155 -0
  10. package/dist/code-search.js.map +1 -0
  11. package/dist/code-store.d.ts +39 -0
  12. package/dist/code-store.js +206 -0
  13. package/dist/code-store.js.map +1 -0
  14. package/dist/code.test.d.ts +7 -0
  15. package/dist/code.test.js +197 -0
  16. package/dist/code.test.js.map +1 -0
  17. package/dist/discovery.js +56 -4
  18. package/dist/discovery.js.map +1 -1
  19. package/dist/docs.test.d.ts +7 -0
  20. package/dist/docs.test.js +105 -0
  21. package/dist/docs.test.js.map +1 -0
  22. package/dist/file-walker.d.ts +34 -0
  23. package/dist/file-walker.js +199 -0
  24. package/dist/file-walker.js.map +1 -0
  25. package/dist/index.js +321 -22
  26. package/dist/index.js.map +1 -1
  27. package/dist/indexer.js +4 -23
  28. package/dist/indexer.js.map +1 -1
  29. package/dist/integration.test.d.ts +3 -2
  30. package/dist/integration.test.js +461 -11
  31. package/dist/integration.test.js.map +1 -1
  32. package/dist/reranker.d.ts +2 -2
  33. package/dist/reranker.js +10 -12
  34. package/dist/reranker.js.map +1 -1
  35. package/dist/rrf.d.ts +17 -0
  36. package/dist/rrf.js +25 -0
  37. package/dist/rrf.js.map +1 -0
  38. package/dist/search.d.ts +2 -0
  39. package/dist/search.js +30 -52
  40. package/dist/search.js.map +1 -1
  41. package/dist/sfc-extractor.d.ts +14 -0
  42. package/dist/sfc-extractor.js +70 -0
  43. package/dist/sfc-extractor.js.map +1 -0
  44. package/dist/store.d.ts +2 -0
  45. package/dist/store.js +39 -24
  46. package/dist/store.js.map +1 -1
  47. package/dist/tei-client.d.ts +70 -0
  48. package/dist/tei-client.js +153 -0
  49. package/dist/tei-client.js.map +1 -0
  50. package/dist/types.d.ts +49 -0
  51. package/dist/types.js +4 -1
  52. package/dist/types.js.map +1 -1
  53. package/dist/unit.test.d.ts +8 -0
  54. package/dist/unit.test.js +1241 -0
  55. package/dist/unit.test.js.map +1 -0
  56. package/docker-compose.nvidia.yml +7 -0
  57. package/docker-compose.yml +9 -0
  58. package/package.json +8 -2
  59. package/scripts/ensure-tei.sh +93 -19
  60. package/start-tei.sh +17 -3
package/.mcp.json CHANGED
@@ -5,7 +5,8 @@
5
5
  "args": ["-y", "claude-local-docs@latest"],
6
6
  "env": {
7
7
  "TEI_EMBED_URL": "http://localhost:39281",
8
- "TEI_RERANK_URL": "http://localhost:39282"
8
+ "TEI_RERANK_URL": "http://localhost:39282",
9
+ "TEI_CODE_EMBED_URL": "http://localhost:39283"
9
10
  }
10
11
  }
11
12
  }
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # claude-local-docs
2
2
 
3
- A local-first alternative to Context7 for Claude Code. Indexes your project's dependency documentation locally and provides production-grade semantic search. Embeddings and reranking run via TEI (HuggingFace Text Embeddings Inference) Docker containers with auto GPU detection.
3
+ A local-first alternative to Context7 for Claude Code. Indexes your project's dependency documentation **and source code** locally with production-grade semantic search. Embeddings and reranking run via TEI (HuggingFace Text Embeddings Inference) Docker containers with auto GPU detection. Supports JS/TS, Vue, Svelte, and Astro with AST-aware chunking, JSDoc extraction, and git-diff incremental indexing.
4
4
 
5
5
  ## Why not Context7?
6
6
 
@@ -13,8 +13,11 @@ A local-first alternative to Context7 for Claude Code. Indexes your project's de
13
13
  | **GPU accelerated** | NVIDIA CUDA / Apple Metal | N/A |
14
14
  | **Search quality** | 4-stage RAG (vector + BM25 + RRF + cross-encoder reranking) | Single-stage retrieval |
15
15
  | **Doc sources** | Prefers llms.txt, falls back to official docs | Pre-indexed source repos |
16
- | **Scope** | Your project's actual dependencies | Any library |
16
+ | **Code search** | Semantic AST-level search via Qodo-Embed-1-1.5B | N/A |
17
+ | **Framework support** | JS, TS, Vue, Svelte, Astro (SFC script extraction) | N/A |
18
+ | **Scope** | Your project's actual dependencies + source code | Any library |
17
19
  | **Monorepo** | Detects pnpm/npm/yarn workspaces, resolves catalogs | N/A |
20
+ | **Resilience** | BM25-only fallback when TEI is down, retry + timeout | N/A |
18
21
 
19
22
  ## Prerequisites
20
23
 
@@ -25,32 +28,17 @@ A local-first alternative to Context7 for Claude Code. Indexes your project's de
25
28
 
26
29
  ## Installation
27
30
 
28
- ### As a Claude Code MCP server (recommended)
29
-
30
- Add this to your project's `.mcp.json` (or global `~/.claude/mcp.json`):
31
-
32
- ```json
33
- {
34
- "mcpServers": {
35
- "local-docs": {
36
- "command": "npx",
37
- "args": ["-y", "claude-local-docs@latest"],
38
- "env": {
39
- "TEI_EMBED_URL": "http://localhost:39281",
40
- "TEI_RERANK_URL": "http://localhost:39282"
41
- }
42
- }
43
- }
44
- }
45
- ```
46
-
47
- Then start the TEI containers (clone the repo or download `start-tei.sh` + `docker-compose.yml`):
31
+ ### As a Claude Code plugin (recommended)
48
32
 
49
33
  ```bash
50
- ./start-tei.sh
34
+ # Add the marketplace
35
+ /plugin marketplace add matteodante/claude-local-docs
36
+
37
+ # Install the plugin
38
+ /plugin install claude-local-docs
51
39
  ```
52
40
 
53
- The plugin includes a SessionStart hook that auto-checks TEI health and starts containers if needed.
41
+ The plugin starts TEI containers automatically on session start via a SessionStart hook.
54
42
 
55
43
  ### Manual / development setup
56
44
 
@@ -66,6 +54,8 @@ npm run build
66
54
 
67
55
  ## How it works
68
56
 
57
+ ### Documentation search
58
+
69
59
  ```
70
60
  /fetch-docs search_docs("how to use useState")
71
61
  | |
@@ -87,6 +77,33 @@ npm run build
87
77
  Top-K results
88
78
  ```
89
79
 
80
+ ### Codebase search
81
+
82
+ ```
83
+ /index-codebase search_code("RRF fusion logic")
84
+ | |
85
+ v v
86
+ Walk project files +--- Vector search (LanceDB) -------+
87
+ Respect .gitignore | Qodo-Embed-1-1.5B (1536-dim) |
88
+ Git-diff incremental skip | |
89
+ | | +-> RRF Fusion
90
+ v | | (k=60)
91
+ For each JS/TS/Vue/ +-- BM25 search (LanceDB FTS) ------+
92
+ Svelte/Astro file: | camelCase split + stemming |
93
+ - Extract <script> (SFC) | |
94
+ - Parse AST (tree-sitter) +-- File-path boost (optional) -----+
95
+ - Extract functions/classes | v
96
+ - Extract JSDoc/decorators | Cross-encoder rerank
97
+ - Contextual headers | ms-marco-MiniLM-L-6-v2
98
+ - Embed with Qodo-Embed | (via TEI :39282)
99
+ - Store in LanceDB +--------------------------------------+
100
+ |
101
+ v
102
+ Function-level results
103
+ (file, lines, scope, score)
104
+ + neighbor chunk expansion
105
+ ```
106
+
90
107
  ## Usage
91
108
 
92
109
  ### 1. Index your project's docs
@@ -97,23 +114,38 @@ npm run build
97
114
 
98
115
  Claude analyzes your project (including monorepo workspaces), finds all runtime dependencies, searches the web for the best documentation for each one (preferring `llms-full.txt` > `llms.txt` > official docs), and indexes everything locally.
99
116
 
100
- ### 2. Search
117
+ ### 2. Index your source code
118
+
119
+ ```
120
+ /index-codebase
121
+ ```
122
+
123
+ Parses all JS/TS/Vue/Svelte/Astro files with tree-sitter, extracts JSDoc comments and decorators, generates Qodo-Embed-1-1.5B embeddings for function/class/method-level chunks, and stores them in LanceDB. Incremental via git-diff (falls back to SHA-256 hashing for non-git projects). Only changed files are re-indexed.
101
124
 
102
- Ask Claude anything about your dependencies. It will automatically use `search_docs` to find relevant documentation chunks:
125
+ ### 3. Search
126
+
127
+ Ask Claude anything. It will automatically use the right search tool:
103
128
 
104
129
  ```
130
+ # Library documentation (search_docs)
105
131
  How do I set up middleware in Express?
106
132
  What are the options for useQuery in TanStack Query?
107
133
  Show me the API for zod's .refine()
134
+
135
+ # Your codebase (search_code)
136
+ Where is the authentication middleware?
137
+ Find the database connection setup
138
+ How does the search pipeline work?
108
139
  ```
109
140
 
110
- ### 3. Other tools
141
+ ### 4. Other tools
111
142
 
112
143
  - **`list_docs`** — See what's indexed, when it was fetched, chunk counts
113
144
  - **`get_doc_section`** — Retrieve specific sections by heading or chunk ID
145
+ - **`get_codebase_status`** — Check index status, language breakdown, changed files
114
146
  - **`analyze_dependencies`** — List all deps (monorepo-aware, catalog-resolved, runtime/dev tagged)
115
147
  - **`fetch_and_store_doc`** — Fetch a URL and index it directly (no AI truncation)
116
- - **`discover_and_fetch_docs`** — Auto-discover and index docs for a library (probes npm, llms.txt, GitHub, homepage)
148
+ - **`discover_and_fetch_docs`** — Auto-discover and index docs for any npm package
117
149
 
118
150
  ## TEI backend
119
151
 
@@ -121,8 +153,11 @@ ML inference runs in TEI (HuggingFace Text Embeddings Inference) containers:
121
153
 
122
154
  | Container | Port | Model | Purpose |
123
155
  |---|---|---|---|
124
- | tei-embed | `:39281` | `nomic-ai/nomic-embed-text-v1.5` | Text embeddings (384-dim Matryoshka) |
125
- | tei-rerank | `:39282` | `cross-encoder/ms-marco-MiniLM-L-6-v2` | Cross-encoder reranking |
156
+ | tei-embed | `:39281` | `nomic-ai/nomic-embed-text-v1.5` | Doc embeddings (384-dim Matryoshka) |
157
+ | tei-rerank | `:39282` | `cross-encoder/ms-marco-MiniLM-L-6-v2` | Cross-encoder reranking (docs + code) |
158
+ | tei-code-embed | `:39283` | `Qodo/Qodo-Embed-1-1.5B` | Code embeddings (1536-dim, 68.5 CoIR) |
159
+
160
+ All TEI communication goes through a shared `TeiClient` class (`src/tei-client.ts`) with automatic retry (2 attempts, exponential backoff), 30s timeout, and batch splitting. If TEI is unavailable, search pipelines gracefully degrade to BM25-only results.
126
161
 
127
162
  ### Starting TEI
128
163
 
@@ -150,25 +185,39 @@ docker compose -f docker-compose.yml -f docker-compose.nvidia.yml up -d
150
185
 
151
186
  ## Search pipeline
152
187
 
153
- 4-stage RAG pipeline:
188
+ Both doc search and code search use the same 4-stage RAG pipeline:
154
189
 
155
190
  | Stage | Technology | Purpose |
156
191
  |---|---|---|
157
- | **Vector search** | LanceDB + nomic-embed-text-v1.5 via TEI | Semantic similarity (understands meaning) |
192
+ | **Vector search** | LanceDB + nomic-embed / Qodo-Embed via TEI | Semantic similarity (understands meaning) |
158
193
  | **BM25 search** | LanceDB native FTS (BM25, stemming, stop words) | Keyword matching (exact terms like `useEffect`) |
159
194
  | **RRF fusion** | Reciprocal Rank Fusion (k=60) | Merges both ranked lists, handles different score scales |
160
195
  | **Cross-encoder rerank** | ms-marco-MiniLM-L-6-v2 via TEI | Rescores top 50 candidates with deep relevance model |
161
196
 
197
+ ### Code search specifics
198
+
199
+ - **AST chunking**: tree-sitter parses JS/TS/Vue/Svelte/Astro into function/class/method/interface/namespace entities
200
+ - **JSDoc + decorators**: Extracted from AST and prepended to chunk text for richer search context
201
+ - **Metadata flags**: `exported`, `async`, `abstract` tracked per entity
202
+ - **Qodo-Embed-1-1.5B**: 1.5B parameter model, 68.5 CoIR score, 32K context window, 1536-dim embeddings
203
+ - **Contextual headers**: file path + scope chain + flags + decorators + JSDoc prepended for BM25
204
+ - **File-path boost**: Queries containing file names (e.g., "rrf.ts") get a third RRF signal boosting matching files
205
+ - **Neighbor expansion**: Adjacent chunks from the same file are merged for fuller context
206
+ - **Incremental indexing**: Git-diff based (fast, ~50-100ms), falls back to SHA-256 hashing for non-git projects
207
+ - **Graceful degradation**: BM25-only results when vector embedding or reranker is unavailable
208
+ - **SFC support**: Vue `<script>`/`<script setup>`, Svelte `<script>`/`<script context="module">`, Astro `---` frontmatter + `<script>` tags
209
+
162
210
  ## Storage
163
211
 
164
212
  All data stays in your project directory:
165
213
 
166
214
  ```
167
215
  your-project/.claude/docs/
168
- ├── lancedb/ # Vector database (LanceDB files)
169
- ├── .metadata.json # Fetch timestamps, source URLs per library
216
+ ├── lancedb/ # Vector database (docs + code tables)
217
+ ├── .metadata.json # Doc fetch timestamps, source URLs per library
218
+ ├── .code-metadata.json # File hashes, language, chunk counts, last index
170
219
  └── raw/
171
- ├── react.md # Raw fetched documentation
220
+ ├── react.md # Raw fetched documentation
172
221
  ├── next.md
173
222
  └── tanstack__query.md
174
223
  ```
@@ -177,13 +226,16 @@ your-project/.claude/docs/
177
226
 
178
227
  | Tool | Description |
179
228
  |---|---|
180
- | `analyze_dependencies` | Monorepo-aware dep analysis: detects workspaces, resolves catalog versions, tags runtime/dev |
181
- | `store_and_index_doc` | Receive markdown, chunk, embed via TEI, store in LanceDB |
182
- | `search_docs` | Full RAG pipeline: vector + BM25 + RRF + rerank via TEI |
183
- | `list_docs` | List indexed libraries with metadata |
184
- | `get_doc_section` | Get specific chunks by library + heading or chunk ID |
185
- | `fetch_and_store_doc` | Fetch URL directly (raw HTTP, no truncation), then chunk + embed + store |
186
- | `discover_and_fetch_docs` | Auto-discover docs: probes npm registry, llms.txt URLs, GitHub, homepage HTML. Detects and expands index files |
229
+ | `analyze_dependencies` | Detect and list all npm dependencies (monorepo-aware, runtime/dev tagged) |
230
+ | `store_and_index_doc` | Index documentation content you already have as a string |
231
+ | `fetch_and_store_doc` | Fetch documentation from a URL and index it (raw HTTP, no truncation) |
232
+ | `discover_and_fetch_docs` | Auto-discover and index docs for an npm package |
233
+ | `search_docs` | Semantic search across indexed library documentation |
234
+ | `list_docs` | List indexed libraries with version and fetch date |
235
+ | `get_doc_section` | Retrieve specific doc sections by heading or chunk ID |
236
+ | `index_codebase` | Index project source code for semantic search (incremental, .gitignore-aware) |
237
+ | `search_code` | Semantic search across project source code (function/class-level) |
238
+ | `get_codebase_status` | Check codebase index status, language breakdown, changed files |
187
239
 
188
240
  ## Dependencies
189
241
 
@@ -191,23 +243,27 @@ your-project/.claude/docs/
191
243
  |---|---|---|
192
244
  | `@lancedb/lancedb` | Apache 2.0 | Embedded vector database + native FTS |
193
245
  | `@modelcontextprotocol/sdk` | MIT | MCP server framework |
246
+ | `web-tree-sitter` | MIT | WASM-based AST parsing for code chunking |
247
+ | `tree-sitter-wasms` | MIT | Pre-built WASM grammars (JS/TS/Vue/Svelte) |
248
+ | `ignore` | MIT | .gitignore pattern matching |
194
249
  | `zod` | MIT | Schema validation |
195
- | `turndown` | MIT | HTML to markdown conversion |
196
- | `turndown-plugin-gfm` | MIT | GFM support for turndown (tables, strikethrough, etc.) |
197
250
 
198
251
  TEI containers (Docker):
199
252
 
200
253
  | Image | Model | Purpose |
201
254
  |---|---|---|
202
- | `text-embeddings-inference:*` | `nomic-ai/nomic-embed-text-v1.5` | Text embeddings |
255
+ | `text-embeddings-inference:*` | `nomic-ai/nomic-embed-text-v1.5` | Doc embeddings |
203
256
  | `text-embeddings-inference:*` | `cross-encoder/ms-marco-MiniLM-L-6-v2` | Cross-encoder reranking |
257
+ | `text-embeddings-inference:*` | `Qodo/Qodo-Embed-1-1.5B` | Code embeddings (1536-dim) |
204
258
 
205
259
  ## Development
206
260
 
207
261
  ```bash
208
- npm run dev # Watch mode — rebuilds on file changes
209
- npm run build # One-time build
210
- npm test # Integration test (requires TEI running)
262
+ npm run dev # Watch mode — rebuilds on file changes
263
+ npm run build # One-time build
264
+ npm run test:unit # Unit tests (no TEI needed)
265
+ npm run test:docs # Doc search integration tests (requires TEI on :39281, :39282)
266
+ npm run test:code # Code search integration tests (requires TEI on :39281, :39282, :39283)
211
267
  ```
212
268
 
213
269
  ## Project structure
@@ -219,7 +275,8 @@ claude-local-docs/
219
275
  │ └── marketplace.json # Marketplace listing
220
276
  ├── .mcp.json # MCP server config (stdio transport)
221
277
  ├── commands/
222
- └── fetch-docs.md # /fetch-docs — Claude as research agent
278
+ ├── fetch-docs.md # /fetch-docs — Claude as research agent
279
+ │ └── index-codebase.md # /index-codebase — index source code
223
280
  ├── hooks/
224
281
  │ └── hooks.json # SessionStart hook for TEI containers
225
282
  ├── scripts/
@@ -228,17 +285,25 @@ claude-local-docs/
228
285
  ├── docker-compose.nvidia.yml # NVIDIA GPU device passthrough
229
286
  ├── start-tei.sh # Auto-detect GPU, start TEI
230
287
  ├── src/
231
- │ ├── index.ts # MCP server entry, 7 tool definitions
232
- │ ├── discovery.ts # Doc discovery: npm registry, URL probing, index expansion, HTML→markdown
233
- │ ├── indexer.ts # Chunking + TEI embeddings
234
- │ ├── search.ts # 4-stage pipeline: vector + BM25 + RRF + rerank
288
+ │ ├── index.ts # MCP server entry, 10 tool definitions
289
+ │ ├── tei-client.ts # Shared TEI HTTP client (retry, timeout, batching)
290
+ │ ├── indexer.ts # Doc chunking + nomic-embed-text embeddings
291
+ │ ├── search.ts # Doc search pipeline (vector + BM25 + RRF + rerank)
292
+ │ ├── rrf.ts # Shared Reciprocal Rank Fusion utility
235
293
  │ ├── reranker.ts # TEI cross-encoder reranking
236
- │ ├── store.ts # LanceDB storage + metadata persistence
294
+ │ ├── store.ts # LanceDB "docs" table + metadata
295
+ │ ├── code-indexer.ts # AST chunking (tree-sitter) + Qodo-Embed embeddings
296
+ │ ├── code-search.ts # Code search pipeline (4-stage + file-path boost + neighbors)
297
+ │ ├── code-store.ts # LanceDB "code" table + file hash tracking + schema migration
298
+ │ ├── file-walker.ts # Project file discovery + .gitignore + git-diff
299
+ │ ├── sfc-extractor.ts # Vue/Svelte/Astro <script> block extraction
237
300
  │ ├── fetcher.ts # Raw HTTP fetch (no AI truncation)
238
301
  │ ├── workspace.ts # Monorepo detection + pnpm catalog
302
+ │ ├── discovery.ts # npm registry + URL probing for docs
239
303
  │ ├── types.ts # Shared TypeScript interfaces
240
- │ ├── turndown-plugin-gfm.d.ts # Type declarations for turndown-plugin-gfm
241
- └── integration.test.ts # Integration tests (requires TEI running)
304
+ │ ├── unit.test.ts # Unit tests (no TEI needed)
305
+ ├── docs.test.ts # Doc search integration tests
306
+ │ └── code.test.ts # Code search integration tests
242
307
  ├── LICENSE
243
308
  ├── package.json
244
309
  └── tsconfig.json
@@ -254,15 +319,16 @@ docker info
254
319
  # Check container logs
255
320
  docker compose logs tei-embed
256
321
  docker compose logs tei-rerank
322
+ docker compose logs tei-code-embed
257
323
 
258
324
  # Restart
259
325
  ./start-tei.sh --stop && ./start-tei.sh
260
326
  ```
261
327
 
262
328
  ### Port conflicts
263
- If 39281/39282 are in use, override via env vars:
329
+ If 39281/39282/39283 are in use, override via env vars:
264
330
  ```bash
265
- TEI_EMBED_URL=http://localhost:49281 TEI_RERANK_URL=http://localhost:49282 node dist/index.js
331
+ TEI_EMBED_URL=http://localhost:49281 TEI_RERANK_URL=http://localhost:49282 TEI_CODE_EMBED_URL=http://localhost:49283 node dist/index.js
266
332
  ```
267
333
 
268
334
  ### Apple Silicon — slow performance
@@ -33,47 +33,48 @@ Call `list_docs` to see which libraries are already indexed. **Skip** any librar
33
33
 
34
34
  ### 4. Fetch Documentation
35
35
 
36
- For each remaining library, follow this multi-step strategy. The goal is to find the **best quality** source — `llms-full.txt` > `llms.txt` (expanded index) > homepage HTML > README.
36
+ For each remaining library, follow this strategy. The goal is to find the **best quality** source — `llms-full.txt` > `llms.txt` (expanded index) > homepage HTML > README.
37
37
 
38
38
  #### Step A: Check Known URLs first
39
39
 
40
- Before any searching, check if the library is in the **Known URLs Reference** below. If there's a known `llms-full.txt` or `llms.txt` URL, use it directly with `fetch_and_store_doc`. This is the fastest path.
40
+ Before any probing, check if the library is in the **Known URLs Reference** below. If there's a known `llms-full.txt` or `llms.txt` URL, use it directly with `fetch_and_store_doc`. This is the fastest path.
41
41
 
42
- #### Step B: WebSearch for llms.txt
42
+ #### Step B: `discover_and_fetch_docs` (automatic probing)
43
43
 
44
- For libraries NOT in the known list, use **WebSearch** to find the actual `llms.txt` or `llms-full.txt` URL. Use queries like:
44
+ For libraries NOT in the known list, call **`discover_and_fetch_docs`**. This tool automatically:
45
+ 1. Checks npm registry for `llms`/`llmsFull` fields in package.json (newest convention)
46
+ 2. Probes homepage (skipping GitHub homepages), `docs.{domain}`, `llms.{domain}`, `/docs/` subpath for llms-full.txt/llms.txt
47
+ 3. Validates redirect domains (rejects cross-domain redirects like GitHub → docs.github.com)
48
+ 4. Validates content quality (rejects 404 pages, too-short content)
49
+ 5. Probes GitHub raw for llms-full.txt/llms.txt on main/master branches
50
+ 6. Falls back to README.md from GitHub
51
+ 7. Falls back to homepage HTML → markdown conversion
52
+ 8. Detects index files and expands them by fetching linked pages
45
53
 
46
- > `{library-name} llms-full.txt site:{homepage-domain}`
54
+ #### Step C: WebSearch fallback
47
55
 
48
- or more broadly:
56
+ If `discover_and_fetch_docs` fails or returns very thin results (< 3 chunks), use **WebSearch** to find the actual `llms.txt` or `llms-full.txt` URL:
49
57
 
50
58
  > `{library-name} llms-full.txt OR llms.txt documentation`
51
59
 
52
- If the search finds a concrete URL to an `llms.txt` or `llms-full.txt` file, pass it directly to **`fetch_and_store_doc`**. Prefer `llms-full.txt` over `llms.txt` when both exist.
53
-
54
- **Batch the searches**: Run WebSearch for multiple libraries in parallel (up to 5 at a time) to collect URLs upfront. Then fetch them one by one.
55
-
56
- #### Step C: `discover_and_fetch_docs` (automatic probing)
57
-
58
- If neither known URLs nor WebSearch found an `llms.txt` URL, call **`discover_and_fetch_docs`**. This tool automatically:
59
- 1. Checks npm registry for `llms`/`llmsFull` fields in package.json (newest convention)
60
- 2. Probes homepage, `docs.{domain}`, `llms.{domain}`, `/docs/` subpath for llms-full.txt/llms.txt
61
- 3. Probes GitHub raw for llms-full.txt/llms.txt on main/master branches
62
- 4. Falls back to README.md from GitHub
63
- 5. Falls back to homepage HTML → markdown conversion
64
- 6. Detects index files and expands them by fetching linked pages
60
+ If the search finds a concrete URL, pass it to **`fetch_and_store_doc`**. Prefer `llms-full.txt` over `llms.txt`.
65
61
 
66
62
  #### Step D: Training data fallback
67
63
 
68
64
  If all above fail, try **`fetch_and_store_doc`** with documentation URLs you know from your training data (GitHub raw docs, official doc site pages, etc.).
69
65
 
70
- #### Evaluating results
66
+ #### Evaluating results & chunk quality
71
67
 
72
68
  After each library is fetched, check the chunk count:
73
- - **< 5 chunks**: Very thin. Use WebSearch to find additional doc pages (API reference, guides) and fetch with `fetch_and_store_doc` to supplement.
69
+ - **< 3 chunks**: Very thin flag as "very thin, may need supplementing". Try `fetch_and_store_doc` with additional doc pages from training data.
70
+ - **3-5 chunks**: Thin. Acceptable for small/simple libraries, but note it in the summary.
74
71
  - **5-20 chunks**: Acceptable for small libraries.
75
72
  - **20+ chunks**: Good coverage.
76
73
 
74
+ Also note the source type:
75
+ - `readme` fallback means the library has no proper docs site — worth noting
76
+ - `homepage-html` means HTML was converted — quality varies
77
+
77
78
  #### Progress reporting
78
79
 
79
80
  After each library, report:
@@ -93,6 +94,12 @@ Done! Indexed X/Y libraries.
93
94
  express — 30 chunks (homepage-html)
94
95
  lodash — FAILED (no docs found)
95
96
 
97
+ Thin coverage (< 5 chunks):
98
+ some-lib — 2 chunks (readme) ⚠️
99
+
100
+ README fallback (no docs site found):
101
+ another-lib — 8 chunks (readme)
102
+
96
103
  Total: 280 chunks across 4 libraries.
97
104
  Use search_docs to query your documentation.
98
105
  ```
@@ -112,10 +119,11 @@ Use these URLs directly with `fetch_and_store_doc` — no searching needed. Pref
112
119
  | svelte | `https://svelte.dev/llms-full.txt` |
113
120
  | @sveltejs/kit | `https://svelte.dev/llms-full.txt` |
114
121
  | vue | (no official llms.txt — use `discover_and_fetch_docs`) |
115
- | react-native | `https://reactnative.dev/llms.txt` |
122
+ | react-native | `https://reactnative.dev/llms-full.txt` |
116
123
  | expo | `https://docs.expo.dev/llms-full.txt` |
117
124
  | hono | `https://hono.dev/llms.txt` |
118
125
  | bun | `https://bun.sh/llms.txt` |
126
+ | astro | `https://astro.build/llms.txt` |
119
127
 
120
128
  ### Styling & UI
121
129
 
@@ -139,6 +147,7 @@ Use these URLs directly with `fetch_and_store_doc` — no searching needed. Pref
139
147
  | drizzle-orm | `https://orm.drizzle.team/llms-full.txt` |
140
148
  | @prisma/client | `https://prisma.io/docs/llms-full.txt` |
141
149
  | convex | `https://docs.convex.dev/llms.txt` |
150
+ | zustand | `https://zustand.docs.pmnd.rs/llms-full.txt` |
142
151
 
143
152
  ### Backend & APIs
144
153
 
@@ -149,6 +158,7 @@ Use these URLs directly with `fetch_and_store_doc` — no searching needed. Pref
149
158
  | resend | `https://resend.com/docs/llms-full.txt` |
150
159
  | @medusajs/medusa | `https://docs.medusajs.com/llms-full.txt` |
151
160
  | better-auth | `https://www.better-auth.com/llms.txt` |
161
+ | bullmq | `https://docs.bullmq.io/llms-full.txt` |
152
162
 
153
163
  ### AI & LLM
154
164
 
@@ -171,11 +181,27 @@ Use these URLs directly with `fetch_and_store_doc` — no searching needed. Pref
171
181
  | @netlify/functions | `https://docs.netlify.com/llms.txt` |
172
182
  | @liveblocks/client | `https://liveblocks.io/llms-full.txt` |
173
183
 
184
+ ### React Native Libraries
185
+
186
+ | Library | Best URL |
187
+ |---|---|
188
+ | react-native-reanimated | `https://docs.swmansion.com/react-native-reanimated/llms.txt` |
189
+ | react-native-gesture-handler | `https://docs.swmansion.com/react-native-gesture-handler/llms.txt` |
190
+ | @react-navigation/native | `https://reactnavigation.org/llms.txt` |
191
+ | react-native-keyboard-controller | `https://kirillzyusko.github.io/react-native-keyboard-controller/llms-full.txt` |
192
+
193
+ ### i18n
194
+
195
+ | Library | Best URL |
196
+ |---|---|
197
+ | i18next | `https://www.i18next.com/llms-full.txt` |
198
+ | react-i18next | `https://react.i18next.com/llms-full.txt` |
199
+
174
200
  ### Animation
175
201
 
176
202
  | Library | Best URL |
177
203
  |---|---|
178
- | motion / framer-motion | Special: `https://llms.motion.dev/docs/react-quick-start.md` (or use WebSearch for full index) |
204
+ | motion / framer-motion | Special: `https://llms.motion.dev/docs/react-quick-start.md` (or use `discover_and_fetch_docs`) |
179
205
 
180
206
  ### Notes on special patterns
181
207
 
@@ -188,13 +214,13 @@ Use these URLs directly with `fetch_and_store_doc` — no searching needed. Pref
188
214
 
189
215
  ## Critical Rules
190
216
 
191
- - **Check known URLs first** — the reference table above is faster and more reliable than searching.
192
- - **Search second, probe third** — use WebSearch to find llms.txt URLs before falling back to blind URL probing via `discover_and_fetch_docs`.
217
+ - **Check known URLs first** — the reference table above is faster and more reliable than probing.
218
+ - **Use `discover_and_fetch_docs` for unknown libraries** — it now correctly handles GitHub homepages and validates redirects.
193
219
  - **Prefer `llms-full.txt` over `llms.txt`** — the full version has complete documentation without truncation.
194
- - **Use `fetch_and_store_doc` when you have a known URL** — from the reference table, WebSearch results, or training data.
220
+ - **Use `fetch_and_store_doc` when you have a known URL** — from the reference table or training data.
195
221
  - **Use `discover_and_fetch_docs` when you have no URL** — it will probe common patterns automatically.
196
- - **Supplement thin results** — if a library has < 5 chunks, search for additional doc pages and fetch them.
222
+ - **Flag thin results** — report libraries with < 3 chunks as "very thin" in the summary.
197
223
  - **NEVER write files to the filesystem directly.** Do NOT use the Write tool, Bash tool, or any other method to save documentation content to disk. ALL storage goes through the MCP tools.
198
- - **One library at a time for fetching** — clear progress, no batching (but WebSearch can be batched)
224
+ - **One library at a time for fetching** — clear progress, no batching
199
225
  - **Skip dev deps by default** — runtime deps only
200
226
  - Handle errors gracefully: if a library fails, log it and move to the next one
@@ -0,0 +1,53 @@
1
+ ---
2
+ description: "Index the project's source code for semantic search"
3
+ allowed-tools: ["mcp__local-docs__get_codebase_status", "mcp__local-docs__index_codebase"]
4
+ ---
5
+
6
+ # Index Project Codebase
7
+
8
+ You are a codebase indexing agent. Your job is to index the project's source code so it can be searched semantically with `search_code`.
9
+
10
+ ## Steps
11
+
12
+ ### 1. Check Current Status
13
+
14
+ Call `get_codebase_status` to see:
15
+ - Whether any code has been indexed before
16
+ - How many files are currently indexed
17
+ - Language breakdown (TypeScript vs JavaScript)
18
+ - Files that have changed since last index
19
+
20
+ ### 2. Run Indexing
21
+
22
+ Based on the status:
23
+
24
+ - **First time**: Call `index_codebase` with no parameters. This will index all JS/TS files.
25
+ - **Files changed**: Call `index_codebase` with no parameters. Incremental indexing will only process changed files.
26
+ - **Force refresh**: Call `index_codebase` with `forceReindex: true` to re-index everything.
27
+ - **Up to date**: If no files have changed, tell the user the index is current.
28
+
29
+ ### 3. Report Results
30
+
31
+ After indexing completes, report:
32
+
33
+ ```
34
+ Codebase indexed!
35
+
36
+ TypeScript: 45 files
37
+ JavaScript: 12 files
38
+ Total: 57 files, 320 chunks
39
+
40
+ Indexed: 15 files (changed)
41
+ Skipped: 42 files (unchanged)
42
+ Removed: 0 files (deleted)
43
+
44
+ Use search_code to search your codebase semantically.
45
+ ```
46
+
47
+ If there were errors, list them so the user can investigate.
48
+
49
+ ## Critical Rules
50
+
51
+ - Always check status first — avoid unnecessary full re-indexing
52
+ - Report per-language breakdown
53
+ - Mention `search_code` is available after indexing
@@ -0,0 +1,14 @@
1
+ /**
2
+ * AST-based code chunking via web-tree-sitter + code embedding.
3
+ * Parses JS/TS files into function/class/method-level chunks with contextual headers.
4
+ * Extracts JSDoc, decorators, and metadata flags (exported, async, abstract).
5
+ */
6
+ import type { CodeRow } from "./types.js";
7
+ /**
8
+ * Parse and chunk a code file into entities. Accepts an optional lineOffset
9
+ * for SFC files where script content starts at a non-zero line.
10
+ */
11
+ export declare function chunkCodeFile(source: string, filePath: string, language: string, lineOffset?: number): Promise<Omit<CodeRow, "id" | "vector">[]>;
12
+ export declare function embedCodeTexts(texts: string[], mode?: "document" | "query"): Promise<number[][]>;
13
+ /** Parse and embed a code file, returning rows ready for LanceDB. */
14
+ export declare function indexCodeFile(source: string, filePath: string, language: string): Promise<Omit<CodeRow, "id">[]>;