mcp-local-rag 0.14.0 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +150 -10
- package/dist/cli/common.d.ts +37 -0
- package/dist/cli/common.d.ts.map +1 -1
- package/dist/cli/common.js +81 -1
- package/dist/cli/common.js.map +1 -1
- package/dist/cli/delete.js +3 -3
- package/dist/cli/delete.js.map +1 -1
- package/dist/cli/ingest.d.ts +38 -14
- package/dist/cli/ingest.d.ts.map +1 -1
- package/dist/cli/ingest.js +146 -74
- package/dist/cli/ingest.js.map +1 -1
- package/dist/cli/list.d.ts +6 -1
- package/dist/cli/list.d.ts.map +1 -1
- package/dist/cli/list.js +158 -34
- package/dist/cli/list.js.map +1 -1
- package/dist/cli/options.d.ts +22 -3
- package/dist/cli/options.d.ts.map +1 -1
- package/dist/cli/options.js +37 -32
- package/dist/cli/options.js.map +1 -1
- package/dist/cli/query.d.ts.map +1 -1
- package/dist/cli/query.js +2 -3
- package/dist/cli/query.js.map +1 -1
- package/dist/cli/read-neighbors.js +2 -2
- package/dist/cli/read-neighbors.js.map +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/dist/ingest/visual.d.ts +6 -4
- package/dist/ingest/visual.d.ts.map +1 -1
- package/dist/ingest/visual.js +18 -3
- package/dist/ingest/visual.js.map +1 -1
- package/dist/parser/index.d.ts +47 -10
- package/dist/parser/index.d.ts.map +1 -1
- package/dist/parser/index.js +70 -14
- package/dist/parser/index.js.map +1 -1
- package/dist/pdf-visual/captioner.d.ts +9 -9
- package/dist/pdf-visual/captioner.d.ts.map +1 -1
- package/dist/pdf-visual/captioner.js +48 -157
- package/dist/pdf-visual/captioner.js.map +1 -1
- package/dist/pdf-visual/captioners/fast.d.ts +7 -0
- package/dist/pdf-visual/captioners/fast.d.ts.map +1 -0
- package/dist/pdf-visual/captioners/fast.js +129 -0
- package/dist/pdf-visual/captioners/fast.js.map +1 -0
- package/dist/pdf-visual/captioners/quality.d.ts +7 -0
- package/dist/pdf-visual/captioners/quality.d.ts.map +1 -0
- package/dist/pdf-visual/captioners/quality.js +150 -0
- package/dist/pdf-visual/captioners/quality.js.map +1 -0
- package/dist/pdf-visual/captioners/shared.d.ts +11 -0
- package/dist/pdf-visual/captioners/shared.d.ts.map +1 -0
- package/dist/pdf-visual/captioners/shared.js +40 -0
- package/dist/pdf-visual/captioners/shared.js.map +1 -0
- package/dist/pdf-visual/index.d.ts +24 -10
- package/dist/pdf-visual/index.d.ts.map +1 -1
- package/dist/pdf-visual/index.js +27 -29
- package/dist/pdf-visual/index.js.map +1 -1
- package/dist/pdf-visual/types.d.ts +14 -9
- package/dist/pdf-visual/types.d.ts.map +1 -1
- package/dist/server/error-utils.d.ts +40 -0
- package/dist/server/error-utils.d.ts.map +1 -1
- package/dist/server/error-utils.js +66 -0
- package/dist/server/error-utils.js.map +1 -1
- package/dist/server/index.d.ts +92 -37
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +296 -75
- package/dist/server/index.js.map +1 -1
- package/dist/server/tool-definitions.d.ts.map +1 -1
- package/dist/server/tool-definitions.js +6 -0
- package/dist/server/tool-definitions.js.map +1 -1
- package/dist/server/types.d.ts +63 -8
- package/dist/server/types.d.ts.map +1 -1
- package/dist/server-main.d.ts.map +1 -1
- package/dist/server-main.js +91 -10
- package/dist/server-main.js.map +1 -1
- package/dist/utils/base-dirs.d.ts +203 -0
- package/dist/utils/base-dirs.d.ts.map +1 -0
- package/dist/utils/base-dirs.js +407 -0
- package/dist/utils/base-dirs.js.map +1 -0
- package/dist/utils/raw-data-utils.d.ts +15 -5
- package/dist/utils/raw-data-utils.d.ts.map +1 -1
- package/dist/utils/raw-data-utils.js +39 -8
- package/dist/utils/raw-data-utils.js.map +1 -1
- package/dist/utils/sensitive-path.d.ts +23 -0
- package/dist/utils/sensitive-path.d.ts.map +1 -0
- package/dist/utils/sensitive-path.js +91 -0
- package/dist/utils/sensitive-path.js.map +1 -0
- package/package.json +7 -7
- package/skills/mcp-local-rag/SKILL.md +77 -17
- package/skills/mcp-local-rag/references/cli-reference.md +18 -6
package/README.md
CHANGED
|
@@ -33,7 +33,7 @@ Semantic search with keyword boost for exact technical terms — fully private,
|
|
|
33
33
|
|
|
34
34
|
## Quick Start
|
|
35
35
|
|
|
36
|
-
Set `BASE_DIR` to the folder you want to search. Documents must live under
|
|
36
|
+
Set `BASE_DIR` to the folder you want to search (or `BASE_DIRS` for multiple roots — see [Configuration](#configuration)). Documents must live under one of the configured roots.
|
|
37
37
|
|
|
38
38
|
Add the MCP server to your AI coding tool:
|
|
39
39
|
|
|
@@ -121,7 +121,7 @@ Re-ingesting the same file replaces the old version automatically.
|
|
|
121
121
|
|
|
122
122
|
##### Ingesting PDFs with figures (visual mode)
|
|
123
123
|
|
|
124
|
-
PDFs with charts, tables, or diagrams can optionally add local VLM-generated captions to the
|
|
124
|
+
PDFs with charts, tables, or diagrams can optionally add local VLM-generated captions to the document index, giving visual content some searchable representation in the same vector + FTS pipeline. Captions are auxiliary text — not image search, not OCR, and not a faithful transcription of the figure.
|
|
125
125
|
|
|
126
126
|
**Via MCP**:
|
|
127
127
|
```
|
|
@@ -133,9 +133,34 @@ PDFs with charts, tables, or diagrams can optionally add local VLM-generated cap
|
|
|
133
133
|
npx mcp-local-rag ingest ./docs/spec.pdf --visual
|
|
134
134
|
```
|
|
135
135
|
|
|
136
|
-
|
|
136
|
+
Each caption is emitted as its own chunk with the envelope `[Visual content on page N: …]`, alongside the page-body chunks. It flows through the existing embedder and FTS index — no schema differences, no separate index.
|
|
137
137
|
|
|
138
|
-
Visual mode is opt-in; normal ingest does not load the VLM.
|
|
138
|
+
Visual mode is opt-in; normal ingest does not load the VLM. Per-page VLM failures are tolerated — that page proceeds with text only.
|
|
139
|
+
|
|
140
|
+
###### Choosing a visual-quality profile
|
|
141
|
+
|
|
142
|
+
Visual mode offers two profiles, selected per ingest call:
|
|
143
|
+
|
|
144
|
+
| Profile | Model | Disk (cache) | Per-page inference | Suited for |
|
|
145
|
+
|---|---|---|---|---|
|
|
146
|
+
| `fast` (default) | `HuggingFaceTB/SmolVLM-256M-Instruct` | ~250 MB | baseline | Light visual indexing, quick first-run setup. |
|
|
147
|
+
| `quality` | `onnx-community/Qwen2.5-VL-3B-Instruct-ONNX` | ~2.9 GB | ~2× `fast` | Figures with in-image text (axis labels, panel sub-labels, annotations) where caption fidelity matters more than inference time. |
|
|
148
|
+
|
|
149
|
+
The numbers above are measured on CPU during development on the project's probe PDFs; they may shift with model updates or differ on your hardware.
|
|
150
|
+
|
|
151
|
+
**Via MCP** — `ingest_file` accepts an optional `visualQuality` parameter (enum: `'fast' | 'quality'`, default `'fast'`; ignored when `visual` is false):
|
|
152
|
+
```
|
|
153
|
+
"Ingest /Users/me/docs/research-paper.pdf with visual: true and visualQuality: 'quality'"
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
**Via CLI** — `--visual-quality fast|quality` (default `fast`; silently ignored when `--visual` is absent):
|
|
157
|
+
```bash
|
|
158
|
+
npx mcp-local-rag ingest ./docs/research-paper.pdf --visual --visual-quality quality
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Profile model identifiers and quantization variants are fixed per release. Both profiles share the same `CACHE_DIR` (default: `./models/`); the first run on each profile downloads its model.
|
|
162
|
+
|
|
163
|
+
> **Behavior change from v0.14.0**: Captions are now emitted as dedicated chunks rather than appended to the page text before chunking. As a side effect, `metadata.fileSize` for visual ingests no longer includes the caption character count — it measures the post-extraction body length only. The underlying PDF is unchanged; only the reported `fileSize` for visual-ingested PDFs may shrink across the release boundary.
|
|
139
164
|
|
|
140
165
|
> **Security note**: Visual captions are derived from PDF contents and may inherit attacker-controlled text. Downstream LLM consumers should treat retrieved chunks as untrusted data, not as instructions. The `[Visual content on page N: …]` envelope helps consumers distinguish caption text from prose.
|
|
141
166
|
|
|
@@ -181,7 +206,7 @@ Pass the `filePath` and `chunkIndex` from the search result. The response includ
|
|
|
181
206
|
#### Managing Files
|
|
182
207
|
|
|
183
208
|
```
|
|
184
|
-
"List all files in
|
|
209
|
+
"List all files in configured base directories and their ingested status" # See what's indexed
|
|
185
210
|
"Delete old-spec.pdf from RAG" # Remove a file
|
|
186
211
|
"Show RAG server status" # Check system health
|
|
187
212
|
```
|
|
@@ -212,6 +237,15 @@ npx mcp-local-rag delete --source "https://..." # Remove by source URL
|
|
|
212
237
|
npx mcp-local-rag --db-path ./my-db query "auth" --base-dir ./docs
|
|
213
238
|
```
|
|
214
239
|
|
|
240
|
+
The `--base-dir` flag is repeatable on `ingest` and `list`; pass it once per root:
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
npx mcp-local-rag ingest --base-dir ./docs --base-dir ./specs ./docs/readme.md
|
|
244
|
+
npx mcp-local-rag list --base-dir ./docs --base-dir ./specs
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
The positional path to `ingest` must sit inside one of the configured roots. When at least one `--base-dir` is supplied, CLI roots replace any env-var roots (no merge).
|
|
248
|
+
|
|
215
249
|
**Environment variables** — set in your shell:
|
|
216
250
|
|
|
217
251
|
```bash
|
|
@@ -220,6 +254,13 @@ export BASE_DIR=./docs
|
|
|
220
254
|
npx mcp-local-rag query "auth"
|
|
221
255
|
```
|
|
222
256
|
|
|
257
|
+
For multiple roots, use `BASE_DIRS` (JSON array of non-empty path strings):
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
export BASE_DIRS='["/Users/me/Documents/work","/Users/me/Projects/specs"]'
|
|
261
|
+
npx mcp-local-rag list
|
|
262
|
+
```
|
|
263
|
+
|
|
223
264
|
**Sharing config between MCP and CLI** — if your MCP client inherits shell environment variables, you can set them in your shell profile (e.g., `~/.zshrc`) so both use the same values. Otherwise, set them explicitly in your MCP config as well.
|
|
224
265
|
|
|
225
266
|
```bash
|
|
@@ -334,7 +375,8 @@ The MCP server is configured by environment variables only — pass them through
|
|
|
334
375
|
|
|
335
376
|
| Environment Variable | CLI Flag | Default | Description |
|
|
336
377
|
|---------------------|----------|---------|-------------|
|
|
337
|
-
| `BASE_DIR` | `--base-dir` | Current directory |
|
|
378
|
+
| `BASE_DIR` | `--base-dir` (repeatable) | Current directory | Single document root directory (security boundary). See [Document Roots](#document-roots-base_dir-and-base_dirs) for multi-root setup. |
|
|
379
|
+
| `BASE_DIRS` | — | (unset) | JSON array of document roots (security boundary). Takes precedence over `BASE_DIR`. See [Document Roots](#document-roots-base_dir-and-base_dirs). |
|
|
338
380
|
| `DB_PATH` | `--db-path` | `./lancedb/` | Vector database location |
|
|
339
381
|
| `CACHE_DIR` | `--cache-dir` | `./models/` | Model cache directory |
|
|
340
382
|
| `MODEL_NAME` | `--model-name` | `Xenova/all-MiniLM-L6-v2` | HuggingFace model ID ([available models](https://huggingface.co/models?library=transformers.js&pipeline_tag=feature-extraction)) |
|
|
@@ -349,6 +391,89 @@ The MCP server is configured by environment variables only — pass them through
|
|
|
349
391
|
|
|
350
392
|
⚠️ Changing `MODEL_NAME` changes embedding dimensions. Delete `DB_PATH` and re-ingest after switching models.
|
|
351
393
|
|
|
394
|
+
### Document Roots (`BASE_DIR` and `BASE_DIRS`)
|
|
395
|
+
|
|
396
|
+
mcp-local-rag enforces a security boundary: only files under a configured root are accessible to ingest, list, delete, or read-neighbor operations.
|
|
397
|
+
|
|
398
|
+
**Single root** — use `BASE_DIR`:
|
|
399
|
+
|
|
400
|
+
```bash
|
|
401
|
+
export BASE_DIR=/Users/me/Documents/work
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
**Multiple roots** — use `BASE_DIRS` with a JSON array:
|
|
405
|
+
|
|
406
|
+
```bash
|
|
407
|
+
export BASE_DIRS='["/Users/me/Documents/work","/Users/me/Projects/specs"]'
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
Only JSON-array syntax is supported. Delimiter syntax such as `BASE_DIRS=/a:/b` is intentionally **not** supported (avoids ambiguity with spaces, colons, commas, and Windows paths).
|
|
411
|
+
|
|
412
|
+
**Resolution order** (highest precedence first):
|
|
413
|
+
|
|
414
|
+
1. CLI `--base-dir <path>` flags (repeatable on `ingest` and `list`)
|
|
415
|
+
2. `BASE_DIRS` environment variable
|
|
416
|
+
3. `BASE_DIR` environment variable
|
|
417
|
+
4. `process.cwd()` (current working directory)
|
|
418
|
+
|
|
419
|
+
CLI roots **replace** env roots — they are never merged. `BASE_DIRS` and `BASE_DIR` are never merged either: `BASE_DIRS` wins when both are set.
|
|
420
|
+
|
|
421
|
+
**Precedence warning** — when `BASE_DIRS` and `BASE_DIR` are both set (and no CLI `--base-dir` is supplied), `BASE_DIR` is ignored and a warning is surfaced. The warning is visible:
|
|
422
|
+
|
|
423
|
+
- In MCP tool responses (as an additional content block, on every tool — including `status`, `query_documents`, `ingest_file`, `ingest_data`, `list_files`, `delete_file`, `read_chunk_neighbors`).
|
|
424
|
+
- On CLI `stderr`.
|
|
425
|
+
|
|
426
|
+
Unset `BASE_DIR` (or remove `BASE_DIRS`) to silence the warning.
|
|
427
|
+
|
|
428
|
+
**Nested-root pruning** — if one configured root sits inside another after realpath resolution, the nested child is dropped to avoid duplicate scan results. A pruning warning is surfaced the same way as the precedence warning. The surviving parent root still defines the security boundary.
|
|
429
|
+
|
|
430
|
+
**Invalid `BASE_DIRS`** — when `BASE_DIRS` is not a valid JSON array of non-empty strings (malformed JSON, empty array, non-string elements, ...), root-dependent MCP tools return a structured error and CLI subcommands exit non-zero. There is **no silent fallback** to `BASE_DIR` or `cwd`. The MCP `status` tool remains callable so you can diagnose the config error through your MCP client.
|
|
431
|
+
|
|
432
|
+
**MCP client examples** — multi-root setup:
|
|
433
|
+
|
|
434
|
+
Cursor (`~/.cursor/mcp.json`):
|
|
435
|
+
```json
|
|
436
|
+
{
|
|
437
|
+
"mcpServers": {
|
|
438
|
+
"local-rag": {
|
|
439
|
+
"command": "npx",
|
|
440
|
+
"args": ["-y", "mcp-local-rag"],
|
|
441
|
+
"env": {
|
|
442
|
+
"BASE_DIRS": "[\"/Users/me/Documents/work\",\"/Users/me/Projects/specs\"]"
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
Codex (`~/.codex/config.toml`):
|
|
450
|
+
```toml
|
|
451
|
+
[mcp_servers.local-rag]
|
|
452
|
+
command = "npx"
|
|
453
|
+
args = ["-y", "mcp-local-rag"]
|
|
454
|
+
|
|
455
|
+
[mcp_servers.local-rag.env]
|
|
456
|
+
BASE_DIRS = "[\"/Users/me/Documents/work\",\"/Users/me/Projects/specs\"]"
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
Claude Code:
|
|
460
|
+
```bash
|
|
461
|
+
claude mcp add local-rag --scope user \
|
|
462
|
+
--env BASE_DIRS='["/Users/me/Documents/work","/Users/me/Projects/specs"]' \
|
|
463
|
+
-- npx -y mcp-local-rag
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
**CLI examples** — multi-root invocations:
|
|
467
|
+
|
|
468
|
+
```bash
|
|
469
|
+
# Repeatable --base-dir
|
|
470
|
+
npx mcp-local-rag ingest --base-dir /Users/me/work --base-dir /Users/me/specs /Users/me/work/readme.md
|
|
471
|
+
npx mcp-local-rag list --base-dir /Users/me/work --base-dir /Users/me/specs
|
|
472
|
+
|
|
473
|
+
# Or via BASE_DIRS env
|
|
474
|
+
BASE_DIRS='["/Users/me/work","/Users/me/specs"]' npx mcp-local-rag list
|
|
475
|
+
```
|
|
476
|
+
|
|
352
477
|
### Client-Specific Setup
|
|
353
478
|
|
|
354
479
|
**Cursor** — Global: `~/.cursor/mcp.json`, Project: `.cursor/mcp.json`
|
|
@@ -392,9 +517,13 @@ The embedding model (~90MB) downloads on first use. Takes 1-2 minutes, then work
|
|
|
392
517
|
|
|
393
518
|
### Security
|
|
394
519
|
|
|
395
|
-
- **Path restriction**: Only files within `BASE_DIR` are accessible
|
|
520
|
+
- **Path restriction**: Only files within a configured root (`BASE_DIR` or any `BASE_DIRS` / `--base-dir` entry) are accessible. Symlinks resolving outside all configured roots, and sibling-prefix paths (e.g. `/foo/barista` for root `/foo/bar`), are rejected.
|
|
396
521
|
- **Local only**: No network requests after model download
|
|
397
|
-
- **Model
|
|
522
|
+
- **Model sources** (all official HuggingFace repositories):
|
|
523
|
+
- Embedder: [`Xenova/all-MiniLM-L6-v2`](https://huggingface.co/Xenova/all-MiniLM-L6-v2)
|
|
524
|
+
- Visual `fast` profile: [`HuggingFaceTB/SmolVLM-256M-Instruct`](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
|
|
525
|
+
- Visual `quality` profile: [`onnx-community/Qwen2.5-VL-3B-Instruct-ONNX`](https://huggingface.co/onnx-community/Qwen2.5-VL-3B-Instruct-ONNX)
|
|
526
|
+
- **Visual caption fidelity**: The `quality` profile reproduces in-image text more faithfully than `fast`. Both profiles output captions wrapped as `[Visual content on page N: …]`, but a faithful reproduction means attacker-controlled in-image text — including characters like `]` that visually close the envelope — can appear verbatim in retrieved chunks. Downstream LLM consumers should treat retrieved chunks as untrusted data, not as instructions, regardless of envelope shape.
|
|
398
527
|
|
|
399
528
|
<details>
|
|
400
529
|
<summary><strong>Performance</strong></summary>
|
|
@@ -436,7 +565,18 @@ Check chunk count with `status`. Large documents with many chunks may slow queri
|
|
|
436
565
|
|
|
437
566
|
### "Path outside BASE_DIR"
|
|
438
567
|
|
|
439
|
-
Ensure file paths are within `BASE_DIR
|
|
568
|
+
Ensure file paths are within one of the configured roots (`BASE_DIR`, any `BASE_DIRS` entry, or any CLI `--base-dir`). Use absolute paths.
|
|
569
|
+
|
|
570
|
+
### "BASE_DIRS must be a JSON array..."
|
|
571
|
+
|
|
572
|
+
`BASE_DIRS` accepts only a JSON array of one or more non-empty path strings. Examples:
|
|
573
|
+
|
|
574
|
+
- Valid: `BASE_DIRS='["/Users/me/work","/Users/me/specs"]'`
|
|
575
|
+
- Invalid: `BASE_DIRS=/a:/b` (delimiter syntax not supported)
|
|
576
|
+
- Invalid: `BASE_DIRS='[]'` (empty array)
|
|
577
|
+
- Invalid: `BASE_DIRS='["",""]'` (empty string element)
|
|
578
|
+
|
|
579
|
+
When invalid, root-dependent operations fail with a clear error rather than silently falling back. The MCP `status` tool remains callable so you can inspect the diagnostic.
|
|
440
580
|
|
|
441
581
|
### MCP client doesn't see tools
|
|
442
582
|
|
|
@@ -453,7 +593,7 @@ Ensure file paths are within `BASE_DIR`. Use absolute paths.
|
|
|
453
593
|
Yes. After model download, nothing leaves your machine. Verify with network monitoring.
|
|
454
594
|
|
|
455
595
|
**Can I use this offline?**
|
|
456
|
-
Yes, after the required models are cached locally. Text ingest/search needs the embedding model. PDF visual mode is opt-in and also needs the VLM model on first use; the download is
|
|
596
|
+
Yes, after the required models are cached locally. Text ingest/search needs the embedding model. PDF visual mode is opt-in and also needs the VLM model on first use; the download is ~250 MB for the default `fast` profile (SmolVLM-256M) or ~2.9 GB for the `quality` profile (Qwen2.5-VL-3B), cached under `CACHE_DIR` (default: `./models/`).
|
|
457
597
|
|
|
458
598
|
**How does this compare to cloud RAG?**
|
|
459
599
|
Cloud services offer better accuracy at scale but require sending data externally. This trades some accuracy for complete privacy and zero runtime cost.
|
package/dist/cli/common.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { Embedder } from '../embedder/index.js';
|
|
2
|
+
import { type BaseDirsConfig, type BaseDirsConfigWarning } from '../utils/base-dirs.js';
|
|
2
3
|
import { VectorStore } from '../vectordb/index.js';
|
|
3
4
|
import { type ResolvedGlobalConfig } from './options.js';
|
|
4
5
|
/**
|
|
@@ -11,4 +12,40 @@ export declare function createVectorStore(config: ResolvedGlobalConfig): VectorS
|
|
|
11
12
|
* Callers are responsible for managing the Embedder lifecycle.
|
|
12
13
|
*/
|
|
13
14
|
export declare function createEmbedder(config: ResolvedGlobalConfig): Embedder;
|
|
15
|
+
/**
|
|
16
|
+
* Result of {@link resolveCliBaseDirsOrExit}. Resolution warnings travel with
|
|
17
|
+
* the config so subcommands can render them per their own UI contract (CLI
|
|
18
|
+
* subcommands generally write them to stderr).
|
|
19
|
+
*/
|
|
20
|
+
export interface CliBaseDirsResolution {
|
|
21
|
+
config: BaseDirsConfig;
|
|
22
|
+
warnings: BaseDirsConfigWarning[];
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Resolve effective base directories for a CLI subcommand using the shared
|
|
26
|
+
* resolver, surfacing any configuration error as a process-level failure.
|
|
27
|
+
*
|
|
28
|
+
* Inputs (single source of truth for CLI precedence — kept here so per-
|
|
29
|
+
* subcommand entry points don't each replicate the env-fallback chain):
|
|
30
|
+
* - `cliRoots`: repeated `--base-dir` flag values in CLI order. When non-
|
|
31
|
+
* empty, REPLACES env roots — no merge.
|
|
32
|
+
* - `process.env['BASE_DIRS']`: JSON array, used only when CLI roots are
|
|
33
|
+
* absent.
|
|
34
|
+
* - `process.env['BASE_DIR']`: single path, used only when CLI roots and
|
|
35
|
+
* `BASE_DIRS` are absent.
|
|
36
|
+
* - `process.cwd()`: final fallback.
|
|
37
|
+
*
|
|
38
|
+
* Failure mode: a `BaseDirsConfigError` (invalid `BASE_DIRS` JSON, missing
|
|
39
|
+
* directory, not-a-directory, ...) is reported to stderr and exits with
|
|
40
|
+
* code 1. This is intentional: the resolver explicitly does NOT fall back
|
|
41
|
+
* (see §Technical Decisions → Resolution order in the multi-base-dirs
|
|
42
|
+
* plan), so CLI consumers should fail fast rather than silently degrading
|
|
43
|
+
* to `cwd`.
|
|
44
|
+
*
|
|
45
|
+
* Warnings (`base-dirs-overrides-base-dir`, `nested-root-pruned`) are
|
|
46
|
+
* returned to the caller rather than written here, so each subcommand can
|
|
47
|
+
* decide its own rendering (JSON-output subcommands like `list` may need
|
|
48
|
+
* to keep stderr clean even when warnings are present).
|
|
49
|
+
*/
|
|
50
|
+
export declare function resolveCliBaseDirsOrExit(cliRoots: string[]): Promise<CliBaseDirsResolution>;
|
|
14
51
|
//# sourceMappingURL=common.d.ts.map
|
package/dist/cli/common.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"common.d.ts","sourceRoot":"","sources":["../../src/cli/common.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"common.d.ts","sourceRoot":"","sources":["../../src/cli/common.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAA;AAC/C,OAAO,EACL,KAAK,cAAc,EACnB,KAAK,qBAAqB,EAG3B,MAAM,uBAAuB,CAAA;AAE9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAClD,OAAO,EAAE,KAAK,oBAAoB,EAA+B,MAAM,cAAc,CAAA;AAErF;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,oBAAoB,GAAG,WAAW,CAK3E;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,oBAAoB,GAAG,QAAQ,CAOrE;AAED;;;;GAIG;AACH,MAAM,WAAW,qBAAqB;IACpC,MAAM,EAAE,cAAc,CAAA;IACtB,QAAQ,EAAE,qBAAqB,EAAE,CAAA;CAClC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,wBAAsB,wBAAwB,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAqDjG"}
|
package/dist/cli/common.js
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
// Shared CLI component helpers — factory functions for VectorStore and Embedder
|
|
2
|
+
// plus base-directory resolution shared by every subcommand that scans files.
|
|
2
3
|
import { Embedder } from '../embedder/index.js';
|
|
4
|
+
import { parseBaseDirsEnv, resolveBaseDirs, } from '../utils/base-dirs.js';
|
|
5
|
+
import { checkSensitivePath } from '../utils/sensitive-path.js';
|
|
3
6
|
import { VectorStore } from '../vectordb/index.js';
|
|
4
|
-
import { resolveDevice } from './options.js';
|
|
7
|
+
import { resolveDevice, validatePath } from './options.js';
|
|
5
8
|
/**
|
|
6
9
|
* Create an uninitialized VectorStore from resolved global config.
|
|
7
10
|
* Callers are responsible for calling initialize() before use.
|
|
@@ -24,4 +27,81 @@ export function createEmbedder(config) {
|
|
|
24
27
|
device: resolveDevice(process.env['RAG_DEVICE']),
|
|
25
28
|
});
|
|
26
29
|
}
|
|
30
|
+
/**
|
|
31
|
+
* Resolve effective base directories for a CLI subcommand using the shared
|
|
32
|
+
* resolver, surfacing any configuration error as a process-level failure.
|
|
33
|
+
*
|
|
34
|
+
* Inputs (single source of truth for CLI precedence — kept here so per-
|
|
35
|
+
* subcommand entry points don't each replicate the env-fallback chain):
|
|
36
|
+
* - `cliRoots`: repeated `--base-dir` flag values in CLI order. When non-
|
|
37
|
+
* empty, REPLACES env roots — no merge.
|
|
38
|
+
* - `process.env['BASE_DIRS']`: JSON array, used only when CLI roots are
|
|
39
|
+
* absent.
|
|
40
|
+
* - `process.env['BASE_DIR']`: single path, used only when CLI roots and
|
|
41
|
+
* `BASE_DIRS` are absent.
|
|
42
|
+
* - `process.cwd()`: final fallback.
|
|
43
|
+
*
|
|
44
|
+
* Failure mode: a `BaseDirsConfigError` (invalid `BASE_DIRS` JSON, missing
|
|
45
|
+
* directory, not-a-directory, ...) is reported to stderr and exits with
|
|
46
|
+
* code 1. This is intentional: the resolver explicitly does NOT fall back
|
|
47
|
+
* (see §Technical Decisions → Resolution order in the multi-base-dirs
|
|
48
|
+
* plan), so CLI consumers should fail fast rather than silently degrading
|
|
49
|
+
* to `cwd`.
|
|
50
|
+
*
|
|
51
|
+
* Warnings (`base-dirs-overrides-base-dir`, `nested-root-pruned`) are
|
|
52
|
+
* returned to the caller rather than written here, so each subcommand can
|
|
53
|
+
* decide its own rendering (JSON-output subcommands like `list` may need
|
|
54
|
+
* to keep stderr clean even when warnings are present).
|
|
55
|
+
*/
|
|
56
|
+
export async function resolveCliBaseDirsOrExit(cliRoots) {
|
|
57
|
+
// Screen the raw env-supplied paths before the resolver realpath-
|
|
58
|
+
// normalizes them, so a literal `BASE_DIR=/etc` is rejected with the
|
|
59
|
+
// env var as the attribution surface.
|
|
60
|
+
if (cliRoots.length === 0) {
|
|
61
|
+
if (process.env['BASE_DIRS'] !== undefined && process.env['BASE_DIRS'].length > 0) {
|
|
62
|
+
const parsed = parseBaseDirsEnv(process.env['BASE_DIRS']);
|
|
63
|
+
if (parsed.ok) {
|
|
64
|
+
for (const raw of parsed.value) {
|
|
65
|
+
const sensitive = checkSensitivePath(raw, 'BASE_DIRS');
|
|
66
|
+
if (sensitive) {
|
|
67
|
+
console.error(sensitive);
|
|
68
|
+
process.exit(1);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// Malformed BASE_DIRS surfaces below via resolveBaseDirs.
|
|
73
|
+
}
|
|
74
|
+
else if (process.env['BASE_DIR'] !== undefined && process.env['BASE_DIR'].trim().length > 0) {
|
|
75
|
+
const sensitive = checkSensitivePath(process.env['BASE_DIR'], 'BASE_DIR');
|
|
76
|
+
if (sensitive) {
|
|
77
|
+
console.error(sensitive);
|
|
78
|
+
process.exit(1);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
const result = await resolveBaseDirs({
|
|
83
|
+
cliRoots,
|
|
84
|
+
envBaseDirs: process.env['BASE_DIRS'],
|
|
85
|
+
envBaseDir: process.env['BASE_DIR'],
|
|
86
|
+
cwd: process.cwd(),
|
|
87
|
+
});
|
|
88
|
+
if (!result.ok) {
|
|
89
|
+
console.error(result.error.message);
|
|
90
|
+
process.exit(1);
|
|
91
|
+
}
|
|
92
|
+
// Apply the sensitive-path policy uniformly to every effective root
|
|
93
|
+
// (CLI, env, or cwd). Pre-multi-root code validated `BASE_DIR` here; the
|
|
94
|
+
// same policy must continue to apply to `BASE_DIRS` entries and to CLI
|
|
95
|
+
// roots that pre-validation in the subcommand may have missed (e.g.
|
|
96
|
+
// realpath-resolved targets of symlinks). Reported under `--base-dir`
|
|
97
|
+
// because that is the flag the user most directly controls.
|
|
98
|
+
for (const root of result.config.baseDirs) {
|
|
99
|
+
const sensitive = validatePath(root, '--base-dir');
|
|
100
|
+
if (sensitive) {
|
|
101
|
+
console.error(sensitive);
|
|
102
|
+
process.exit(1);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
return { config: result.config, warnings: result.warnings };
|
|
106
|
+
}
|
|
27
107
|
//# sourceMappingURL=common.js.map
|
package/dist/cli/common.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"common.js","sourceRoot":"","sources":["../../src/cli/common.ts"],"names":[],"mappings":"AAAA,gFAAgF;
|
|
1
|
+
{"version":3,"file":"common.js","sourceRoot":"","sources":["../../src/cli/common.ts"],"names":[],"mappings":"AAAA,gFAAgF;AAChF,8EAA8E;AAE9E,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAA;AAC/C,OAAO,EAGL,gBAAgB,EAChB,eAAe,GAChB,MAAM,uBAAuB,CAAA;AAC9B,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAA;AAC/D,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAClD,OAAO,EAA6B,aAAa,EAAE,YAAY,EAAE,MAAM,cAAc,CAAA;AAErF;;;GAGG;AACH,MAAM,UAAU,iBAAiB,CAAC,MAA4B;IAC5D,OAAO,IAAI,WAAW,CAAC;QACrB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAA;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,MAA4B;IACzD,OAAO,IAAI,QAAQ,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,EAAE;QACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;QACzB,MAAM,EAAE,aAAa,CAAC,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;KACjD,CAAC,CAAA;AACJ,CAAC;AAYD;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,CAAC,KAAK,UAAU,wBAAwB,CAAC,QAAkB;IAC/D,kEAAkE;IAClE,qEAAqE;IACrE,sCAAsC;IACtC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,IAAI,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,KAAK,SAAS,IAAI,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClF,MAAM,MAAM,GAAG,gBAAgB,CAAC,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,CAAA;YACzD,IAAI,MAAM,CAAC,EAAE,EAAE,CAAC;gBACd,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;oBAC/B,MAAM,SAAS,GAAG,kBAAkB,CAAC,GAAG,EAAE,WAAW,CAAC,CAAA;oBACtD,IAAI,SAAS,EAAE,CAAC;wBACd,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;wBACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;oBACjB,CAAC;gBACH,CAAC;YACH,CAAC;YACD,0DAA0D;QAC5D,CAAC;aAAM,IAAI,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,KAAK,SAAS,IAAI,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC9F,MAAM,SAAS,GAAG,kBAAkB,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,UAAU,CAAC,CAAA;YACzE,IAAI,SAAS,EAAE,CAAC;gBACd,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;gBACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACjB,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC;QACnC,QAAQ;QACR,WAAW,EAAE,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC;QACrC,UAAU,EAAE,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;QACnC,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE;KACnB,CAAC,CAAA;IAEF,IAAI,CAAC,MAAM,CAAC,EAAE,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAA;QACnC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,oEAAoE;IACpE,yEAAyE;IACzE,uEAAuE;IACvE,oEAAoE;IACpE,sEAAsE;IACtE,4DAA4D;IAC5D,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC;QAC1C,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,EAAE,YAAY,CAAC,CAAA;QAClD,IAAI,SAAS,EAAE,CAAC;YACd,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;YACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACjB,CAAC;IACH,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAA;AAC7D,CAAC"}
|
package/dist/cli/delete.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// CLI delete subcommand — delete ingested content by file path or source URL
|
|
2
2
|
import { unlink } from 'node:fs/promises';
|
|
3
3
|
import { resolve } from 'node:path';
|
|
4
|
-
import { generateMetaJsonPath, generateRawDataPath,
|
|
4
|
+
import { generateMetaJsonPath, generateRawDataPath, isPathInRawDataDirLexical, } from '../utils/raw-data-utils.js';
|
|
5
5
|
import { createVectorStore } from './common.js';
|
|
6
6
|
import { resolveGlobalConfig, validatePath } from './options.js';
|
|
7
7
|
// ============================================
|
|
@@ -120,8 +120,8 @@ export async function runDelete(args, globalOptions = {}) {
|
|
|
120
120
|
}
|
|
121
121
|
// Delete chunks from VectorStore
|
|
122
122
|
await vectorStore.deleteChunks(targetPath);
|
|
123
|
-
// Clean up physical raw-data files if applicable
|
|
124
|
-
if (
|
|
123
|
+
// Clean up physical raw-data files if applicable.
|
|
124
|
+
if (isPathInRawDataDirLexical(targetPath, globalConfig.dbPath)) {
|
|
125
125
|
try {
|
|
126
126
|
await unlink(targetPath);
|
|
127
127
|
}
|
package/dist/cli/delete.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"delete.js","sourceRoot":"","sources":["../../src/cli/delete.ts"],"names":[],"mappings":"AAAA,6EAA6E;AAE7E,OAAO,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAA;AACzC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AACnC,OAAO,EACL,oBAAoB,EACpB,mBAAmB,EACnB,
|
|
1
|
+
{"version":3,"file":"delete.js","sourceRoot":"","sources":["../../src/cli/delete.ts"],"names":[],"mappings":"AAAA,6EAA6E;AAE7E,OAAO,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAA;AACzC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AACnC,OAAO,EACL,oBAAoB,EACpB,mBAAmB,EACnB,yBAAyB,GAC1B,MAAM,4BAA4B,CAAA;AACnC,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,mBAAmB,EAAE,YAAY,EAAE,MAAM,cAAc,CAAA;AAEhE,+CAA+C;AAC/C,OAAO;AACP,+CAA+C;AAE/C,MAAM,SAAS,GAAG;;;;;;;;;;;;;;;;yCAgBuB,CAAA;AAYzC;;;;GAIG;AACH,SAAS,SAAS,CAAC,IAAc;IAC/B,IAAI,IAAI,GAAG,KAAK,CAAA;IAChB,IAAI,MAA0B,CAAA;IAC9B,IAAI,QAA4B,CAAA;IAEhC,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAE,CAAA;QAEpB,IAAI,GAAG,KAAK,IAAI,IAAI,GAAG,KAAK,QAAQ,EAAE,CAAC;YACrC,IAAI,GAAG,IAAI,CAAA;YACX,CAAC,EAAE,CAAA;QACL,CAAC;aAAM,IAAI,GAAG,KAAK,UAAU,EAAE,CAAC;YAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;YACvB,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBACjD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAA;gBAC3C,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;gBACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACjB,CAAC;YACD,MAAM,GAAG,KAAK,CAAA;YACd,CAAC,EAAE,CAAA;QACL,CAAC;aAAM,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,KAAK,CAAC,mBAAmB,GAAG,EAAE,CAAC,CAAA;YACvC,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;YACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACjB,CAAC;aAAM,CAAC;YACN,iCAAiC;YACjC,QAAQ,GAAG,GAAG,CAAA;YACd,CAAC,EAAE,CAAA;QACL,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAe,EAAE,IAAI,EAAE,CAAA;IACnC,IAAI,MAAM,KAAK,SAAS;QAAE,MAAM,CAAC,MAAM,GAAG,MAAM,CAAA;IAChD,IAAI,QAAQ,KAAK,SAAS;QAAE,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAA;IACtD,OAAO,MAAM,CAAA;AACf,CAAC;AAED,+CAA+C;AAC/C,mBAAmB;AACnB,+CAA+C;AAE/C;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,IAAc,EAAE,gBAA+B,EAAE;IAC/E,oBAAoB;IACpB,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAA;IAE9B,gBAAgB;IAChB,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,4DAA4D;IAC5D,IAAI,CAAC,MAAM,CAAC,QAAQ,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;QACvC,OAAO,CAAC,KAAK,CAAC,4CAA4C,CAAC,CAAA;QAC3D,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,IAAI,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;QACrC,OAAO,CAAC,KAAK,CAAC,8CAA8C,CAAC,CAAA;QAC7D,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,wBAAwB;IACxB,MAAM,YAAY,GAAG,mBAAmB,CAAC,aAAa,CAAC,CAAA;IAEvD,IAAI,CAAC;QACH,oEAAoE;QACpE,MAAM,WAAW,GAAG,iBAAiB,CAAC,YAAY,CAAC,CAAA;QACnD,MAAM,WAAW,CAAC,UAAU,EAAE,CAAA;QAE9B,6BAA6B;QAC7B,IAAI,UAAkB,CAAA;QAEtB,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClB,yCAAyC;YACzC,UAAU,GAAG,mBAAmB,CAAC,YAAY,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;QAClF,CAAC;aAAM,CAAC;YACN,8CAA8C;YAC9C,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,QAAS,CAAC,CAAA;YAEtC,sDAAsD;YACtD,MAAM,SAAS,GAAG,YAAY,CAAC,UAAU,EAAE,aAAa,CAAC,CAAA;YACzD,IAAI,SAAS,EAAE,CAAC;gBACd,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;gBACxB,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;gBACpB,OAAM;YACR,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,MAAM,WAAW,CAAC,YAAY,CAAC,UAAU,CAAC,CAAA;QAE1C,kDAAkD;QAClD,IAAI,yBAAyB,CAAC,UAAU,EAAE,YAAY,CAAC,MAAM,CAAC,EAAE,CAAC;YAC/D,IAAI,CAAC;gBACH,MAAM,MAAM,CAAC,UAAU,CAAC,CAAA;YAC1B,CAAC;YAAC,OAAO,KAAc,EAAE,CAAC;gBACxB,uDAAuD;gBACvD,IACE,CAAC,CAAC,KAAK,YAAY,KAAK,CAAC;oBACzB,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC;oBACjB,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAClD,CAAC;oBACD,MAAM,KAAK,CAAA;gBACb,CAAC;YACH,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,MAAM,CAAC,oBAAoB,CAAC,UAAU,CAAC,CAAC,CAAA;YAChD,CAAC;YAAC,OAAO,KAAc,EAAE,CAAC;gBACxB,gBAAgB;gBAChB,IACE,CAAC,CAAC,KAAK,YAAY,KAAK,CAAC;oBACzB,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC;oBACjB,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAClD,CAAC;oBACD,MAAM,KAAK,CAAA;gBACb,CAAC;YACH,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,MAAM,WAAW,CAAC,QAAQ,EAAE,CAAA;QAE5B,+BAA+B;QAC/B,MAAM,MAAM,GAAG;YACb,QAAQ,EAAE,UAAU;YACpB,OAAO,EAAE,IAAI;YACb,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAA;QACD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAA;IAC9C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QACrE,OAAO,CAAC,KAAK,CAAC,UAAU,MAAM,EAAE,CAAC,CAAA;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;AACH,CAAC"}
|
package/dist/cli/ingest.d.ts
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import { SemanticChunker } from '../chunker/index.js';
|
|
2
2
|
import type { Embedder } from '../embedder/index.js';
|
|
3
3
|
import { DocumentParser } from '../parser/index.js';
|
|
4
|
+
import type { QualityProfile } from '../pdf-visual/types.js';
|
|
5
|
+
import type { BaseDirsConfig, BaseDirsConfigWarning } from '../utils/base-dirs.js';
|
|
4
6
|
import type { VectorStore } from '../vectordb/index.js';
|
|
5
7
|
import type { GlobalOptions, ResolvedGlobalConfig } from './options.js';
|
|
6
8
|
interface IngestConfig {
|
|
7
|
-
|
|
9
|
+
baseDirs: BaseDirsConfig;
|
|
10
|
+
baseDirsWarnings: BaseDirsConfigWarning[];
|
|
8
11
|
dbPath: string;
|
|
9
12
|
cacheDir: string;
|
|
10
13
|
modelName: string;
|
|
@@ -12,10 +15,21 @@ interface IngestConfig {
|
|
|
12
15
|
chunkMinLength?: number;
|
|
13
16
|
}
|
|
14
17
|
interface IngestCliOptions {
|
|
15
|
-
|
|
18
|
+
/**
|
|
19
|
+
* Collected `--base-dir` values in CLI order. Repeatable: each flag
|
|
20
|
+
* occurrence appends one entry. An empty array means the flag was not
|
|
21
|
+
* provided (resolver then falls through to env / cwd).
|
|
22
|
+
*/
|
|
23
|
+
baseDirs?: string[] | undefined;
|
|
16
24
|
maxFileSize?: number | undefined;
|
|
17
25
|
chunkMinLength?: number | undefined;
|
|
18
26
|
visual?: boolean | undefined;
|
|
27
|
+
/**
|
|
28
|
+
* Visual-quality profile selector. Only meaningful when `visual` is true;
|
|
29
|
+
* silently ignored otherwise (mirrors the existing `--visual` precedent
|
|
30
|
+
* of silently coercing for non-PDF files). Defaults to `'fast'`.
|
|
31
|
+
*/
|
|
32
|
+
visualQuality?: QualityProfile | undefined;
|
|
19
33
|
}
|
|
20
34
|
interface ParsedArgs {
|
|
21
35
|
positional: string | undefined;
|
|
@@ -30,27 +44,37 @@ interface ParsedArgs {
|
|
|
30
44
|
export declare function parseArgs(args: string[]): ParsedArgs;
|
|
31
45
|
/**
|
|
32
46
|
* Resolve ingest config by merging global config with ingest-specific options.
|
|
33
|
-
*
|
|
34
|
-
*
|
|
47
|
+
*
|
|
48
|
+
* Base directories are resolved via the shared CLI resolver
|
|
49
|
+
* ({@link resolveCliBaseDirsOrExit}) which applies the documented precedence
|
|
50
|
+
* (CLI roots > `BASE_DIRS` > `BASE_DIR` > `cwd`), realpath-normalizes every
|
|
51
|
+
* effective root, dedupes exact duplicates, and prunes nested roots. CLI
|
|
52
|
+
* roots are pre-validated against the sensitive-path policy here so the
|
|
53
|
+
* user sees `--base-dir`-attributed errors before the resolver touches the
|
|
54
|
+
* filesystem.
|
|
55
|
+
*
|
|
56
|
+
* Other ingest-specific values (maxFileSize, chunkMinLength) follow the
|
|
57
|
+
* existing CLI > env > defaults order and are validated against the same
|
|
58
|
+
* ranges as before.
|
|
35
59
|
*/
|
|
36
|
-
export declare function resolveConfig(globalConfig: ResolvedGlobalConfig, ingestOptions?: IngestCliOptions): IngestConfig
|
|
60
|
+
export declare function resolveConfig(globalConfig: ResolvedGlobalConfig, ingestOptions?: IngestCliOptions): Promise<IngestConfig>;
|
|
37
61
|
/**
|
|
38
|
-
* Options for `ingestSingleFile`. Discriminated on `visual` so the visual
|
|
39
|
-
* is type-only callable with the VLM config it actually needs:
|
|
62
|
+
* Options for `ingestSingleFile`. Discriminated on `visual` so the visual
|
|
63
|
+
* path is type-only callable with the VLM config it actually needs:
|
|
40
64
|
* - `visual` absent or `false` → no VLM fields required (and not accepted).
|
|
41
|
-
* - `visual: true` → `
|
|
65
|
+
* - `visual: true` → `profile` and `cacheDir` required; `device` optional.
|
|
42
66
|
*
|
|
43
|
-
* Why a union rather than always-required fields: making
|
|
44
|
-
*
|
|
45
|
-
*
|
|
46
|
-
*
|
|
47
|
-
*
|
|
67
|
+
* Why a union rather than always-required fields: making the VLM fields
|
|
68
|
+
* unconditionally required forces non-visual callers (default-mode tests,
|
|
69
|
+
* future direct-import callers that only ingest non-PDF files) to fabricate
|
|
70
|
+
* VLM config they will never use. The visual-true variant still catches
|
|
71
|
+
* accidental misuse at compile time, which was the original goal.
|
|
48
72
|
*/
|
|
49
73
|
export type IngestSingleFileOptions = {
|
|
50
74
|
visual?: false | undefined;
|
|
51
75
|
} | {
|
|
52
76
|
visual: true;
|
|
53
|
-
|
|
77
|
+
profile: QualityProfile;
|
|
54
78
|
cacheDir: string;
|
|
55
79
|
device?: string | undefined;
|
|
56
80
|
};
|
package/dist/cli/ingest.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../../src/cli/ingest.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAA;AACrD,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAA;AAGpD,OAAO,EAAE,cAAc,EAAwB,MAAM,oBAAoB,CAAA;AACzE,OAAO,KAAK,EAAe,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAEpE,OAAO,KAAK,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAA;
|
|
1
|
+
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../../src/cli/ingest.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAA;AACrD,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAA;AAGpD,OAAO,EAAE,cAAc,EAAwB,MAAM,oBAAoB,CAAA;AACzE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAA;AAC5D,OAAO,KAAK,EAAE,cAAc,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAA;AAClF,OAAO,KAAK,EAAe,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAEpE,OAAO,KAAK,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAA;AAoBvE,UAAU,YAAY;IACpB,QAAQ,EAAE,cAAc,CAAA;IACxB,gBAAgB,EAAE,qBAAqB,EAAE,CAAA;IACzC,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,EAAE,MAAM,CAAA;IACnB,cAAc,CAAC,EAAE,MAAM,CAAA;CACxB;AAQD,UAAU,gBAAgB;IACxB;;;;OAIG;IACH,QAAQ,CAAC,EAAE,MAAM,EAAE,GAAG,SAAS,CAAA;IAC/B,WAAW,CAAC,EAAE,MAAM,GAAG,SAAS,CAAA;IAChC,cAAc,CAAC,EAAE,MAAM,GAAG,SAAS,CAAA;IACnC,MAAM,CAAC,EAAE,OAAO,GAAG,SAAS,CAAA;IAC5B;;;;OAIG;IACH,aAAa,CAAC,EAAE,cAAc,GAAG,SAAS,CAAA;CAC3C;AAED,UAAU,UAAU;IAClB,UAAU,EAAE,MAAM,GAAG,SAAS,CAAA;IAC9B,OAAO,EAAE,gBAAgB,CAAA;IACzB,IAAI,EAAE,OAAO,CAAA;CACd;AAmCD;;;;GAIG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,UAAU,CA+FpD;AAMD;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,aAAa,CACjC,YAAY,EAAE,oBAAoB,EAClC,aAAa,GAAE,gBAAqB,GACnC,OAAO,CAAC,YAAY,CAAC,CAyDvB;AAmHD;;;;;;;;;;;GAWG;AACH,MAAM,MAAM,uBAAuB,GAC/B;IAAE,MAAM,CAAC,EAAE,KAAK,GAAG,SAAS,CAAA;CAAE,GAC9B;IACE,MAAM,EAAE,IAAI,CAAA;IACZ,OAAO,EAAE,cAAc,CAAA;IACvB,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,CAAC,EAAE,MAAM,GAAG,SAAS,CAAA;CAC5B,CAAA;AAEL;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,gBAAgB,CACpC,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,cAAc,EACtB,OAAO,EAAE,eAAe,EACxB,QAAQ,EAAE,QAAQ,EAClB,WAAW,EAAE,WAAW,EACxB,OAAO,CAAC,EAAE,uBAAuB,GAChC,OAAO,CAAC,MAAM,CAAC,CAuGjB;AAMD;;;;GAIG;AACH,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,aAAa,GAAE,aAAkB,GAAG,OAAO,CAAC,IAAI,CAAC,CA0IhG"}
|