mcp-local-rag 0.8.2 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/dist/cli/common.d.ts +14 -0
- package/dist/cli/common.d.ts.map +1 -0
- package/dist/cli/common.js +25 -0
- package/dist/cli/common.js.map +1 -0
- package/dist/cli/delete.d.ts +8 -0
- package/dist/cli/delete.d.ts.map +1 -0
- package/dist/cli/delete.js +164 -0
- package/dist/cli/delete.js.map +1 -0
- package/dist/cli/ingest.d.ts +37 -0
- package/dist/cli/ingest.d.ts.map +1 -0
- package/dist/cli/ingest.js +345 -0
- package/dist/cli/ingest.js.map +1 -0
- package/dist/cli/list.d.ts +23 -0
- package/dist/cli/list.d.ts.map +1 -0
- package/dist/cli/list.js +142 -0
- package/dist/cli/list.js.map +1 -0
- package/dist/cli/options.d.ts +49 -0
- package/dist/cli/options.d.ts.map +1 -0
- package/dist/cli/options.js +193 -0
- package/dist/cli/options.js.map +1 -0
- package/dist/cli/query.d.ts +23 -0
- package/dist/cli/query.d.ts.map +1 -0
- package/dist/cli/query.js +162 -0
- package/dist/cli/query.js.map +1 -0
- package/dist/cli/status.d.ts +8 -0
- package/dist/cli/status.d.ts.map +1 -0
- package/dist/cli/status.js +78 -0
- package/dist/cli/status.js.map +1 -0
- package/dist/cli-main.d.ts +4 -2
- package/dist/cli-main.d.ts.map +1 -1
- package/dist/cli-main.js +24 -3
- package/dist/cli-main.js.map +1 -1
- package/dist/embedder/index.d.ts.map +1 -1
- package/dist/embedder/index.js +3 -1
- package/dist/embedder/index.js.map +1 -1
- package/dist/index.js +8 -4
- package/dist/index.js.map +1 -1
- package/dist/server/index.js +1 -1
- package/dist/server/index.js.map +1 -1
- package/dist/server/types.d.ts +1 -1
- package/dist/server/types.d.ts.map +1 -1
- package/dist/utils/raw-data-utils.d.ts +116 -0
- package/dist/utils/raw-data-utils.d.ts.map +1 -0
- package/dist/utils/raw-data-utils.js +202 -0
- package/dist/utils/raw-data-utils.js.map +1 -0
- package/package.json +3 -4
- package/skills/mcp-local-rag/SKILL.md +29 -13
- package/skills/mcp-local-rag/references/cli-reference.md +77 -0
- package/skills/mcp-local-rag/references/html-ingestion.md +11 -7
- package/skills/mcp-local-rag/references/query-optimization.md +1 -1
package/README.md
CHANGED
|
@@ -119,6 +119,25 @@ HTML is automatically cleaned—you get the article content, not the boilerplate
|
|
|
119
119
|
|
|
120
120
|
> **Note:** The RAG server itself doesn't fetch web content—your AI assistant retrieves it and passes the HTML to `ingest_data`. This keeps the server fully local while letting you index any content your assistant can access. Please respect website terms of service and copyright when ingesting external content.
|
|
121
121
|
|
|
122
|
+
### CLI Commands
|
|
123
|
+
|
|
124
|
+
All MCP tools are also available as CLI commands — no MCP server needed:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
npx mcp-local-rag ingest ./docs/ # Bulk ingest files
|
|
128
|
+
npx mcp-local-rag query "authentication API" # Search documents
|
|
129
|
+
npx mcp-local-rag list # Show ingestion status
|
|
130
|
+
npx mcp-local-rag status # Database stats
|
|
131
|
+
npx mcp-local-rag delete ./docs/old.pdf # Remove content
|
|
132
|
+
npx mcp-local-rag delete --source "https://..." # Remove by source URL
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Global options (`--db-path`, `--cache-dir`, `--model-name`) go before the subcommand. Run `npx mcp-local-rag --help` for details.
|
|
136
|
+
|
|
137
|
+
`query`, `list`, `status`, and `delete` output JSON to stdout for piping (e.g., `| jq`). `ingest` outputs progress to stderr.
|
|
138
|
+
|
|
139
|
+
> ⚠️ **CLI options must match your MCP server config.** Especially `--model-name` — using a different embedding model against an existing database produces incompatible vectors, silently degrading search quality.
|
|
140
|
+
|
|
122
141
|
### Searching Documents
|
|
123
142
|
|
|
124
143
|
```
|
|
@@ -407,6 +426,7 @@ pnpm run check:all # Full quality check
|
|
|
407
426
|
src/
|
|
408
427
|
index.ts # Entry point
|
|
409
428
|
server/ # MCP tool handlers
|
|
429
|
+
cli/ # CLI subcommands (ingest)
|
|
410
430
|
parser/ # PDF, DOCX, TXT, MD parsing
|
|
411
431
|
chunker/ # Text splitting
|
|
412
432
|
embedder/ # Transformers.js embeddings
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { Embedder } from '../embedder/index.js';
|
|
2
|
+
import { VectorStore } from '../vectordb/index.js';
|
|
3
|
+
import type { ResolvedGlobalConfig } from './options.js';
|
|
4
|
+
/**
|
|
5
|
+
* Create an uninitialized VectorStore from resolved global config.
|
|
6
|
+
* Callers are responsible for calling initialize() before use.
|
|
7
|
+
*/
|
|
8
|
+
export declare function createVectorStore(config: ResolvedGlobalConfig): VectorStore;
|
|
9
|
+
/**
|
|
10
|
+
* Create an uninitialized Embedder from resolved global config.
|
|
11
|
+
* Callers are responsible for managing the Embedder lifecycle.
|
|
12
|
+
*/
|
|
13
|
+
export declare function createEmbedder(config: ResolvedGlobalConfig): Embedder;
|
|
14
|
+
//# sourceMappingURL=common.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"common.d.ts","sourceRoot":"","sources":["../../src/cli/common.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAA;AAC/C,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAClD,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAA;AAExD;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,oBAAoB,GAAG,WAAW,CAK3E;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,oBAAoB,GAAG,QAAQ,CAMrE"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
// Shared CLI component helpers — factory functions for VectorStore and Embedder
|
|
2
|
+
import { Embedder } from '../embedder/index.js';
|
|
3
|
+
import { VectorStore } from '../vectordb/index.js';
|
|
4
|
+
/**
|
|
5
|
+
* Create an uninitialized VectorStore from resolved global config.
|
|
6
|
+
* Callers are responsible for calling initialize() before use.
|
|
7
|
+
*/
|
|
8
|
+
export function createVectorStore(config) {
|
|
9
|
+
return new VectorStore({
|
|
10
|
+
dbPath: config.dbPath,
|
|
11
|
+
tableName: 'chunks',
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Create an uninitialized Embedder from resolved global config.
|
|
16
|
+
* Callers are responsible for managing the Embedder lifecycle.
|
|
17
|
+
*/
|
|
18
|
+
export function createEmbedder(config) {
|
|
19
|
+
return new Embedder({
|
|
20
|
+
modelPath: config.modelName,
|
|
21
|
+
batchSize: 16,
|
|
22
|
+
cacheDir: config.cacheDir,
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=common.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"common.js","sourceRoot":"","sources":["../../src/cli/common.ts"],"names":[],"mappings":"AAAA,gFAAgF;AAEhF,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAA;AAC/C,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAGlD;;;GAGG;AACH,MAAM,UAAU,iBAAiB,CAAC,MAA4B;IAC5D,OAAO,IAAI,WAAW,CAAC;QACrB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAA;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,MAA4B;IACzD,OAAO,IAAI,QAAQ,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,EAAE;QACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;KAC1B,CAAC,CAAA;AACJ,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { GlobalOptions } from './options.js';
|
|
2
|
+
/**
|
|
3
|
+
* Run the delete CLI subcommand.
|
|
4
|
+
* @param args - Arguments after "delete"
|
|
5
|
+
* @param globalOptions - Global options parsed before the subcommand
|
|
6
|
+
*/
|
|
7
|
+
export declare function runDelete(args: string[], globalOptions?: GlobalOptions): Promise<void>;
|
|
8
|
+
//# sourceMappingURL=delete.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"delete.d.ts","sourceRoot":"","sources":["../../src/cli/delete.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAkFjD;;;;GAIG;AACH,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,aAAa,GAAE,aAAkB,GAAG,OAAO,CAAC,IAAI,CAAC,CAiGhG"}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
// CLI delete subcommand — delete ingested content by file path or source URL
|
|
2
|
+
import { unlink } from 'node:fs/promises';
|
|
3
|
+
import { resolve } from 'node:path';
|
|
4
|
+
import { generateMetaJsonPath, generateRawDataPath, isRawDataPath, } from '../utils/raw-data-utils.js';
|
|
5
|
+
import { createVectorStore } from './common.js';
|
|
6
|
+
import { resolveGlobalConfig, validatePath } from './options.js';
|
|
7
|
+
// ============================================
|
|
8
|
+
// Help
|
|
9
|
+
// ============================================
|
|
10
|
+
const HELP_TEXT = `Usage: mcp-local-rag [global-options] delete [--source <url>] [<file-path>]
|
|
11
|
+
|
|
12
|
+
Delete ingested content by file path or source URL.
|
|
13
|
+
|
|
14
|
+
Either <file-path> or --source is required (not both).
|
|
15
|
+
|
|
16
|
+
Arguments:
|
|
17
|
+
<file-path> File path of ingested content to delete
|
|
18
|
+
|
|
19
|
+
Options:
|
|
20
|
+
--source <url> Delete by source URL (for content ingested via ingest_data)
|
|
21
|
+
-h, --help Show this help
|
|
22
|
+
|
|
23
|
+
Global options (must appear before "delete"):
|
|
24
|
+
--db-path <path> LanceDB database path
|
|
25
|
+
--cache-dir <path> Model cache directory
|
|
26
|
+
--model-name <name> Embedding model`;
|
|
27
|
+
/**
|
|
28
|
+
* Parse delete-specific CLI arguments.
|
|
29
|
+
* Accepts a positional <file-path>, --source <url>, and -h/--help.
|
|
30
|
+
* Unknown flags or conflicting args cause exit(1).
|
|
31
|
+
*/
|
|
32
|
+
function parseArgs(args) {
|
|
33
|
+
let help = false;
|
|
34
|
+
let source;
|
|
35
|
+
let filePath;
|
|
36
|
+
let i = 0;
|
|
37
|
+
while (i < args.length) {
|
|
38
|
+
const arg = args[i];
|
|
39
|
+
if (arg === '-h' || arg === '--help') {
|
|
40
|
+
help = true;
|
|
41
|
+
i++;
|
|
42
|
+
}
|
|
43
|
+
else if (arg === '--source') {
|
|
44
|
+
const value = args[++i];
|
|
45
|
+
if (value === undefined || value.startsWith('-')) {
|
|
46
|
+
console.error('Missing value for --source');
|
|
47
|
+
console.error(HELP_TEXT);
|
|
48
|
+
process.exit(1);
|
|
49
|
+
}
|
|
50
|
+
source = value;
|
|
51
|
+
i++;
|
|
52
|
+
}
|
|
53
|
+
else if (arg.startsWith('-')) {
|
|
54
|
+
console.error(`Unknown option: ${arg}`);
|
|
55
|
+
console.error(HELP_TEXT);
|
|
56
|
+
process.exit(1);
|
|
57
|
+
}
|
|
58
|
+
else {
|
|
59
|
+
// Positional argument: file-path
|
|
60
|
+
filePath = arg;
|
|
61
|
+
i++;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
const result = { help };
|
|
65
|
+
if (source !== undefined)
|
|
66
|
+
result.source = source;
|
|
67
|
+
if (filePath !== undefined)
|
|
68
|
+
result.filePath = filePath;
|
|
69
|
+
return result;
|
|
70
|
+
}
|
|
71
|
+
// ============================================
|
|
72
|
+
// Main Entry Point
|
|
73
|
+
// ============================================
|
|
74
|
+
/**
|
|
75
|
+
* Run the delete CLI subcommand.
|
|
76
|
+
* @param args - Arguments after "delete"
|
|
77
|
+
* @param globalOptions - Global options parsed before the subcommand
|
|
78
|
+
*/
|
|
79
|
+
export async function runDelete(args, globalOptions = {}) {
|
|
80
|
+
// Parse CLI options
|
|
81
|
+
const parsed = parseArgs(args);
|
|
82
|
+
// Handle --help
|
|
83
|
+
if (parsed.help) {
|
|
84
|
+
console.error(HELP_TEXT);
|
|
85
|
+
process.exit(0);
|
|
86
|
+
}
|
|
87
|
+
// Validate: either file-path or --source required, not both
|
|
88
|
+
if (!parsed.filePath && !parsed.source) {
|
|
89
|
+
console.error('Either <file-path> or --source is required');
|
|
90
|
+
console.error(HELP_TEXT);
|
|
91
|
+
process.exit(1);
|
|
92
|
+
}
|
|
93
|
+
if (parsed.filePath && parsed.source) {
|
|
94
|
+
console.error('Cannot specify both <file-path> and --source');
|
|
95
|
+
console.error(HELP_TEXT);
|
|
96
|
+
process.exit(1);
|
|
97
|
+
}
|
|
98
|
+
// Resolve global config
|
|
99
|
+
const globalConfig = resolveGlobalConfig(globalOptions);
|
|
100
|
+
try {
|
|
101
|
+
// Create and initialize VectorStore (no Embedder needed for delete)
|
|
102
|
+
const vectorStore = createVectorStore(globalConfig);
|
|
103
|
+
await vectorStore.initialize();
|
|
104
|
+
// Determine target file path
|
|
105
|
+
let targetPath;
|
|
106
|
+
if (parsed.source) {
|
|
107
|
+
// Generate raw-data path from source URL
|
|
108
|
+
targetPath = generateRawDataPath(globalConfig.dbPath, parsed.source, 'markdown');
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
// Use provided file path, resolve to absolute
|
|
112
|
+
targetPath = resolve(parsed.filePath);
|
|
113
|
+
// Validate path (reject sensitive system directories)
|
|
114
|
+
const pathError = validatePath(targetPath, '<file-path>');
|
|
115
|
+
if (pathError) {
|
|
116
|
+
console.error(pathError);
|
|
117
|
+
process.exitCode = 1;
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
// Delete chunks from VectorStore
|
|
122
|
+
await vectorStore.deleteChunks(targetPath);
|
|
123
|
+
// Clean up physical raw-data files if applicable
|
|
124
|
+
if (isRawDataPath(targetPath)) {
|
|
125
|
+
try {
|
|
126
|
+
await unlink(targetPath);
|
|
127
|
+
}
|
|
128
|
+
catch (error) {
|
|
129
|
+
// Ignore ENOENT (file already deleted / never existed)
|
|
130
|
+
if (!(error instanceof Error) ||
|
|
131
|
+
!('code' in error) ||
|
|
132
|
+
error.code !== 'ENOENT') {
|
|
133
|
+
throw error;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
try {
|
|
137
|
+
await unlink(generateMetaJsonPath(targetPath));
|
|
138
|
+
}
|
|
139
|
+
catch (error) {
|
|
140
|
+
// Ignore ENOENT
|
|
141
|
+
if (!(error instanceof Error) ||
|
|
142
|
+
!('code' in error) ||
|
|
143
|
+
error.code !== 'ENOENT') {
|
|
144
|
+
throw error;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
// Optimize VectorStore after deletion
|
|
149
|
+
await vectorStore.optimize();
|
|
150
|
+
// Output result JSON to stdout
|
|
151
|
+
const result = {
|
|
152
|
+
filePath: targetPath,
|
|
153
|
+
deleted: true,
|
|
154
|
+
timestamp: new Date().toISOString(),
|
|
155
|
+
};
|
|
156
|
+
process.stdout.write(JSON.stringify(result));
|
|
157
|
+
}
|
|
158
|
+
catch (error) {
|
|
159
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
160
|
+
console.error(`Error: ${reason}`);
|
|
161
|
+
process.exit(1);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
//# sourceMappingURL=delete.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"delete.js","sourceRoot":"","sources":["../../src/cli/delete.ts"],"names":[],"mappings":"AAAA,6EAA6E;AAE7E,OAAO,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAA;AACzC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AACnC,OAAO,EACL,oBAAoB,EACpB,mBAAmB,EACnB,aAAa,GACd,MAAM,4BAA4B,CAAA;AACnC,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,mBAAmB,EAAE,YAAY,EAAE,MAAM,cAAc,CAAA;AAEhE,+CAA+C;AAC/C,OAAO;AACP,+CAA+C;AAE/C,MAAM,SAAS,GAAG;;;;;;;;;;;;;;;;yCAgBuB,CAAA;AAYzC;;;;GAIG;AACH,SAAS,SAAS,CAAC,IAAc;IAC/B,IAAI,IAAI,GAAG,KAAK,CAAA;IAChB,IAAI,MAA0B,CAAA;IAC9B,IAAI,QAA4B,CAAA;IAEhC,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAE,CAAA;QAEpB,IAAI,GAAG,KAAK,IAAI,IAAI,GAAG,KAAK,QAAQ,EAAE,CAAC;YACrC,IAAI,GAAG,IAAI,CAAA;YACX,CAAC,EAAE,CAAA;QACL,CAAC;aAAM,IAAI,GAAG,KAAK,UAAU,EAAE,CAAC;YAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;YACvB,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBACjD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAA;gBAC3C,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;gBACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACjB,CAAC;YACD,MAAM,GAAG,KAAK,CAAA;YACd,CAAC,EAAE,CAAA;QACL,CAAC;aAAM,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,KAAK,CAAC,mBAAmB,GAAG,EAAE,CAAC,CAAA;YACvC,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;YACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACjB,CAAC;aAAM,CAAC;YACN,iCAAiC;YACjC,QAAQ,GAAG,GAAG,CAAA;YACd,CAAC,EAAE,CAAA;QACL,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAe,EAAE,IAAI,EAAE,CAAA;IACnC,IAAI,MAAM,KAAK,SAAS;QAAE,MAAM,CAAC,MAAM,GAAG,MAAM,CAAA;IAChD,IAAI,QAAQ,KAAK,SAAS;QAAE,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAA;IACtD,OAAO,MAAM,CAAA;AACf,CAAC;AAED,+CAA+C;AAC/C,mBAAmB;AACnB,+CAA+C;AAE/C;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,IAAc,EAAE,gBAA+B,EAAE;IAC/E,oBAAoB;IACpB,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAA;IAE9B,gBAAgB;IAChB,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,4DAA4D;IAC5D,IAAI,CAAC,MAAM,CAAC,QAAQ,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;QACvC,OAAO,CAAC,KAAK,CAAC,4CAA4C,CAAC,CAAA;QAC3D,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,IAAI,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;QACrC,OAAO,CAAC,KAAK,CAAC,8CAA8C,CAAC,CAAA;QAC7D,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,wBAAwB;IACxB,MAAM,YAAY,GAAG,mBAAmB,CAAC,aAAa,CAAC,CAAA;IAEvD,IAAI,CAAC;QACH,oEAAoE;QACpE,MAAM,WAAW,GAAG,iBAAiB,CAAC,YAAY,CAAC,CAAA;QACnD,MAAM,WAAW,CAAC,UAAU,EAAE,CAAA;QAE9B,6BAA6B;QAC7B,IAAI,UAAkB,CAAA;QAEtB,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClB,yCAAyC;YACzC,UAAU,GAAG,mBAAmB,CAAC,YAAY,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;QAClF,CAAC;aAAM,CAAC;YACN,8CAA8C;YAC9C,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,QAAS,CAAC,CAAA;YAEtC,sDAAsD;YACtD,MAAM,SAAS,GAAG,YAAY,CAAC,UAAU,EAAE,aAAa,CAAC,CAAA;YACzD,IAAI,SAAS,EAAE,CAAC;gBACd,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;gBACxB,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;gBACpB,OAAM;YACR,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,MAAM,WAAW,CAAC,YAAY,CAAC,UAAU,CAAC,CAAA;QAE1C,iDAAiD;QACjD,IAAI,aAAa,CAAC,UAAU,CAAC,EAAE,CAAC;YAC9B,IAAI,CAAC;gBACH,MAAM,MAAM,CAAC,UAAU,CAAC,CAAA;YAC1B,CAAC;YAAC,OAAO,KAAc,EAAE,CAAC;gBACxB,uDAAuD;gBACvD,IACE,CAAC,CAAC,KAAK,YAAY,KAAK,CAAC;oBACzB,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC;oBACjB,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAClD,CAAC;oBACD,MAAM,KAAK,CAAA;gBACb,CAAC;YACH,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,MAAM,CAAC,oBAAoB,CAAC,UAAU,CAAC,CAAC,CAAA;YAChD,CAAC;YAAC,OAAO,KAAc,EAAE,CAAC;gBACxB,gBAAgB;gBAChB,IACE,CAAC,CAAC,KAAK,YAAY,KAAK,CAAC;oBACzB,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC;oBACjB,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAClD,CAAC;oBACD,MAAM,KAAK,CAAA;gBACb,CAAC;YACH,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,MAAM,WAAW,CAAC,QAAQ,EAAE,CAAA;QAE5B,+BAA+B;QAC/B,MAAM,MAAM,GAAG;YACb,QAAQ,EAAE,UAAU;YACpB,OAAO,EAAE,IAAI;YACb,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAA;QACD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAA;IAC9C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QACrE,OAAO,CAAC,KAAK,CAAC,UAAU,MAAM,EAAE,CAAC,CAAA;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { GlobalOptions, ResolvedGlobalConfig } from './options.js';
|
|
2
|
+
interface IngestConfig {
|
|
3
|
+
baseDir: string;
|
|
4
|
+
dbPath: string;
|
|
5
|
+
cacheDir: string;
|
|
6
|
+
modelName: string;
|
|
7
|
+
maxFileSize: number;
|
|
8
|
+
}
|
|
9
|
+
interface IngestCliOptions {
|
|
10
|
+
baseDir?: string | undefined;
|
|
11
|
+
maxFileSize?: number | undefined;
|
|
12
|
+
}
|
|
13
|
+
interface ParsedArgs {
|
|
14
|
+
positional: string | undefined;
|
|
15
|
+
options: IngestCliOptions;
|
|
16
|
+
help: boolean;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Parse ingest-specific CLI arguments into options and a positional path.
|
|
20
|
+
* Flags: --base-dir, --max-file-size, -h/--help
|
|
21
|
+
* Unknown flags (including global flags passed after subcommand) cause an error.
|
|
22
|
+
*/
|
|
23
|
+
export declare function parseArgs(args: string[]): ParsedArgs;
|
|
24
|
+
/**
|
|
25
|
+
* Resolve ingest config by merging global config with ingest-specific options.
|
|
26
|
+
* Ingest-specific: baseDir, maxFileSize (CLI flags > env vars > defaults).
|
|
27
|
+
* Validates all resolved values before returning.
|
|
28
|
+
*/
|
|
29
|
+
export declare function resolveConfig(globalConfig: ResolvedGlobalConfig, ingestOptions?: IngestCliOptions): IngestConfig;
|
|
30
|
+
/**
|
|
31
|
+
* Run the ingest CLI subcommand.
|
|
32
|
+
* @param args - Arguments after "ingest" (e.g., option flags and file/directory path)
|
|
33
|
+
* @param globalOptions - Global options parsed before the subcommand
|
|
34
|
+
*/
|
|
35
|
+
export declare function runIngest(args: string[], globalOptions?: GlobalOptions): Promise<void>;
|
|
36
|
+
export {};
|
|
37
|
+
//# sourceMappingURL=ingest.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../../src/cli/ingest.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAA;AAavE,UAAU,YAAY;IACpB,OAAO,EAAE,MAAM,CAAA;IACf,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,EAAE,MAAM,CAAA;CACpB;AAQD,UAAU,gBAAgB;IACxB,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,CAAA;IAC5B,WAAW,CAAC,EAAE,MAAM,GAAG,SAAS,CAAA;CACjC;AAED,UAAU,UAAU;IAClB,UAAU,EAAE,MAAM,GAAG,SAAS,CAAA;IAC9B,OAAO,EAAE,gBAAgB,CAAA;IACzB,IAAI,EAAE,OAAO,CAAA;CACd;AAgCD;;;;GAIG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,UAAU,CAqDpD;AAiBD;;;;GAIG;AACH,wBAAgB,aAAa,CAC3B,YAAY,EAAE,oBAAoB,EAClC,aAAa,GAAE,gBAAqB,GACnC,YAAY,CA8Bd;AAuJD;;;;GAIG;AACH,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,aAAa,GAAE,aAAkB,GAAG,OAAO,CAAC,IAAI,CAAC,CA0FhG"}
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
// CLI ingest subcommand — bulk file ingestion with single optimize() at end
|
|
2
|
+
import { randomUUID } from 'node:crypto';
|
|
3
|
+
import { opendir, stat } from 'node:fs/promises';
|
|
4
|
+
import { extname, join, resolve } from 'node:path';
|
|
5
|
+
import { SemanticChunker } from '../chunker/index.js';
|
|
6
|
+
import { DocumentParser, SUPPORTED_EXTENSIONS } from '../parser/index.js';
|
|
7
|
+
import { createEmbedder, createVectorStore } from './common.js';
|
|
8
|
+
import { resolveGlobalConfig, validateMaxFileSize, validatePath } from './options.js';
|
|
9
|
+
// ============================================
|
|
10
|
+
// Constants
|
|
11
|
+
// ============================================
|
|
12
|
+
const MAX_DEPTH = 10;
|
|
13
|
+
// ============================================
|
|
14
|
+
// Defaults
|
|
15
|
+
// ============================================
|
|
16
|
+
const INGEST_DEFAULTS = {
|
|
17
|
+
maxFileSize: 104857600,
|
|
18
|
+
};
|
|
19
|
+
// ============================================
|
|
20
|
+
// Help
|
|
21
|
+
// ============================================
|
|
22
|
+
const HELP_TEXT = `Usage: mcp-local-rag [global-options] ingest [options] <path>
|
|
23
|
+
|
|
24
|
+
Ingest a single file or all supported files under a directory.
|
|
25
|
+
|
|
26
|
+
Options:
|
|
27
|
+
--base-dir <path> Base directory for documents (default: cwd)
|
|
28
|
+
--max-file-size <n> Max file size in bytes (default: ${INGEST_DEFAULTS.maxFileSize})
|
|
29
|
+
-h, --help Show this help
|
|
30
|
+
|
|
31
|
+
Global options (must appear before "ingest"):
|
|
32
|
+
--db-path <path> LanceDB database path
|
|
33
|
+
--cache-dir <path> Model cache directory
|
|
34
|
+
--model-name <name> Embedding model`;
|
|
35
|
+
// ============================================
|
|
36
|
+
// Arg Parsing
|
|
37
|
+
// ============================================
|
|
38
|
+
/**
|
|
39
|
+
* Parse ingest-specific CLI arguments into options and a positional path.
|
|
40
|
+
* Flags: --base-dir, --max-file-size, -h/--help
|
|
41
|
+
* Unknown flags (including global flags passed after subcommand) cause an error.
|
|
42
|
+
*/
|
|
43
|
+
export function parseArgs(args) {
|
|
44
|
+
const options = {};
|
|
45
|
+
let positional;
|
|
46
|
+
let help = false;
|
|
47
|
+
let i = 0;
|
|
48
|
+
while (i < args.length) {
|
|
49
|
+
const arg = args[i];
|
|
50
|
+
switch (arg) {
|
|
51
|
+
case '-h':
|
|
52
|
+
case '--help':
|
|
53
|
+
help = true;
|
|
54
|
+
i++;
|
|
55
|
+
break;
|
|
56
|
+
case '--base-dir': {
|
|
57
|
+
const value = args[++i];
|
|
58
|
+
if (value === undefined || value.startsWith('-')) {
|
|
59
|
+
console.error('Missing value for --base-dir');
|
|
60
|
+
process.exit(1);
|
|
61
|
+
}
|
|
62
|
+
options.baseDir = value;
|
|
63
|
+
i++;
|
|
64
|
+
break;
|
|
65
|
+
}
|
|
66
|
+
case '--max-file-size': {
|
|
67
|
+
const raw = args[++i];
|
|
68
|
+
if (raw === undefined || raw.startsWith('-')) {
|
|
69
|
+
console.error('Missing value for --max-file-size');
|
|
70
|
+
process.exit(1);
|
|
71
|
+
}
|
|
72
|
+
const parsed = Number.parseInt(raw, 10);
|
|
73
|
+
options.maxFileSize = Number.isNaN(parsed) ? undefined : parsed;
|
|
74
|
+
i++;
|
|
75
|
+
break;
|
|
76
|
+
}
|
|
77
|
+
default:
|
|
78
|
+
if (arg.startsWith('-')) {
|
|
79
|
+
console.error(`Unknown option: ${arg}`);
|
|
80
|
+
console.error(HELP_TEXT);
|
|
81
|
+
process.exit(1);
|
|
82
|
+
}
|
|
83
|
+
if (positional !== undefined) {
|
|
84
|
+
console.error(`Unexpected argument: ${arg}`);
|
|
85
|
+
console.error('Only one path is accepted. Use a directory to ingest multiple files.');
|
|
86
|
+
process.exit(1);
|
|
87
|
+
}
|
|
88
|
+
positional = arg;
|
|
89
|
+
i++;
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return { positional, options, help };
|
|
94
|
+
}
|
|
95
|
+
// ============================================
|
|
96
|
+
// NaN Defense
|
|
97
|
+
// ============================================
|
|
98
|
+
/**
|
|
99
|
+
* Ensure maxFileSize is a valid number, falling back to default if NaN.
|
|
100
|
+
*/
|
|
101
|
+
function sanitizeMaxFileSize(value) {
|
|
102
|
+
return Number.isNaN(value) ? INGEST_DEFAULTS.maxFileSize : value;
|
|
103
|
+
}
|
|
104
|
+
// ============================================
|
|
105
|
+
// Config Resolution
|
|
106
|
+
// ============================================
|
|
107
|
+
/**
|
|
108
|
+
* Resolve ingest config by merging global config with ingest-specific options.
|
|
109
|
+
* Ingest-specific: baseDir, maxFileSize (CLI flags > env vars > defaults).
|
|
110
|
+
* Validates all resolved values before returning.
|
|
111
|
+
*/
|
|
112
|
+
export function resolveConfig(globalConfig, ingestOptions = {}) {
|
|
113
|
+
const baseDir = ingestOptions.baseDir ?? process.env['BASE_DIR'] ?? process.cwd();
|
|
114
|
+
const maxFileSize = sanitizeMaxFileSize(ingestOptions.maxFileSize ??
|
|
115
|
+
(process.env['MAX_FILE_SIZE']
|
|
116
|
+
? Number.parseInt(process.env['MAX_FILE_SIZE'], 10)
|
|
117
|
+
: INGEST_DEFAULTS.maxFileSize));
|
|
118
|
+
// Validate baseDir path
|
|
119
|
+
const baseDirError = validatePath(baseDir, '--base-dir');
|
|
120
|
+
if (baseDirError) {
|
|
121
|
+
console.error(baseDirError);
|
|
122
|
+
process.exit(1);
|
|
123
|
+
}
|
|
124
|
+
// Validate maxFileSize range
|
|
125
|
+
const maxFileSizeError = validateMaxFileSize(maxFileSize);
|
|
126
|
+
if (maxFileSizeError) {
|
|
127
|
+
console.error(maxFileSizeError);
|
|
128
|
+
process.exit(1);
|
|
129
|
+
}
|
|
130
|
+
return {
|
|
131
|
+
dbPath: globalConfig.dbPath,
|
|
132
|
+
cacheDir: globalConfig.cacheDir,
|
|
133
|
+
modelName: globalConfig.modelName,
|
|
134
|
+
baseDir,
|
|
135
|
+
maxFileSize,
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
// ============================================
|
|
139
|
+
// File Collection
|
|
140
|
+
// ============================================
|
|
141
|
+
/**
|
|
142
|
+
* Collect files to ingest from a path.
|
|
143
|
+
* - If path is a file with supported extension, return [path].
|
|
144
|
+
* - If path is a directory, walk with BFS up to MAX_DEPTH levels.
|
|
145
|
+
* - Skip symlinks, permission errors, and excluded directories.
|
|
146
|
+
*/
|
|
147
|
+
async function collectFiles(targetPath, excludePaths) {
|
|
148
|
+
const resolved = resolve(targetPath);
|
|
149
|
+
const info = await stat(resolved);
|
|
150
|
+
if (info.isFile()) {
|
|
151
|
+
const ext = extname(resolved).toLowerCase();
|
|
152
|
+
if (!SUPPORTED_EXTENSIONS.has(ext)) {
|
|
153
|
+
console.error(`Unsupported file extension: ${ext} (supported: ${[...SUPPORTED_EXTENSIONS].join(', ')})`);
|
|
154
|
+
return [];
|
|
155
|
+
}
|
|
156
|
+
return [resolved];
|
|
157
|
+
}
|
|
158
|
+
if (info.isDirectory()) {
|
|
159
|
+
const files = [];
|
|
160
|
+
let depthLimited = false;
|
|
161
|
+
const queue = [{ dirPath: resolved, depth: 0 }];
|
|
162
|
+
while (queue.length > 0) {
|
|
163
|
+
const { dirPath, depth } = queue.shift();
|
|
164
|
+
if (depth >= MAX_DEPTH) {
|
|
165
|
+
depthLimited = true;
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
let dir;
|
|
169
|
+
try {
|
|
170
|
+
dir = await opendir(dirPath);
|
|
171
|
+
}
|
|
172
|
+
catch {
|
|
173
|
+
console.error(`Warning: cannot read directory: ${dirPath}`);
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
for await (const entry of dir) {
|
|
177
|
+
const fullPath = join(dirPath, entry.name);
|
|
178
|
+
if (!fullPath.startsWith(resolved))
|
|
179
|
+
continue;
|
|
180
|
+
if (entry.isSymbolicLink())
|
|
181
|
+
continue;
|
|
182
|
+
if (excludePaths.some((ep) => fullPath.startsWith(ep)))
|
|
183
|
+
continue;
|
|
184
|
+
if (entry.isDirectory()) {
|
|
185
|
+
queue.push({ dirPath: fullPath, depth: depth + 1 });
|
|
186
|
+
}
|
|
187
|
+
else if (entry.isFile() && SUPPORTED_EXTENSIONS.has(extname(entry.name).toLowerCase())) {
|
|
188
|
+
files.push(fullPath);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
if (depthLimited) {
|
|
193
|
+
console.error(`Warning: some directories were skipped because they exceed the maximum depth (${MAX_DEPTH})`);
|
|
194
|
+
}
|
|
195
|
+
return files.sort();
|
|
196
|
+
}
|
|
197
|
+
return [];
|
|
198
|
+
}
|
|
199
|
+
// ============================================
|
|
200
|
+
// Per-file Ingestion
|
|
201
|
+
// ============================================
|
|
202
|
+
/**
|
|
203
|
+
* Ingest a single file: parse, chunk, embed, delete old chunks, insert new chunks.
|
|
204
|
+
* Returns the number of chunks inserted.
|
|
205
|
+
*/
|
|
206
|
+
async function ingestSingleFile(filePath, parser, chunker, embedder, vectorStore) {
|
|
207
|
+
// Parse file
|
|
208
|
+
const isPdf = filePath.toLowerCase().endsWith('.pdf');
|
|
209
|
+
let text;
|
|
210
|
+
let title = null;
|
|
211
|
+
if (isPdf) {
|
|
212
|
+
const result = await parser.parsePdf(filePath, embedder);
|
|
213
|
+
text = result.content;
|
|
214
|
+
title = result.title || null;
|
|
215
|
+
}
|
|
216
|
+
else {
|
|
217
|
+
const result = await parser.parseFile(filePath);
|
|
218
|
+
text = result.content;
|
|
219
|
+
title = result.title || null;
|
|
220
|
+
}
|
|
221
|
+
// Chunk text
|
|
222
|
+
const chunks = await chunker.chunkText(text, embedder);
|
|
223
|
+
if (chunks.length === 0) {
|
|
224
|
+
console.error(` Warning: 0 chunks generated (file may be empty or too short)`);
|
|
225
|
+
return 0;
|
|
226
|
+
}
|
|
227
|
+
// Generate embeddings
|
|
228
|
+
const embeddings = await embedder.embedBatch(chunks.map((c) => c.text));
|
|
229
|
+
// Delete existing chunks for this file
|
|
230
|
+
await vectorStore.deleteChunks(filePath);
|
|
231
|
+
// Build vector chunks
|
|
232
|
+
const timestamp = new Date().toISOString();
|
|
233
|
+
const vectorChunks = chunks.map((chunk, index) => {
|
|
234
|
+
const embedding = embeddings[index];
|
|
235
|
+
if (!embedding) {
|
|
236
|
+
throw new Error(`Missing embedding for chunk ${index}`);
|
|
237
|
+
}
|
|
238
|
+
return {
|
|
239
|
+
id: randomUUID(),
|
|
240
|
+
filePath,
|
|
241
|
+
chunkIndex: chunk.index,
|
|
242
|
+
text: chunk.text,
|
|
243
|
+
vector: embedding,
|
|
244
|
+
metadata: {
|
|
245
|
+
fileName: filePath.split('/').pop() || filePath,
|
|
246
|
+
fileSize: text.length,
|
|
247
|
+
fileType: filePath.split('.').pop() || '',
|
|
248
|
+
},
|
|
249
|
+
fileTitle: title,
|
|
250
|
+
timestamp,
|
|
251
|
+
};
|
|
252
|
+
});
|
|
253
|
+
// Insert chunks
|
|
254
|
+
await vectorStore.insertChunks(vectorChunks);
|
|
255
|
+
return vectorChunks.length;
|
|
256
|
+
}
|
|
257
|
+
// ============================================
|
|
258
|
+
// Main Entry Point
|
|
259
|
+
// ============================================
|
|
260
|
+
/**
|
|
261
|
+
* Run the ingest CLI subcommand.
|
|
262
|
+
* @param args - Arguments after "ingest" (e.g., option flags and file/directory path)
|
|
263
|
+
* @param globalOptions - Global options parsed before the subcommand
|
|
264
|
+
*/
|
|
265
|
+
export async function runIngest(args, globalOptions = {}) {
|
|
266
|
+
// Parse CLI options
|
|
267
|
+
const { positional, options, help } = parseArgs(args);
|
|
268
|
+
// Handle --help
|
|
269
|
+
if (help) {
|
|
270
|
+
console.error(HELP_TEXT);
|
|
271
|
+
process.exit(0);
|
|
272
|
+
}
|
|
273
|
+
// Validate positional argument
|
|
274
|
+
if (!positional) {
|
|
275
|
+
console.error('Usage: mcp-local-rag ingest [options] <path>');
|
|
276
|
+
console.error(' Ingest a single file or all supported files under a directory.');
|
|
277
|
+
console.error(' Run with --help for all options.');
|
|
278
|
+
process.exit(1);
|
|
279
|
+
}
|
|
280
|
+
const targetPath = positional;
|
|
281
|
+
// Validate path exists
|
|
282
|
+
try {
|
|
283
|
+
await stat(targetPath);
|
|
284
|
+
}
|
|
285
|
+
catch {
|
|
286
|
+
console.error(`Error: path does not exist: ${targetPath}`);
|
|
287
|
+
process.exit(1);
|
|
288
|
+
}
|
|
289
|
+
// Resolve config: CLI flags > env vars > defaults
|
|
290
|
+
const globalConfig = resolveGlobalConfig(globalOptions);
|
|
291
|
+
const config = resolveConfig(globalConfig, options);
|
|
292
|
+
const excludePaths = [`${resolve(config.dbPath)}/`, `${resolve(config.cacheDir)}/`];
|
|
293
|
+
// Collect files
|
|
294
|
+
const files = await collectFiles(targetPath, excludePaths);
|
|
295
|
+
if (files.length === 0) {
|
|
296
|
+
console.error('No supported files found.');
|
|
297
|
+
process.exit(1);
|
|
298
|
+
}
|
|
299
|
+
console.error(`Found ${files.length} file(s) to ingest.`);
|
|
300
|
+
// Initialize components (single instances reused across all files)
|
|
301
|
+
const parser = new DocumentParser({
|
|
302
|
+
baseDir: config.baseDir,
|
|
303
|
+
maxFileSize: config.maxFileSize,
|
|
304
|
+
});
|
|
305
|
+
const chunker = new SemanticChunker();
|
|
306
|
+
const embedder = createEmbedder(globalConfig);
|
|
307
|
+
const vectorStore = createVectorStore(globalConfig);
|
|
308
|
+
await vectorStore.initialize();
|
|
309
|
+
// Process each file
|
|
310
|
+
const summary = { succeeded: 0, failed: 0, totalChunks: 0 };
|
|
311
|
+
for (let i = 0; i < files.length; i++) {
|
|
312
|
+
const filePath = files[i];
|
|
313
|
+
const label = `[${i + 1}/${files.length}]`;
|
|
314
|
+
try {
|
|
315
|
+
const chunkCount = await ingestSingleFile(filePath, parser, chunker, embedder, vectorStore);
|
|
316
|
+
if (chunkCount === 0) {
|
|
317
|
+
// 0 chunks is a skip/warning, not a failure
|
|
318
|
+
console.error(`${label} ${filePath} ... SKIPPED (0 chunks)`);
|
|
319
|
+
summary.succeeded++;
|
|
320
|
+
}
|
|
321
|
+
else {
|
|
322
|
+
console.error(`${label} ${filePath} ... OK (${chunkCount} chunks)`);
|
|
323
|
+
summary.succeeded++;
|
|
324
|
+
summary.totalChunks += chunkCount;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
catch (error) {
|
|
328
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
329
|
+
console.error(`${label} ${filePath} ... FAILED: ${reason}`);
|
|
330
|
+
summary.failed++;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
// Optimize once at end (not per-file)
|
|
334
|
+
await vectorStore.optimize();
|
|
335
|
+
// Print summary
|
|
336
|
+
console.error('');
|
|
337
|
+
console.error('--- Ingest Summary ---');
|
|
338
|
+
console.error(`Succeeded: ${summary.succeeded}`);
|
|
339
|
+
console.error(`Failed: ${summary.failed}`);
|
|
340
|
+
console.error(`Total chunks: ${summary.totalChunks}`);
|
|
341
|
+
if (summary.failed > 0) {
|
|
342
|
+
process.exitCode = 1;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
//# sourceMappingURL=ingest.js.map
|