mcp-local-rag 0.8.2 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +20 -0
  2. package/dist/cli/common.d.ts +14 -0
  3. package/dist/cli/common.d.ts.map +1 -0
  4. package/dist/cli/common.js +25 -0
  5. package/dist/cli/common.js.map +1 -0
  6. package/dist/cli/delete.d.ts +8 -0
  7. package/dist/cli/delete.d.ts.map +1 -0
  8. package/dist/cli/delete.js +164 -0
  9. package/dist/cli/delete.js.map +1 -0
  10. package/dist/cli/ingest.d.ts +37 -0
  11. package/dist/cli/ingest.d.ts.map +1 -0
  12. package/dist/cli/ingest.js +345 -0
  13. package/dist/cli/ingest.js.map +1 -0
  14. package/dist/cli/list.d.ts +23 -0
  15. package/dist/cli/list.d.ts.map +1 -0
  16. package/dist/cli/list.js +142 -0
  17. package/dist/cli/list.js.map +1 -0
  18. package/dist/cli/options.d.ts +49 -0
  19. package/dist/cli/options.d.ts.map +1 -0
  20. package/dist/cli/options.js +193 -0
  21. package/dist/cli/options.js.map +1 -0
  22. package/dist/cli/query.d.ts +23 -0
  23. package/dist/cli/query.d.ts.map +1 -0
  24. package/dist/cli/query.js +162 -0
  25. package/dist/cli/query.js.map +1 -0
  26. package/dist/cli/status.d.ts +8 -0
  27. package/dist/cli/status.d.ts.map +1 -0
  28. package/dist/cli/status.js +78 -0
  29. package/dist/cli/status.js.map +1 -0
  30. package/dist/cli-main.d.ts +4 -2
  31. package/dist/cli-main.d.ts.map +1 -1
  32. package/dist/cli-main.js +24 -3
  33. package/dist/cli-main.js.map +1 -1
  34. package/dist/embedder/index.d.ts.map +1 -1
  35. package/dist/embedder/index.js +3 -1
  36. package/dist/embedder/index.js.map +1 -1
  37. package/dist/index.js +8 -4
  38. package/dist/index.js.map +1 -1
  39. package/dist/server/index.js +1 -1
  40. package/dist/server/index.js.map +1 -1
  41. package/dist/server/types.d.ts +1 -1
  42. package/dist/server/types.d.ts.map +1 -1
  43. package/dist/utils/raw-data-utils.d.ts +116 -0
  44. package/dist/utils/raw-data-utils.d.ts.map +1 -0
  45. package/dist/utils/raw-data-utils.js +202 -0
  46. package/dist/utils/raw-data-utils.js.map +1 -0
  47. package/package.json +3 -4
  48. package/skills/mcp-local-rag/SKILL.md +29 -13
  49. package/skills/mcp-local-rag/references/cli-reference.md +77 -0
  50. package/skills/mcp-local-rag/references/html-ingestion.md +11 -7
  51. package/skills/mcp-local-rag/references/query-optimization.md +1 -1
package/README.md CHANGED
@@ -119,6 +119,25 @@ HTML is automatically cleaned—you get the article content, not the boilerplate
119
119
 
120
120
  > **Note:** The RAG server itself doesn't fetch web content—your AI assistant retrieves it and passes the HTML to `ingest_data`. This keeps the server fully local while letting you index any content your assistant can access. Please respect website terms of service and copyright when ingesting external content.
121
121
 
122
+ ### CLI Commands
123
+
124
+ All MCP tools are also available as CLI commands — no MCP server needed:
125
+
126
+ ```bash
127
+ npx mcp-local-rag ingest ./docs/ # Bulk ingest files
128
+ npx mcp-local-rag query "authentication API" # Search documents
129
+ npx mcp-local-rag list # Show ingestion status
130
+ npx mcp-local-rag status # Database stats
131
+ npx mcp-local-rag delete ./docs/old.pdf # Remove content
132
+ npx mcp-local-rag delete --source "https://..." # Remove by source URL
133
+ ```
134
+
135
+ Global options (`--db-path`, `--cache-dir`, `--model-name`) go before the subcommand. Run `npx mcp-local-rag --help` for details.
136
+
137
+ `query`, `list`, `status`, and `delete` output JSON to stdout for piping (e.g., `| jq`). `ingest` outputs progress to stderr.
138
+
139
+ > ⚠️ **CLI options must match your MCP server config.** Especially `--model-name` — using a different embedding model against an existing database produces incompatible vectors, silently degrading search quality.
140
+
122
141
  ### Searching Documents
123
142
 
124
143
  ```
@@ -407,6 +426,7 @@ pnpm run check:all # Full quality check
407
426
  src/
408
427
  index.ts # Entry point
409
428
  server/ # MCP tool handlers
429
+ cli/ # CLI subcommands (ingest)
410
430
  parser/ # PDF, DOCX, TXT, MD parsing
411
431
  chunker/ # Text splitting
412
432
  embedder/ # Transformers.js embeddings
@@ -0,0 +1,14 @@
1
+ import { Embedder } from '../embedder/index.js';
2
+ import { VectorStore } from '../vectordb/index.js';
3
+ import type { ResolvedGlobalConfig } from './options.js';
4
+ /**
5
+ * Create an uninitialized VectorStore from resolved global config.
6
+ * Callers are responsible for calling initialize() before use.
7
+ */
8
+ export declare function createVectorStore(config: ResolvedGlobalConfig): VectorStore;
9
+ /**
10
+ * Create an uninitialized Embedder from resolved global config.
11
+ * Callers are responsible for managing the Embedder lifecycle.
12
+ */
13
+ export declare function createEmbedder(config: ResolvedGlobalConfig): Embedder;
14
+ //# sourceMappingURL=common.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"common.d.ts","sourceRoot":"","sources":["../../src/cli/common.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAA;AAC/C,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAClD,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAA;AAExD;;;GAGG;AACH,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,oBAAoB,GAAG,WAAW,CAK3E;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,oBAAoB,GAAG,QAAQ,CAMrE"}
@@ -0,0 +1,25 @@
1
+ // Shared CLI component helpers — factory functions for VectorStore and Embedder
2
+ import { Embedder } from '../embedder/index.js';
3
+ import { VectorStore } from '../vectordb/index.js';
4
+ /**
5
+ * Create an uninitialized VectorStore from resolved global config.
6
+ * Callers are responsible for calling initialize() before use.
7
+ */
8
+ export function createVectorStore(config) {
9
+ return new VectorStore({
10
+ dbPath: config.dbPath,
11
+ tableName: 'chunks',
12
+ });
13
+ }
14
+ /**
15
+ * Create an uninitialized Embedder from resolved global config.
16
+ * Callers are responsible for managing the Embedder lifecycle.
17
+ */
18
+ export function createEmbedder(config) {
19
+ return new Embedder({
20
+ modelPath: config.modelName,
21
+ batchSize: 16,
22
+ cacheDir: config.cacheDir,
23
+ });
24
+ }
25
+ //# sourceMappingURL=common.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"common.js","sourceRoot":"","sources":["../../src/cli/common.ts"],"names":[],"mappings":"AAAA,gFAAgF;AAEhF,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAA;AAC/C,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAA;AAGlD;;;GAGG;AACH,MAAM,UAAU,iBAAiB,CAAC,MAA4B;IAC5D,OAAO,IAAI,WAAW,CAAC;QACrB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAA;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,MAA4B;IACzD,OAAO,IAAI,QAAQ,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,EAAE;QACb,QAAQ,EAAE,MAAM,CAAC,QAAQ;KAC1B,CAAC,CAAA;AACJ,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { GlobalOptions } from './options.js';
2
+ /**
3
+ * Run the delete CLI subcommand.
4
+ * @param args - Arguments after "delete"
5
+ * @param globalOptions - Global options parsed before the subcommand
6
+ */
7
+ export declare function runDelete(args: string[], globalOptions?: GlobalOptions): Promise<void>;
8
+ //# sourceMappingURL=delete.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"delete.d.ts","sourceRoot":"","sources":["../../src/cli/delete.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAkFjD;;;;GAIG;AACH,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,aAAa,GAAE,aAAkB,GAAG,OAAO,CAAC,IAAI,CAAC,CAiGhG"}
@@ -0,0 +1,164 @@
1
+ // CLI delete subcommand — delete ingested content by file path or source URL
2
+ import { unlink } from 'node:fs/promises';
3
+ import { resolve } from 'node:path';
4
+ import { generateMetaJsonPath, generateRawDataPath, isRawDataPath, } from '../utils/raw-data-utils.js';
5
+ import { createVectorStore } from './common.js';
6
+ import { resolveGlobalConfig, validatePath } from './options.js';
7
+ // ============================================
8
+ // Help
9
+ // ============================================
10
+ const HELP_TEXT = `Usage: mcp-local-rag [global-options] delete [--source <url>] [<file-path>]
11
+
12
+ Delete ingested content by file path or source URL.
13
+
14
+ Either <file-path> or --source is required (not both).
15
+
16
+ Arguments:
17
+ <file-path> File path of ingested content to delete
18
+
19
+ Options:
20
+ --source <url> Delete by source URL (for content ingested via ingest_data)
21
+ -h, --help Show this help
22
+
23
+ Global options (must appear before "delete"):
24
+ --db-path <path> LanceDB database path
25
+ --cache-dir <path> Model cache directory
26
+ --model-name <name> Embedding model`;
27
+ /**
28
+ * Parse delete-specific CLI arguments.
29
+ * Accepts a positional <file-path>, --source <url>, and -h/--help.
30
+ * Unknown flags or conflicting args cause exit(1).
31
+ */
32
+ function parseArgs(args) {
33
+ let help = false;
34
+ let source;
35
+ let filePath;
36
+ let i = 0;
37
+ while (i < args.length) {
38
+ const arg = args[i];
39
+ if (arg === '-h' || arg === '--help') {
40
+ help = true;
41
+ i++;
42
+ }
43
+ else if (arg === '--source') {
44
+ const value = args[++i];
45
+ if (value === undefined || value.startsWith('-')) {
46
+ console.error('Missing value for --source');
47
+ console.error(HELP_TEXT);
48
+ process.exit(1);
49
+ }
50
+ source = value;
51
+ i++;
52
+ }
53
+ else if (arg.startsWith('-')) {
54
+ console.error(`Unknown option: ${arg}`);
55
+ console.error(HELP_TEXT);
56
+ process.exit(1);
57
+ }
58
+ else {
59
+ // Positional argument: file-path
60
+ filePath = arg;
61
+ i++;
62
+ }
63
+ }
64
+ const result = { help };
65
+ if (source !== undefined)
66
+ result.source = source;
67
+ if (filePath !== undefined)
68
+ result.filePath = filePath;
69
+ return result;
70
+ }
71
+ // ============================================
72
+ // Main Entry Point
73
+ // ============================================
74
+ /**
75
+ * Run the delete CLI subcommand.
76
+ * @param args - Arguments after "delete"
77
+ * @param globalOptions - Global options parsed before the subcommand
78
+ */
79
+ export async function runDelete(args, globalOptions = {}) {
80
+ // Parse CLI options
81
+ const parsed = parseArgs(args);
82
+ // Handle --help
83
+ if (parsed.help) {
84
+ console.error(HELP_TEXT);
85
+ process.exit(0);
86
+ }
87
+ // Validate: either file-path or --source required, not both
88
+ if (!parsed.filePath && !parsed.source) {
89
+ console.error('Either <file-path> or --source is required');
90
+ console.error(HELP_TEXT);
91
+ process.exit(1);
92
+ }
93
+ if (parsed.filePath && parsed.source) {
94
+ console.error('Cannot specify both <file-path> and --source');
95
+ console.error(HELP_TEXT);
96
+ process.exit(1);
97
+ }
98
+ // Resolve global config
99
+ const globalConfig = resolveGlobalConfig(globalOptions);
100
+ try {
101
+ // Create and initialize VectorStore (no Embedder needed for delete)
102
+ const vectorStore = createVectorStore(globalConfig);
103
+ await vectorStore.initialize();
104
+ // Determine target file path
105
+ let targetPath;
106
+ if (parsed.source) {
107
+ // Generate raw-data path from source URL
108
+ targetPath = generateRawDataPath(globalConfig.dbPath, parsed.source, 'markdown');
109
+ }
110
+ else {
111
+ // Use provided file path, resolve to absolute
112
+ targetPath = resolve(parsed.filePath);
113
+ // Validate path (reject sensitive system directories)
114
+ const pathError = validatePath(targetPath, '<file-path>');
115
+ if (pathError) {
116
+ console.error(pathError);
117
+ process.exitCode = 1;
118
+ return;
119
+ }
120
+ }
121
+ // Delete chunks from VectorStore
122
+ await vectorStore.deleteChunks(targetPath);
123
+ // Clean up physical raw-data files if applicable
124
+ if (isRawDataPath(targetPath)) {
125
+ try {
126
+ await unlink(targetPath);
127
+ }
128
+ catch (error) {
129
+ // Ignore ENOENT (file already deleted / never existed)
130
+ if (!(error instanceof Error) ||
131
+ !('code' in error) ||
132
+ error.code !== 'ENOENT') {
133
+ throw error;
134
+ }
135
+ }
136
+ try {
137
+ await unlink(generateMetaJsonPath(targetPath));
138
+ }
139
+ catch (error) {
140
+ // Ignore ENOENT
141
+ if (!(error instanceof Error) ||
142
+ !('code' in error) ||
143
+ error.code !== 'ENOENT') {
144
+ throw error;
145
+ }
146
+ }
147
+ }
148
+ // Optimize VectorStore after deletion
149
+ await vectorStore.optimize();
150
+ // Output result JSON to stdout
151
+ const result = {
152
+ filePath: targetPath,
153
+ deleted: true,
154
+ timestamp: new Date().toISOString(),
155
+ };
156
+ process.stdout.write(JSON.stringify(result));
157
+ }
158
+ catch (error) {
159
+ const reason = error instanceof Error ? error.message : String(error);
160
+ console.error(`Error: ${reason}`);
161
+ process.exit(1);
162
+ }
163
+ }
164
+ //# sourceMappingURL=delete.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"delete.js","sourceRoot":"","sources":["../../src/cli/delete.ts"],"names":[],"mappings":"AAAA,6EAA6E;AAE7E,OAAO,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAA;AACzC,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AACnC,OAAO,EACL,oBAAoB,EACpB,mBAAmB,EACnB,aAAa,GACd,MAAM,4BAA4B,CAAA;AACnC,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,mBAAmB,EAAE,YAAY,EAAE,MAAM,cAAc,CAAA;AAEhE,+CAA+C;AAC/C,OAAO;AACP,+CAA+C;AAE/C,MAAM,SAAS,GAAG;;;;;;;;;;;;;;;;yCAgBuB,CAAA;AAYzC;;;;GAIG;AACH,SAAS,SAAS,CAAC,IAAc;IAC/B,IAAI,IAAI,GAAG,KAAK,CAAA;IAChB,IAAI,MAA0B,CAAA;IAC9B,IAAI,QAA4B,CAAA;IAEhC,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,OAAO,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAE,CAAA;QAEpB,IAAI,GAAG,KAAK,IAAI,IAAI,GAAG,KAAK,QAAQ,EAAE,CAAC;YACrC,IAAI,GAAG,IAAI,CAAA;YACX,CAAC,EAAE,CAAA;QACL,CAAC;aAAM,IAAI,GAAG,KAAK,UAAU,EAAE,CAAC;YAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;YACvB,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBACjD,OAAO,CAAC,KAAK,CAAC,4BAA4B,CAAC,CAAA;gBAC3C,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;gBACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACjB,CAAC;YACD,MAAM,GAAG,KAAK,CAAA;YACd,CAAC,EAAE,CAAA;QACL,CAAC;aAAM,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,KAAK,CAAC,mBAAmB,GAAG,EAAE,CAAC,CAAA;YACvC,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;YACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACjB,CAAC;aAAM,CAAC;YACN,iCAAiC;YACjC,QAAQ,GAAG,GAAG,CAAA;YACd,CAAC,EAAE,CAAA;QACL,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAe,EAAE,IAAI,EAAE,CAAA;IACnC,IAAI,MAAM,KAAK,SAAS;QAAE,MAAM,CAAC,MAAM,GAAG,MAAM,CAAA;IAChD,IAAI,QAAQ,KAAK,SAAS;QAAE,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAA;IACtD,OAAO,MAAM,CAAA;AACf,CAAC;AAED,+CAA+C;AAC/C,mBAAmB;AACnB,+CAA+C;AAE/C;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,IAAc,EAAE,gBAA+B,EAAE;IAC/E,oBAAoB;IACpB,MAAM,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAA;IAE9B,gBAAgB;IAChB,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;QAChB,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,4DAA4D;IAC5D,IAAI,CAAC,MAAM,CAAC,QAAQ,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;QACvC,OAAO,CAAC,KAAK,CAAC,4CAA4C,CAAC,CAAA;QAC3D,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,IAAI,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;QACrC,OAAO,CAAC,KAAK,CAAC,8CAA8C,CAAC,CAAA;QAC7D,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACxB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;IAED,wBAAwB;IACxB,MAAM,YAAY,GAAG,mBAAmB,CAAC,aAAa,CAAC,CAAA;IAEvD,IAAI,CAAC;QACH,oEAAoE;QACpE,MAAM,WAAW,GAAG,iBAAiB,CAAC,YAAY,CAAC,CAAA;QACnD,MAAM,WAAW,CAAC,UAAU,EAAE,CAAA;QAE9B,6BAA6B;QAC7B,IAAI,UAAkB,CAAA;QAEtB,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClB,yCAAyC;YACzC,UAAU,GAAG,mBAAmB,CAAC,YAAY,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;QAClF,CAAC;aAAM,CAAC;YACN,8CAA8C;YAC9C,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,QAAS,CAAC,CAAA;YAEtC,sDAAsD;YACtD,MAAM,SAAS,GAAG,YAAY,CAAC,UAAU,EAAE,aAAa,CAAC,CAAA;YACzD,IAAI,SAAS,EAAE,CAAC;gBACd,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;gBACxB,OAAO,CAAC,QAAQ,GAAG,CAAC,CAAA;gBACpB,OAAM;YACR,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,MAAM,WAAW,CAAC,YAAY,CAAC,UAAU,CAAC,CAAA;QAE1C,iDAAiD;QACjD,IAAI,aAAa,CAAC,UAAU,CAAC,EAAE,CAAC;YAC9B,IAAI,CAAC;gBACH,MAAM,MAAM,CAAC,UAAU,CAAC,CAAA;YAC1B,CAAC;YAAC,OAAO,KAAc,EAAE,CAAC;gBACxB,uDAAuD;gBACvD,IACE,CAAC,CAAC,KAAK,YAAY,KAAK,CAAC;oBACzB,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC;oBACjB,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAClD,CAAC;oBACD,MAAM,KAAK,CAAA;gBACb,CAAC;YACH,CAAC;YAED,IAAI,CAAC;gBACH,MAAM,MAAM,CAAC,oBAAoB,CAAC,UAAU,CAAC,CAAC,CAAA;YAChD,CAAC;YAAC,OAAO,KAAc,EAAE,CAAC;gBACxB,gBAAgB;gBAChB,IACE,CAAC,CAAC,KAAK,YAAY,KAAK,CAAC;oBACzB,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC;oBACjB,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAClD,CAAC;oBACD,MAAM,KAAK,CAAA;gBACb,CAAC;YACH,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,MAAM,WAAW,CAAC,QAAQ,EAAE,CAAA;QAE5B,+BAA+B;QAC/B,MAAM,MAAM,GAAG;YACb,QAAQ,EAAE,UAAU;YACpB,OAAO,EAAE,IAAI;YACb,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACpC,CAAA;QACD,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC,CAAA;IAC9C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QACrE,OAAO,CAAC,KAAK,CAAC,UAAU,MAAM,EAAE,CAAC,CAAA;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjB,CAAC;AACH,CAAC"}
@@ -0,0 +1,37 @@
1
+ import type { GlobalOptions, ResolvedGlobalConfig } from './options.js';
2
+ interface IngestConfig {
3
+ baseDir: string;
4
+ dbPath: string;
5
+ cacheDir: string;
6
+ modelName: string;
7
+ maxFileSize: number;
8
+ }
9
+ interface IngestCliOptions {
10
+ baseDir?: string | undefined;
11
+ maxFileSize?: number | undefined;
12
+ }
13
+ interface ParsedArgs {
14
+ positional: string | undefined;
15
+ options: IngestCliOptions;
16
+ help: boolean;
17
+ }
18
+ /**
19
+ * Parse ingest-specific CLI arguments into options and a positional path.
20
+ * Flags: --base-dir, --max-file-size, -h/--help
21
+ * Unknown flags (including global flags passed after subcommand) cause an error.
22
+ */
23
+ export declare function parseArgs(args: string[]): ParsedArgs;
24
+ /**
25
+ * Resolve ingest config by merging global config with ingest-specific options.
26
+ * Ingest-specific: baseDir, maxFileSize (CLI flags > env vars > defaults).
27
+ * Validates all resolved values before returning.
28
+ */
29
+ export declare function resolveConfig(globalConfig: ResolvedGlobalConfig, ingestOptions?: IngestCliOptions): IngestConfig;
30
+ /**
31
+ * Run the ingest CLI subcommand.
32
+ * @param args - Arguments after "ingest" (e.g., option flags and file/directory path)
33
+ * @param globalOptions - Global options parsed before the subcommand
34
+ */
35
+ export declare function runIngest(args: string[], globalOptions?: GlobalOptions): Promise<void>;
36
+ export {};
37
+ //# sourceMappingURL=ingest.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ingest.d.ts","sourceRoot":"","sources":["../../src/cli/ingest.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,cAAc,CAAA;AAavE,UAAU,YAAY;IACpB,OAAO,EAAE,MAAM,CAAA;IACf,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,EAAE,MAAM,CAAA;CACpB;AAQD,UAAU,gBAAgB;IACxB,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,CAAA;IAC5B,WAAW,CAAC,EAAE,MAAM,GAAG,SAAS,CAAA;CACjC;AAED,UAAU,UAAU;IAClB,UAAU,EAAE,MAAM,GAAG,SAAS,CAAA;IAC9B,OAAO,EAAE,gBAAgB,CAAA;IACzB,IAAI,EAAE,OAAO,CAAA;CACd;AAgCD;;;;GAIG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,UAAU,CAqDpD;AAiBD;;;;GAIG;AACH,wBAAgB,aAAa,CAC3B,YAAY,EAAE,oBAAoB,EAClC,aAAa,GAAE,gBAAqB,GACnC,YAAY,CA8Bd;AAuJD;;;;GAIG;AACH,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,aAAa,GAAE,aAAkB,GAAG,OAAO,CAAC,IAAI,CAAC,CA0FhG"}
@@ -0,0 +1,345 @@
1
+ // CLI ingest subcommand — bulk file ingestion with single optimize() at end
2
+ import { randomUUID } from 'node:crypto';
3
+ import { opendir, stat } from 'node:fs/promises';
4
+ import { extname, join, resolve } from 'node:path';
5
+ import { SemanticChunker } from '../chunker/index.js';
6
+ import { DocumentParser, SUPPORTED_EXTENSIONS } from '../parser/index.js';
7
+ import { createEmbedder, createVectorStore } from './common.js';
8
+ import { resolveGlobalConfig, validateMaxFileSize, validatePath } from './options.js';
9
+ // ============================================
10
+ // Constants
11
+ // ============================================
12
+ const MAX_DEPTH = 10;
13
+ // ============================================
14
+ // Defaults
15
+ // ============================================
16
+ const INGEST_DEFAULTS = {
17
+ maxFileSize: 104857600,
18
+ };
19
+ // ============================================
20
+ // Help
21
+ // ============================================
22
+ const HELP_TEXT = `Usage: mcp-local-rag [global-options] ingest [options] <path>
23
+
24
+ Ingest a single file or all supported files under a directory.
25
+
26
+ Options:
27
+ --base-dir <path> Base directory for documents (default: cwd)
28
+ --max-file-size <n> Max file size in bytes (default: ${INGEST_DEFAULTS.maxFileSize})
29
+ -h, --help Show this help
30
+
31
+ Global options (must appear before "ingest"):
32
+ --db-path <path> LanceDB database path
33
+ --cache-dir <path> Model cache directory
34
+ --model-name <name> Embedding model`;
35
+ // ============================================
36
+ // Arg Parsing
37
+ // ============================================
38
+ /**
39
+ * Parse ingest-specific CLI arguments into options and a positional path.
40
+ * Flags: --base-dir, --max-file-size, -h/--help
41
+ * Unknown flags (including global flags passed after subcommand) cause an error.
42
+ */
43
+ export function parseArgs(args) {
44
+ const options = {};
45
+ let positional;
46
+ let help = false;
47
+ let i = 0;
48
+ while (i < args.length) {
49
+ const arg = args[i];
50
+ switch (arg) {
51
+ case '-h':
52
+ case '--help':
53
+ help = true;
54
+ i++;
55
+ break;
56
+ case '--base-dir': {
57
+ const value = args[++i];
58
+ if (value === undefined || value.startsWith('-')) {
59
+ console.error('Missing value for --base-dir');
60
+ process.exit(1);
61
+ }
62
+ options.baseDir = value;
63
+ i++;
64
+ break;
65
+ }
66
+ case '--max-file-size': {
67
+ const raw = args[++i];
68
+ if (raw === undefined || raw.startsWith('-')) {
69
+ console.error('Missing value for --max-file-size');
70
+ process.exit(1);
71
+ }
72
+ const parsed = Number.parseInt(raw, 10);
73
+ options.maxFileSize = Number.isNaN(parsed) ? undefined : parsed;
74
+ i++;
75
+ break;
76
+ }
77
+ default:
78
+ if (arg.startsWith('-')) {
79
+ console.error(`Unknown option: ${arg}`);
80
+ console.error(HELP_TEXT);
81
+ process.exit(1);
82
+ }
83
+ if (positional !== undefined) {
84
+ console.error(`Unexpected argument: ${arg}`);
85
+ console.error('Only one path is accepted. Use a directory to ingest multiple files.');
86
+ process.exit(1);
87
+ }
88
+ positional = arg;
89
+ i++;
90
+ break;
91
+ }
92
+ }
93
+ return { positional, options, help };
94
+ }
95
+ // ============================================
96
+ // NaN Defense
97
+ // ============================================
98
+ /**
99
+ * Ensure maxFileSize is a valid number, falling back to default if NaN.
100
+ */
101
+ function sanitizeMaxFileSize(value) {
102
+ return Number.isNaN(value) ? INGEST_DEFAULTS.maxFileSize : value;
103
+ }
104
+ // ============================================
105
+ // Config Resolution
106
+ // ============================================
107
+ /**
108
+ * Resolve ingest config by merging global config with ingest-specific options.
109
+ * Ingest-specific: baseDir, maxFileSize (CLI flags > env vars > defaults).
110
+ * Validates all resolved values before returning.
111
+ */
112
+ export function resolveConfig(globalConfig, ingestOptions = {}) {
113
+ const baseDir = ingestOptions.baseDir ?? process.env['BASE_DIR'] ?? process.cwd();
114
+ const maxFileSize = sanitizeMaxFileSize(ingestOptions.maxFileSize ??
115
+ (process.env['MAX_FILE_SIZE']
116
+ ? Number.parseInt(process.env['MAX_FILE_SIZE'], 10)
117
+ : INGEST_DEFAULTS.maxFileSize));
118
+ // Validate baseDir path
119
+ const baseDirError = validatePath(baseDir, '--base-dir');
120
+ if (baseDirError) {
121
+ console.error(baseDirError);
122
+ process.exit(1);
123
+ }
124
+ // Validate maxFileSize range
125
+ const maxFileSizeError = validateMaxFileSize(maxFileSize);
126
+ if (maxFileSizeError) {
127
+ console.error(maxFileSizeError);
128
+ process.exit(1);
129
+ }
130
+ return {
131
+ dbPath: globalConfig.dbPath,
132
+ cacheDir: globalConfig.cacheDir,
133
+ modelName: globalConfig.modelName,
134
+ baseDir,
135
+ maxFileSize,
136
+ };
137
+ }
138
+ // ============================================
139
+ // File Collection
140
+ // ============================================
141
+ /**
142
+ * Collect files to ingest from a path.
143
+ * - If path is a file with supported extension, return [path].
144
+ * - If path is a directory, walk with BFS up to MAX_DEPTH levels.
145
+ * - Skip symlinks, permission errors, and excluded directories.
146
+ */
147
+ async function collectFiles(targetPath, excludePaths) {
148
+ const resolved = resolve(targetPath);
149
+ const info = await stat(resolved);
150
+ if (info.isFile()) {
151
+ const ext = extname(resolved).toLowerCase();
152
+ if (!SUPPORTED_EXTENSIONS.has(ext)) {
153
+ console.error(`Unsupported file extension: ${ext} (supported: ${[...SUPPORTED_EXTENSIONS].join(', ')})`);
154
+ return [];
155
+ }
156
+ return [resolved];
157
+ }
158
+ if (info.isDirectory()) {
159
+ const files = [];
160
+ let depthLimited = false;
161
+ const queue = [{ dirPath: resolved, depth: 0 }];
162
+ while (queue.length > 0) {
163
+ const { dirPath, depth } = queue.shift();
164
+ if (depth >= MAX_DEPTH) {
165
+ depthLimited = true;
166
+ continue;
167
+ }
168
+ let dir;
169
+ try {
170
+ dir = await opendir(dirPath);
171
+ }
172
+ catch {
173
+ console.error(`Warning: cannot read directory: ${dirPath}`);
174
+ continue;
175
+ }
176
+ for await (const entry of dir) {
177
+ const fullPath = join(dirPath, entry.name);
178
+ if (!fullPath.startsWith(resolved))
179
+ continue;
180
+ if (entry.isSymbolicLink())
181
+ continue;
182
+ if (excludePaths.some((ep) => fullPath.startsWith(ep)))
183
+ continue;
184
+ if (entry.isDirectory()) {
185
+ queue.push({ dirPath: fullPath, depth: depth + 1 });
186
+ }
187
+ else if (entry.isFile() && SUPPORTED_EXTENSIONS.has(extname(entry.name).toLowerCase())) {
188
+ files.push(fullPath);
189
+ }
190
+ }
191
+ }
192
+ if (depthLimited) {
193
+ console.error(`Warning: some directories were skipped because they exceed the maximum depth (${MAX_DEPTH})`);
194
+ }
195
+ return files.sort();
196
+ }
197
+ return [];
198
+ }
199
+ // ============================================
200
+ // Per-file Ingestion
201
+ // ============================================
202
+ /**
203
+ * Ingest a single file: parse, chunk, embed, delete old chunks, insert new chunks.
204
+ * Returns the number of chunks inserted.
205
+ */
206
+ async function ingestSingleFile(filePath, parser, chunker, embedder, vectorStore) {
207
+ // Parse file
208
+ const isPdf = filePath.toLowerCase().endsWith('.pdf');
209
+ let text;
210
+ let title = null;
211
+ if (isPdf) {
212
+ const result = await parser.parsePdf(filePath, embedder);
213
+ text = result.content;
214
+ title = result.title || null;
215
+ }
216
+ else {
217
+ const result = await parser.parseFile(filePath);
218
+ text = result.content;
219
+ title = result.title || null;
220
+ }
221
+ // Chunk text
222
+ const chunks = await chunker.chunkText(text, embedder);
223
+ if (chunks.length === 0) {
224
+ console.error(` Warning: 0 chunks generated (file may be empty or too short)`);
225
+ return 0;
226
+ }
227
+ // Generate embeddings
228
+ const embeddings = await embedder.embedBatch(chunks.map((c) => c.text));
229
+ // Delete existing chunks for this file
230
+ await vectorStore.deleteChunks(filePath);
231
+ // Build vector chunks
232
+ const timestamp = new Date().toISOString();
233
+ const vectorChunks = chunks.map((chunk, index) => {
234
+ const embedding = embeddings[index];
235
+ if (!embedding) {
236
+ throw new Error(`Missing embedding for chunk ${index}`);
237
+ }
238
+ return {
239
+ id: randomUUID(),
240
+ filePath,
241
+ chunkIndex: chunk.index,
242
+ text: chunk.text,
243
+ vector: embedding,
244
+ metadata: {
245
+ fileName: filePath.split('/').pop() || filePath,
246
+ fileSize: text.length,
247
+ fileType: filePath.split('.').pop() || '',
248
+ },
249
+ fileTitle: title,
250
+ timestamp,
251
+ };
252
+ });
253
+ // Insert chunks
254
+ await vectorStore.insertChunks(vectorChunks);
255
+ return vectorChunks.length;
256
+ }
257
+ // ============================================
258
+ // Main Entry Point
259
+ // ============================================
260
+ /**
261
+ * Run the ingest CLI subcommand.
262
+ * @param args - Arguments after "ingest" (e.g., option flags and file/directory path)
263
+ * @param globalOptions - Global options parsed before the subcommand
264
+ */
265
+ export async function runIngest(args, globalOptions = {}) {
266
+ // Parse CLI options
267
+ const { positional, options, help } = parseArgs(args);
268
+ // Handle --help
269
+ if (help) {
270
+ console.error(HELP_TEXT);
271
+ process.exit(0);
272
+ }
273
+ // Validate positional argument
274
+ if (!positional) {
275
+ console.error('Usage: mcp-local-rag ingest [options] <path>');
276
+ console.error(' Ingest a single file or all supported files under a directory.');
277
+ console.error(' Run with --help for all options.');
278
+ process.exit(1);
279
+ }
280
+ const targetPath = positional;
281
+ // Validate path exists
282
+ try {
283
+ await stat(targetPath);
284
+ }
285
+ catch {
286
+ console.error(`Error: path does not exist: ${targetPath}`);
287
+ process.exit(1);
288
+ }
289
+ // Resolve config: CLI flags > env vars > defaults
290
+ const globalConfig = resolveGlobalConfig(globalOptions);
291
+ const config = resolveConfig(globalConfig, options);
292
+ const excludePaths = [`${resolve(config.dbPath)}/`, `${resolve(config.cacheDir)}/`];
293
+ // Collect files
294
+ const files = await collectFiles(targetPath, excludePaths);
295
+ if (files.length === 0) {
296
+ console.error('No supported files found.');
297
+ process.exit(1);
298
+ }
299
+ console.error(`Found ${files.length} file(s) to ingest.`);
300
+ // Initialize components (single instances reused across all files)
301
+ const parser = new DocumentParser({
302
+ baseDir: config.baseDir,
303
+ maxFileSize: config.maxFileSize,
304
+ });
305
+ const chunker = new SemanticChunker();
306
+ const embedder = createEmbedder(globalConfig);
307
+ const vectorStore = createVectorStore(globalConfig);
308
+ await vectorStore.initialize();
309
+ // Process each file
310
+ const summary = { succeeded: 0, failed: 0, totalChunks: 0 };
311
+ for (let i = 0; i < files.length; i++) {
312
+ const filePath = files[i];
313
+ const label = `[${i + 1}/${files.length}]`;
314
+ try {
315
+ const chunkCount = await ingestSingleFile(filePath, parser, chunker, embedder, vectorStore);
316
+ if (chunkCount === 0) {
317
+ // 0 chunks is a skip/warning, not a failure
318
+ console.error(`${label} ${filePath} ... SKIPPED (0 chunks)`);
319
+ summary.succeeded++;
320
+ }
321
+ else {
322
+ console.error(`${label} ${filePath} ... OK (${chunkCount} chunks)`);
323
+ summary.succeeded++;
324
+ summary.totalChunks += chunkCount;
325
+ }
326
+ }
327
+ catch (error) {
328
+ const reason = error instanceof Error ? error.message : String(error);
329
+ console.error(`${label} ${filePath} ... FAILED: ${reason}`);
330
+ summary.failed++;
331
+ }
332
+ }
333
+ // Optimize once at end (not per-file)
334
+ await vectorStore.optimize();
335
+ // Print summary
336
+ console.error('');
337
+ console.error('--- Ingest Summary ---');
338
+ console.error(`Succeeded: ${summary.succeeded}`);
339
+ console.error(`Failed: ${summary.failed}`);
340
+ console.error(`Total chunks: ${summary.totalChunks}`);
341
+ if (summary.failed > 0) {
342
+ process.exitCode = 1;
343
+ }
344
+ }
345
+ //# sourceMappingURL=ingest.js.map