@framers/agentos 0.1.101 → 0.1.102
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -0
- package/dist/memory/config.d.ts +39 -0
- package/dist/memory/config.d.ts.map +1 -1
- package/dist/memory/config.js.map +1 -1
- package/dist/memory/consolidation/ConsolidationLoop.d.ts +177 -0
- package/dist/memory/consolidation/ConsolidationLoop.d.ts.map +1 -0
- package/dist/memory/consolidation/ConsolidationLoop.js +517 -0
- package/dist/memory/consolidation/ConsolidationLoop.js.map +1 -0
- package/dist/memory/consolidation/ConsolidationPipeline.d.ts.map +1 -1
- package/dist/memory/consolidation/ConsolidationPipeline.js +7 -0
- package/dist/memory/consolidation/ConsolidationPipeline.js.map +1 -1
- package/dist/memory/consolidation/index.d.ts +8 -0
- package/dist/memory/consolidation/index.d.ts.map +1 -0
- package/dist/memory/consolidation/index.js +7 -0
- package/dist/memory/consolidation/index.js.map +1 -0
- package/dist/memory/decay/DecayModel.d.ts +33 -0
- package/dist/memory/decay/DecayModel.d.ts.map +1 -1
- package/dist/memory/decay/DecayModel.js +31 -0
- package/dist/memory/decay/DecayModel.js.map +1 -1
- package/dist/memory/facade/Memory.d.ts +228 -0
- package/dist/memory/facade/Memory.d.ts.map +1 -0
- package/dist/memory/facade/Memory.js +823 -0
- package/dist/memory/facade/Memory.js.map +1 -0
- package/dist/memory/facade/index.d.ts +13 -0
- package/dist/memory/facade/index.d.ts.map +1 -0
- package/dist/memory/facade/index.js +11 -0
- package/dist/memory/facade/index.js.map +1 -0
- package/dist/memory/facade/types.d.ts +606 -0
- package/dist/memory/facade/types.d.ts.map +1 -0
- package/dist/memory/facade/types.js +11 -0
- package/dist/memory/facade/types.js.map +1 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.d.ts +132 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.d.ts.map +1 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.js +178 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.js.map +1 -0
- package/dist/memory/feedback/index.d.ts +13 -0
- package/dist/memory/feedback/index.d.ts.map +1 -0
- package/dist/memory/feedback/index.js +12 -0
- package/dist/memory/feedback/index.js.map +1 -0
- package/dist/memory/index.d.ts +22 -0
- package/dist/memory/index.d.ts.map +1 -1
- package/dist/memory/index.js +24 -0
- package/dist/memory/index.js.map +1 -1
- package/dist/memory/ingestion/ChunkingEngine.d.ts +143 -0
- package/dist/memory/ingestion/ChunkingEngine.d.ts.map +1 -0
- package/dist/memory/ingestion/ChunkingEngine.js +508 -0
- package/dist/memory/ingestion/ChunkingEngine.js.map +1 -0
- package/dist/memory/ingestion/DoclingLoader.d.ts +44 -0
- package/dist/memory/ingestion/DoclingLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/DoclingLoader.js +228 -0
- package/dist/memory/ingestion/DoclingLoader.js.map +1 -0
- package/dist/memory/ingestion/DocxLoader.d.ts +37 -0
- package/dist/memory/ingestion/DocxLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/DocxLoader.js +111 -0
- package/dist/memory/ingestion/DocxLoader.js.map +1 -0
- package/dist/memory/ingestion/FolderScanner.d.ts +116 -0
- package/dist/memory/ingestion/FolderScanner.d.ts.map +1 -0
- package/dist/memory/ingestion/FolderScanner.js +127 -0
- package/dist/memory/ingestion/FolderScanner.js.map +1 -0
- package/dist/memory/ingestion/HtmlLoader.d.ts +49 -0
- package/dist/memory/ingestion/HtmlLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/HtmlLoader.js +202 -0
- package/dist/memory/ingestion/HtmlLoader.js.map +1 -0
- package/dist/memory/ingestion/IDocumentLoader.d.ts +63 -0
- package/dist/memory/ingestion/IDocumentLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/IDocumentLoader.js +11 -0
- package/dist/memory/ingestion/IDocumentLoader.js.map +1 -0
- package/dist/memory/ingestion/LoaderRegistry.d.ts +140 -0
- package/dist/memory/ingestion/LoaderRegistry.d.ts.map +1 -0
- package/dist/memory/ingestion/LoaderRegistry.js +229 -0
- package/dist/memory/ingestion/LoaderRegistry.js.map +1 -0
- package/dist/memory/ingestion/MarkdownLoader.d.ts +50 -0
- package/dist/memory/ingestion/MarkdownLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/MarkdownLoader.js +169 -0
- package/dist/memory/ingestion/MarkdownLoader.js.map +1 -0
- package/dist/memory/ingestion/MultimodalAggregator.d.ts +88 -0
- package/dist/memory/ingestion/MultimodalAggregator.d.ts.map +1 -0
- package/dist/memory/ingestion/MultimodalAggregator.js +96 -0
- package/dist/memory/ingestion/MultimodalAggregator.js.map +1 -0
- package/dist/memory/ingestion/OcrPdfLoader.d.ts +41 -0
- package/dist/memory/ingestion/OcrPdfLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/OcrPdfLoader.js +149 -0
- package/dist/memory/ingestion/OcrPdfLoader.js.map +1 -0
- package/dist/memory/ingestion/PdfLoader.d.ts +78 -0
- package/dist/memory/ingestion/PdfLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/PdfLoader.js +179 -0
- package/dist/memory/ingestion/PdfLoader.js.map +1 -0
- package/dist/memory/ingestion/TextLoader.d.ts +66 -0
- package/dist/memory/ingestion/TextLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/TextLoader.js +207 -0
- package/dist/memory/ingestion/TextLoader.js.map +1 -0
- package/dist/memory/ingestion/UrlLoader.d.ts +95 -0
- package/dist/memory/ingestion/UrlLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/UrlLoader.js +174 -0
- package/dist/memory/ingestion/UrlLoader.js.map +1 -0
- package/dist/memory/io/ChatGptImporter.d.ts +85 -0
- package/dist/memory/io/ChatGptImporter.d.ts.map +1 -0
- package/dist/memory/io/ChatGptImporter.js +231 -0
- package/dist/memory/io/ChatGptImporter.js.map +1 -0
- package/dist/memory/io/JsonExporter.d.ts +67 -0
- package/dist/memory/io/JsonExporter.d.ts.map +1 -0
- package/dist/memory/io/JsonExporter.js +132 -0
- package/dist/memory/io/JsonExporter.js.map +1 -0
- package/dist/memory/io/JsonImporter.d.ts +84 -0
- package/dist/memory/io/JsonImporter.d.ts.map +1 -0
- package/dist/memory/io/JsonImporter.js +234 -0
- package/dist/memory/io/JsonImporter.js.map +1 -0
- package/dist/memory/io/MarkdownExporter.d.ts +95 -0
- package/dist/memory/io/MarkdownExporter.d.ts.map +1 -0
- package/dist/memory/io/MarkdownExporter.js +130 -0
- package/dist/memory/io/MarkdownExporter.js.map +1 -0
- package/dist/memory/io/MarkdownImporter.d.ts +84 -0
- package/dist/memory/io/MarkdownImporter.d.ts.map +1 -0
- package/dist/memory/io/MarkdownImporter.js +166 -0
- package/dist/memory/io/MarkdownImporter.js.map +1 -0
- package/dist/memory/io/ObsidianExporter.d.ts +80 -0
- package/dist/memory/io/ObsidianExporter.d.ts.map +1 -0
- package/dist/memory/io/ObsidianExporter.js +127 -0
- package/dist/memory/io/ObsidianExporter.js.map +1 -0
- package/dist/memory/io/ObsidianImporter.d.ts +93 -0
- package/dist/memory/io/ObsidianImporter.d.ts.map +1 -0
- package/dist/memory/io/ObsidianImporter.js +221 -0
- package/dist/memory/io/ObsidianImporter.js.map +1 -0
- package/dist/memory/io/SqliteExporter.d.ts +47 -0
- package/dist/memory/io/SqliteExporter.d.ts.map +1 -0
- package/dist/memory/io/SqliteExporter.js +56 -0
- package/dist/memory/io/SqliteExporter.js.map +1 -0
- package/dist/memory/io/SqliteImporter.d.ts +82 -0
- package/dist/memory/io/SqliteImporter.d.ts.map +1 -0
- package/dist/memory/io/SqliteImporter.js +232 -0
- package/dist/memory/io/SqliteImporter.js.map +1 -0
- package/dist/memory/io/index.d.ts +31 -0
- package/dist/memory/io/index.d.ts.map +1 -0
- package/dist/memory/io/index.js +31 -0
- package/dist/memory/io/index.js.map +1 -0
- package/dist/memory/store/SqliteBrain.d.ts +125 -0
- package/dist/memory/store/SqliteBrain.d.ts.map +1 -0
- package/dist/memory/store/SqliteBrain.js +407 -0
- package/dist/memory/store/SqliteBrain.js.map +1 -0
- package/dist/memory/store/SqliteKnowledgeGraph.d.ts +259 -0
- package/dist/memory/store/SqliteKnowledgeGraph.d.ts.map +1 -0
- package/dist/memory/store/SqliteKnowledgeGraph.js +1062 -0
- package/dist/memory/store/SqliteKnowledgeGraph.js.map +1 -0
- package/dist/memory/store/SqliteMemoryGraph.d.ts +251 -0
- package/dist/memory/store/SqliteMemoryGraph.d.ts.map +1 -0
- package/dist/memory/store/SqliteMemoryGraph.js +637 -0
- package/dist/memory/store/SqliteMemoryGraph.js.map +1 -0
- package/dist/memory/tools/MemoryAddTool.d.ts +98 -0
- package/dist/memory/tools/MemoryAddTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryAddTool.js +131 -0
- package/dist/memory/tools/MemoryAddTool.js.map +1 -0
- package/dist/memory/tools/MemoryDeleteTool.d.ts +83 -0
- package/dist/memory/tools/MemoryDeleteTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryDeleteTool.js +96 -0
- package/dist/memory/tools/MemoryDeleteTool.js.map +1 -0
- package/dist/memory/tools/MemoryMergeTool.d.ts +95 -0
- package/dist/memory/tools/MemoryMergeTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryMergeTool.js +164 -0
- package/dist/memory/tools/MemoryMergeTool.js.map +1 -0
- package/dist/memory/tools/MemoryReflectTool.d.ts +86 -0
- package/dist/memory/tools/MemoryReflectTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryReflectTool.js +102 -0
- package/dist/memory/tools/MemoryReflectTool.js.map +1 -0
- package/dist/memory/tools/MemorySearchTool.d.ts +117 -0
- package/dist/memory/tools/MemorySearchTool.d.ts.map +1 -0
- package/dist/memory/tools/MemorySearchTool.js +162 -0
- package/dist/memory/tools/MemorySearchTool.js.map +1 -0
- package/dist/memory/tools/MemoryUpdateTool.d.ts +92 -0
- package/dist/memory/tools/MemoryUpdateTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryUpdateTool.js +125 -0
- package/dist/memory/tools/MemoryUpdateTool.js.map +1 -0
- package/dist/memory/tools/index.d.ts +32 -0
- package/dist/memory/tools/index.d.ts.map +1 -0
- package/dist/memory/tools/index.js +26 -0
- package/dist/memory/tools/index.js.map +1 -0
- package/package.json +6 -1
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview DoclingLoader — high-fidelity PDF/DOCX extraction via Python Docling.
|
|
3
|
+
*
|
|
4
|
+
* Docling (https://github.com/DS4SD/docling) is an IBM Research open-source
|
|
5
|
+
* library that converts PDFs and office documents to structured JSON, preserving
|
|
6
|
+
* tables, figures, and layout information far beyond what pure-JS text extraction
|
|
7
|
+
* can achieve.
|
|
8
|
+
*
|
|
9
|
+
* This module provides a factory function {@link createDoclingLoader} that:
|
|
10
|
+
* 1. Checks whether `python3 -m docling --version` succeeds in the current PATH.
|
|
11
|
+
* 2. If it does, returns a {@link DoclingLoader} instance that spawns a
|
|
12
|
+
* `python3 -m docling` subprocess for each document.
|
|
13
|
+
* 3. If Docling is not installed, returns `null` gracefully.
|
|
14
|
+
*
|
|
15
|
+
* ### Opting in
|
|
16
|
+
* ```sh
|
|
17
|
+
* pip install docling
|
|
18
|
+
* ```
|
|
19
|
+
*
|
|
20
|
+
* @module memory/ingestion/DoclingLoader
|
|
21
|
+
*/
|
|
22
|
+
import { spawn, spawnSync } from 'node:child_process';
|
|
23
|
+
import path from 'node:path';
|
|
24
|
+
import os from 'node:os';
|
|
25
|
+
import fs from 'node:fs/promises';
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// Constants
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
/** Extensions this loader can handle (Docling supports PDF and DOCX). */
|
|
30
|
+
const SUPPORTED_EXTENSIONS = ['.pdf', '.docx'];
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Helpers
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
/**
|
|
35
|
+
* Returns the lower-cased extension (with dot) of a file path.
|
|
36
|
+
*
|
|
37
|
+
* @param filePath - Absolute or relative file path.
|
|
38
|
+
*/
|
|
39
|
+
function extOf(filePath) {
|
|
40
|
+
return path.extname(filePath).toLowerCase();
|
|
41
|
+
}
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
// DoclingLoader (internal class)
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
/**
|
|
46
|
+
* High-fidelity document loader that delegates to a `python3 -m docling`
|
|
47
|
+
* subprocess.
|
|
48
|
+
*
|
|
49
|
+
* Consumers should use {@link createDoclingLoader} rather than constructing
|
|
50
|
+
* this class directly so that the Python availability check is always run
|
|
51
|
+
* before first use.
|
|
52
|
+
*
|
|
53
|
+
* @implements {IDocumentLoader}
|
|
54
|
+
*/
|
|
55
|
+
class DoclingLoader {
|
|
56
|
+
constructor() {
|
|
57
|
+
/** @inheritdoc */
|
|
58
|
+
this.supportedExtensions = [...SUPPORTED_EXTENSIONS];
|
|
59
|
+
}
|
|
60
|
+
// -------------------------------------------------------------------------
|
|
61
|
+
// canLoad
|
|
62
|
+
// -------------------------------------------------------------------------
|
|
63
|
+
/** @inheritdoc */
|
|
64
|
+
canLoad(source) {
|
|
65
|
+
if (Buffer.isBuffer(source)) {
|
|
66
|
+
// Without an extension we can't determine compatibility from bytes alone.
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
return SUPPORTED_EXTENSIONS.includes(extOf(source));
|
|
70
|
+
}
|
|
71
|
+
// -------------------------------------------------------------------------
|
|
72
|
+
// load
|
|
73
|
+
// -------------------------------------------------------------------------
|
|
74
|
+
/** @inheritdoc */
|
|
75
|
+
async load(source, _options) {
|
|
76
|
+
let filePath;
|
|
77
|
+
let tempFile = null;
|
|
78
|
+
if (Buffer.isBuffer(source)) {
|
|
79
|
+
// Write buffer to a temp file so Docling has a real path to read.
|
|
80
|
+
tempFile = path.join(os.tmpdir(), `docling-input-${Date.now()}.pdf`);
|
|
81
|
+
await fs.writeFile(tempFile, source);
|
|
82
|
+
filePath = tempFile;
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
filePath = source;
|
|
86
|
+
}
|
|
87
|
+
try {
|
|
88
|
+
const jsonOutput = await this._runDocling(filePath);
|
|
89
|
+
return this._mapToLoadedDocument(jsonOutput, Buffer.isBuffer(source) ? undefined : source);
|
|
90
|
+
}
|
|
91
|
+
finally {
|
|
92
|
+
// Clean up any temp file we created.
|
|
93
|
+
if (tempFile !== null) {
|
|
94
|
+
await fs.unlink(tempFile).catch(() => { });
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
// -------------------------------------------------------------------------
|
|
99
|
+
// Private: subprocess invocation
|
|
100
|
+
// -------------------------------------------------------------------------
|
|
101
|
+
/**
|
|
102
|
+
* Spawn `python3 -m docling --output-format json <filePath>` and collect
|
|
103
|
+
* stdout.
|
|
104
|
+
*
|
|
105
|
+
* @param filePath - Absolute path to the PDF or DOCX file.
|
|
106
|
+
* @returns Parsed Docling JSON output.
|
|
107
|
+
* @throws When the subprocess exits with a non-zero code or stdout is not
|
|
108
|
+
* valid JSON.
|
|
109
|
+
*/
|
|
110
|
+
async _runDocling(filePath) {
|
|
111
|
+
return new Promise((resolve, reject) => {
|
|
112
|
+
let stdout = '';
|
|
113
|
+
let stderr = '';
|
|
114
|
+
const proc = spawn('python3', ['-m', 'docling', '--output-format', 'json', filePath], {
|
|
115
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
116
|
+
});
|
|
117
|
+
proc.stdout.on('data', (chunk) => {
|
|
118
|
+
stdout += chunk.toString('utf8');
|
|
119
|
+
});
|
|
120
|
+
proc.stderr.on('data', (chunk) => {
|
|
121
|
+
stderr += chunk.toString('utf8');
|
|
122
|
+
});
|
|
123
|
+
proc.on('close', (code) => {
|
|
124
|
+
if (code !== 0) {
|
|
125
|
+
reject(new Error(`DoclingLoader: python3 -m docling exited with code ${code}.\n${stderr.slice(0, 500)}`));
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
try {
|
|
129
|
+
const parsed = JSON.parse(stdout);
|
|
130
|
+
resolve(parsed);
|
|
131
|
+
}
|
|
132
|
+
catch (err) {
|
|
133
|
+
reject(new Error(`DoclingLoader: failed to parse Docling JSON output: ${String(err)}\n` +
|
|
134
|
+
`stdout (first 500 chars): ${stdout.slice(0, 500)}`));
|
|
135
|
+
}
|
|
136
|
+
});
|
|
137
|
+
proc.on('error', (err) => {
|
|
138
|
+
reject(new Error(`DoclingLoader: failed to spawn python3: ${err.message}`));
|
|
139
|
+
});
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
// -------------------------------------------------------------------------
|
|
143
|
+
// Private: JSON → LoadedDocument mapping
|
|
144
|
+
// -------------------------------------------------------------------------
|
|
145
|
+
/**
|
|
146
|
+
* Convert a Docling JSON output object to a {@link LoadedDocument}.
|
|
147
|
+
*
|
|
148
|
+
* Handles both the newer (`text` top-level string) and older
|
|
149
|
+
* (`pages[].text` array) Docling output shapes.
|
|
150
|
+
*
|
|
151
|
+
* @param json - Parsed Docling JSON.
|
|
152
|
+
* @param resolvedPath - Original source path for the `source` metadata field.
|
|
153
|
+
*/
|
|
154
|
+
_mapToLoadedDocument(json, resolvedPath) {
|
|
155
|
+
// Prefer top-level `text` (Docling v2+), fall back to concatenating pages.
|
|
156
|
+
let content;
|
|
157
|
+
if (typeof json['text'] === 'string') {
|
|
158
|
+
content = json['text'];
|
|
159
|
+
}
|
|
160
|
+
else if (Array.isArray(json['pages'])) {
|
|
161
|
+
content = json['pages']
|
|
162
|
+
.map((p) => (typeof p['text'] === 'string' ? p['text'] : ''))
|
|
163
|
+
.join('\n\n');
|
|
164
|
+
}
|
|
165
|
+
else {
|
|
166
|
+
content = '';
|
|
167
|
+
}
|
|
168
|
+
const rawMeta = json['metadata'] ?? {};
|
|
169
|
+
const pageCount = typeof rawMeta['pageCount'] === 'number' ? rawMeta['pageCount'] :
|
|
170
|
+
typeof rawMeta['page_count'] === 'number' ? rawMeta['page_count'] :
|
|
171
|
+
undefined;
|
|
172
|
+
const meta = {
|
|
173
|
+
...(typeof rawMeta['title'] === 'string' && rawMeta['title']
|
|
174
|
+
? { title: rawMeta['title'] }
|
|
175
|
+
: {}),
|
|
176
|
+
...(typeof rawMeta['author'] === 'string' ? { author: rawMeta['author'] } : {}),
|
|
177
|
+
...(pageCount !== undefined ? { pageCount } : {}),
|
|
178
|
+
...(resolvedPath ? { source: resolvedPath } : {}),
|
|
179
|
+
};
|
|
180
|
+
return {
|
|
181
|
+
content,
|
|
182
|
+
metadata: meta,
|
|
183
|
+
format: 'pdf',
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
// ---------------------------------------------------------------------------
|
|
188
|
+
// Factory
|
|
189
|
+
// ---------------------------------------------------------------------------
|
|
190
|
+
/**
|
|
191
|
+
* Checks whether `python3 -m docling` is available in the current environment
|
|
192
|
+
* and, if so, returns a new {@link DoclingLoader} instance; otherwise returns
|
|
193
|
+
* `null`.
|
|
194
|
+
*
|
|
195
|
+
* The availability check runs `python3 -m docling --version` synchronously
|
|
196
|
+
* via `spawnSync` — it exits quickly and is only called once during registry
|
|
197
|
+
* initialisation.
|
|
198
|
+
*
|
|
199
|
+
* ### Usage
|
|
200
|
+
* ```ts
|
|
201
|
+
* import { createDoclingLoader } from './DoclingLoader.js';
|
|
202
|
+
* import { PdfLoader } from './PdfLoader.js';
|
|
203
|
+
*
|
|
204
|
+
* const doclingLoader = createDoclingLoader();
|
|
205
|
+
* const loader = new PdfLoader(null, doclingLoader);
|
|
206
|
+
* ```
|
|
207
|
+
*
|
|
208
|
+
* @returns A `DoclingLoader` instance when Docling is installed, or `null`.
|
|
209
|
+
*/
|
|
210
|
+
export function createDoclingLoader() {
|
|
211
|
+
try {
|
|
212
|
+
const result = spawnSync('python3', ['-m', 'docling', '--version'], {
|
|
213
|
+
stdio: 'ignore',
|
|
214
|
+
timeout: 5000,
|
|
215
|
+
});
|
|
216
|
+
// spawnSync throws when the binary cannot be found, and sets .error for
|
|
217
|
+
// other failure modes. A non-zero status also means docling is absent.
|
|
218
|
+
if (result.error !== undefined || result.status !== 0) {
|
|
219
|
+
return null;
|
|
220
|
+
}
|
|
221
|
+
return new DoclingLoader();
|
|
222
|
+
}
|
|
223
|
+
catch {
|
|
224
|
+
// python3 is not in PATH or docling is not installed.
|
|
225
|
+
return null;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
//# sourceMappingURL=DoclingLoader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"DoclingLoader.js","sourceRoot":"","sources":["../../../src/memory/ingestion/DoclingLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACtD,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAIlC,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E,yEAAyE;AACzE,MAAM,oBAAoB,GAAG,CAAC,MAAM,EAAE,OAAO,CAAU,CAAC;AAExD,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E;;;;GAIG;AACH,SAAS,KAAK,CAAC,QAAgB;IAC7B,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;AAC9C,CAAC;AAgCD,8EAA8E;AAC9E,iCAAiC;AACjC,8EAA8E;AAE9E;;;;;;;;;GASG;AACH,MAAM,aAAa;IAAnB;QACE,kBAAkB;QACT,wBAAmB,GAAa,CAAC,GAAG,oBAAoB,CAAC,CAAC;IAqJrE,CAAC;IAnJC,4EAA4E;IAC5E,UAAU;IACV,4EAA4E;IAE5E,kBAAkB;IAClB,OAAO,CAAC,MAAuB;QAC7B,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,0EAA0E;YAC1E,OAAO,KAAK,CAAC;QACf,CAAC;QACD,OAAQ,oBAA0C,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAqB,CAAC,CAAC;IACjG,CAAC;IAED,4EAA4E;IAC5E,OAAO;IACP,4EAA4E;IAE5E,kBAAkB;IAClB,KAAK,CAAC,IAAI,CAAC,MAAuB,EAAE,QAAsB;QACxD,IAAI,QAAgB,CAAC;QACrB,IAAI,QAAQ,GAAkB,IAAI,CAAC;QAEnC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,kEAAkE;YAClE,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,iBAAiB,IAAI,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;YACrE,MAAM,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;YACrC,QAAQ,GAAG,QAAQ,CAAC;QACtB,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,MAAM,CAAC;QACpB,CAAC;QAED,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;YACpD,OAAO,IAAI,CAAC,oBAAoB,CAAC,UAAU,EAAE,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAC7F,CAAC;gBAAS,CAAC;YACT,qCAAqC;YACrC,IAAI,QAAQ,KAAK,IAAI,EAAE,CAAC;gBACtB,MAAM,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAA+B,CAAC,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;IACH,CAAC;IAED,4EAA4E;IAC5E,iCAAiC;IACjC,4EAA4E;IAE5E;;;;;;;;OAQG;IACK,KAAK,CAAC,WAAW,CAAC,QAAgB;QACxC,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,IAAI,MAAM,GAAG,EAAE,CAAC;YAChB,IAAI,MAAM,GAAG,EAAE,CAAC;YAEhB,MAAM,IAAI,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,SAAS,EAAE,iBAAiB,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE;gBACpF,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC;aAClC,CAAC,CAAC;YAEH,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;gBACvC,MAAM,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACnC,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;gBACvC,MAAM,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;YACnC,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;oBACf,MAAM,CAAC,IAAI,KAAK,CACd,sDAAsD,IAAI,MAAM,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CACvF,CAAC,CAAC;oBACH,OAAO;gBACT,CAAC;gBAED,IAAI,CAAC;oBACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAsB,CAAC;oBACvD,OAAO,CAAC,MAAM,CAAC,CAAC;gBAClB,CAAC;gBAAC,OAAO,GAAG,EAAE,CAAC;oBACb,MAAM,CAAC,IAAI,KAAK,CACd,uDAAuD,MAAM,CAAC,GAAG,CAAC,IAAI;wBACtE,6BAA6B,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CACpD,CAAC,CAAC;gBACL,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;gBACvB,MAAM,CAAC,IAAI,KAAK,CAAC,2CAA2C,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC9E,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED,4EAA4E;IAC5E,yCAAyC;IACzC,4EAA4E;IAE5E;;;;;;;;OAQG;IACK,oBAAoB,CAC1B,IAAuB,EACvB,YAAqB;QAErB,2EAA2E;QAC3E,IAAI,OAAe,CAAC;QACpB,IAAI,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,QAAQ,EAAE,CAAC;YACrC,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC;QACzB,CAAC;aAAM,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC;YACxC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;iBACpB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;iBAC5D,IAAI,CAAC,MAAM,CAAC,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,OAAO,GAAG,EAAE,CAAC;QACf,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;QACvC,MAAM,SAAS,GACb,OAAO,OAAO,CAAC,WAAW,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC;YACjE,OAAO,OAAO,CAAC,YAAY,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC;gBACnE,SAAS,CAAC;QAEZ,MAAM,IAAI,GAAqB;YAC7B,GAAG,CAAC,OAAO,OAAO,CAAC,OAAO,CAAC,KAAK,QAAQ,IAAI,OAAO,CAAC,OAAO,CAAC;gBAC1D,CAAC,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,OAAO,CAAC,EAAE;gBAC7B,CAAC,CAAC,EAAE,CAAC;YACP,GAAG,CAAC,OAAO,OAAO,CAAC,QAAQ,CAAC,KAAK,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC/E,GAAG,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACjD,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAClD,CAAC;QAEF,OAAO;YACL,OAAO;YACP,QAAQ,EAAE,IAAI;YACd,MAAM,EAAE,KAAK;SACd,CAAC;IACJ,CAAC;CACF;AAED,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,UAAU,mBAAmB;IACjC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,EAAE,CAAC,IAAI,EAAE,SAAS,EAAE,WAAW,CAAC,EAAE;YAClE,KAAK,EAAE,QAAQ;YACf,OAAO,EAAE,IAAI;SACd,CAAC,CAAC;QACH,wEAAwE;QACxE,wEAAwE;QACxE,IAAI,MAAM,CAAC,KAAK,KAAK,SAAS,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtD,OAAO,IAAI,CAAC;QACd,CAAC;QACD,OAAO,IAAI,aAAa,EAAE,CAAC;IAC7B,CAAC;IAAC,MAAM,CAAC;QACP,sDAAsD;QACtD,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview DocxLoader — loads `.docx` documents using `mammoth`.
|
|
3
|
+
*
|
|
4
|
+
* The `mammoth` library extracts raw text from OOXML (Office Open XML) Word
|
|
5
|
+
* documents by stripping all formatting and returning the plain-text content.
|
|
6
|
+
* This keeps the ingestion pipeline fast and dependency-light while still
|
|
7
|
+
* producing high-quality text suitable for chunking and embedding.
|
|
8
|
+
*
|
|
9
|
+
* @module memory/ingestion/DocxLoader
|
|
10
|
+
*/
|
|
11
|
+
import type { IDocumentLoader } from './IDocumentLoader.js';
|
|
12
|
+
import type { LoadOptions, LoadedDocument } from '../facade/types.js';
|
|
13
|
+
/**
|
|
14
|
+
* Document loader for Microsoft Word (`.docx`) files.
|
|
15
|
+
*
|
|
16
|
+
* Uses `mammoth.extractRawText()` to strip all styling and return plain
|
|
17
|
+
* prose text, which is then stored as the `content` field. The `metadata`
|
|
18
|
+
* block includes an approximate `wordCount`.
|
|
19
|
+
*
|
|
20
|
+
* @implements {IDocumentLoader}
|
|
21
|
+
*
|
|
22
|
+
* @example
|
|
23
|
+
* ```ts
|
|
24
|
+
* const loader = new DocxLoader();
|
|
25
|
+
* const doc = await loader.load('/docs/spec.docx');
|
|
26
|
+
* console.log(doc.metadata.wordCount); // e.g. 1842
|
|
27
|
+
* ```
|
|
28
|
+
*/
|
|
29
|
+
export declare class DocxLoader implements IDocumentLoader {
|
|
30
|
+
/** @inheritdoc */
|
|
31
|
+
readonly supportedExtensions: string[];
|
|
32
|
+
/** @inheritdoc */
|
|
33
|
+
canLoad(source: string | Buffer): boolean;
|
|
34
|
+
/** @inheritdoc */
|
|
35
|
+
load(source: string | Buffer, _options?: LoadOptions): Promise<LoadedDocument>;
|
|
36
|
+
}
|
|
37
|
+
//# sourceMappingURL=DocxLoader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"DocxLoader.d.ts","sourceRoot":"","sources":["../../../src/memory/ingestion/DocxLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAKH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAC5D,OAAO,KAAK,EAAE,WAAW,EAAE,cAAc,EAAoB,MAAM,oBAAoB,CAAC;AAsCxF;;;;;;;;;;;;;;;GAeG;AACH,qBAAa,UAAW,YAAW,eAAe;IAChD,kBAAkB;IAClB,QAAQ,CAAC,mBAAmB,EAAE,MAAM,EAAE,CAA6B;IAMnE,kBAAkB;IAClB,OAAO,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO;IAezC,kBAAkB;IACZ,IAAI,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,QAAQ,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,cAAc,CAAC;CA+BrF"}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview DocxLoader — loads `.docx` documents using `mammoth`.
|
|
3
|
+
*
|
|
4
|
+
* The `mammoth` library extracts raw text from OOXML (Office Open XML) Word
|
|
5
|
+
* documents by stripping all formatting and returning the plain-text content.
|
|
6
|
+
* This keeps the ingestion pipeline fast and dependency-light while still
|
|
7
|
+
* producing high-quality text suitable for chunking and embedding.
|
|
8
|
+
*
|
|
9
|
+
* @module memory/ingestion/DocxLoader
|
|
10
|
+
*/
|
|
11
|
+
import fs from 'node:fs/promises';
|
|
12
|
+
import path from 'node:path';
|
|
13
|
+
import mammoth from 'mammoth';
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Constants
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
/** Extensions handled by this loader, each with a leading dot. */
|
|
18
|
+
const SUPPORTED_EXTENSIONS = ['.docx'];
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Helpers
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
/**
|
|
23
|
+
* Returns the lower-cased extension (with dot) of a file path.
|
|
24
|
+
*
|
|
25
|
+
* @param filePath - Absolute or relative file path.
|
|
26
|
+
*/
|
|
27
|
+
function extOf(filePath) {
|
|
28
|
+
return path.extname(filePath).toLowerCase();
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Count the approximate number of words in a string.
|
|
32
|
+
*
|
|
33
|
+
* Splits on runs of whitespace — intentionally lightweight for the typical
|
|
34
|
+
* document sizes encountered during ingestion.
|
|
35
|
+
*
|
|
36
|
+
* @param text - Raw text to count.
|
|
37
|
+
*/
|
|
38
|
+
function wordCount(text) {
|
|
39
|
+
return text.trim() === '' ? 0 : text.trim().split(/\s+/).length;
|
|
40
|
+
}
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
// DocxLoader
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
/**
|
|
45
|
+
* Document loader for Microsoft Word (`.docx`) files.
|
|
46
|
+
*
|
|
47
|
+
* Uses `mammoth.extractRawText()` to strip all styling and return plain
|
|
48
|
+
* prose text, which is then stored as the `content` field. The `metadata`
|
|
49
|
+
* block includes an approximate `wordCount`.
|
|
50
|
+
*
|
|
51
|
+
* @implements {IDocumentLoader}
|
|
52
|
+
*
|
|
53
|
+
* @example
|
|
54
|
+
* ```ts
|
|
55
|
+
* const loader = new DocxLoader();
|
|
56
|
+
* const doc = await loader.load('/docs/spec.docx');
|
|
57
|
+
* console.log(doc.metadata.wordCount); // e.g. 1842
|
|
58
|
+
* ```
|
|
59
|
+
*/
|
|
60
|
+
export class DocxLoader {
|
|
61
|
+
constructor() {
|
|
62
|
+
/** @inheritdoc */
|
|
63
|
+
this.supportedExtensions = [...SUPPORTED_EXTENSIONS];
|
|
64
|
+
}
|
|
65
|
+
// -------------------------------------------------------------------------
|
|
66
|
+
// canLoad
|
|
67
|
+
// -------------------------------------------------------------------------
|
|
68
|
+
/** @inheritdoc */
|
|
69
|
+
canLoad(source) {
|
|
70
|
+
if (Buffer.isBuffer(source)) {
|
|
71
|
+
// OOXML magic: PK zip signature (0x50 0x4B 0x03 0x04).
|
|
72
|
+
// .docx files are ZIP archives — check for the PK header.
|
|
73
|
+
return source.length >= 4 &&
|
|
74
|
+
source[0] === 0x50 && source[1] === 0x4B &&
|
|
75
|
+
source[2] === 0x03 && source[3] === 0x04;
|
|
76
|
+
}
|
|
77
|
+
return SUPPORTED_EXTENSIONS.includes(extOf(source));
|
|
78
|
+
}
|
|
79
|
+
// -------------------------------------------------------------------------
|
|
80
|
+
// load
|
|
81
|
+
// -------------------------------------------------------------------------
|
|
82
|
+
/** @inheritdoc */
|
|
83
|
+
async load(source, _options) {
|
|
84
|
+
let buffer;
|
|
85
|
+
let resolvedPath;
|
|
86
|
+
if (Buffer.isBuffer(source)) {
|
|
87
|
+
buffer = source;
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
resolvedPath = source;
|
|
91
|
+
buffer = await fs.readFile(resolvedPath);
|
|
92
|
+
}
|
|
93
|
+
// mammoth.extractRawText strips all OOXML formatting and returns plain text.
|
|
94
|
+
// The `buffer` option accepts a Node Buffer directly (no temp file needed).
|
|
95
|
+
const result = await mammoth.extractRawText({ buffer });
|
|
96
|
+
// `result.value` is the extracted text; `result.messages` holds any
|
|
97
|
+
// conversion warnings (ignored here — they're rarely actionable for
|
|
98
|
+
// text-only extraction).
|
|
99
|
+
const content = result.value;
|
|
100
|
+
const meta = {
|
|
101
|
+
wordCount: wordCount(content),
|
|
102
|
+
...(resolvedPath ? { source: resolvedPath } : {}),
|
|
103
|
+
};
|
|
104
|
+
return {
|
|
105
|
+
content,
|
|
106
|
+
metadata: meta,
|
|
107
|
+
format: 'docx',
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
//# sourceMappingURL=DocxLoader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"DocxLoader.js","sourceRoot":"","sources":["../../../src/memory/ingestion/DocxLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,OAAO,MAAM,SAAS,CAAC;AAI9B,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E,kEAAkE;AAClE,MAAM,oBAAoB,GAAG,CAAC,OAAO,CAAU,CAAC;AAEhD,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E;;;;GAIG;AACH,SAAS,KAAK,CAAC,QAAgB;IAC7B,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;AAC9C,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;AAClE,CAAC;AAED,8EAA8E;AAC9E,aAAa;AACb,8EAA8E;AAE9E;;;;;;;;;;;;;;;GAeG;AACH,MAAM,OAAO,UAAU;IAAvB;QACE,kBAAkB;QACT,wBAAmB,GAAa,CAAC,GAAG,oBAAoB,CAAC,CAAC;IAsDrE,CAAC;IApDC,4EAA4E;IAC5E,UAAU;IACV,4EAA4E;IAE5E,kBAAkB;IAClB,OAAO,CAAC,MAAuB;QAC7B,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,uDAAuD;YACvD,0DAA0D;YAC1D,OAAO,MAAM,CAAC,MAAM,IAAI,CAAC;gBACvB,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI;gBACxC,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC;QAC7C,CAAC;QACD,OAAQ,oBAA0C,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAY,CAAC,CAAC;IACxF,CAAC;IAED,4EAA4E;IAC5E,OAAO;IACP,4EAA4E;IAE5E,kBAAkB;IAClB,KAAK,CAAC,IAAI,CAAC,MAAuB,EAAE,QAAsB;QACxD,IAAI,MAAc,CAAC;QACnB,IAAI,YAAgC,CAAC;QAErC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,MAAM,GAAG,MAAM,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,YAAY,GAAG,MAAM,CAAC;YACtB,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;QAC3C,CAAC;QAED,6EAA6E;QAC7E,4EAA4E;QAC5E,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;QAExD,oEAAoE;QACpE,oEAAoE;QACpE,yBAAyB;QACzB,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC;QAE7B,MAAM,IAAI,GAAqB;YAC7B,SAAS,EAAE,SAAS,CAAC,OAAO,CAAC;YAC7B,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAClD,CAAC;QAEF,OAAO;YACL,OAAO;YACP,QAAQ,EAAE,IAAI;YACd,MAAM,EAAE,MAAM;SACf,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview FolderScanner — recursive directory walker for the ingestion
|
|
3
|
+
* pipeline.
|
|
4
|
+
*
|
|
5
|
+
* FolderScanner is NOT an {@link IDocumentLoader}; instead it orchestrates a
|
|
6
|
+
* {@link LoaderRegistry} to batch-process every file in a directory tree. It
|
|
7
|
+
* supports glob-based include/exclude filters via `minimatch`, per-file
|
|
8
|
+
* progress callbacks, and graceful error collection so a single unreadable file
|
|
9
|
+
* never aborts a whole scan.
|
|
10
|
+
*
|
|
11
|
+
* @module memory/ingestion/FolderScanner
|
|
12
|
+
*/
|
|
13
|
+
import type { LoadedDocument } from '../facade/types.js';
|
|
14
|
+
import type { LoaderRegistry } from './LoaderRegistry.js';
|
|
15
|
+
/**
|
|
16
|
+
* Configuration options for {@link FolderScanner.scan}.
|
|
17
|
+
*/
|
|
18
|
+
export interface FolderScanOptions {
|
|
19
|
+
/**
|
|
20
|
+
* Whether to descend into sub-directories.
|
|
21
|
+
*
|
|
22
|
+
* When `false` only the direct children of `dirPath` are considered.
|
|
23
|
+
*
|
|
24
|
+
* @default true
|
|
25
|
+
*/
|
|
26
|
+
recursive?: boolean;
|
|
27
|
+
/**
|
|
28
|
+
* Glob patterns that a file path must match at least one of in order to be
|
|
29
|
+
* processed. Patterns are evaluated against the path *relative* to the
|
|
30
|
+
* scanned root directory using {@link minimatch}.
|
|
31
|
+
*
|
|
32
|
+
* When omitted every file with a registered extension is processed.
|
|
33
|
+
*
|
|
34
|
+
* @example ['**\/*.pdf', '**\/*.md']
|
|
35
|
+
*/
|
|
36
|
+
include?: string[];
|
|
37
|
+
/**
|
|
38
|
+
* Glob patterns that cause a file to be skipped when its relative path
|
|
39
|
+
* matches any of them. Evaluated *after* `include`.
|
|
40
|
+
*
|
|
41
|
+
* @example ['**\/node_modules\/**', '**\/.git\/**']
|
|
42
|
+
*/
|
|
43
|
+
exclude?: string[];
|
|
44
|
+
/**
|
|
45
|
+
* Called after each file attempt (success *or* failure).
|
|
46
|
+
*
|
|
47
|
+
* @param file - Absolute path of the file that was just processed.
|
|
48
|
+
* @param index - 1-based index of the file in the total discovered list.
|
|
49
|
+
* @param total - Total number of matching files discovered before processing began.
|
|
50
|
+
*/
|
|
51
|
+
onProgress?: (file: string, index: number, total: number) => void;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* The aggregated result of a {@link FolderScanner.scan} call.
|
|
55
|
+
*/
|
|
56
|
+
export interface FolderScanResult {
|
|
57
|
+
/**
|
|
58
|
+
* Successfully loaded documents, one per processed file.
|
|
59
|
+
*/
|
|
60
|
+
documents: LoadedDocument[];
|
|
61
|
+
/**
|
|
62
|
+
* Absolute paths of files that were loaded without error.
|
|
63
|
+
*/
|
|
64
|
+
succeeded: string[];
|
|
65
|
+
/**
|
|
66
|
+
* Files that could not be processed, with per-file error messages.
|
|
67
|
+
*/
|
|
68
|
+
failed: Array<{
|
|
69
|
+
path: string;
|
|
70
|
+
error: string;
|
|
71
|
+
}>;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Recursively scans a directory and loads every file whose extension has a
|
|
75
|
+
* registered loader in the supplied {@link LoaderRegistry}.
|
|
76
|
+
*
|
|
77
|
+
* ### Example
|
|
78
|
+
* ```ts
|
|
79
|
+
* const registry = new LoaderRegistry();
|
|
80
|
+
* const scanner = new FolderScanner(registry);
|
|
81
|
+
*
|
|
82
|
+
* const result = await scanner.scan('/knowledge-base', {
|
|
83
|
+
* recursive: true,
|
|
84
|
+
* include: ['**\/*.md', '**\/*.pdf'],
|
|
85
|
+
* exclude: ['**\/node_modules\/**'],
|
|
86
|
+
* onProgress: (file, i, total) => console.log(`${i}/${total} ${file}`),
|
|
87
|
+
* });
|
|
88
|
+
*
|
|
89
|
+
* console.log(`Loaded ${result.documents.length} documents`);
|
|
90
|
+
* console.log(`Failed: ${result.failed.length}`);
|
|
91
|
+
* ```
|
|
92
|
+
*/
|
|
93
|
+
export declare class FolderScanner {
|
|
94
|
+
private readonly registry;
|
|
95
|
+
/**
|
|
96
|
+
* @param registry - The {@link LoaderRegistry} used to dispatch each file to
|
|
97
|
+
* the appropriate loader.
|
|
98
|
+
*/
|
|
99
|
+
constructor(registry: LoaderRegistry);
|
|
100
|
+
/**
|
|
101
|
+
* Walk `dirPath` and load every matching file.
|
|
102
|
+
*
|
|
103
|
+
* Files are discovered first and then loaded sequentially. Errors thrown
|
|
104
|
+
* by individual loaders are caught and accumulated in
|
|
105
|
+
* {@link FolderScanResult.failed} rather than propagating.
|
|
106
|
+
*
|
|
107
|
+
* @param dirPath - Absolute path to the directory to scan.
|
|
108
|
+
* @param options - Optional include/exclude filters and progress callback.
|
|
109
|
+
* @returns A promise that resolves to a {@link FolderScanResult}.
|
|
110
|
+
*
|
|
111
|
+
* @throws {Error} When `dirPath` cannot be read as a directory (e.g.
|
|
112
|
+
* it does not exist or is a regular file).
|
|
113
|
+
*/
|
|
114
|
+
scan(dirPath: string, options?: FolderScanOptions): Promise<FolderScanResult>;
|
|
115
|
+
}
|
|
116
|
+
//# sourceMappingURL=FolderScanner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"FolderScanner.d.ts","sourceRoot":"","sources":["../../../src/memory/ingestion/FolderScanner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAKH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAM1D;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC;;;;;;OAMG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB;;;;;;;;OAQG;IACH,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IAEnB;;;;;OAKG;IACH,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IAEnB;;;;;;OAMG;IACH,UAAU,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACnE;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,SAAS,EAAE,cAAc,EAAE,CAAC;IAE5B;;OAEG;IACH,SAAS,EAAE,MAAM,EAAE,CAAC;IAEpB;;OAEG;IACH,MAAM,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;CAChD;AAMD;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,aAAa;IAKZ,OAAO,CAAC,QAAQ,CAAC,QAAQ;IAJrC;;;OAGG;gBAC0B,QAAQ,EAAE,cAAc;IAMrD;;;;;;;;;;;;;OAaG;IACG,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,iBAAsB,GAAG,OAAO,CAAC,gBAAgB,CAAC;CAoFxF"}
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview FolderScanner — recursive directory walker for the ingestion
|
|
3
|
+
* pipeline.
|
|
4
|
+
*
|
|
5
|
+
* FolderScanner is NOT an {@link IDocumentLoader}; instead it orchestrates a
|
|
6
|
+
* {@link LoaderRegistry} to batch-process every file in a directory tree. It
|
|
7
|
+
* supports glob-based include/exclude filters via `minimatch`, per-file
|
|
8
|
+
* progress callbacks, and graceful error collection so a single unreadable file
|
|
9
|
+
* never aborts a whole scan.
|
|
10
|
+
*
|
|
11
|
+
* @module memory/ingestion/FolderScanner
|
|
12
|
+
*/
|
|
13
|
+
import fs from 'node:fs/promises';
|
|
14
|
+
import path from 'node:path';
|
|
15
|
+
import { minimatch } from 'minimatch';
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
// FolderScanner
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
/**
|
|
20
|
+
* Recursively scans a directory and loads every file whose extension has a
|
|
21
|
+
* registered loader in the supplied {@link LoaderRegistry}.
|
|
22
|
+
*
|
|
23
|
+
* ### Example
|
|
24
|
+
* ```ts
|
|
25
|
+
* const registry = new LoaderRegistry();
|
|
26
|
+
* const scanner = new FolderScanner(registry);
|
|
27
|
+
*
|
|
28
|
+
* const result = await scanner.scan('/knowledge-base', {
|
|
29
|
+
* recursive: true,
|
|
30
|
+
* include: ['**\/*.md', '**\/*.pdf'],
|
|
31
|
+
* exclude: ['**\/node_modules\/**'],
|
|
32
|
+
* onProgress: (file, i, total) => console.log(`${i}/${total} ${file}`),
|
|
33
|
+
* });
|
|
34
|
+
*
|
|
35
|
+
* console.log(`Loaded ${result.documents.length} documents`);
|
|
36
|
+
* console.log(`Failed: ${result.failed.length}`);
|
|
37
|
+
* ```
|
|
38
|
+
*/
|
|
39
|
+
export class FolderScanner {
|
|
40
|
+
/**
|
|
41
|
+
* @param registry - The {@link LoaderRegistry} used to dispatch each file to
|
|
42
|
+
* the appropriate loader.
|
|
43
|
+
*/
|
|
44
|
+
constructor(registry) {
|
|
45
|
+
this.registry = registry;
|
|
46
|
+
}
|
|
47
|
+
// -------------------------------------------------------------------------
|
|
48
|
+
// scan
|
|
49
|
+
// -------------------------------------------------------------------------
|
|
50
|
+
/**
|
|
51
|
+
* Walk `dirPath` and load every matching file.
|
|
52
|
+
*
|
|
53
|
+
* Files are discovered first and then loaded sequentially. Errors thrown
|
|
54
|
+
* by individual loaders are caught and accumulated in
|
|
55
|
+
* {@link FolderScanResult.failed} rather than propagating.
|
|
56
|
+
*
|
|
57
|
+
* @param dirPath - Absolute path to the directory to scan.
|
|
58
|
+
* @param options - Optional include/exclude filters and progress callback.
|
|
59
|
+
* @returns A promise that resolves to a {@link FolderScanResult}.
|
|
60
|
+
*
|
|
61
|
+
* @throws {Error} When `dirPath` cannot be read as a directory (e.g.
|
|
62
|
+
* it does not exist or is a regular file).
|
|
63
|
+
*/
|
|
64
|
+
async scan(dirPath, options = {}) {
|
|
65
|
+
const { recursive = true, include, exclude, onProgress, } = options;
|
|
66
|
+
// ------------------------------------------------------------------
|
|
67
|
+
// 1. Discover all candidate file paths.
|
|
68
|
+
// ------------------------------------------------------------------
|
|
69
|
+
const allEntries = await fs.readdir(dirPath, { recursive, withFileTypes: true });
|
|
70
|
+
// Filter to only regular files whose extension is registered.
|
|
71
|
+
const supportedExtensions = new Set(this.registry.getSupportedExtensions());
|
|
72
|
+
const candidatePaths = [];
|
|
73
|
+
for (const entry of allEntries) {
|
|
74
|
+
// Skip directories (readdir with recursive:true includes dirs too).
|
|
75
|
+
if (!entry.isFile())
|
|
76
|
+
continue;
|
|
77
|
+
// Build the absolute path. In Node 20+ with recursive:true the
|
|
78
|
+
// `parentPath` property is set; older versions use `path` (deprecated).
|
|
79
|
+
// We fall back gracefully.
|
|
80
|
+
const parentPath = entry.parentPath ??
|
|
81
|
+
entry.path ??
|
|
82
|
+
dirPath;
|
|
83
|
+
const absolutePath = path.join(parentPath, entry.name);
|
|
84
|
+
// Check that the extension has a registered loader.
|
|
85
|
+
const ext = path.extname(entry.name).toLowerCase();
|
|
86
|
+
if (!supportedExtensions.has(ext))
|
|
87
|
+
continue;
|
|
88
|
+
// Build relative path for glob matching.
|
|
89
|
+
const relativePath = path.relative(dirPath, absolutePath);
|
|
90
|
+
// Apply include filter — file must match at least one pattern.
|
|
91
|
+
if (include && include.length > 0) {
|
|
92
|
+
const matches = include.some((pattern) => minimatch(relativePath, pattern, { dot: true }));
|
|
93
|
+
if (!matches)
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
// Apply exclude filter — file must NOT match any pattern.
|
|
97
|
+
if (exclude && exclude.length > 0) {
|
|
98
|
+
const excluded = exclude.some((pattern) => minimatch(relativePath, pattern, { dot: true }));
|
|
99
|
+
if (excluded)
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
candidatePaths.push(absolutePath);
|
|
103
|
+
}
|
|
104
|
+
// ------------------------------------------------------------------
|
|
105
|
+
// 2. Load each candidate file, collecting results and errors.
|
|
106
|
+
// ------------------------------------------------------------------
|
|
107
|
+
const documents = [];
|
|
108
|
+
const succeeded = [];
|
|
109
|
+
const failed = [];
|
|
110
|
+
for (let i = 0; i < candidatePaths.length; i++) {
|
|
111
|
+
const filePath = candidatePaths[i];
|
|
112
|
+
try {
|
|
113
|
+
const doc = await this.registry.loadFile(filePath);
|
|
114
|
+
documents.push(doc);
|
|
115
|
+
succeeded.push(filePath);
|
|
116
|
+
}
|
|
117
|
+
catch (err) {
|
|
118
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
119
|
+
failed.push({ path: filePath, error: message });
|
|
120
|
+
}
|
|
121
|
+
// Fire progress callback (1-based index).
|
|
122
|
+
onProgress?.(filePath, i + 1, candidatePaths.length);
|
|
123
|
+
}
|
|
124
|
+
return { documents, succeeded, failed };
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
//# sourceMappingURL=FolderScanner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"FolderScanner.js","sourceRoot":"","sources":["../../../src/memory/ingestion/FolderScanner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAsEtC,8EAA8E;AAC9E,gBAAgB;AAChB,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,OAAO,aAAa;IACxB;;;OAGG;IACH,YAA6B,QAAwB;QAAxB,aAAQ,GAAR,QAAQ,CAAgB;IAAG,CAAC;IAEzD,4EAA4E;IAC5E,OAAO;IACP,4EAA4E;IAE5E;;;;;;;;;;;;;OAaG;IACH,KAAK,CAAC,IAAI,CAAC,OAAe,EAAE,UAA6B,EAAE;QACzD,MAAM,EACJ,SAAS,GAAG,IAAI,EAChB,OAAO,EACP,OAAO,EACP,UAAU,GACX,GAAG,OAAO,CAAC;QAEZ,qEAAqE;QACrE,wCAAwC;QACxC,qEAAqE;QAErE,MAAM,UAAU,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;QAEjF,8DAA8D;QAC9D,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,sBAAsB,EAAE,CAAC,CAAC;QAE5E,MAAM,cAAc,GAAa,EAAE,CAAC;QAEpC,KAAK,MAAM,KAAK,IAAI,UAAU,EAAE,CAAC;YAC/B,oEAAoE;YACpE,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE;gBAAE,SAAS;YAE9B,gEAAgE;YAChE,wEAAwE;YACxE,2BAA2B;YAC3B,MAAM,UAAU,GACb,KAAgD,CAAC,UAAU;gBAC3D,KAAgD,CAAC,IAAI;gBACtD,OAAO,CAAC;YACV,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;YAEvD,oDAAoD;YACpD,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC;YACnD,IAAI,CAAC,mBAAmB,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,SAAS;YAE5C,yCAAyC;YACzC,MAAM,YAAY,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;YAE1D,+DAA+D;YAC/D,IAAI,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClC,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CACvC,SAAS,CAAC,YAAY,EAAE,OAAO,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAChD,CAAC;gBACF,IAAI,CAAC,OAAO;oBAAE,SAAS;YACzB,CAAC;YAED,0DAA0D;YAC1D,IAAI,OAAO,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAClC,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,EAAE,CACxC,SAAS,CAAC,YAAY,EAAE,OAAO,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAChD,CAAC;gBACF,IAAI,QAAQ;oBAAE,SAAS;YACzB,CAAC;YAED,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACpC,CAAC;QAED,qEAAqE;QACrE,8DAA8D;QAC9D,qEAAqE;QAErE,MAAM,SAAS,GAAqB,EAAE,CAAC;QACvC,MAAM,SAAS,GAAa,EAAE,CAAC;QAC/B,MAAM,MAAM,GAA2C,EAAE,CAAC;QAE1D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/C,MAAM,QAAQ,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC;YAEnC,IAAI,CAAC;gBACH,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;gBACnD,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACpB,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YAC3B,CAAC;YAAC,OAAO,GAAY,EAAE,CAAC;gBACtB,MAAM,OAAO,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;gBACjE,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC,CAAC;YAClD,CAAC;YAED,0CAA0C;YAC1C,UAAU,EAAE,CAAC,QAAQ,EAAE,CAAC,GAAG,CAAC,EAAE,cAAc,CAAC,MAAM,CAAC,CAAC;QACvD,CAAC;QAED,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC;IAC1C,CAAC;CACF"}
|