@framers/agentos 0.1.101 → 0.1.102
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -0
- package/dist/memory/config.d.ts +39 -0
- package/dist/memory/config.d.ts.map +1 -1
- package/dist/memory/config.js.map +1 -1
- package/dist/memory/consolidation/ConsolidationLoop.d.ts +177 -0
- package/dist/memory/consolidation/ConsolidationLoop.d.ts.map +1 -0
- package/dist/memory/consolidation/ConsolidationLoop.js +517 -0
- package/dist/memory/consolidation/ConsolidationLoop.js.map +1 -0
- package/dist/memory/consolidation/ConsolidationPipeline.d.ts.map +1 -1
- package/dist/memory/consolidation/ConsolidationPipeline.js +7 -0
- package/dist/memory/consolidation/ConsolidationPipeline.js.map +1 -1
- package/dist/memory/consolidation/index.d.ts +8 -0
- package/dist/memory/consolidation/index.d.ts.map +1 -0
- package/dist/memory/consolidation/index.js +7 -0
- package/dist/memory/consolidation/index.js.map +1 -0
- package/dist/memory/decay/DecayModel.d.ts +33 -0
- package/dist/memory/decay/DecayModel.d.ts.map +1 -1
- package/dist/memory/decay/DecayModel.js +31 -0
- package/dist/memory/decay/DecayModel.js.map +1 -1
- package/dist/memory/facade/Memory.d.ts +228 -0
- package/dist/memory/facade/Memory.d.ts.map +1 -0
- package/dist/memory/facade/Memory.js +823 -0
- package/dist/memory/facade/Memory.js.map +1 -0
- package/dist/memory/facade/index.d.ts +13 -0
- package/dist/memory/facade/index.d.ts.map +1 -0
- package/dist/memory/facade/index.js +11 -0
- package/dist/memory/facade/index.js.map +1 -0
- package/dist/memory/facade/types.d.ts +606 -0
- package/dist/memory/facade/types.d.ts.map +1 -0
- package/dist/memory/facade/types.js +11 -0
- package/dist/memory/facade/types.js.map +1 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.d.ts +132 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.d.ts.map +1 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.js +178 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.js.map +1 -0
- package/dist/memory/feedback/index.d.ts +13 -0
- package/dist/memory/feedback/index.d.ts.map +1 -0
- package/dist/memory/feedback/index.js +12 -0
- package/dist/memory/feedback/index.js.map +1 -0
- package/dist/memory/index.d.ts +22 -0
- package/dist/memory/index.d.ts.map +1 -1
- package/dist/memory/index.js +24 -0
- package/dist/memory/index.js.map +1 -1
- package/dist/memory/ingestion/ChunkingEngine.d.ts +143 -0
- package/dist/memory/ingestion/ChunkingEngine.d.ts.map +1 -0
- package/dist/memory/ingestion/ChunkingEngine.js +508 -0
- package/dist/memory/ingestion/ChunkingEngine.js.map +1 -0
- package/dist/memory/ingestion/DoclingLoader.d.ts +44 -0
- package/dist/memory/ingestion/DoclingLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/DoclingLoader.js +228 -0
- package/dist/memory/ingestion/DoclingLoader.js.map +1 -0
- package/dist/memory/ingestion/DocxLoader.d.ts +37 -0
- package/dist/memory/ingestion/DocxLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/DocxLoader.js +111 -0
- package/dist/memory/ingestion/DocxLoader.js.map +1 -0
- package/dist/memory/ingestion/FolderScanner.d.ts +116 -0
- package/dist/memory/ingestion/FolderScanner.d.ts.map +1 -0
- package/dist/memory/ingestion/FolderScanner.js +127 -0
- package/dist/memory/ingestion/FolderScanner.js.map +1 -0
- package/dist/memory/ingestion/HtmlLoader.d.ts +49 -0
- package/dist/memory/ingestion/HtmlLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/HtmlLoader.js +202 -0
- package/dist/memory/ingestion/HtmlLoader.js.map +1 -0
- package/dist/memory/ingestion/IDocumentLoader.d.ts +63 -0
- package/dist/memory/ingestion/IDocumentLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/IDocumentLoader.js +11 -0
- package/dist/memory/ingestion/IDocumentLoader.js.map +1 -0
- package/dist/memory/ingestion/LoaderRegistry.d.ts +140 -0
- package/dist/memory/ingestion/LoaderRegistry.d.ts.map +1 -0
- package/dist/memory/ingestion/LoaderRegistry.js +229 -0
- package/dist/memory/ingestion/LoaderRegistry.js.map +1 -0
- package/dist/memory/ingestion/MarkdownLoader.d.ts +50 -0
- package/dist/memory/ingestion/MarkdownLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/MarkdownLoader.js +169 -0
- package/dist/memory/ingestion/MarkdownLoader.js.map +1 -0
- package/dist/memory/ingestion/MultimodalAggregator.d.ts +88 -0
- package/dist/memory/ingestion/MultimodalAggregator.d.ts.map +1 -0
- package/dist/memory/ingestion/MultimodalAggregator.js +96 -0
- package/dist/memory/ingestion/MultimodalAggregator.js.map +1 -0
- package/dist/memory/ingestion/OcrPdfLoader.d.ts +41 -0
- package/dist/memory/ingestion/OcrPdfLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/OcrPdfLoader.js +149 -0
- package/dist/memory/ingestion/OcrPdfLoader.js.map +1 -0
- package/dist/memory/ingestion/PdfLoader.d.ts +78 -0
- package/dist/memory/ingestion/PdfLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/PdfLoader.js +179 -0
- package/dist/memory/ingestion/PdfLoader.js.map +1 -0
- package/dist/memory/ingestion/TextLoader.d.ts +66 -0
- package/dist/memory/ingestion/TextLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/TextLoader.js +207 -0
- package/dist/memory/ingestion/TextLoader.js.map +1 -0
- package/dist/memory/ingestion/UrlLoader.d.ts +95 -0
- package/dist/memory/ingestion/UrlLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/UrlLoader.js +174 -0
- package/dist/memory/ingestion/UrlLoader.js.map +1 -0
- package/dist/memory/io/ChatGptImporter.d.ts +85 -0
- package/dist/memory/io/ChatGptImporter.d.ts.map +1 -0
- package/dist/memory/io/ChatGptImporter.js +231 -0
- package/dist/memory/io/ChatGptImporter.js.map +1 -0
- package/dist/memory/io/JsonExporter.d.ts +67 -0
- package/dist/memory/io/JsonExporter.d.ts.map +1 -0
- package/dist/memory/io/JsonExporter.js +132 -0
- package/dist/memory/io/JsonExporter.js.map +1 -0
- package/dist/memory/io/JsonImporter.d.ts +84 -0
- package/dist/memory/io/JsonImporter.d.ts.map +1 -0
- package/dist/memory/io/JsonImporter.js +234 -0
- package/dist/memory/io/JsonImporter.js.map +1 -0
- package/dist/memory/io/MarkdownExporter.d.ts +95 -0
- package/dist/memory/io/MarkdownExporter.d.ts.map +1 -0
- package/dist/memory/io/MarkdownExporter.js +130 -0
- package/dist/memory/io/MarkdownExporter.js.map +1 -0
- package/dist/memory/io/MarkdownImporter.d.ts +84 -0
- package/dist/memory/io/MarkdownImporter.d.ts.map +1 -0
- package/dist/memory/io/MarkdownImporter.js +166 -0
- package/dist/memory/io/MarkdownImporter.js.map +1 -0
- package/dist/memory/io/ObsidianExporter.d.ts +80 -0
- package/dist/memory/io/ObsidianExporter.d.ts.map +1 -0
- package/dist/memory/io/ObsidianExporter.js +127 -0
- package/dist/memory/io/ObsidianExporter.js.map +1 -0
- package/dist/memory/io/ObsidianImporter.d.ts +93 -0
- package/dist/memory/io/ObsidianImporter.d.ts.map +1 -0
- package/dist/memory/io/ObsidianImporter.js +221 -0
- package/dist/memory/io/ObsidianImporter.js.map +1 -0
- package/dist/memory/io/SqliteExporter.d.ts +47 -0
- package/dist/memory/io/SqliteExporter.d.ts.map +1 -0
- package/dist/memory/io/SqliteExporter.js +56 -0
- package/dist/memory/io/SqliteExporter.js.map +1 -0
- package/dist/memory/io/SqliteImporter.d.ts +82 -0
- package/dist/memory/io/SqliteImporter.d.ts.map +1 -0
- package/dist/memory/io/SqliteImporter.js +232 -0
- package/dist/memory/io/SqliteImporter.js.map +1 -0
- package/dist/memory/io/index.d.ts +31 -0
- package/dist/memory/io/index.d.ts.map +1 -0
- package/dist/memory/io/index.js +31 -0
- package/dist/memory/io/index.js.map +1 -0
- package/dist/memory/store/SqliteBrain.d.ts +125 -0
- package/dist/memory/store/SqliteBrain.d.ts.map +1 -0
- package/dist/memory/store/SqliteBrain.js +407 -0
- package/dist/memory/store/SqliteBrain.js.map +1 -0
- package/dist/memory/store/SqliteKnowledgeGraph.d.ts +259 -0
- package/dist/memory/store/SqliteKnowledgeGraph.d.ts.map +1 -0
- package/dist/memory/store/SqliteKnowledgeGraph.js +1062 -0
- package/dist/memory/store/SqliteKnowledgeGraph.js.map +1 -0
- package/dist/memory/store/SqliteMemoryGraph.d.ts +251 -0
- package/dist/memory/store/SqliteMemoryGraph.d.ts.map +1 -0
- package/dist/memory/store/SqliteMemoryGraph.js +637 -0
- package/dist/memory/store/SqliteMemoryGraph.js.map +1 -0
- package/dist/memory/tools/MemoryAddTool.d.ts +98 -0
- package/dist/memory/tools/MemoryAddTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryAddTool.js +131 -0
- package/dist/memory/tools/MemoryAddTool.js.map +1 -0
- package/dist/memory/tools/MemoryDeleteTool.d.ts +83 -0
- package/dist/memory/tools/MemoryDeleteTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryDeleteTool.js +96 -0
- package/dist/memory/tools/MemoryDeleteTool.js.map +1 -0
- package/dist/memory/tools/MemoryMergeTool.d.ts +95 -0
- package/dist/memory/tools/MemoryMergeTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryMergeTool.js +164 -0
- package/dist/memory/tools/MemoryMergeTool.js.map +1 -0
- package/dist/memory/tools/MemoryReflectTool.d.ts +86 -0
- package/dist/memory/tools/MemoryReflectTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryReflectTool.js +102 -0
- package/dist/memory/tools/MemoryReflectTool.js.map +1 -0
- package/dist/memory/tools/MemorySearchTool.d.ts +117 -0
- package/dist/memory/tools/MemorySearchTool.d.ts.map +1 -0
- package/dist/memory/tools/MemorySearchTool.js +162 -0
- package/dist/memory/tools/MemorySearchTool.js.map +1 -0
- package/dist/memory/tools/MemoryUpdateTool.d.ts +92 -0
- package/dist/memory/tools/MemoryUpdateTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryUpdateTool.js +125 -0
- package/dist/memory/tools/MemoryUpdateTool.js.map +1 -0
- package/dist/memory/tools/index.d.ts +32 -0
- package/dist/memory/tools/index.d.ts.map +1 -0
- package/dist/memory/tools/index.js +26 -0
- package/dist/memory/tools/index.js.map +1 -0
- package/package.json +6 -1
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview OcrPdfLoader — optional OCR-based PDF loader using Tesseract.js.
|
|
3
|
+
*
|
|
4
|
+
* This module provides a factory function {@link createOcrPdfLoader} that
|
|
5
|
+
* attempts to lazy-load `tesseract.js` at runtime. When the package is not
|
|
6
|
+
* installed the factory returns `null` gracefully so callers can treat OCR as
|
|
7
|
+
* fully opt-in without any hard dependency.
|
|
8
|
+
*
|
|
9
|
+
* ### Opting in
|
|
10
|
+
* ```sh
|
|
11
|
+
* pnpm add tesseract.js
|
|
12
|
+
* ```
|
|
13
|
+
*
|
|
14
|
+
* Once installed, pass the result of {@link createOcrPdfLoader} to
|
|
15
|
+
* {@link PdfLoader}'s constructor as the `ocrLoader` argument.
|
|
16
|
+
*
|
|
17
|
+
* @module memory/ingestion/OcrPdfLoader
|
|
18
|
+
*/
|
|
19
|
+
import type { IDocumentLoader } from './IDocumentLoader.js';
|
|
20
|
+
/**
|
|
21
|
+
* Checks whether `tesseract.js` is available in the current environment and,
|
|
22
|
+
* if so, returns a new {@link OcrPdfLoader} instance; otherwise returns `null`.
|
|
23
|
+
*
|
|
24
|
+
* The check is performed by attempting to resolve the package path using
|
|
25
|
+
* Node's `createRequire`. This avoids a full async dynamic import at call
|
|
26
|
+
* time while still being accurate.
|
|
27
|
+
*
|
|
28
|
+
* ### Usage
|
|
29
|
+
* ```ts
|
|
30
|
+
* import { createOcrPdfLoader } from './OcrPdfLoader.js';
|
|
31
|
+
* import { PdfLoader } from './PdfLoader.js';
|
|
32
|
+
*
|
|
33
|
+
* const ocrLoader = createOcrPdfLoader();
|
|
34
|
+
* const loader = new PdfLoader(ocrLoader);
|
|
35
|
+
* ```
|
|
36
|
+
*
|
|
37
|
+
* @returns An `OcrPdfLoader` instance when tesseract.js is installed, or
|
|
38
|
+
* `null` when it is not.
|
|
39
|
+
*/
|
|
40
|
+
export declare function createOcrPdfLoader(): IDocumentLoader | null;
|
|
41
|
+
//# sourceMappingURL=OcrPdfLoader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"OcrPdfLoader.d.ts","sourceRoot":"","sources":["../../../src/memory/ingestion/OcrPdfLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAKH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAgH5D;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,kBAAkB,IAAI,eAAe,GAAG,IAAI,CAY3D"}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview OcrPdfLoader — optional OCR-based PDF loader using Tesseract.js.
|
|
3
|
+
*
|
|
4
|
+
* This module provides a factory function {@link createOcrPdfLoader} that
|
|
5
|
+
* attempts to lazy-load `tesseract.js` at runtime. When the package is not
|
|
6
|
+
* installed the factory returns `null` gracefully so callers can treat OCR as
|
|
7
|
+
* fully opt-in without any hard dependency.
|
|
8
|
+
*
|
|
9
|
+
* ### Opting in
|
|
10
|
+
* ```sh
|
|
11
|
+
* pnpm add tesseract.js
|
|
12
|
+
* ```
|
|
13
|
+
*
|
|
14
|
+
* Once installed, pass the result of {@link createOcrPdfLoader} to
|
|
15
|
+
* {@link PdfLoader}'s constructor as the `ocrLoader` argument.
|
|
16
|
+
*
|
|
17
|
+
* @module memory/ingestion/OcrPdfLoader
|
|
18
|
+
*/
|
|
19
|
+
import path from 'node:path';
|
|
20
|
+
import fs from 'node:fs/promises';
|
|
21
|
+
import { createRequire } from 'node:module';
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Constants
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
/** Extensions this loader handles. */
|
|
26
|
+
const SUPPORTED_EXTENSIONS = ['.pdf'];
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// Helpers
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
/**
|
|
31
|
+
* Returns the lower-cased extension (with dot) of a file path.
|
|
32
|
+
*
|
|
33
|
+
* @param filePath - Absolute or relative file path.
|
|
34
|
+
*/
|
|
35
|
+
function extOf(filePath) {
|
|
36
|
+
return path.extname(filePath).toLowerCase();
|
|
37
|
+
}
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
// OcrPdfLoader (internal class)
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
/**
|
|
42
|
+
* Internal loader implementation. Consumers should use {@link createOcrPdfLoader}
|
|
43
|
+
* rather than instantiating this class directly, as the factory performs the
|
|
44
|
+
* availability check and returns `null` when tesseract.js is absent.
|
|
45
|
+
*
|
|
46
|
+
* @implements {IDocumentLoader}
|
|
47
|
+
*/
|
|
48
|
+
class OcrPdfLoader {
|
|
49
|
+
constructor() {
|
|
50
|
+
/** @inheritdoc */
|
|
51
|
+
this.supportedExtensions = [...SUPPORTED_EXTENSIONS];
|
|
52
|
+
}
|
|
53
|
+
// -------------------------------------------------------------------------
|
|
54
|
+
// canLoad
|
|
55
|
+
// -------------------------------------------------------------------------
|
|
56
|
+
/** @inheritdoc */
|
|
57
|
+
canLoad(source) {
|
|
58
|
+
if (Buffer.isBuffer(source)) {
|
|
59
|
+
// Detect PDF magic bytes: %PDF- at offset 0.
|
|
60
|
+
return source.length >= 4 &&
|
|
61
|
+
source[0] === 0x25 && source[1] === 0x50 &&
|
|
62
|
+
source[2] === 0x44 && source[3] === 0x46;
|
|
63
|
+
}
|
|
64
|
+
return SUPPORTED_EXTENSIONS.includes(extOf(source));
|
|
65
|
+
}
|
|
66
|
+
// -------------------------------------------------------------------------
|
|
67
|
+
// load
|
|
68
|
+
// -------------------------------------------------------------------------
|
|
69
|
+
/** @inheritdoc */
|
|
70
|
+
async load(source, _options) {
|
|
71
|
+
let buffer;
|
|
72
|
+
let resolvedPath;
|
|
73
|
+
if (Buffer.isBuffer(source)) {
|
|
74
|
+
buffer = source;
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
resolvedPath = source;
|
|
78
|
+
buffer = await fs.readFile(resolvedPath);
|
|
79
|
+
}
|
|
80
|
+
// Attempt to dynamically import tesseract.js. The import is wrapped in
|
|
81
|
+
// try/catch so a missing package yields a clear message rather than a
|
|
82
|
+
// cryptic MODULE_NOT_FOUND stack trace. We use `any` here so that the
|
|
83
|
+
// optional peer dependency does not require installed type declarations.
|
|
84
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
85
|
+
let Tesseract;
|
|
86
|
+
try {
|
|
87
|
+
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
|
88
|
+
// @ts-expect-error — optional peer dependency; types not guaranteed to be installed
|
|
89
|
+
Tesseract = await import('tesseract.js');
|
|
90
|
+
}
|
|
91
|
+
catch {
|
|
92
|
+
throw new Error('OcrPdfLoader: tesseract.js is not installed. ' +
|
|
93
|
+
'Run `pnpm add tesseract.js` (or the equivalent for your package manager) ' +
|
|
94
|
+
'to enable OCR-based PDF extraction.');
|
|
95
|
+
}
|
|
96
|
+
// Perform OCR on the raw PDF bytes.
|
|
97
|
+
// Tesseract.js accepts a Buffer, base64 string, Blob, or image URL.
|
|
98
|
+
// For PDF buffers we pass the raw bytes and let Tesseract handle decoding.
|
|
99
|
+
const { data: { text }, } = await Tesseract.recognize(buffer, 'eng', {
|
|
100
|
+
logger: () => { },
|
|
101
|
+
});
|
|
102
|
+
const meta = {
|
|
103
|
+
...(resolvedPath ? { source: resolvedPath } : {}),
|
|
104
|
+
};
|
|
105
|
+
return {
|
|
106
|
+
content: text,
|
|
107
|
+
metadata: meta,
|
|
108
|
+
format: 'pdf',
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
// Factory
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
/**
|
|
116
|
+
* Checks whether `tesseract.js` is available in the current environment and,
|
|
117
|
+
* if so, returns a new {@link OcrPdfLoader} instance; otherwise returns `null`.
|
|
118
|
+
*
|
|
119
|
+
* The check is performed by attempting to resolve the package path using
|
|
120
|
+
* Node's `createRequire`. This avoids a full async dynamic import at call
|
|
121
|
+
* time while still being accurate.
|
|
122
|
+
*
|
|
123
|
+
* ### Usage
|
|
124
|
+
* ```ts
|
|
125
|
+
* import { createOcrPdfLoader } from './OcrPdfLoader.js';
|
|
126
|
+
* import { PdfLoader } from './PdfLoader.js';
|
|
127
|
+
*
|
|
128
|
+
* const ocrLoader = createOcrPdfLoader();
|
|
129
|
+
* const loader = new PdfLoader(ocrLoader);
|
|
130
|
+
* ```
|
|
131
|
+
*
|
|
132
|
+
* @returns An `OcrPdfLoader` instance when tesseract.js is installed, or
|
|
133
|
+
* `null` when it is not.
|
|
134
|
+
*/
|
|
135
|
+
export function createOcrPdfLoader() {
|
|
136
|
+
try {
|
|
137
|
+
// createRequire lets us resolve package paths from an ESM context without
|
|
138
|
+
// actually importing the module. If resolve throws, the package is absent.
|
|
139
|
+
const require = createRequire(import.meta.url);
|
|
140
|
+
require.resolve('tesseract.js');
|
|
141
|
+
// Package is resolvable — return a loader instance.
|
|
142
|
+
return new OcrPdfLoader();
|
|
143
|
+
}
|
|
144
|
+
catch {
|
|
145
|
+
// Package not installed — OCR is unavailable.
|
|
146
|
+
return null;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
//# sourceMappingURL=OcrPdfLoader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"OcrPdfLoader.js","sourceRoot":"","sources":["../../../src/memory/ingestion/OcrPdfLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAEH,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAI5C,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E,sCAAsC;AACtC,MAAM,oBAAoB,GAAG,CAAC,MAAM,CAAU,CAAC;AAE/C,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E;;;;GAIG;AACH,SAAS,KAAK,CAAC,QAAgB;IAC7B,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;AAC9C,CAAC;AAED,8EAA8E;AAC9E,gCAAgC;AAChC,8EAA8E;AAE9E;;;;;;GAMG;AACH,MAAM,YAAY;IAAlB;QACE,kBAAkB;QACT,wBAAmB,GAAa,CAAC,GAAG,oBAAoB,CAAC,CAAC;IAsErE,CAAC;IApEC,4EAA4E;IAC5E,UAAU;IACV,4EAA4E;IAE5E,kBAAkB;IAClB,OAAO,CAAC,MAAuB;QAC7B,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,6CAA6C;YAC7C,OAAO,MAAM,CAAC,MAAM,IAAI,CAAC;gBACvB,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI;gBACxC,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC;QAC7C,CAAC;QACD,OAAQ,oBAA0C,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAW,CAAC,CAAC;IACvF,CAAC;IAED,4EAA4E;IAC5E,OAAO;IACP,4EAA4E;IAE5E,kBAAkB;IAClB,KAAK,CAAC,IAAI,CAAC,MAAuB,EAAE,QAAsB;QACxD,IAAI,MAAc,CAAC;QACnB,IAAI,YAAgC,CAAC;QAErC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,MAAM,GAAG,MAAM,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,YAAY,GAAG,MAAM,CAAC;YACtB,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;QAC3C,CAAC;QAED,wEAAwE;QACxE,sEAAsE;QACtE,uEAAuE;QACvE,yEAAyE;QACzE,8DAA8D;QAC9D,IAAI,SAAc,CAAC;QACnB,IAAI,CAAC;YACH,6DAA6D;YAC7D,oFAAoF;YACpF,SAAS,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QAC3C,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,gDAAgD;gBAChD,2EAA2E;gBAC3E,qCAAqC,CACtC,CAAC;QACJ,CAAC;QAED,oCAAoC;QACpC,oEAAoE;QACpE,2EAA2E;QAC3E,MAAM,EACJ,IAAI,EAAE,EAAE,IAAI,EAAE,GACf,GAAG,MAAM,SAAS,CAAC,SAAS,CAAC,MAAM,EAAE,KAAK,EAAE;YAC3C,MAAM,EAAE,GAAG,EAAE,GAAkC,CAAC;SACjD,CAAC,CAAC;QAEH,MAAM,IAAI,GAAqB;YAC7B,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAClD,CAAC;QAEF,OAAO;YACL,OAAO,EAAE,IAAI;YACb,QAAQ,EAAE,IAAI;YACd,MAAM,EAAE,KAAK;SACd,CAAC;IACJ,CAAC;CACF;AAED,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,UAAU,kBAAkB;IAChC,IAAI,CAAC;QACH,0EAA0E;QAC1E,4EAA4E;QAC5E,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC/C,OAAO,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QAChC,oDAAoD;QACpD,OAAO,IAAI,YAAY,EAAE,CAAC;IAC5B,CAAC;IAAC,MAAM,CAAC;QACP,8CAA8C;QAC9C,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview PdfLoader — loads `.pdf` documents using `unpdf`.
|
|
3
|
+
*
|
|
4
|
+
* Implements a tiered extraction strategy:
|
|
5
|
+
* 1. **unpdf** (Tier 1, always available) — pure-JS text extraction via
|
|
6
|
+
* `getDocumentProxy` + `extractText`. Fast and dependency-free.
|
|
7
|
+
* 2. **OcrPdfLoader** (Tier 2, opt-in) — Tesseract.js OCR, engaged when the
|
|
8
|
+
* unpdf extraction produces sparse text (< 50 chars per page on average).
|
|
9
|
+
* 3. **DoclingLoader** (Tier 3, opt-in) — Python `docling` subprocess, engaged
|
|
10
|
+
* when explicitly configured via the `docling` loader option.
|
|
11
|
+
*
|
|
12
|
+
* Both fallback loaders are optional and injected via constructor parameters;
|
|
13
|
+
* callers supply them by calling {@link createOcrPdfLoader} and
|
|
14
|
+
* {@link createDoclingLoader} and checking for non-null values.
|
|
15
|
+
*
|
|
16
|
+
* @module memory/ingestion/PdfLoader
|
|
17
|
+
*/
|
|
18
|
+
import type { IDocumentLoader } from './IDocumentLoader.js';
|
|
19
|
+
import type { LoadOptions, LoadedDocument } from '../facade/types.js';
|
|
20
|
+
/**
|
|
21
|
+
* Document loader for PDF files.
|
|
22
|
+
*
|
|
23
|
+
* ### Extraction tiers
|
|
24
|
+
* 1. **unpdf** — always used as the primary extraction engine. Performs
|
|
25
|
+
* pure-JS PDF text layer extraction with no native binaries required.
|
|
26
|
+
* 2. **OcrPdfLoader** (optional) — supplied at construction time and engaged
|
|
27
|
+
* automatically when unpdf yields sparse text (< 50 chars per page on
|
|
28
|
+
* average), indicating a scanned document.
|
|
29
|
+
* 3. **DoclingLoader** (optional) — when provided, takes precedence over both
|
|
30
|
+
* unpdf and OCR, yielding the highest-fidelity extraction at the cost of
|
|
31
|
+
* requiring a Python runtime.
|
|
32
|
+
*
|
|
33
|
+
* @implements {IDocumentLoader}
|
|
34
|
+
*
|
|
35
|
+
* @example
|
|
36
|
+
* ```ts
|
|
37
|
+
* const ocrLoader = createOcrPdfLoader(); // null if tesseract.js absent
|
|
38
|
+
* const doclingLoader = createDoclingLoader(); // null if docling absent
|
|
39
|
+
* const pdfLoader = new PdfLoader(ocrLoader, doclingLoader);
|
|
40
|
+
* const doc = await pdfLoader.load('/reports/q3.pdf');
|
|
41
|
+
* ```
|
|
42
|
+
*/
|
|
43
|
+
export declare class PdfLoader implements IDocumentLoader {
|
|
44
|
+
/** @inheritdoc */
|
|
45
|
+
readonly supportedExtensions: string[];
|
|
46
|
+
/**
|
|
47
|
+
* Optional OCR fallback loader, engaged when primary extraction is sparse.
|
|
48
|
+
* Pass `null` to disable OCR fallback.
|
|
49
|
+
*/
|
|
50
|
+
private readonly _ocrLoader;
|
|
51
|
+
/**
|
|
52
|
+
* Optional Docling loader that, when present, takes precedence over the
|
|
53
|
+
* entire unpdf + OCR pipeline.
|
|
54
|
+
* Pass `null` to disable Docling.
|
|
55
|
+
*/
|
|
56
|
+
private readonly _doclingLoader;
|
|
57
|
+
/**
|
|
58
|
+
* Creates a new PdfLoader.
|
|
59
|
+
*
|
|
60
|
+
* @param ocrLoader - Optional OCR fallback (e.g. from {@link createOcrPdfLoader}).
|
|
61
|
+
* @param doclingLoader - Optional Docling loader (e.g. from {@link createDoclingLoader}).
|
|
62
|
+
*/
|
|
63
|
+
constructor(ocrLoader?: IDocumentLoader | null, doclingLoader?: IDocumentLoader | null);
|
|
64
|
+
/** @inheritdoc */
|
|
65
|
+
canLoad(source: string | Buffer): boolean;
|
|
66
|
+
/** @inheritdoc */
|
|
67
|
+
load(source: string | Buffer, options?: LoadOptions): Promise<LoadedDocument>;
|
|
68
|
+
/**
|
|
69
|
+
* Extract text from a PDF buffer using the `unpdf` package.
|
|
70
|
+
*
|
|
71
|
+
* Returns the concatenated page text, the page count, and an optional title
|
|
72
|
+
* string extracted from the PDF metadata dictionary when available.
|
|
73
|
+
*
|
|
74
|
+
* @param buffer - Raw PDF bytes.
|
|
75
|
+
*/
|
|
76
|
+
private _extractWithUnpdf;
|
|
77
|
+
}
|
|
78
|
+
//# sourceMappingURL=PdfLoader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PdfLoader.d.ts","sourceRoot":"","sources":["../../../src/memory/ingestion/PdfLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAIH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAC5D,OAAO,KAAK,EAAE,WAAW,EAAE,cAAc,EAAoB,MAAM,oBAAoB,CAAC;AA+CxF;;;;;;;;;;;;;;;;;;;;;;GAsBG;AACH,qBAAa,SAAU,YAAW,eAAe;IAC/C,kBAAkB;IAClB,QAAQ,CAAC,mBAAmB,EAAE,MAAM,EAAE,CAA6B;IAEnE;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAyB;IAEpD;;;;OAIG;IACH,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAyB;IAExD;;;;;OAKG;gBAED,SAAS,GAAE,eAAe,GAAG,IAAW,EACxC,aAAa,GAAE,eAAe,GAAG,IAAW;IAU9C,kBAAkB;IAClB,OAAO,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO;IAazC,kBAAkB;IACZ,IAAI,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,OAAO,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,cAAc,CAAC;IA2CnF;;;;;;;OAOG;YACW,iBAAiB;CA+BhC"}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview PdfLoader — loads `.pdf` documents using `unpdf`.
|
|
3
|
+
*
|
|
4
|
+
* Implements a tiered extraction strategy:
|
|
5
|
+
* 1. **unpdf** (Tier 1, always available) — pure-JS text extraction via
|
|
6
|
+
* `getDocumentProxy` + `extractText`. Fast and dependency-free.
|
|
7
|
+
* 2. **OcrPdfLoader** (Tier 2, opt-in) — Tesseract.js OCR, engaged when the
|
|
8
|
+
* unpdf extraction produces sparse text (< 50 chars per page on average).
|
|
9
|
+
* 3. **DoclingLoader** (Tier 3, opt-in) — Python `docling` subprocess, engaged
|
|
10
|
+
* when explicitly configured via the `docling` loader option.
|
|
11
|
+
*
|
|
12
|
+
* Both fallback loaders are optional and injected via constructor parameters;
|
|
13
|
+
* callers supply them by calling {@link createOcrPdfLoader} and
|
|
14
|
+
* {@link createDoclingLoader} and checking for non-null values.
|
|
15
|
+
*
|
|
16
|
+
* @module memory/ingestion/PdfLoader
|
|
17
|
+
*/
|
|
18
|
+
import fs from 'node:fs/promises';
|
|
19
|
+
import path from 'node:path';
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Constants
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
/** Extensions handled by this loader, each with a leading dot. */
|
|
24
|
+
const SUPPORTED_EXTENSIONS = ['.pdf'];
|
|
25
|
+
/**
|
|
26
|
+
* Minimum average character count per page below which the primary extraction
|
|
27
|
+
* result is considered sparse and a fallback strategy is engaged.
|
|
28
|
+
*/
|
|
29
|
+
const SPARSE_THRESHOLD_CHARS_PER_PAGE = 50;
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Helpers
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
/**
|
|
34
|
+
* Returns the lower-cased extension (with dot) of a file path.
|
|
35
|
+
*
|
|
36
|
+
* @param filePath - Absolute or relative file path.
|
|
37
|
+
*/
|
|
38
|
+
function extOf(filePath) {
|
|
39
|
+
return path.extname(filePath).toLowerCase();
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Determine whether extracted text is considered sparse given the page count.
|
|
43
|
+
*
|
|
44
|
+
* Sparse text typically indicates a scanned PDF where textual content was not
|
|
45
|
+
* embedded during creation and OCR is required.
|
|
46
|
+
*
|
|
47
|
+
* @param text - Full extracted text.
|
|
48
|
+
* @param pageCount - Number of pages in the document.
|
|
49
|
+
*/
|
|
50
|
+
function isSparse(text, pageCount) {
|
|
51
|
+
if (pageCount <= 0)
|
|
52
|
+
return false;
|
|
53
|
+
const avgCharsPerPage = text.length / pageCount;
|
|
54
|
+
return avgCharsPerPage < SPARSE_THRESHOLD_CHARS_PER_PAGE;
|
|
55
|
+
}
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
// PdfLoader
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
/**
|
|
60
|
+
* Document loader for PDF files.
|
|
61
|
+
*
|
|
62
|
+
* ### Extraction tiers
|
|
63
|
+
* 1. **unpdf** — always used as the primary extraction engine. Performs
|
|
64
|
+
* pure-JS PDF text layer extraction with no native binaries required.
|
|
65
|
+
* 2. **OcrPdfLoader** (optional) — supplied at construction time and engaged
|
|
66
|
+
* automatically when unpdf yields sparse text (< 50 chars per page on
|
|
67
|
+
* average), indicating a scanned document.
|
|
68
|
+
* 3. **DoclingLoader** (optional) — when provided, takes precedence over both
|
|
69
|
+
* unpdf and OCR, yielding the highest-fidelity extraction at the cost of
|
|
70
|
+
* requiring a Python runtime.
|
|
71
|
+
*
|
|
72
|
+
* @implements {IDocumentLoader}
|
|
73
|
+
*
|
|
74
|
+
* @example
|
|
75
|
+
* ```ts
|
|
76
|
+
* const ocrLoader = createOcrPdfLoader(); // null if tesseract.js absent
|
|
77
|
+
* const doclingLoader = createDoclingLoader(); // null if docling absent
|
|
78
|
+
* const pdfLoader = new PdfLoader(ocrLoader, doclingLoader);
|
|
79
|
+
* const doc = await pdfLoader.load('/reports/q3.pdf');
|
|
80
|
+
* ```
|
|
81
|
+
*/
|
|
82
|
+
export class PdfLoader {
|
|
83
|
+
/**
|
|
84
|
+
* Creates a new PdfLoader.
|
|
85
|
+
*
|
|
86
|
+
* @param ocrLoader - Optional OCR fallback (e.g. from {@link createOcrPdfLoader}).
|
|
87
|
+
* @param doclingLoader - Optional Docling loader (e.g. from {@link createDoclingLoader}).
|
|
88
|
+
*/
|
|
89
|
+
constructor(ocrLoader = null, doclingLoader = null) {
|
|
90
|
+
/** @inheritdoc */
|
|
91
|
+
this.supportedExtensions = [...SUPPORTED_EXTENSIONS];
|
|
92
|
+
this._ocrLoader = ocrLoader;
|
|
93
|
+
this._doclingLoader = doclingLoader;
|
|
94
|
+
}
|
|
95
|
+
// -------------------------------------------------------------------------
|
|
96
|
+
// canLoad
|
|
97
|
+
// -------------------------------------------------------------------------
|
|
98
|
+
/** @inheritdoc */
|
|
99
|
+
canLoad(source) {
|
|
100
|
+
if (Buffer.isBuffer(source)) {
|
|
101
|
+
// Detect PDF magic bytes: %PDF- at offset 0.
|
|
102
|
+
return source.length >= 4 && source[0] === 0x25 && source[1] === 0x50 &&
|
|
103
|
+
source[2] === 0x44 && source[3] === 0x46;
|
|
104
|
+
}
|
|
105
|
+
return SUPPORTED_EXTENSIONS.includes(extOf(source));
|
|
106
|
+
}
|
|
107
|
+
// -------------------------------------------------------------------------
|
|
108
|
+
// load
|
|
109
|
+
// -------------------------------------------------------------------------
|
|
110
|
+
/** @inheritdoc */
|
|
111
|
+
async load(source, options) {
|
|
112
|
+
// Prefer Docling when available — highest fidelity.
|
|
113
|
+
if (this._doclingLoader !== null) {
|
|
114
|
+
return this._doclingLoader.load(source, options);
|
|
115
|
+
}
|
|
116
|
+
// Read bytes from disk if a path was supplied.
|
|
117
|
+
let buffer;
|
|
118
|
+
let resolvedPath;
|
|
119
|
+
if (Buffer.isBuffer(source)) {
|
|
120
|
+
buffer = source;
|
|
121
|
+
}
|
|
122
|
+
else {
|
|
123
|
+
resolvedPath = source;
|
|
124
|
+
buffer = await fs.readFile(resolvedPath);
|
|
125
|
+
}
|
|
126
|
+
// Primary extraction via unpdf.
|
|
127
|
+
const { content, pageCount, title } = await this._extractWithUnpdf(buffer);
|
|
128
|
+
// If primary extraction is sparse and OCR is available, delegate entirely
|
|
129
|
+
// to the OCR loader which has access to image-level content.
|
|
130
|
+
if (isSparse(content, pageCount) && this._ocrLoader !== null) {
|
|
131
|
+
return this._ocrLoader.load(source, options);
|
|
132
|
+
}
|
|
133
|
+
const meta = {
|
|
134
|
+
pageCount,
|
|
135
|
+
...(title ? { title } : {}),
|
|
136
|
+
...(resolvedPath ? { source: resolvedPath } : {}),
|
|
137
|
+
};
|
|
138
|
+
return {
|
|
139
|
+
content,
|
|
140
|
+
metadata: meta,
|
|
141
|
+
format: 'pdf',
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
// -------------------------------------------------------------------------
|
|
145
|
+
// Private: unpdf extraction
|
|
146
|
+
// -------------------------------------------------------------------------
|
|
147
|
+
/**
|
|
148
|
+
* Extract text from a PDF buffer using the `unpdf` package.
|
|
149
|
+
*
|
|
150
|
+
* Returns the concatenated page text, the page count, and an optional title
|
|
151
|
+
* string extracted from the PDF metadata dictionary when available.
|
|
152
|
+
*
|
|
153
|
+
* @param buffer - Raw PDF bytes.
|
|
154
|
+
*/
|
|
155
|
+
async _extractWithUnpdf(buffer) {
|
|
156
|
+
// Dynamic import keeps unpdf tree-shakeable and avoids module-resolution
|
|
157
|
+
// errors in environments that don't bundle the package.
|
|
158
|
+
const { getDocumentProxy, extractText } = await import('unpdf');
|
|
159
|
+
// getDocumentProxy accepts a Uint8Array — wrap the Node Buffer.
|
|
160
|
+
const doc = await getDocumentProxy(new Uint8Array(buffer));
|
|
161
|
+
// Extract all pages at once.
|
|
162
|
+
const { text } = await extractText(doc, { mergePages: true });
|
|
163
|
+
// Attempt to read the Title field from the PDF info dictionary.
|
|
164
|
+
let title;
|
|
165
|
+
try {
|
|
166
|
+
const metadata = await doc.getMetadata();
|
|
167
|
+
const info = metadata?.info;
|
|
168
|
+
if (info && typeof info['Title'] === 'string' && info['Title'].trim()) {
|
|
169
|
+
title = info['Title'].trim();
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
catch {
|
|
173
|
+
// Metadata access is optional — swallow errors silently.
|
|
174
|
+
}
|
|
175
|
+
const pageCount = doc.numPages ?? 0;
|
|
176
|
+
return { content: text, pageCount, title };
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
//# sourceMappingURL=PdfLoader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PdfLoader.js","sourceRoot":"","sources":["../../../src/memory/ingestion/PdfLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,OAAO,EAAE,MAAM,kBAAkB,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAI7B,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E,kEAAkE;AAClE,MAAM,oBAAoB,GAAG,CAAC,MAAM,CAAU,CAAC;AAE/C;;;GAGG;AACH,MAAM,+BAA+B,GAAG,EAAE,CAAC;AAE3C,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E;;;;GAIG;AACH,SAAS,KAAK,CAAC,QAAgB;IAC7B,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;AAC9C,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,SAAiB;IAC/C,IAAI,SAAS,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IACjC,MAAM,eAAe,GAAG,IAAI,CAAC,MAAM,GAAG,SAAS,CAAC;IAChD,OAAO,eAAe,GAAG,+BAA+B,CAAC;AAC3D,CAAC;AAED,8EAA8E;AAC9E,YAAY;AACZ,8EAA8E;AAE9E;;;;;;;;;;;;;;;;;;;;;;GAsBG;AACH,MAAM,OAAO,SAAS;IAiBpB;;;;;OAKG;IACH,YACE,YAAoC,IAAI,EACxC,gBAAwC,IAAI;QAxB9C,kBAAkB;QACT,wBAAmB,GAAa,CAAC,GAAG,oBAAoB,CAAC,CAAC;QAyBjE,IAAI,CAAC,UAAU,GAAG,SAAS,CAAC;QAC5B,IAAI,CAAC,cAAc,GAAG,aAAa,CAAC;IACtC,CAAC;IAED,4EAA4E;IAC5E,UAAU;IACV,4EAA4E;IAE5E,kBAAkB;IAClB,OAAO,CAAC,MAAuB;QAC7B,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,6CAA6C;YAC7C,OAAO,MAAM,CAAC,MAAM,IAAI,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI;gBACnE,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC;QAC7C,CAAC;QACD,OAAQ,oBAA0C,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAW,CAAC,CAAC;IACvF,CAAC;IAED,4EAA4E;IAC5E,OAAO;IACP,4EAA4E;IAE5E,kBAAkB;IAClB,KAAK,CAAC,IAAI,CAAC,MAAuB,EAAE,OAAqB;QACvD,oDAAoD;QACpD,IAAI,IAAI,CAAC,cAAc,KAAK,IAAI,EAAE,CAAC;YACjC,OAAO,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACnD,CAAC;QAED,+CAA+C;QAC/C,IAAI,MAAc,CAAC;QACnB,IAAI,YAAgC,CAAC;QAErC,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC5B,MAAM,GAAG,MAAM,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,YAAY,GAAG,MAAM,CAAC;YACtB,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;QAC3C,CAAC;QAED,gCAAgC;QAChC,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,MAAM,CAAC,CAAC;QAE3E,0EAA0E;QAC1E,6DAA6D;QAC7D,IAAI,QAAQ,CAAC,OAAO,EAAE,SAAS,CAAC,IAAI,IAAI,CAAC,UAAU,KAAK,IAAI,EAAE,CAAC;YAC7D,OAAO,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC/C,CAAC;QAED,MAAM,IAAI,GAAqB;YAC7B,SAAS;YACT,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3B,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SAClD,CAAC;QAEF,OAAO;YACL,OAAO;YACP,QAAQ,EAAE,IAAI;YACd,MAAM,EAAE,KAAK;SACd,CAAC;IACJ,CAAC;IAED,4EAA4E;IAC5E,4BAA4B;IAC5B,4EAA4E;IAE5E;;;;;;;OAOG;IACK,KAAK,CAAC,iBAAiB,CAAC,MAAc;QAK5C,yEAAyE;QACzE,wDAAwD;QACxD,MAAM,EAAE,gBAAgB,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;QAEhE,gEAAgE;QAChE,MAAM,GAAG,GAAG,MAAM,gBAAgB,CAAC,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC;QAE3D,6BAA6B;QAC7B,MAAM,EAAE,IAAI,EAAE,GAAG,MAAM,WAAW,CAAC,GAAG,EAAE,EAAE,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC;QAE9D,gEAAgE;QAChE,IAAI,KAAyB,CAAC;QAC9B,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC;YACzC,MAAM,IAAI,GAAG,QAAQ,EAAE,IAA2C,CAAC;YACnE,IAAI,IAAI,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,KAAK,QAAQ,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC;gBACtE,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;YAC/B,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,yDAAyD;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,GAAG,CAAC,QAAQ,IAAI,CAAC,CAAC;QAEpC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC;IAC7C,CAAC;CACF"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview TextLoader — loads plain-text, CSV/TSV, JSON, and YAML files.
|
|
3
|
+
*
|
|
4
|
+
* This is the most general-purpose loader in the AgentOS ingestion pipeline.
|
|
5
|
+
* It handles six extensions that all share the same fundamental operation:
|
|
6
|
+
* read raw text and attach lightweight metadata derived from the file content
|
|
7
|
+
* and extension.
|
|
8
|
+
*
|
|
9
|
+
* Supported extensions: `.txt`, `.csv`, `.tsv`, `.json`, `.yaml`, `.yml`
|
|
10
|
+
*
|
|
11
|
+
* @module memory/ingestion/TextLoader
|
|
12
|
+
*/
|
|
13
|
+
import type { IDocumentLoader } from './IDocumentLoader.js';
|
|
14
|
+
import type { LoadOptions, LoadedDocument } from '../facade/types.js';
|
|
15
|
+
/**
|
|
16
|
+
* Loader for plain-text, CSV, TSV, JSON, and YAML files.
|
|
17
|
+
*
|
|
18
|
+
* The loader performs minimal transformation:
|
|
19
|
+
* - **`.json`** — re-serialises with pretty-printing so stored content is
|
|
20
|
+
* consistently formatted.
|
|
21
|
+
* - **`.yaml` / `.yml`** — the `yaml` package is used to parse and re-dump
|
|
22
|
+
* for consistent formatting; falls back to raw text on parse error.
|
|
23
|
+
* - All other extensions — content is returned as-is.
|
|
24
|
+
*
|
|
25
|
+
* Metadata includes the approximate `wordCount` and a `format` label derived
|
|
26
|
+
* from the file extension.
|
|
27
|
+
*
|
|
28
|
+
* @implements {IDocumentLoader}
|
|
29
|
+
*
|
|
30
|
+
* @example
|
|
31
|
+
* ```ts
|
|
32
|
+
* const loader = new TextLoader();
|
|
33
|
+
* const doc = await loader.load('/data/notes.txt');
|
|
34
|
+
* console.log(doc.metadata.wordCount); // e.g. 312
|
|
35
|
+
* ```
|
|
36
|
+
*/
|
|
37
|
+
export declare class TextLoader implements IDocumentLoader {
|
|
38
|
+
/** @inheritdoc */
|
|
39
|
+
readonly supportedExtensions: string[];
|
|
40
|
+
/** @inheritdoc */
|
|
41
|
+
canLoad(source: string | Buffer): boolean;
|
|
42
|
+
/** @inheritdoc */
|
|
43
|
+
load(source: string | Buffer, _options?: LoadOptions): Promise<LoadedDocument>;
|
|
44
|
+
/**
|
|
45
|
+
* Normalises raw file content based on the detected extension.
|
|
46
|
+
*
|
|
47
|
+
* - JSON files are pretty-printed.
|
|
48
|
+
* - YAML files are parsed and re-dumped for consistent formatting.
|
|
49
|
+
* - All other formats are returned unchanged.
|
|
50
|
+
*
|
|
51
|
+
* @param raw - Raw UTF-8 string read from the source.
|
|
52
|
+
* @param ext - Lower-cased extension with leading dot.
|
|
53
|
+
*/
|
|
54
|
+
private _normalise;
|
|
55
|
+
/**
|
|
56
|
+
* Parse and re-serialise YAML content for consistent formatting.
|
|
57
|
+
*
|
|
58
|
+
* Uses the `yaml` package that is already a production dependency of the
|
|
59
|
+
* `@framers/agentos` package. Falls back to the original raw string on
|
|
60
|
+
* any parse error so the loader never throws on malformed YAML.
|
|
61
|
+
*
|
|
62
|
+
* @param raw - Raw YAML string.
|
|
63
|
+
*/
|
|
64
|
+
private _prettyYaml;
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=TextLoader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"TextLoader.d.ts","sourceRoot":"","sources":["../../../src/memory/ingestion/TextLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAKH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAC5D,OAAO,KAAK,EAAE,WAAW,EAAE,cAAc,EAAoB,MAAM,oBAAoB,CAAC;AAsFxF;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,qBAAa,UAAW,YAAW,eAAe;IAChD,kBAAkB;IAClB,QAAQ,CAAC,mBAAmB,EAAE,MAAM,EAAE,CAA6B;IAMnE,kBAAkB;IAClB,OAAO,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO;IAazC,kBAAkB;IACZ,IAAI,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,QAAQ,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,cAAc,CAAC;IAsCpF;;;;;;;;;OASG;IACH,OAAO,CAAC,UAAU;IAelB;;;;;;;;OAQG;IACH,OAAO,CAAC,WAAW;CAQpB"}
|