@framers/agentos 0.1.101 → 0.1.103
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -0
- package/dist/api/agency.js +1 -1
- package/dist/api/agency.js.map +1 -1
- package/dist/api/strategies/graph.d.ts.map +1 -1
- package/dist/api/strategies/graph.js +1 -0
- package/dist/api/strategies/graph.js.map +1 -1
- package/dist/api/strategies/sequential.d.ts.map +1 -1
- package/dist/api/strategies/sequential.js +1 -0
- package/dist/api/strategies/sequential.js.map +1 -1
- package/dist/memory/config.d.ts +39 -0
- package/dist/memory/config.d.ts.map +1 -1
- package/dist/memory/config.js.map +1 -1
- package/dist/memory/consolidation/ConsolidationLoop.d.ts +177 -0
- package/dist/memory/consolidation/ConsolidationLoop.d.ts.map +1 -0
- package/dist/memory/consolidation/ConsolidationLoop.js +517 -0
- package/dist/memory/consolidation/ConsolidationLoop.js.map +1 -0
- package/dist/memory/consolidation/ConsolidationPipeline.d.ts.map +1 -1
- package/dist/memory/consolidation/ConsolidationPipeline.js +7 -0
- package/dist/memory/consolidation/ConsolidationPipeline.js.map +1 -1
- package/dist/memory/consolidation/index.d.ts +8 -0
- package/dist/memory/consolidation/index.d.ts.map +1 -0
- package/dist/memory/consolidation/index.js +7 -0
- package/dist/memory/consolidation/index.js.map +1 -0
- package/dist/memory/decay/DecayModel.d.ts +33 -0
- package/dist/memory/decay/DecayModel.d.ts.map +1 -1
- package/dist/memory/decay/DecayModel.js +31 -0
- package/dist/memory/decay/DecayModel.js.map +1 -1
- package/dist/memory/facade/Memory.d.ts +228 -0
- package/dist/memory/facade/Memory.d.ts.map +1 -0
- package/dist/memory/facade/Memory.js +823 -0
- package/dist/memory/facade/Memory.js.map +1 -0
- package/dist/memory/facade/index.d.ts +13 -0
- package/dist/memory/facade/index.d.ts.map +1 -0
- package/dist/memory/facade/index.js +11 -0
- package/dist/memory/facade/index.js.map +1 -0
- package/dist/memory/facade/types.d.ts +606 -0
- package/dist/memory/facade/types.d.ts.map +1 -0
- package/dist/memory/facade/types.js +11 -0
- package/dist/memory/facade/types.js.map +1 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.d.ts +132 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.d.ts.map +1 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.js +178 -0
- package/dist/memory/feedback/RetrievalFeedbackSignal.js.map +1 -0
- package/dist/memory/feedback/index.d.ts +13 -0
- package/dist/memory/feedback/index.d.ts.map +1 -0
- package/dist/memory/feedback/index.js +12 -0
- package/dist/memory/feedback/index.js.map +1 -0
- package/dist/memory/index.d.ts +22 -0
- package/dist/memory/index.d.ts.map +1 -1
- package/dist/memory/index.js +24 -0
- package/dist/memory/index.js.map +1 -1
- package/dist/memory/ingestion/ChunkingEngine.d.ts +143 -0
- package/dist/memory/ingestion/ChunkingEngine.d.ts.map +1 -0
- package/dist/memory/ingestion/ChunkingEngine.js +508 -0
- package/dist/memory/ingestion/ChunkingEngine.js.map +1 -0
- package/dist/memory/ingestion/DoclingLoader.d.ts +44 -0
- package/dist/memory/ingestion/DoclingLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/DoclingLoader.js +228 -0
- package/dist/memory/ingestion/DoclingLoader.js.map +1 -0
- package/dist/memory/ingestion/DocxLoader.d.ts +37 -0
- package/dist/memory/ingestion/DocxLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/DocxLoader.js +111 -0
- package/dist/memory/ingestion/DocxLoader.js.map +1 -0
- package/dist/memory/ingestion/FolderScanner.d.ts +116 -0
- package/dist/memory/ingestion/FolderScanner.d.ts.map +1 -0
- package/dist/memory/ingestion/FolderScanner.js +127 -0
- package/dist/memory/ingestion/FolderScanner.js.map +1 -0
- package/dist/memory/ingestion/HtmlLoader.d.ts +49 -0
- package/dist/memory/ingestion/HtmlLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/HtmlLoader.js +202 -0
- package/dist/memory/ingestion/HtmlLoader.js.map +1 -0
- package/dist/memory/ingestion/IDocumentLoader.d.ts +63 -0
- package/dist/memory/ingestion/IDocumentLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/IDocumentLoader.js +11 -0
- package/dist/memory/ingestion/IDocumentLoader.js.map +1 -0
- package/dist/memory/ingestion/LoaderRegistry.d.ts +140 -0
- package/dist/memory/ingestion/LoaderRegistry.d.ts.map +1 -0
- package/dist/memory/ingestion/LoaderRegistry.js +229 -0
- package/dist/memory/ingestion/LoaderRegistry.js.map +1 -0
- package/dist/memory/ingestion/MarkdownLoader.d.ts +50 -0
- package/dist/memory/ingestion/MarkdownLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/MarkdownLoader.js +169 -0
- package/dist/memory/ingestion/MarkdownLoader.js.map +1 -0
- package/dist/memory/ingestion/MultimodalAggregator.d.ts +88 -0
- package/dist/memory/ingestion/MultimodalAggregator.d.ts.map +1 -0
- package/dist/memory/ingestion/MultimodalAggregator.js +96 -0
- package/dist/memory/ingestion/MultimodalAggregator.js.map +1 -0
- package/dist/memory/ingestion/OcrPdfLoader.d.ts +41 -0
- package/dist/memory/ingestion/OcrPdfLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/OcrPdfLoader.js +149 -0
- package/dist/memory/ingestion/OcrPdfLoader.js.map +1 -0
- package/dist/memory/ingestion/PdfLoader.d.ts +78 -0
- package/dist/memory/ingestion/PdfLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/PdfLoader.js +179 -0
- package/dist/memory/ingestion/PdfLoader.js.map +1 -0
- package/dist/memory/ingestion/TextLoader.d.ts +66 -0
- package/dist/memory/ingestion/TextLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/TextLoader.js +207 -0
- package/dist/memory/ingestion/TextLoader.js.map +1 -0
- package/dist/memory/ingestion/UrlLoader.d.ts +95 -0
- package/dist/memory/ingestion/UrlLoader.d.ts.map +1 -0
- package/dist/memory/ingestion/UrlLoader.js +174 -0
- package/dist/memory/ingestion/UrlLoader.js.map +1 -0
- package/dist/memory/io/ChatGptImporter.d.ts +85 -0
- package/dist/memory/io/ChatGptImporter.d.ts.map +1 -0
- package/dist/memory/io/ChatGptImporter.js +231 -0
- package/dist/memory/io/ChatGptImporter.js.map +1 -0
- package/dist/memory/io/JsonExporter.d.ts +67 -0
- package/dist/memory/io/JsonExporter.d.ts.map +1 -0
- package/dist/memory/io/JsonExporter.js +132 -0
- package/dist/memory/io/JsonExporter.js.map +1 -0
- package/dist/memory/io/JsonImporter.d.ts +84 -0
- package/dist/memory/io/JsonImporter.d.ts.map +1 -0
- package/dist/memory/io/JsonImporter.js +234 -0
- package/dist/memory/io/JsonImporter.js.map +1 -0
- package/dist/memory/io/MarkdownExporter.d.ts +95 -0
- package/dist/memory/io/MarkdownExporter.d.ts.map +1 -0
- package/dist/memory/io/MarkdownExporter.js +130 -0
- package/dist/memory/io/MarkdownExporter.js.map +1 -0
- package/dist/memory/io/MarkdownImporter.d.ts +84 -0
- package/dist/memory/io/MarkdownImporter.d.ts.map +1 -0
- package/dist/memory/io/MarkdownImporter.js +166 -0
- package/dist/memory/io/MarkdownImporter.js.map +1 -0
- package/dist/memory/io/ObsidianExporter.d.ts +80 -0
- package/dist/memory/io/ObsidianExporter.d.ts.map +1 -0
- package/dist/memory/io/ObsidianExporter.js +127 -0
- package/dist/memory/io/ObsidianExporter.js.map +1 -0
- package/dist/memory/io/ObsidianImporter.d.ts +93 -0
- package/dist/memory/io/ObsidianImporter.d.ts.map +1 -0
- package/dist/memory/io/ObsidianImporter.js +221 -0
- package/dist/memory/io/ObsidianImporter.js.map +1 -0
- package/dist/memory/io/SqliteExporter.d.ts +47 -0
- package/dist/memory/io/SqliteExporter.d.ts.map +1 -0
- package/dist/memory/io/SqliteExporter.js +56 -0
- package/dist/memory/io/SqliteExporter.js.map +1 -0
- package/dist/memory/io/SqliteImporter.d.ts +82 -0
- package/dist/memory/io/SqliteImporter.d.ts.map +1 -0
- package/dist/memory/io/SqliteImporter.js +232 -0
- package/dist/memory/io/SqliteImporter.js.map +1 -0
- package/dist/memory/io/index.d.ts +31 -0
- package/dist/memory/io/index.d.ts.map +1 -0
- package/dist/memory/io/index.js +31 -0
- package/dist/memory/io/index.js.map +1 -0
- package/dist/memory/store/SqliteBrain.d.ts +125 -0
- package/dist/memory/store/SqliteBrain.d.ts.map +1 -0
- package/dist/memory/store/SqliteBrain.js +407 -0
- package/dist/memory/store/SqliteBrain.js.map +1 -0
- package/dist/memory/store/SqliteKnowledgeGraph.d.ts +259 -0
- package/dist/memory/store/SqliteKnowledgeGraph.d.ts.map +1 -0
- package/dist/memory/store/SqliteKnowledgeGraph.js +1062 -0
- package/dist/memory/store/SqliteKnowledgeGraph.js.map +1 -0
- package/dist/memory/store/SqliteMemoryGraph.d.ts +251 -0
- package/dist/memory/store/SqliteMemoryGraph.d.ts.map +1 -0
- package/dist/memory/store/SqliteMemoryGraph.js +637 -0
- package/dist/memory/store/SqliteMemoryGraph.js.map +1 -0
- package/dist/memory/tools/MemoryAddTool.d.ts +98 -0
- package/dist/memory/tools/MemoryAddTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryAddTool.js +131 -0
- package/dist/memory/tools/MemoryAddTool.js.map +1 -0
- package/dist/memory/tools/MemoryDeleteTool.d.ts +83 -0
- package/dist/memory/tools/MemoryDeleteTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryDeleteTool.js +96 -0
- package/dist/memory/tools/MemoryDeleteTool.js.map +1 -0
- package/dist/memory/tools/MemoryMergeTool.d.ts +95 -0
- package/dist/memory/tools/MemoryMergeTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryMergeTool.js +164 -0
- package/dist/memory/tools/MemoryMergeTool.js.map +1 -0
- package/dist/memory/tools/MemoryReflectTool.d.ts +86 -0
- package/dist/memory/tools/MemoryReflectTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryReflectTool.js +102 -0
- package/dist/memory/tools/MemoryReflectTool.js.map +1 -0
- package/dist/memory/tools/MemorySearchTool.d.ts +117 -0
- package/dist/memory/tools/MemorySearchTool.d.ts.map +1 -0
- package/dist/memory/tools/MemorySearchTool.js +162 -0
- package/dist/memory/tools/MemorySearchTool.js.map +1 -0
- package/dist/memory/tools/MemoryUpdateTool.d.ts +92 -0
- package/dist/memory/tools/MemoryUpdateTool.d.ts.map +1 -0
- package/dist/memory/tools/MemoryUpdateTool.js +125 -0
- package/dist/memory/tools/MemoryUpdateTool.js.map +1 -0
- package/dist/memory/tools/index.d.ts +32 -0
- package/dist/memory/tools/index.d.ts.map +1 -0
- package/dist/memory/tools/index.js +26 -0
- package/dist/memory/tools/index.js.map +1 -0
- package/package.json +6 -1
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview ChunkingEngine — splits raw document text into `DocumentChunk`
|
|
3
|
+
* slices ready for embedding and vector-store ingestion.
|
|
4
|
+
*
|
|
5
|
+
* Four strategies are supported:
|
|
6
|
+
*
|
|
7
|
+
* - **fixed** — split at a fixed character count with word-boundary
|
|
8
|
+
* awareness and configurable overlap.
|
|
9
|
+
* - **semantic** — embed individual sentences and split where cosine
|
|
10
|
+
* similarity drops below a threshold (topic boundaries).
|
|
11
|
+
* Falls back to `fixed` when no `embedFn` is supplied.
|
|
12
|
+
* - **hierarchical**— honour Markdown heading structure; each heading creates
|
|
13
|
+
* a new chunk boundary with the heading stored in metadata.
|
|
14
|
+
* Long sections are sub-split with `fixed`.
|
|
15
|
+
* - **layout** — preserve fenced code blocks and pipe-delimited tables as
|
|
16
|
+
* atomic chunks; surrounding prose is split with `fixed`.
|
|
17
|
+
*
|
|
18
|
+
* @module memory/ingestion/ChunkingEngine
|
|
19
|
+
*/
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Internal constants
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
/** Default target chunk size in characters. */
|
|
24
|
+
const DEFAULT_CHUNK_SIZE = 512;
|
|
25
|
+
/** Default overlap between consecutive chunks in characters. */
|
|
26
|
+
const DEFAULT_CHUNK_OVERLAP = 64;
|
|
27
|
+
/**
|
|
28
|
+
* Cosine similarity threshold below which two consecutive sentence embeddings
|
|
29
|
+
* are considered to belong to different topics. Split points are inserted
|
|
30
|
+
* wherever similarity falls below this value.
|
|
31
|
+
*/
|
|
32
|
+
const SEMANTIC_SPLIT_THRESHOLD = 0.3;
|
|
33
|
+
/**
|
|
34
|
+
* Maximum allowed chunk character count for a semantic group before it is
|
|
35
|
+
* further sub-split with the fixed strategy. Expressed as a multiplier of
|
|
36
|
+
* `chunkSize`.
|
|
37
|
+
*/
|
|
38
|
+
const SEMANTIC_MAX_CHUNK_MULTIPLIER = 2;
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
// Helper — cosine similarity
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
/**
|
|
43
|
+
* Computes the cosine similarity between two equal-length dense vectors.
|
|
44
|
+
*
|
|
45
|
+
* Returns a value in [-1, 1] where 1 means identical direction and 0 means
|
|
46
|
+
* orthogonal. Returns 0 safely when either vector is the zero vector.
|
|
47
|
+
*
|
|
48
|
+
* @param a - First vector.
|
|
49
|
+
* @param b - Second vector.
|
|
50
|
+
* @returns Cosine similarity scalar.
|
|
51
|
+
*/
|
|
52
|
+
function cosineSimilarity(a, b) {
|
|
53
|
+
let dot = 0;
|
|
54
|
+
let magA = 0;
|
|
55
|
+
let magB = 0;
|
|
56
|
+
for (let i = 0; i < a.length; i++) {
|
|
57
|
+
dot += a[i] * b[i];
|
|
58
|
+
magA += a[i] * a[i];
|
|
59
|
+
magB += b[i] * b[i];
|
|
60
|
+
}
|
|
61
|
+
return dot / (Math.sqrt(magA) * Math.sqrt(magB) || 1);
|
|
62
|
+
}
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Helper — fixed strategy (used internally by other strategies)
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
/**
|
|
67
|
+
* Splits `content` into fixed-size character chunks with optional overlap,
|
|
68
|
+
* breaking at word boundaries so no word is split mid-token.
|
|
69
|
+
*
|
|
70
|
+
* @param content - Full text to split.
|
|
71
|
+
* @param chunkSize - Target character count per chunk.
|
|
72
|
+
* @param chunkOverlap - Number of trailing characters from the previous chunk
|
|
73
|
+
* prepended to the next chunk.
|
|
74
|
+
* @param startIndex - The `DocumentChunk.index` to assign to the first
|
|
75
|
+
* produced chunk. Useful when merging partial results.
|
|
76
|
+
* @param baseMetadata - Extra metadata fields merged into every produced chunk.
|
|
77
|
+
* @returns Array of `DocumentChunk` objects in order.
|
|
78
|
+
*/
|
|
79
|
+
function fixedChunks(content, chunkSize, chunkOverlap, startIndex = 0, baseMetadata) {
|
|
80
|
+
const chunks = [];
|
|
81
|
+
let pos = 0;
|
|
82
|
+
let chunkIndex = startIndex;
|
|
83
|
+
while (pos < content.length) {
|
|
84
|
+
// Desired end position for this window.
|
|
85
|
+
let end = pos + chunkSize;
|
|
86
|
+
if (end >= content.length) {
|
|
87
|
+
// We've reached (or exceeded) the end — take whatever remains.
|
|
88
|
+
const slice = content.slice(pos).trim();
|
|
89
|
+
if (slice.length > 0) {
|
|
90
|
+
chunks.push({
|
|
91
|
+
content: slice,
|
|
92
|
+
index: chunkIndex++,
|
|
93
|
+
...(baseMetadata ? { metadata: { ...baseMetadata } } : {}),
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
// Walk backwards from `end` until we land on a whitespace boundary so we
|
|
99
|
+
// never split a word in the middle.
|
|
100
|
+
while (end > pos && !/\s/.test(content[end])) {
|
|
101
|
+
end--;
|
|
102
|
+
}
|
|
103
|
+
// Edge case: no whitespace found in the whole window — hard-cut.
|
|
104
|
+
if (end === pos) {
|
|
105
|
+
end = pos + chunkSize;
|
|
106
|
+
}
|
|
107
|
+
const slice = content.slice(pos, end).trim();
|
|
108
|
+
if (slice.length > 0) {
|
|
109
|
+
chunks.push({
|
|
110
|
+
content: slice,
|
|
111
|
+
index: chunkIndex++,
|
|
112
|
+
...(baseMetadata ? { metadata: { ...baseMetadata } } : {}),
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
// Advance position, stepping back by `chunkOverlap` characters.
|
|
116
|
+
pos = end - chunkOverlap;
|
|
117
|
+
if (pos <= 0)
|
|
118
|
+
pos = end; // Guard against infinite loop on tiny content.
|
|
119
|
+
}
|
|
120
|
+
return chunks;
|
|
121
|
+
}
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
// ChunkingEngine
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
/**
|
|
126
|
+
* Splits raw document text into an ordered array of `DocumentChunk` objects
|
|
127
|
+
* suitable for embedding and storage in a vector index.
|
|
128
|
+
*
|
|
129
|
+
* @example
|
|
130
|
+
* ```typescript
|
|
131
|
+
* const engine = new ChunkingEngine();
|
|
132
|
+
* const chunks = await engine.chunk(content, { strategy: 'fixed', chunkSize: 512 });
|
|
133
|
+
* ```
|
|
134
|
+
*/
|
|
135
|
+
export class ChunkingEngine {
|
|
136
|
+
// -------------------------------------------------------------------------
|
|
137
|
+
// Public API
|
|
138
|
+
// -------------------------------------------------------------------------
|
|
139
|
+
/**
|
|
140
|
+
* Chunks the provided `content` string according to the given `options`.
|
|
141
|
+
*
|
|
142
|
+
* All strategy implementations are async to accommodate the optional
|
|
143
|
+
* `embedFn` used by the semantic strategy.
|
|
144
|
+
*
|
|
145
|
+
* @param content - Full document text to split.
|
|
146
|
+
* @param options - Chunking strategy and tuning parameters.
|
|
147
|
+
* @returns Ordered array of `DocumentChunk` objects with sequential indices.
|
|
148
|
+
*/
|
|
149
|
+
async chunk(content, options) {
|
|
150
|
+
const chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE;
|
|
151
|
+
const chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
|
|
152
|
+
switch (options.strategy) {
|
|
153
|
+
case 'fixed':
|
|
154
|
+
return this._chunkFixed(content, chunkSize, chunkOverlap);
|
|
155
|
+
case 'semantic':
|
|
156
|
+
return this._chunkSemantic(content, chunkSize, chunkOverlap, options.embedFn);
|
|
157
|
+
case 'hierarchical':
|
|
158
|
+
return this._chunkHierarchical(content, chunkSize, chunkOverlap);
|
|
159
|
+
case 'layout':
|
|
160
|
+
return this._chunkLayout(content, chunkSize, chunkOverlap);
|
|
161
|
+
default: {
|
|
162
|
+
// TypeScript exhaustiveness guard.
|
|
163
|
+
const never = options.strategy;
|
|
164
|
+
throw new Error(`ChunkingEngine: unknown strategy "${String(never)}"`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// -------------------------------------------------------------------------
|
|
169
|
+
// Strategy: fixed
|
|
170
|
+
// -------------------------------------------------------------------------
|
|
171
|
+
/**
|
|
172
|
+
* Splits content at a fixed character count with word-boundary awareness
|
|
173
|
+
* and configurable overlap between consecutive chunks.
|
|
174
|
+
*
|
|
175
|
+
* @param content - Text to split.
|
|
176
|
+
* @param chunkSize - Target character count per chunk.
|
|
177
|
+
* @param chunkOverlap - Overlap in characters between consecutive chunks.
|
|
178
|
+
* @returns Ordered `DocumentChunk[]`.
|
|
179
|
+
*/
|
|
180
|
+
_chunkFixed(content, chunkSize, chunkOverlap) {
|
|
181
|
+
return fixedChunks(content, chunkSize, chunkOverlap, 0);
|
|
182
|
+
}
|
|
183
|
+
// -------------------------------------------------------------------------
|
|
184
|
+
// Strategy: semantic
|
|
185
|
+
// -------------------------------------------------------------------------
|
|
186
|
+
/**
|
|
187
|
+
* Embeds individual sentences and inserts split points wherever the cosine
|
|
188
|
+
* similarity between consecutive sentence embeddings drops below
|
|
189
|
+
* {@link SEMANTIC_SPLIT_THRESHOLD} (topic boundary heuristic).
|
|
190
|
+
*
|
|
191
|
+
* When `embedFn` is not supplied the method falls back to `_chunkFixed`.
|
|
192
|
+
*
|
|
193
|
+
* Any resulting group that exceeds `2 × chunkSize` characters is further
|
|
194
|
+
* sub-split with the fixed strategy.
|
|
195
|
+
*
|
|
196
|
+
* @param content - Text to split.
|
|
197
|
+
* @param chunkSize - Target character count per chunk.
|
|
198
|
+
* @param chunkOverlap - Overlap used when sub-splitting oversized groups.
|
|
199
|
+
* @param embedFn - Optional batch embedding function.
|
|
200
|
+
* @returns Ordered `DocumentChunk[]`.
|
|
201
|
+
*/
|
|
202
|
+
async _chunkSemantic(content, chunkSize, chunkOverlap, embedFn) {
|
|
203
|
+
// No embedding function → fall back to fixed.
|
|
204
|
+
if (!embedFn) {
|
|
205
|
+
return this._chunkFixed(content, chunkSize, chunkOverlap);
|
|
206
|
+
}
|
|
207
|
+
// Split into sentences. We use two approaches to cover common patterns:
|
|
208
|
+
// 1. Lookbehind / lookahead regex (modern engines support this).
|
|
209
|
+
// 2. Simple split on terminal punctuation + whitespace as fallback.
|
|
210
|
+
let sentences;
|
|
211
|
+
try {
|
|
212
|
+
sentences = content.split(/(?<=[.!?])\s+(?=[A-Z])/).filter((s) => s.trim().length > 0);
|
|
213
|
+
}
|
|
214
|
+
catch {
|
|
215
|
+
// Safari / older engines may not support lookbehind — use simpler split.
|
|
216
|
+
sentences = content.split(/[.!?]\s+/).filter((s) => s.trim().length > 0);
|
|
217
|
+
}
|
|
218
|
+
// Degenerate case: no meaningful sentences.
|
|
219
|
+
if (sentences.length === 0) {
|
|
220
|
+
return this._chunkFixed(content, chunkSize, chunkOverlap);
|
|
221
|
+
}
|
|
222
|
+
// Single sentence: emit as one chunk.
|
|
223
|
+
if (sentences.length === 1) {
|
|
224
|
+
return [{ content: sentences[0].trim(), index: 0 }];
|
|
225
|
+
}
|
|
226
|
+
// Batch-embed all sentences.
|
|
227
|
+
const embeddings = await embedFn(sentences);
|
|
228
|
+
// Identify split points: positions BETWEEN sentence[i] and sentence[i+1]
|
|
229
|
+
// where similarity falls below the threshold.
|
|
230
|
+
const splitAfter = new Set();
|
|
231
|
+
for (let i = 0; i < sentences.length - 1; i++) {
|
|
232
|
+
const sim = cosineSimilarity(embeddings[i], embeddings[i + 1]);
|
|
233
|
+
if (sim < SEMANTIC_SPLIT_THRESHOLD) {
|
|
234
|
+
splitAfter.add(i);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
// Group sentences into chunks.
|
|
238
|
+
const groups = [];
|
|
239
|
+
let currentGroup = [];
|
|
240
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
241
|
+
currentGroup.push(sentences[i]);
|
|
242
|
+
if (splitAfter.has(i) || i === sentences.length - 1) {
|
|
243
|
+
groups.push(currentGroup);
|
|
244
|
+
currentGroup = [];
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
// Convert groups to DocumentChunks, sub-splitting oversized ones.
|
|
248
|
+
const maxGroupSize = chunkSize * SEMANTIC_MAX_CHUNK_MULTIPLIER;
|
|
249
|
+
const result = [];
|
|
250
|
+
let chunkIndex = 0;
|
|
251
|
+
for (const group of groups) {
|
|
252
|
+
const groupText = group.join(' ').trim();
|
|
253
|
+
if (groupText.length === 0)
|
|
254
|
+
continue;
|
|
255
|
+
if (groupText.length > maxGroupSize) {
|
|
256
|
+
// Sub-split the oversized group with fixed strategy.
|
|
257
|
+
const subChunks = fixedChunks(groupText, chunkSize, chunkOverlap, chunkIndex);
|
|
258
|
+
result.push(...subChunks);
|
|
259
|
+
chunkIndex += subChunks.length;
|
|
260
|
+
}
|
|
261
|
+
else {
|
|
262
|
+
result.push({ content: groupText, index: chunkIndex++ });
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
return result;
|
|
266
|
+
}
|
|
267
|
+
// -------------------------------------------------------------------------
|
|
268
|
+
// Strategy: hierarchical
|
|
269
|
+
// -------------------------------------------------------------------------
|
|
270
|
+
/**
|
|
271
|
+
* Recognises Markdown heading lines (`# H1`, `## H2`, …, `###### H6`) and
|
|
272
|
+
* creates a new chunk boundary at each heading. The heading text is stored
|
|
273
|
+
* in `DocumentChunk.heading` and its level in `metadata.headingLevel`.
|
|
274
|
+
*
|
|
275
|
+
* Sections whose text exceeds `chunkSize` are sub-split with the fixed
|
|
276
|
+
* strategy while preserving the heading metadata.
|
|
277
|
+
*
|
|
278
|
+
* @param content - Markdown-formatted text.
|
|
279
|
+
* @param chunkSize - Maximum characters per output chunk.
|
|
280
|
+
* @param chunkOverlap - Overlap used when sub-splitting oversized sections.
|
|
281
|
+
* @returns Ordered `DocumentChunk[]`.
|
|
282
|
+
*/
|
|
283
|
+
_chunkHierarchical(content, chunkSize, chunkOverlap) {
|
|
284
|
+
const headingRegex = /^(#{1,6})\s+(.+)$/gm;
|
|
285
|
+
const sections = [];
|
|
286
|
+
// Track a heading stack to capture hierarchy context.
|
|
287
|
+
const headingStack = [];
|
|
288
|
+
// Find all heading match positions and slice between them.
|
|
289
|
+
let lastMatchEnd = 0;
|
|
290
|
+
let currentHeading;
|
|
291
|
+
let currentHeadingLevel;
|
|
292
|
+
let currentAncestors = [];
|
|
293
|
+
// Collect all matches first so we can slice between them.
|
|
294
|
+
const matches = [];
|
|
295
|
+
let m;
|
|
296
|
+
while ((m = headingRegex.exec(content)) !== null) {
|
|
297
|
+
matches.push({
|
|
298
|
+
index: m.index,
|
|
299
|
+
end: m.index + m[0].length,
|
|
300
|
+
level: m[1].length,
|
|
301
|
+
text: m[2].trim(),
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
if (matches.length === 0) {
|
|
305
|
+
// No headings — treat entire content as a single section.
|
|
306
|
+
sections.push({
|
|
307
|
+
heading: undefined,
|
|
308
|
+
headingLevel: undefined,
|
|
309
|
+
ancestorHeadings: [],
|
|
310
|
+
text: content,
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
314
|
+
// Text before the first heading (preamble).
|
|
315
|
+
const preamble = content.slice(0, matches[0].index).trim();
|
|
316
|
+
if (preamble.length > 0) {
|
|
317
|
+
sections.push({
|
|
318
|
+
heading: undefined,
|
|
319
|
+
headingLevel: undefined,
|
|
320
|
+
ancestorHeadings: [],
|
|
321
|
+
text: preamble,
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
for (let i = 0; i < matches.length; i++) {
|
|
325
|
+
const match = matches[i];
|
|
326
|
+
const nextIndex = i + 1 < matches.length ? matches[i + 1].index : content.length;
|
|
327
|
+
// The body of this section is the text after the heading line.
|
|
328
|
+
const body = content.slice(match.end, nextIndex).trim();
|
|
329
|
+
// Update heading stack: pop entries at the same level or deeper.
|
|
330
|
+
while (headingStack.length > 0 && headingStack[headingStack.length - 1].level >= match.level) {
|
|
331
|
+
headingStack.pop();
|
|
332
|
+
}
|
|
333
|
+
const ancestors = headingStack.map((h) => h.text);
|
|
334
|
+
headingStack.push({ level: match.level, text: match.text });
|
|
335
|
+
sections.push({
|
|
336
|
+
heading: match.text,
|
|
337
|
+
headingLevel: match.level,
|
|
338
|
+
ancestorHeadings: ancestors,
|
|
339
|
+
text: body,
|
|
340
|
+
});
|
|
341
|
+
void lastMatchEnd; // suppress unused-variable lint
|
|
342
|
+
void currentHeading;
|
|
343
|
+
void currentHeadingLevel;
|
|
344
|
+
void currentAncestors;
|
|
345
|
+
lastMatchEnd = nextIndex;
|
|
346
|
+
currentHeading = match.text;
|
|
347
|
+
currentHeadingLevel = match.level;
|
|
348
|
+
currentAncestors = [...ancestors];
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
// Materialise sections into DocumentChunks.
|
|
352
|
+
const result = [];
|
|
353
|
+
let chunkIndex = 0;
|
|
354
|
+
for (const section of sections) {
|
|
355
|
+
const text = section.text;
|
|
356
|
+
if (text.length === 0 && section.heading === undefined)
|
|
357
|
+
continue;
|
|
358
|
+
// Build metadata common to all chunks from this section.
|
|
359
|
+
const sectionMeta = {};
|
|
360
|
+
if (section.headingLevel !== undefined) {
|
|
361
|
+
sectionMeta.headingLevel = section.headingLevel;
|
|
362
|
+
}
|
|
363
|
+
if (section.ancestorHeadings.length > 0) {
|
|
364
|
+
sectionMeta.ancestorHeadings = section.ancestorHeadings;
|
|
365
|
+
}
|
|
366
|
+
if (text.length === 0) {
|
|
367
|
+
// Heading with no body — emit an empty-content chunk.
|
|
368
|
+
result.push({
|
|
369
|
+
content: section.heading ?? '',
|
|
370
|
+
index: chunkIndex++,
|
|
371
|
+
heading: section.heading,
|
|
372
|
+
metadata: Object.keys(sectionMeta).length > 0 ? sectionMeta : undefined,
|
|
373
|
+
});
|
|
374
|
+
continue;
|
|
375
|
+
}
|
|
376
|
+
if (text.length <= chunkSize) {
|
|
377
|
+
// Fits in a single chunk.
|
|
378
|
+
result.push({
|
|
379
|
+
content: text,
|
|
380
|
+
index: chunkIndex++,
|
|
381
|
+
heading: section.heading,
|
|
382
|
+
metadata: Object.keys(sectionMeta).length > 0 ? sectionMeta : undefined,
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
else {
|
|
386
|
+
// Sub-split the section body with the fixed strategy, preserving heading
|
|
387
|
+
// metadata on every produced sub-chunk.
|
|
388
|
+
const subChunks = fixedChunks(text, chunkSize, chunkOverlap, chunkIndex, sectionMeta);
|
|
389
|
+
for (const sc of subChunks) {
|
|
390
|
+
result.push({
|
|
391
|
+
...sc,
|
|
392
|
+
heading: section.heading,
|
|
393
|
+
});
|
|
394
|
+
chunkIndex++;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
return result;
|
|
399
|
+
}
|
|
400
|
+
// -------------------------------------------------------------------------
|
|
401
|
+
// Strategy: layout
|
|
402
|
+
// -------------------------------------------------------------------------
|
|
403
|
+
/**
|
|
404
|
+
* Detects fenced code blocks (``` … ```) and pipe-delimited tables and
|
|
405
|
+
* emits each as an atomic chunk (never split mid-block). Surrounding prose
|
|
406
|
+
* is split with the fixed strategy.
|
|
407
|
+
*
|
|
408
|
+
* Chunk metadata:
|
|
409
|
+
* - Code blocks: `{ type: 'code' }`
|
|
410
|
+
* - Tables: `{ type: 'table' }`
|
|
411
|
+
* - Prose: no special metadata.
|
|
412
|
+
*
|
|
413
|
+
* @param content - Text potentially containing code blocks and tables.
|
|
414
|
+
* @param chunkSize - Target character count for prose chunks.
|
|
415
|
+
* @param chunkOverlap - Overlap for prose fixed-splits.
|
|
416
|
+
* @returns Ordered `DocumentChunk[]`.
|
|
417
|
+
*/
|
|
418
|
+
_chunkLayout(content, chunkSize, chunkOverlap) {
|
|
419
|
+
const segments = [];
|
|
420
|
+
const lines = content.split('\n');
|
|
421
|
+
let i = 0;
|
|
422
|
+
while (i < lines.length) {
|
|
423
|
+
// ── Fenced code block ────────────────────────────────────────────────
|
|
424
|
+
if (/^```/.test(lines[i])) {
|
|
425
|
+
// Flush any preceding prose first (handled after the block ends).
|
|
426
|
+
const fence = lines[i].match(/^(`{3,})/)?.[1] ?? '```';
|
|
427
|
+
const blockLines = [lines[i]];
|
|
428
|
+
i++;
|
|
429
|
+
while (i < lines.length && !lines[i].startsWith(fence)) {
|
|
430
|
+
blockLines.push(lines[i]);
|
|
431
|
+
i++;
|
|
432
|
+
}
|
|
433
|
+
// Include the closing fence if present.
|
|
434
|
+
if (i < lines.length) {
|
|
435
|
+
blockLines.push(lines[i]);
|
|
436
|
+
i++;
|
|
437
|
+
}
|
|
438
|
+
segments.push({ kind: 'code', text: blockLines.join('\n') });
|
|
439
|
+
continue;
|
|
440
|
+
}
|
|
441
|
+
// ── Table block ───────────────────────────────────────────────────────
|
|
442
|
+
// A table is a contiguous run of lines where every non-blank line
|
|
443
|
+
// contains at least one `|` pipe character.
|
|
444
|
+
if (/\|/.test(lines[i])) {
|
|
445
|
+
const tableLines = [];
|
|
446
|
+
while (i < lines.length && (lines[i].trim() === '' || /\|/.test(lines[i]))) {
|
|
447
|
+
// Stop accumulating if we hit a blank line after table content.
|
|
448
|
+
if (lines[i].trim() === '' && tableLines.length > 0) {
|
|
449
|
+
break;
|
|
450
|
+
}
|
|
451
|
+
tableLines.push(lines[i]);
|
|
452
|
+
i++;
|
|
453
|
+
}
|
|
454
|
+
if (tableLines.some((l) => /\|/.test(l))) {
|
|
455
|
+
segments.push({ kind: 'table', text: tableLines.join('\n') });
|
|
456
|
+
}
|
|
457
|
+
else {
|
|
458
|
+
// No actual pipe content — treat as prose.
|
|
459
|
+
segments.push({ kind: 'prose', text: tableLines.join('\n') });
|
|
460
|
+
}
|
|
461
|
+
continue;
|
|
462
|
+
}
|
|
463
|
+
// ── Prose line ───────────────────────────────────────────────────────
|
|
464
|
+
// Accumulate lines until we hit a code fence or table.
|
|
465
|
+
const proseLines = [];
|
|
466
|
+
while (i < lines.length && !/^```/.test(lines[i]) && !/\|/.test(lines[i])) {
|
|
467
|
+
proseLines.push(lines[i]);
|
|
468
|
+
i++;
|
|
469
|
+
}
|
|
470
|
+
const proseText = proseLines.join('\n').trim();
|
|
471
|
+
if (proseText.length > 0) {
|
|
472
|
+
segments.push({ kind: 'prose', text: proseText });
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
// Convert segments to DocumentChunks.
|
|
476
|
+
const result = [];
|
|
477
|
+
let chunkIndex = 0;
|
|
478
|
+
for (const seg of segments) {
|
|
479
|
+
if (seg.text.trim().length === 0)
|
|
480
|
+
continue;
|
|
481
|
+
switch (seg.kind) {
|
|
482
|
+
case 'code':
|
|
483
|
+
result.push({
|
|
484
|
+
content: seg.text,
|
|
485
|
+
index: chunkIndex++,
|
|
486
|
+
metadata: { type: 'code' },
|
|
487
|
+
});
|
|
488
|
+
break;
|
|
489
|
+
case 'table':
|
|
490
|
+
result.push({
|
|
491
|
+
content: seg.text,
|
|
492
|
+
index: chunkIndex++,
|
|
493
|
+
metadata: { type: 'table' },
|
|
494
|
+
});
|
|
495
|
+
break;
|
|
496
|
+
case 'prose': {
|
|
497
|
+
// Split prose with the fixed strategy.
|
|
498
|
+
const proseChunks = fixedChunks(seg.text, chunkSize, chunkOverlap, chunkIndex);
|
|
499
|
+
result.push(...proseChunks);
|
|
500
|
+
chunkIndex += proseChunks.length;
|
|
501
|
+
break;
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
return result;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
//# sourceMappingURL=ChunkingEngine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ChunkingEngine.js","sourceRoot":"","sources":["../../../src/memory/ingestion/ChunkingEngine.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAuDH,8EAA8E;AAC9E,qBAAqB;AACrB,8EAA8E;AAE9E,+CAA+C;AAC/C,MAAM,kBAAkB,GAAG,GAAG,CAAC;AAE/B,gEAAgE;AAChE,MAAM,qBAAqB,GAAG,EAAE,CAAC;AAEjC;;;;GAIG;AACH,MAAM,wBAAwB,GAAG,GAAG,CAAC;AAErC;;;;GAIG;AACH,MAAM,6BAA6B,GAAG,CAAC,CAAC;AAExC,8EAA8E;AAC9E,6BAA6B;AAC7B,8EAA8E;AAE9E;;;;;;;;;GASG;AACH,SAAS,gBAAgB,CAAC,CAAW,EAAE,CAAW;IAChD,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACnB,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACpB,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACtB,CAAC;IACD,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;AACxD,CAAC;AAED,8EAA8E;AAC9E,gEAAgE;AAChE,8EAA8E;AAE9E;;;;;;;;;;;;GAYG;AACH,SAAS,WAAW,CAClB,OAAe,EACf,SAAiB,EACjB,YAAoB,EACpB,aAAqB,CAAC,EACtB,YAAsC;IAEtC,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,IAAI,UAAU,GAAG,UAAU,CAAC;IAE5B,OAAO,GAAG,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;QAC5B,wCAAwC;QACxC,IAAI,GAAG,GAAG,GAAG,GAAG,SAAS,CAAC;QAE1B,IAAI,GAAG,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;YAC1B,+DAA+D;YAC/D,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YACxC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,MAAM,CAAC,IAAI,CAAC;oBACV,OAAO,EAAE,KAAK;oBACd,KAAK,EAAE,UAAU,EAAE;oBACnB,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,EAAE,GAAG,YAAY,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;iBAC3D,CAAC,CAAC;YACL,CAAC;YACD,MAAM;QACR,CAAC;QAED,yEAAyE;QACzE,oCAAoC;QACpC,OAAO,GAAG,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YAC7C,GAAG,EAAE,CAAC;QACR,CAAC;QAED,iEAAiE;QACjE,IAAI,GAAG,KAAK,GAAG,EAAE,CAAC;YAChB,GAAG,GAAG,GAAG,GAAG,SAAS,CAAC;QACxB,CAAC;QAED,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QAC7C,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrB,MAAM,CAAC,IAAI,CAAC;gBACV,OAAO,EAAE,KAAK;gBACd,KAAK,EAAE,UAAU,EAAE;gBACnB,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,EAAE,GAAG,YAAY,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aAC3D,CAAC,CAAC;QACL,CAAC;QAED,gEAAgE;QAChE,GAAG,GAAG,GAAG,GAAG,YAAY,CAAC;QACzB,IAAI,GAAG,IAAI,CAAC;YAAE,GAAG,GAAG,GAAG,CAAC,CAAC,+CAA+C;IAC1E,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,8EAA8E;AAC9E,iBAAiB;AACjB,8EAA8E;AAE9E;;;;;;;;;GASG;AACH,MAAM,OAAO,cAAc;IACzB,4EAA4E;IAC5E,aAAa;IACb,4EAA4E;IAE5E;;;;;;;;;OASG;IACH,KAAK,CAAC,KAAK,CAAC,OAAe,EAAE,OAAqB;QAChD,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,kBAAkB,CAAC;QAC1D,MAAM,YAAY,GAAG,OAAO,CAAC,YAAY,IAAI,qBAAqB,CAAC;QAEnE,QAAQ,OAAO,CAAC,QAAQ,EAAE,CAAC;YACzB,KAAK,OAAO;gBACV,OAAO,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;YAE5D,KAAK,UAAU;gBACb,OAAO,IAAI,CAAC,cAAc,CAAC,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;YAEhF,KAAK,cAAc;gBACjB,OAAO,IAAI,CAAC,kBAAkB,CAAC,OAAO,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;YAEnE,KAAK,QAAQ;gBACX,OAAO,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;YAE7D,OAAO,CAAC,CAAC,CAAC;gBACR,mCAAmC;gBACnC,MAAM,KAAK,GAAU,OAAO,CAAC,QAAQ,CAAC;gBACtC,MAAM,IAAI,KAAK,CAAC,qCAAqC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;IACH,CAAC;IAED,4EAA4E;IAC5E,kBAAkB;IAClB,4EAA4E;IAE5E;;;;;;;;OAQG;IACK,WAAW,CACjB,OAAe,EACf,SAAiB,EACjB,YAAoB;QAEpB,OAAO,WAAW,CAAC,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC;IAC1D,CAAC;IAED,4EAA4E;IAC5E,qBAAqB;IACrB,4EAA4E;IAE5E;;;;;;;;;;;;;;;OAeG;IACK,KAAK,CAAC,cAAc,CAC1B,OAAe,EACf,SAAiB,EACjB,YAAoB,EACpB,OAAkD;QAElD,8CAA8C;QAC9C,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;QAC5D,CAAC;QAED,yEAAyE;QACzE,mEAAmE;QACnE,sEAAsE;QACtE,IAAI,SAAmB,CAAC;QACxB,IAAI,CAAC;YACH,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,wBAAwB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACzF,CAAC;QAAC,MAAM,CAAC;YACP,yEAAyE;YACzE,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAC3E,CAAC;QAED,4CAA4C;QAC5C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;QAC5D,CAAC;QAED,sCAAsC;QACtC,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3B,OAAO,CAAC,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC;QACtD,CAAC;QAED,6BAA6B;QAC7B,MAAM,UAAU,GAAG,MAAM,OAAO,CAAC,SAAS,CAAC,CAAC;QAE5C,yEAAyE;QACzE,8CAA8C;QAC9C,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAC;QACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC9C,MAAM,GAAG,GAAG,gBAAgB,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC/D,IAAI,GAAG,GAAG,wBAAwB,EAAE,CAAC;gBACnC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,MAAM,MAAM,GAAe,EAAE,CAAC;QAC9B,IAAI,YAAY,GAAa,EAAE,CAAC;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;YAChC,IAAI,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACpD,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;gBAC1B,YAAY,GAAG,EAAE,CAAC;YACpB,CAAC;QACH,CAAC;QAED,kEAAkE;QAClE,MAAM,YAAY,GAAG,SAAS,GAAG,6BAA6B,CAAC;QAC/D,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YACzC,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAS;YAErC,IAAI,SAAS,CAAC,MAAM,GAAG,YAAY,EAAE,CAAC;gBACpC,qDAAqD;gBACrD,MAAM,SAAS,GAAG,WAAW,CAAC,SAAS,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,CAAC,CAAC;gBAC9E,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;gBAC1B,UAAU,IAAI,SAAS,CAAC,MAAM,CAAC;YACjC,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,UAAU,EAAE,EAAE,CAAC,CAAC;YAC3D,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,4EAA4E;IAC5E,yBAAyB;IACzB,4EAA4E;IAE5E;;;;;;;;;;;;OAYG;IACK,kBAAkB,CACxB,OAAe,EACf,SAAiB,EACjB,YAAoB;QAEpB,MAAM,YAAY,GAAG,qBAAqB,CAAC;QAW3C,MAAM,QAAQ,GAAc,EAAE,CAAC;QAE/B,sDAAsD;QACtD,MAAM,YAAY,GAA2C,EAAE,CAAC;QAEhE,2DAA2D;QAC3D,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,IAAI,cAAkC,CAAC;QACvC,IAAI,mBAAuC,CAAC;QAC5C,IAAI,gBAAgB,GAAa,EAAE,CAAC;QAEpC,0DAA0D;QAC1D,MAAM,OAAO,GAAuE,EAAE,CAAC;QACvF,IAAI,CAAyB,CAAC;QAC9B,OAAO,CAAC,CAAC,GAAG,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACjD,OAAO,CAAC,IAAI,CAAC;gBACX,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM;gBAC1B,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM;gBAClB,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;aAClB,CAAC,CAAC;QACL,CAAC;QAED,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACzB,0DAA0D;YAC1D,QAAQ,CAAC,IAAI,CAAC;gBACZ,OAAO,EAAE,SAAS;gBAClB,YAAY,EAAE,SAAS;gBACvB,gBAAgB,EAAE,EAAE;gBACpB,IAAI,EAAE,OAAO;aACd,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,4CAA4C;YAC5C,MAAM,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;YAC3D,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,QAAQ,CAAC,IAAI,CAAC;oBACZ,OAAO,EAAE,SAAS;oBAClB,YAAY,EAAE,SAAS;oBACvB,gBAAgB,EAAE,EAAE;oBACpB,IAAI,EAAE,QAAQ;iBACf,CAAC,CAAC;YACL,CAAC;YAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACxC,MAAM,KAAK,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;gBACzB,MAAM,SAAS,GAAG,CAAC,GAAG,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC;gBACjF,+DAA+D;gBAC/D,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,EAAE,SAAS,CAAC,CAAC,IAAI,EAAE,CAAC;gBAExD,iEAAiE;gBACjE,OAAO,YAAY,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,KAAK,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;oBAC7F,YAAY,CAAC,GAAG,EAAE,CAAC;gBACrB,CAAC;gBACD,MAAM,SAAS,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;gBAClD,YAAY,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC;gBAE5D,QAAQ,CAAC,IAAI,CAAC;oBACZ,OAAO,EAAE,KAAK,CAAC,IAAI;oBACnB,YAAY,EAAE,KAAK,CAAC,KAAK;oBACzB,gBAAgB,EAAE,SAAS;oBAC3B,IAAI,EAAE,IAAI;iBACX,CAAC,CAAC;gBAEH,KAAK,YAAY,CAAC,CAAC,gCAAgC;gBACnD,KAAK,cAAc,CAAC;gBACpB,KAAK,mBAAmB,CAAC;gBACzB,KAAK,gBAAgB,CAAC;gBACtB,YAAY,GAAG,SAAS,CAAC;gBACzB,cAAc,GAAG,KAAK,CAAC,IAAI,CAAC;gBAC5B,mBAAmB,GAAG,KAAK,CAAC,KAAK,CAAC;gBAClC,gBAAgB,GAAG,CAAC,GAAG,SAAS,CAAC,CAAC;YACpC,CAAC;QACH,CAAC;QAED,4CAA4C;QAC5C,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC;YAC1B,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,IAAI,OAAO,CAAC,OAAO,KAAK,SAAS;gBAAE,SAAS;YAEjE,yDAAyD;YACzD,MAAM,WAAW,GAA4B,EAAE,CAAC;YAChD,IAAI,OAAO,CAAC,YAAY,KAAK,SAAS,EAAE,CAAC;gBACvC,WAAW,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;YAClD,CAAC;YACD,IAAI,OAAO,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxC,WAAW,CAAC,gBAAgB,GAAG,OAAO,CAAC,gBAAgB,CAAC;YAC1D,CAAC;YAED,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACtB,sDAAsD;gBACtD,MAAM,CAAC,IAAI,CAAC;oBACV,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE;oBAC9B,KAAK,EAAE,UAAU,EAAE;oBACnB,OAAO,EAAE,OAAO,CAAC,OAAO;oBACxB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;iBACxE,CAAC,CAAC;gBACH,SAAS;YACX,CAAC;YAED,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;gBAC7B,0BAA0B;gBAC1B,MAAM,CAAC,IAAI,CAAC;oBACV,OAAO,EAAE,IAAI;oBACb,KAAK,EAAE,UAAU,EAAE;oBACnB,OAAO,EAAE,OAAO,CAAC,OAAO;oBACxB,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;iBACxE,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,yEAAyE;gBACzE,wCAAwC;gBACxC,MAAM,SAAS,GAAG,WAAW,CAAC,IAAI,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,EAAE,WAAW,CAAC,CAAC;gBACtF,KAAK,MAAM,EAAE,IAAI,SAAS,EAAE,CAAC;oBAC3B,MAAM,CAAC,IAAI,CAAC;wBACV,GAAG,EAAE;wBACL,OAAO,EAAE,OAAO,CAAC,OAAO;qBACzB,CAAC,CAAC;oBACH,UAAU,EAAE,CAAC;gBACf,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,4EAA4E;IAC5E,mBAAmB;IACnB,4EAA4E;IAE5E;;;;;;;;;;;;;;OAcG;IACK,YAAY,CAClB,OAAe,EACf,SAAiB,EACjB,YAAoB;QAWpB,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,IAAI,CAAC,GAAG,CAAC,CAAC;QAEV,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YACxB,wEAAwE;YACxE,IAAI,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC1B,kEAAkE;gBAClE,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC;gBACvD,MAAM,UAAU,GAAa,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxC,CAAC,EAAE,CAAC;gBACJ,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;oBACvD,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;oBAC1B,CAAC,EAAE,CAAC;gBACN,CAAC;gBACD,wCAAwC;gBACxC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;oBACrB,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;oBAC1B,CAAC,EAAE,CAAC;gBACN,CAAC;gBACD,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBAC7D,SAAS;YACX,CAAC;YAED,yEAAyE;YACzE,kEAAkE;YAClE,4CAA4C;YAC5C,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gBACxB,MAAM,UAAU,GAAa,EAAE,CAAC;gBAChC,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC3E,gEAAgE;oBAChE,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBACpD,MAAM;oBACR,CAAC;oBACD,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;oBAC1B,CAAC,EAAE,CAAC;gBACN,CAAC;gBACD,IAAI,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;oBACzC,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBAChE,CAAC;qBAAM,CAAC;oBACN,2CAA2C;oBAC3C,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBAChE,CAAC;gBACD,SAAS;YACX,CAAC;YAED,wEAAwE;YACxE,uDAAuD;YACvD,MAAM,UAAU,GAAa,EAAE,CAAC;YAChC,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC1E,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC1B,CAAC,EAAE,CAAC;YACN,CAAC;YACD,MAAM,SAAS,GAAG,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;YAC/C,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACzB,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC,CAAC;YACpD,CAAC;QACH,CAAC;QAED,sCAAsC;QACtC,MAAM,MAAM,GAAoB,EAAE,CAAC;QACnC,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;YAC3B,IAAI,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;gBAAE,SAAS;YAE3C,QAAQ,GAAG,CAAC,IAAI,EAAE,CAAC;gBACjB,KAAK,MAAM;oBACT,MAAM,CAAC,IAAI,CAAC;wBACV,OAAO,EAAE,GAAG,CAAC,IAAI;wBACjB,KAAK,EAAE,UAAU,EAAE;wBACnB,QAAQ,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE;qBAC3B,CAAC,CAAC;oBACH,MAAM;gBAER,KAAK,OAAO;oBACV,MAAM,CAAC,IAAI,CAAC;wBACV,OAAO,EAAE,GAAG,CAAC,IAAI;wBACjB,KAAK,EAAE,UAAU,EAAE;wBACnB,QAAQ,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE;qBAC5B,CAAC,CAAC;oBACH,MAAM;gBAER,KAAK,OAAO,CAAC,CAAC,CAAC;oBACb,uCAAuC;oBACvC,MAAM,WAAW,GAAG,WAAW,CAAC,GAAG,CAAC,IAAI,EAAE,SAAS,EAAE,YAAY,EAAE,UAAU,CAAC,CAAC;oBAC/E,MAAM,CAAC,IAAI,CAAC,GAAG,WAAW,CAAC,CAAC;oBAC5B,UAAU,IAAI,WAAW,CAAC,MAAM,CAAC;oBACjC,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;CACF"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview DoclingLoader — high-fidelity PDF/DOCX extraction via Python Docling.
|
|
3
|
+
*
|
|
4
|
+
* Docling (https://github.com/DS4SD/docling) is an IBM Research open-source
|
|
5
|
+
* library that converts PDFs and office documents to structured JSON, preserving
|
|
6
|
+
* tables, figures, and layout information far beyond what pure-JS text extraction
|
|
7
|
+
* can achieve.
|
|
8
|
+
*
|
|
9
|
+
* This module provides a factory function {@link createDoclingLoader} that:
|
|
10
|
+
* 1. Checks whether `python3 -m docling --version` succeeds in the current PATH.
|
|
11
|
+
* 2. If it does, returns a {@link DoclingLoader} instance that spawns a
|
|
12
|
+
* `python3 -m docling` subprocess for each document.
|
|
13
|
+
* 3. If Docling is not installed, returns `null` gracefully.
|
|
14
|
+
*
|
|
15
|
+
* ### Opting in
|
|
16
|
+
* ```sh
|
|
17
|
+
* pip install docling
|
|
18
|
+
* ```
|
|
19
|
+
*
|
|
20
|
+
* @module memory/ingestion/DoclingLoader
|
|
21
|
+
*/
|
|
22
|
+
import type { IDocumentLoader } from './IDocumentLoader.js';
|
|
23
|
+
/**
|
|
24
|
+
* Checks whether `python3 -m docling` is available in the current environment
|
|
25
|
+
* and, if so, returns a new {@link DoclingLoader} instance; otherwise returns
|
|
26
|
+
* `null`.
|
|
27
|
+
*
|
|
28
|
+
* The availability check runs `python3 -m docling --version` synchronously
|
|
29
|
+
* via `spawnSync` — it exits quickly and is only called once during registry
|
|
30
|
+
* initialisation.
|
|
31
|
+
*
|
|
32
|
+
* ### Usage
|
|
33
|
+
* ```ts
|
|
34
|
+
* import { createDoclingLoader } from './DoclingLoader.js';
|
|
35
|
+
* import { PdfLoader } from './PdfLoader.js';
|
|
36
|
+
*
|
|
37
|
+
* const doclingLoader = createDoclingLoader();
|
|
38
|
+
* const loader = new PdfLoader(null, doclingLoader);
|
|
39
|
+
* ```
|
|
40
|
+
*
|
|
41
|
+
* @returns A `DoclingLoader` instance when Docling is installed, or `null`.
|
|
42
|
+
*/
|
|
43
|
+
export declare function createDoclingLoader(): IDocumentLoader | null;
|
|
44
|
+
//# sourceMappingURL=DoclingLoader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"DoclingLoader.d.ts","sourceRoot":"","sources":["../../../src/memory/ingestion/DoclingLoader.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAMH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,sBAAsB,CAAC;AAgO5D;;;;;;;;;;;;;;;;;;;GAmBG;AACH,wBAAgB,mBAAmB,IAAI,eAAe,GAAG,IAAI,CAgB5D"}
|