unrag 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/cli/index.js +408 -50
- package/package.json +3 -1
- package/registry/config/unrag.config.ts +164 -7
- package/registry/connectors/notion/render.ts +78 -0
- package/registry/connectors/notion/sync.ts +12 -3
- package/registry/connectors/notion/types.ts +3 -1
- package/registry/core/assets.ts +54 -0
- package/registry/core/config.ts +150 -0
- package/registry/core/context-engine.ts +69 -1
- package/registry/core/index.ts +15 -2
- package/registry/core/ingest.ts +743 -17
- package/registry/core/types.ts +606 -0
- package/registry/docs/unrag.md +6 -0
- package/registry/embedding/ai.ts +89 -8
- package/registry/extractors/_shared/fetch.ts +113 -0
- package/registry/extractors/_shared/media.ts +14 -0
- package/registry/extractors/_shared/text.ts +11 -0
- package/registry/extractors/audio-transcribe/index.ts +75 -0
- package/registry/extractors/file-docx/index.ts +53 -0
- package/registry/extractors/file-pptx/index.ts +92 -0
- package/registry/extractors/file-text/index.ts +85 -0
- package/registry/extractors/file-xlsx/index.ts +58 -0
- package/registry/extractors/image-caption-llm/index.ts +60 -0
- package/registry/extractors/image-ocr/index.ts +60 -0
- package/registry/extractors/pdf-llm/index.ts +84 -0
- package/registry/extractors/pdf-ocr/index.ts +125 -0
- package/registry/extractors/pdf-text-layer/index.ts +76 -0
- package/registry/extractors/video-frames/index.ts +126 -0
- package/registry/extractors/video-transcribe/index.ts +78 -0
- package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
package/registry/core/ingest.ts
CHANGED
|
@@ -1,12 +1,75 @@
|
|
|
1
1
|
import type {
|
|
2
|
+
AssetInput,
|
|
3
|
+
AssetProcessingConfig,
|
|
4
|
+
AssetProcessingPlanItem,
|
|
5
|
+
IngestPlanResult,
|
|
6
|
+
AssetExtractor,
|
|
7
|
+
AssetExtractorContext,
|
|
2
8
|
Chunk,
|
|
3
9
|
IngestInput,
|
|
4
10
|
IngestResult,
|
|
11
|
+
IngestWarning,
|
|
5
12
|
ResolvedContextEngineConfig,
|
|
6
13
|
} from "./types";
|
|
7
14
|
|
|
8
15
|
const now = () => performance.now();
|
|
9
16
|
|
|
17
|
+
const mergeDeep = <T extends Record<string, any>>(
|
|
18
|
+
base: T,
|
|
19
|
+
overrides: any | undefined
|
|
20
|
+
): T => {
|
|
21
|
+
if (!overrides) return base;
|
|
22
|
+
const out: any = Array.isArray(base) ? [...base] : { ...base };
|
|
23
|
+
for (const key of Object.keys(overrides)) {
|
|
24
|
+
const nextVal = overrides[key];
|
|
25
|
+
if (nextVal === undefined) continue;
|
|
26
|
+
const baseVal = (base as any)[key];
|
|
27
|
+
if (
|
|
28
|
+
baseVal &&
|
|
29
|
+
typeof baseVal === "object" &&
|
|
30
|
+
!Array.isArray(baseVal) &&
|
|
31
|
+
nextVal &&
|
|
32
|
+
typeof nextVal === "object" &&
|
|
33
|
+
!Array.isArray(nextVal)
|
|
34
|
+
) {
|
|
35
|
+
out[key] = mergeDeep(baseVal, nextVal);
|
|
36
|
+
} else {
|
|
37
|
+
out[key] = nextVal;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return out as T;
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
const asMessage = (err: unknown) => {
|
|
44
|
+
if (err instanceof Error) return err.message;
|
|
45
|
+
try {
|
|
46
|
+
return typeof err === "string" ? err : JSON.stringify(err);
|
|
47
|
+
} catch {
|
|
48
|
+
return String(err);
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const mapWithConcurrency = async <T, R>(
|
|
53
|
+
items: T[],
|
|
54
|
+
concurrency: number,
|
|
55
|
+
fn: (item: T, idx: number) => Promise<R>
|
|
56
|
+
): Promise<R[]> => {
|
|
57
|
+
const limit = Math.max(1, Math.floor(concurrency || 1));
|
|
58
|
+
const results: R[] = new Array(items.length);
|
|
59
|
+
let nextIdx = 0;
|
|
60
|
+
|
|
61
|
+
const workers = Array.from({ length: Math.min(limit, items.length) }, async () => {
|
|
62
|
+
while (true) {
|
|
63
|
+
const i = nextIdx++;
|
|
64
|
+
if (i >= items.length) break;
|
|
65
|
+
results[i] = await fn(items[i]!, i);
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
await Promise.all(workers);
|
|
70
|
+
return results;
|
|
71
|
+
};
|
|
72
|
+
|
|
10
73
|
export const ingest = async (
|
|
11
74
|
config: ResolvedContextEngineConfig,
|
|
12
75
|
input: IngestInput
|
|
@@ -14,6 +77,10 @@ export const ingest = async (
|
|
|
14
77
|
const totalStart = now();
|
|
15
78
|
const chunkingStart = now();
|
|
16
79
|
|
|
80
|
+
const storeChunkContent = config.storage.storeChunkContent;
|
|
81
|
+
const storeDocumentContent = config.storage.storeDocumentContent;
|
|
82
|
+
const storedDocumentContent = storeDocumentContent ? input.content : "";
|
|
83
|
+
|
|
17
84
|
const chunkingOptions = {
|
|
18
85
|
...config.defaults,
|
|
19
86
|
...input.chunking,
|
|
@@ -22,36 +89,473 @@ export const ingest = async (
|
|
|
22
89
|
const metadata = input.metadata ?? {};
|
|
23
90
|
const documentId = config.idGenerator();
|
|
24
91
|
|
|
25
|
-
const
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
92
|
+
const assetProcessing: AssetProcessingConfig = mergeDeep(
|
|
93
|
+
config.assetProcessing,
|
|
94
|
+
input.assetProcessing
|
|
95
|
+
);
|
|
96
|
+
|
|
97
|
+
type PreparedChunk = {
|
|
98
|
+
chunk: Chunk;
|
|
99
|
+
embed:
|
|
100
|
+
| { kind: "text"; text: string }
|
|
101
|
+
| { kind: "image"; data: Uint8Array | string; mediaType?: string; assetId?: string };
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
const prepared: PreparedChunk[] = [];
|
|
105
|
+
const warnings: IngestWarning[] = [];
|
|
106
|
+
|
|
107
|
+
const baseTextChunks = config.chunker(input.content, chunkingOptions);
|
|
108
|
+
for (const c of baseTextChunks) {
|
|
109
|
+
prepared.push({
|
|
110
|
+
chunk: {
|
|
111
|
+
id: config.idGenerator(),
|
|
112
|
+
documentId,
|
|
113
|
+
sourceId: input.sourceId,
|
|
114
|
+
index: c.index,
|
|
115
|
+
content: storeChunkContent ? c.content : "",
|
|
116
|
+
tokenCount: storeChunkContent ? c.tokenCount : 0,
|
|
117
|
+
metadata,
|
|
118
|
+
documentContent: storedDocumentContent,
|
|
119
|
+
},
|
|
120
|
+
embed: { kind: "text", text: c.content },
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const assets: AssetInput[] = Array.isArray(input.assets) ? input.assets : [];
|
|
125
|
+
type PreparedChunkSpec = Omit<Chunk, "id" | "index"> & {
|
|
126
|
+
metadata: Record<string, any>;
|
|
127
|
+
embed:
|
|
128
|
+
| { kind: "text"; text: string }
|
|
129
|
+
| { kind: "image"; data: Uint8Array | string; mediaType?: string; assetId?: string };
|
|
130
|
+
storedContent: string;
|
|
131
|
+
storedTokenCount: number;
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
const extractorCtx: AssetExtractorContext = {
|
|
135
|
+
sourceId: input.sourceId,
|
|
136
|
+
documentId,
|
|
137
|
+
documentMetadata: metadata,
|
|
138
|
+
assetProcessing,
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
const runExtractors = async (args: {
|
|
142
|
+
asset: AssetInput;
|
|
143
|
+
assetMeta: Record<string, any>;
|
|
144
|
+
assetUri?: string;
|
|
145
|
+
assetMediaType?: string;
|
|
146
|
+
extractors: AssetExtractor[];
|
|
147
|
+
stopOnFirstNonEmpty: boolean;
|
|
148
|
+
}): Promise<{
|
|
149
|
+
specs: PreparedChunkSpec[];
|
|
150
|
+
warnings: IngestWarning[];
|
|
151
|
+
attemptedExtractors: string[];
|
|
152
|
+
}> => {
|
|
153
|
+
const outSpecs: PreparedChunkSpec[] = [];
|
|
154
|
+
const outWarnings: IngestWarning[] = [];
|
|
155
|
+
const attemptedExtractors: string[] = [];
|
|
156
|
+
|
|
157
|
+
for (const ex of args.extractors) {
|
|
158
|
+
attemptedExtractors.push(ex.name);
|
|
159
|
+
const start = now();
|
|
160
|
+
assetProcessing.hooks?.onEvent?.({
|
|
161
|
+
type: "extractor:start",
|
|
162
|
+
sourceId: input.sourceId,
|
|
163
|
+
documentId,
|
|
164
|
+
assetId: args.asset.assetId,
|
|
165
|
+
assetKind: args.asset.kind,
|
|
166
|
+
extractor: ex.name,
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
try {
|
|
170
|
+
const res = await ex.extract({ asset: args.asset, ctx: extractorCtx });
|
|
171
|
+
const durationMs = now() - start;
|
|
172
|
+
const items = Array.isArray(res?.texts) ? res.texts : [];
|
|
173
|
+
assetProcessing.hooks?.onEvent?.({
|
|
174
|
+
type: "extractor:success",
|
|
175
|
+
sourceId: input.sourceId,
|
|
176
|
+
documentId,
|
|
177
|
+
assetId: args.asset.assetId,
|
|
178
|
+
assetKind: args.asset.kind,
|
|
179
|
+
extractor: ex.name,
|
|
180
|
+
durationMs,
|
|
181
|
+
textItemCount: items.length,
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
const nonEmptyItems = items
|
|
185
|
+
.map((t) => ({ ...t, content: (t.content ?? "").toString() }))
|
|
186
|
+
.filter((t) => t.content.trim().length > 0);
|
|
187
|
+
|
|
188
|
+
for (const item of nonEmptyItems) {
|
|
189
|
+
const chunks = config.chunker(item.content, chunkingOptions);
|
|
190
|
+
for (const c of chunks) {
|
|
191
|
+
outSpecs.push({
|
|
192
|
+
documentId,
|
|
193
|
+
sourceId: input.sourceId,
|
|
194
|
+
content: storeChunkContent ? c.content : "",
|
|
195
|
+
tokenCount: storeChunkContent ? c.tokenCount : 0,
|
|
196
|
+
documentContent: storedDocumentContent,
|
|
197
|
+
metadata: {
|
|
198
|
+
...args.assetMeta,
|
|
199
|
+
...(res?.metadata ?? {}),
|
|
200
|
+
extractor: ex.name,
|
|
201
|
+
extractorLabel: item.label,
|
|
202
|
+
...(item.confidence !== undefined
|
|
203
|
+
? { extractorConfidence: item.confidence }
|
|
204
|
+
: {}),
|
|
205
|
+
...(item.pageRange ? { extractorPageRange: item.pageRange } : {}),
|
|
206
|
+
...(item.timeRangeSec ? { extractorTimeRangeSec: item.timeRangeSec } : {}),
|
|
207
|
+
},
|
|
208
|
+
embed: { kind: "text", text: c.content },
|
|
209
|
+
storedContent: storeChunkContent ? c.content : "",
|
|
210
|
+
storedTokenCount: storeChunkContent ? c.tokenCount : 0,
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (outSpecs.length > 0 && args.stopOnFirstNonEmpty) {
|
|
216
|
+
break;
|
|
217
|
+
}
|
|
218
|
+
} catch (err) {
|
|
219
|
+
const durationMs = now() - start;
|
|
220
|
+
assetProcessing.hooks?.onEvent?.({
|
|
221
|
+
type: "extractor:error",
|
|
222
|
+
sourceId: input.sourceId,
|
|
223
|
+
documentId,
|
|
224
|
+
assetId: args.asset.assetId,
|
|
225
|
+
assetKind: args.asset.kind,
|
|
226
|
+
extractor: ex.name,
|
|
227
|
+
durationMs,
|
|
228
|
+
errorMessage: asMessage(err),
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
if (assetProcessing.onError === "fail") throw err;
|
|
232
|
+
outWarnings.push({
|
|
233
|
+
code: "asset_processing_error",
|
|
234
|
+
message: `Asset processing failed but was skipped due to onError="skip": ${asMessage(err)}`,
|
|
235
|
+
assetId: args.asset.assetId,
|
|
236
|
+
assetKind: args.asset.kind,
|
|
237
|
+
stage: "extract",
|
|
238
|
+
...(args.assetUri ? { assetUri: args.assetUri } : {}),
|
|
239
|
+
...(args.assetMediaType ? { assetMediaType: args.assetMediaType } : {}),
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
// try next extractor as fallback
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return { specs: outSpecs, warnings: outWarnings, attemptedExtractors };
|
|
247
|
+
};
|
|
248
|
+
|
|
249
|
+
const processAsset = async (
|
|
250
|
+
asset: AssetInput
|
|
251
|
+
): Promise<{ specs: PreparedChunkSpec[]; warnings: IngestWarning[] }> => {
|
|
252
|
+
const assetUri =
|
|
253
|
+
asset.uri ?? (asset.data.kind === "url" ? asset.data.url : undefined);
|
|
254
|
+
const assetMediaType =
|
|
255
|
+
asset.data.kind === "bytes" ? asset.data.mediaType : asset.data.mediaType;
|
|
256
|
+
|
|
257
|
+
const assetMeta = {
|
|
258
|
+
...metadata,
|
|
259
|
+
...(asset.metadata ?? {}),
|
|
260
|
+
assetKind: asset.kind,
|
|
261
|
+
assetId: asset.assetId,
|
|
262
|
+
...(assetUri ? { assetUri } : {}),
|
|
263
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
264
|
+
};
|
|
265
|
+
|
|
266
|
+
assetProcessing.hooks?.onEvent?.({
|
|
267
|
+
type: "asset:start",
|
|
29
268
|
sourceId: input.sourceId,
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
})
|
|
269
|
+
documentId,
|
|
270
|
+
assetId: asset.assetId,
|
|
271
|
+
assetKind: asset.kind,
|
|
272
|
+
...(assetUri ? { assetUri } : {}),
|
|
273
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
const shouldFailForWarning = (w: IngestWarning): boolean => {
|
|
277
|
+
if (w.code === "asset_processing_error") {
|
|
278
|
+
return assetProcessing.onError === "fail";
|
|
279
|
+
}
|
|
280
|
+
if (w.code === "asset_skipped_pdf_empty_extraction") {
|
|
281
|
+
return assetProcessing.onError === "fail";
|
|
282
|
+
}
|
|
283
|
+
if (w.code === "asset_skipped_extraction_empty") {
|
|
284
|
+
return assetProcessing.onError === "fail";
|
|
285
|
+
}
|
|
286
|
+
return assetProcessing.onUnsupportedAsset === "fail";
|
|
287
|
+
};
|
|
288
|
+
|
|
289
|
+
const skip = (w: IngestWarning) => {
|
|
290
|
+
assetProcessing.hooks?.onEvent?.({
|
|
291
|
+
type: "asset:skipped",
|
|
292
|
+
sourceId: input.sourceId,
|
|
293
|
+
documentId,
|
|
294
|
+
...w,
|
|
295
|
+
});
|
|
296
|
+
if (shouldFailForWarning(w)) {
|
|
297
|
+
throw new Error(w.message);
|
|
298
|
+
}
|
|
299
|
+
return { specs: [], warnings: [w] };
|
|
300
|
+
};
|
|
301
|
+
|
|
302
|
+
// Image handling stays in core for now (direct embed or caption fallback).
|
|
303
|
+
if (asset.kind === "image") {
|
|
304
|
+
const caption = (asset.text ?? "").trim();
|
|
305
|
+
const storedCaption = storeChunkContent ? caption : "";
|
|
306
|
+
const storedCaptionTokenCount = storedCaption
|
|
307
|
+
? storedCaption.split(/\s+/).filter(Boolean).length
|
|
308
|
+
: 0;
|
|
309
|
+
|
|
310
|
+
const specs: PreparedChunkSpec[] = [];
|
|
311
|
+
const warnings: IngestWarning[] = [];
|
|
312
|
+
|
|
313
|
+
if (config.embedding.embedImage) {
|
|
314
|
+
const data =
|
|
315
|
+
asset.data.kind === "bytes" ? asset.data.bytes : asset.data.url;
|
|
316
|
+
const mediaType =
|
|
317
|
+
asset.data.kind === "bytes"
|
|
318
|
+
? asset.data.mediaType
|
|
319
|
+
: asset.data.mediaType;
|
|
320
|
+
|
|
321
|
+
specs.push({
|
|
322
|
+
documentId,
|
|
323
|
+
sourceId: input.sourceId,
|
|
324
|
+
content: storedCaption,
|
|
325
|
+
tokenCount: storedCaptionTokenCount,
|
|
326
|
+
metadata: { ...assetMeta, extractor: "image:embed" },
|
|
327
|
+
documentContent: storedDocumentContent,
|
|
328
|
+
embed: { kind: "image", data, mediaType, assetId: asset.assetId },
|
|
329
|
+
storedContent: storedCaption,
|
|
330
|
+
storedTokenCount: storedCaptionTokenCount,
|
|
331
|
+
});
|
|
332
|
+
} else if (caption) {
|
|
333
|
+
const captionChunks = config.chunker(caption, chunkingOptions);
|
|
334
|
+
for (const c of captionChunks) {
|
|
335
|
+
specs.push({
|
|
336
|
+
documentId,
|
|
337
|
+
sourceId: input.sourceId,
|
|
338
|
+
content: storeChunkContent ? c.content : "",
|
|
339
|
+
tokenCount: storeChunkContent ? c.tokenCount : 0,
|
|
340
|
+
metadata: { ...assetMeta, extractor: "image:caption" },
|
|
341
|
+
documentContent: storedDocumentContent,
|
|
342
|
+
embed: { kind: "text", text: c.content },
|
|
343
|
+
storedContent: storeChunkContent ? c.content : "",
|
|
344
|
+
storedTokenCount: storeChunkContent ? c.tokenCount : 0,
|
|
345
|
+
});
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
const matching = config.extractors.filter((ex) =>
|
|
350
|
+
ex.supports({ asset, ctx: extractorCtx })
|
|
351
|
+
);
|
|
352
|
+
|
|
353
|
+
if (matching.length > 0) {
|
|
354
|
+
const r = await runExtractors({
|
|
355
|
+
asset,
|
|
356
|
+
assetMeta,
|
|
357
|
+
assetUri,
|
|
358
|
+
assetMediaType,
|
|
359
|
+
extractors: matching,
|
|
360
|
+
stopOnFirstNonEmpty: true,
|
|
361
|
+
});
|
|
362
|
+
specs.push(...r.specs);
|
|
363
|
+
warnings.push(...r.warnings);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
if (specs.length > 0) {
|
|
367
|
+
return { specs, warnings };
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
return skip({
|
|
371
|
+
code: "asset_skipped_image_no_multimodal_and_no_caption",
|
|
372
|
+
message:
|
|
373
|
+
"Image skipped because embedding provider does not support embedImage(), assets[].text (caption/alt) is empty, and no enabled image extractors are configured.",
|
|
374
|
+
assetId: asset.assetId,
|
|
375
|
+
assetKind: "image",
|
|
376
|
+
...(assetUri ? { assetUri } : {}),
|
|
377
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
378
|
+
});
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// PDF handling uses extractors when enabled.
|
|
382
|
+
if (asset.kind === "pdf") {
|
|
383
|
+
const matching = config.extractors.filter((ex) =>
|
|
384
|
+
ex.supports({ asset, ctx: extractorCtx })
|
|
385
|
+
);
|
|
386
|
+
if (matching.length === 0) {
|
|
387
|
+
// If ALL configured PDF extraction approaches are disabled, emit a specific warning.
|
|
388
|
+
if (
|
|
389
|
+
!assetProcessing.pdf.llmExtraction.enabled &&
|
|
390
|
+
!assetProcessing.pdf.textLayer.enabled &&
|
|
391
|
+
!assetProcessing.pdf.ocr.enabled
|
|
392
|
+
) {
|
|
393
|
+
return skip({
|
|
394
|
+
code: "asset_skipped_pdf_llm_extraction_disabled",
|
|
395
|
+
message:
|
|
396
|
+
"PDF skipped because no PDF extraction strategy is enabled (assetProcessing.pdf.*.enabled are all false).",
|
|
397
|
+
assetId: asset.assetId,
|
|
398
|
+
assetKind: "pdf",
|
|
399
|
+
...(assetUri ? { assetUri } : {}),
|
|
400
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
401
|
+
});
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
return skip({
|
|
405
|
+
code: "asset_skipped_unsupported_kind",
|
|
406
|
+
message:
|
|
407
|
+
'PDF extraction is enabled but no installed extractor supports this asset. Install/configure a PDF extractor module (e.g. "pdf-llm", "pdf-text-layer").',
|
|
408
|
+
assetId: asset.assetId,
|
|
409
|
+
assetKind: "pdf",
|
|
410
|
+
...(assetUri ? { assetUri } : {}),
|
|
411
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
412
|
+
});
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
const { specs, warnings: w } = await runExtractors({
|
|
416
|
+
asset,
|
|
417
|
+
assetMeta,
|
|
418
|
+
assetUri,
|
|
419
|
+
assetMediaType,
|
|
420
|
+
extractors: matching,
|
|
421
|
+
stopOnFirstNonEmpty: true,
|
|
422
|
+
});
|
|
423
|
+
|
|
424
|
+
if (specs.length === 0) {
|
|
425
|
+
return skip({
|
|
426
|
+
code: "asset_skipped_pdf_empty_extraction",
|
|
427
|
+
message:
|
|
428
|
+
"PDF extraction returned empty text. The PDF may be scanned/image-only or the extractor failed to extract readable content.",
|
|
429
|
+
assetId: asset.assetId,
|
|
430
|
+
assetKind: "pdf",
|
|
431
|
+
...(assetUri ? { assetUri } : {}),
|
|
432
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
433
|
+
});
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
return { specs, warnings: w };
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Audio/video/file: attempt extractors if any, otherwise treat as unsupported.
|
|
440
|
+
const matching = config.extractors.filter((ex) =>
|
|
441
|
+
ex.supports({ asset, ctx: extractorCtx })
|
|
442
|
+
);
|
|
443
|
+
if (matching.length === 0) {
|
|
444
|
+
// Distinguish \"disabled by config\" vs \"no extractor installed\".
|
|
445
|
+
const disabledByConfig =
|
|
446
|
+
(asset.kind === "audio" && !assetProcessing.audio.transcription.enabled) ||
|
|
447
|
+
(asset.kind === "video" &&
|
|
448
|
+
!assetProcessing.video.transcription.enabled &&
|
|
449
|
+
!assetProcessing.video.frames.enabled) ||
|
|
450
|
+
(asset.kind === "file" &&
|
|
451
|
+
!assetProcessing.file.text.enabled &&
|
|
452
|
+
!assetProcessing.file.docx.enabled &&
|
|
453
|
+
!assetProcessing.file.pptx.enabled &&
|
|
454
|
+
!assetProcessing.file.xlsx.enabled);
|
|
455
|
+
|
|
456
|
+
if (disabledByConfig) {
|
|
457
|
+
return skip({
|
|
458
|
+
code: "asset_skipped_extraction_disabled",
|
|
459
|
+
message: `Asset skipped because extraction for kind "${asset.kind}" is disabled by config.`,
|
|
460
|
+
assetId: asset.assetId,
|
|
461
|
+
assetKind: asset.kind,
|
|
462
|
+
...(assetUri ? { assetUri } : {}),
|
|
463
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
464
|
+
});
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
return skip({
|
|
468
|
+
code: "asset_skipped_unsupported_kind",
|
|
469
|
+
message: `Asset skipped because kind "${asset.kind}" is not supported by the built-in pipeline.`,
|
|
470
|
+
assetId: asset.assetId,
|
|
471
|
+
assetKind: asset.kind,
|
|
472
|
+
...(assetUri ? { assetUri } : {}),
|
|
473
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
474
|
+
});
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
const { specs, warnings: w } = await runExtractors({
|
|
478
|
+
asset,
|
|
479
|
+
assetMeta,
|
|
480
|
+
assetUri,
|
|
481
|
+
assetMediaType,
|
|
482
|
+
extractors: matching,
|
|
483
|
+
stopOnFirstNonEmpty: true,
|
|
484
|
+
});
|
|
485
|
+
|
|
486
|
+
if (specs.length === 0) {
|
|
487
|
+
return skip({
|
|
488
|
+
code: "asset_skipped_extraction_empty",
|
|
489
|
+
message:
|
|
490
|
+
"All configured extractors returned empty text outputs for this asset.",
|
|
491
|
+
assetId: asset.assetId,
|
|
492
|
+
assetKind: asset.kind,
|
|
493
|
+
...(assetUri ? { assetUri } : {}),
|
|
494
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
495
|
+
});
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
return { specs, warnings: w };
|
|
499
|
+
};
|
|
500
|
+
|
|
501
|
+
const assetResults = await mapWithConcurrency(
|
|
502
|
+
assets,
|
|
503
|
+
assetProcessing.concurrency,
|
|
504
|
+
async (asset) => processAsset(asset)
|
|
36
505
|
);
|
|
37
506
|
|
|
507
|
+
let nextIndex = baseTextChunks.length;
|
|
508
|
+
for (const r of assetResults) {
|
|
509
|
+
for (let i = 0; i < r.specs.length; i++) {
|
|
510
|
+
const spec = r.specs[i]!;
|
|
511
|
+
prepared.push({
|
|
512
|
+
chunk: {
|
|
513
|
+
id: config.idGenerator(),
|
|
514
|
+
documentId: spec.documentId,
|
|
515
|
+
sourceId: spec.sourceId,
|
|
516
|
+
index: nextIndex++,
|
|
517
|
+
content: spec.storedContent,
|
|
518
|
+
tokenCount: spec.storedTokenCount,
|
|
519
|
+
metadata: spec.metadata,
|
|
520
|
+
documentContent: spec.documentContent,
|
|
521
|
+
},
|
|
522
|
+
embed: spec.embed,
|
|
523
|
+
});
|
|
524
|
+
}
|
|
525
|
+
warnings.push(...r.warnings);
|
|
526
|
+
}
|
|
527
|
+
|
|
38
528
|
const chunkingMs = now() - chunkingStart;
|
|
39
529
|
const embeddingStart = now();
|
|
40
530
|
|
|
41
531
|
const embeddedChunks = await Promise.all(
|
|
42
|
-
|
|
532
|
+
prepared.map(async ({ chunk, embed }) => {
|
|
533
|
+
if (embed.kind === "image") {
|
|
534
|
+
const embedImage = config.embedding.embedImage;
|
|
535
|
+
if (!embedImage) {
|
|
536
|
+
throw new Error("Image embedding requested but provider does not support embedImage()");
|
|
537
|
+
}
|
|
538
|
+
const embedding = await embedImage({
|
|
539
|
+
data: embed.data,
|
|
540
|
+
mediaType: embed.mediaType,
|
|
541
|
+
metadata: chunk.metadata,
|
|
542
|
+
position: chunk.index,
|
|
543
|
+
sourceId: chunk.sourceId,
|
|
544
|
+
documentId: chunk.documentId,
|
|
545
|
+
assetId: embed.assetId,
|
|
546
|
+
});
|
|
547
|
+
return { ...chunk, embedding };
|
|
548
|
+
}
|
|
549
|
+
|
|
43
550
|
const embedding = await config.embedding.embed({
|
|
44
|
-
text:
|
|
45
|
-
metadata,
|
|
551
|
+
text: embed.text,
|
|
552
|
+
metadata: chunk.metadata,
|
|
46
553
|
position: chunk.index,
|
|
47
554
|
sourceId: chunk.sourceId,
|
|
48
555
|
documentId: chunk.documentId,
|
|
49
556
|
});
|
|
50
557
|
|
|
51
|
-
return {
|
|
52
|
-
...chunk,
|
|
53
|
-
embedding,
|
|
54
|
-
};
|
|
558
|
+
return { ...chunk, embedding };
|
|
55
559
|
})
|
|
56
560
|
);
|
|
57
561
|
|
|
@@ -67,6 +571,7 @@ export const ingest = async (
|
|
|
67
571
|
documentId,
|
|
68
572
|
chunkCount: embeddedChunks.length,
|
|
69
573
|
embeddingModel: config.embedding.name,
|
|
574
|
+
warnings,
|
|
70
575
|
durations: {
|
|
71
576
|
totalMs,
|
|
72
577
|
chunkingMs,
|
|
@@ -76,4 +581,225 @@ export const ingest = async (
|
|
|
76
581
|
};
|
|
77
582
|
};
|
|
78
583
|
|
|
584
|
+
/**
|
|
585
|
+
* Dry-run for ingestion. Returns which assets would be processed and why,
|
|
586
|
+
* without calling external services or writing to the store.
|
|
587
|
+
*/
|
|
588
|
+
export const planIngest = async (
|
|
589
|
+
config: ResolvedContextEngineConfig,
|
|
590
|
+
input: IngestInput
|
|
591
|
+
): Promise<IngestPlanResult> => {
|
|
592
|
+
const documentId = config.idGenerator();
|
|
593
|
+
const metadata = input.metadata ?? {};
|
|
594
|
+
|
|
595
|
+
const assetProcessing: AssetProcessingConfig = mergeDeep(
|
|
596
|
+
config.assetProcessing,
|
|
597
|
+
input.assetProcessing
|
|
598
|
+
);
|
|
599
|
+
|
|
600
|
+
const assets: AssetInput[] = Array.isArray(input.assets) ? input.assets : [];
|
|
601
|
+
const warnings: IngestWarning[] = [];
|
|
602
|
+
const plan: AssetProcessingPlanItem[] = [];
|
|
603
|
+
|
|
604
|
+
for (const asset of assets) {
|
|
605
|
+
const assetUri =
|
|
606
|
+
asset.uri ?? (asset.data.kind === "url" ? asset.data.url : undefined);
|
|
607
|
+
const assetMediaType =
|
|
608
|
+
asset.data.kind === "bytes" ? asset.data.mediaType : asset.data.mediaType;
|
|
609
|
+
|
|
610
|
+
const emit = (w: IngestWarning) => {
|
|
611
|
+
warnings.push(w);
|
|
612
|
+
assetProcessing.hooks?.onEvent?.({
|
|
613
|
+
type: "asset:skipped",
|
|
614
|
+
sourceId: input.sourceId,
|
|
615
|
+
documentId,
|
|
616
|
+
...w,
|
|
617
|
+
});
|
|
618
|
+
};
|
|
619
|
+
|
|
620
|
+
assetProcessing.hooks?.onEvent?.({
|
|
621
|
+
type: "asset:start",
|
|
622
|
+
sourceId: input.sourceId,
|
|
623
|
+
documentId,
|
|
624
|
+
assetId: asset.assetId,
|
|
625
|
+
assetKind: asset.kind,
|
|
626
|
+
...(assetUri ? { assetUri } : {}),
|
|
627
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
628
|
+
});
|
|
629
|
+
|
|
630
|
+
const extractorCtx: AssetExtractorContext = {
|
|
631
|
+
sourceId: input.sourceId,
|
|
632
|
+
documentId,
|
|
633
|
+
documentMetadata: metadata,
|
|
634
|
+
assetProcessing,
|
|
635
|
+
};
|
|
636
|
+
|
|
637
|
+
const matchingExtractors = config.extractors.filter((ex) =>
|
|
638
|
+
ex.supports({ asset, ctx: extractorCtx })
|
|
639
|
+
);
|
|
640
|
+
|
|
641
|
+
if (asset.kind === "pdf") {
|
|
642
|
+
if (matchingExtractors.length === 0) {
|
|
643
|
+
if (
|
|
644
|
+
!assetProcessing.pdf.llmExtraction.enabled &&
|
|
645
|
+
!assetProcessing.pdf.textLayer.enabled &&
|
|
646
|
+
!assetProcessing.pdf.ocr.enabled
|
|
647
|
+
) {
|
|
648
|
+
emit({
|
|
649
|
+
code: "asset_skipped_pdf_llm_extraction_disabled",
|
|
650
|
+
message:
|
|
651
|
+
"PDF would be skipped because no PDF extraction strategy is enabled (assetProcessing.pdf.*.enabled are all false).",
|
|
652
|
+
assetId: asset.assetId,
|
|
653
|
+
assetKind: "pdf",
|
|
654
|
+
...(assetUri ? { assetUri } : {}),
|
|
655
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
656
|
+
});
|
|
657
|
+
plan.push({
|
|
658
|
+
assetId: asset.assetId,
|
|
659
|
+
kind: asset.kind,
|
|
660
|
+
uri: asset.uri,
|
|
661
|
+
status: "will_skip",
|
|
662
|
+
reason: "asset_skipped_pdf_llm_extraction_disabled",
|
|
663
|
+
});
|
|
664
|
+
continue;
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
emit({
|
|
668
|
+
code: "asset_skipped_unsupported_kind",
|
|
669
|
+
message:
|
|
670
|
+
'PDF extraction is enabled but no installed extractor supports this asset. Install/configure a PDF extractor module (e.g. "pdf-llm", "pdf-text-layer").',
|
|
671
|
+
assetId: asset.assetId,
|
|
672
|
+
assetKind: "pdf",
|
|
673
|
+
...(assetUri ? { assetUri } : {}),
|
|
674
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
675
|
+
});
|
|
676
|
+
plan.push({
|
|
677
|
+
assetId: asset.assetId,
|
|
678
|
+
kind: asset.kind,
|
|
679
|
+
uri: asset.uri,
|
|
680
|
+
status: "will_skip",
|
|
681
|
+
reason: "asset_skipped_unsupported_kind",
|
|
682
|
+
});
|
|
683
|
+
continue;
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
plan.push({
|
|
687
|
+
assetId: asset.assetId,
|
|
688
|
+
kind: asset.kind,
|
|
689
|
+
uri: asset.uri,
|
|
690
|
+
status: "will_process",
|
|
691
|
+
extractors: matchingExtractors.map((e) => e.name),
|
|
692
|
+
});
|
|
693
|
+
continue;
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
if (asset.kind === "image") {
|
|
697
|
+
const extractors: string[] = [];
|
|
698
|
+
if (config.embedding.embedImage) {
|
|
699
|
+
extractors.push("image:embed");
|
|
700
|
+
} else {
|
|
701
|
+
const caption = (asset.text ?? "").trim();
|
|
702
|
+
if (caption) {
|
|
703
|
+
extractors.push("image:caption");
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
extractors.push(...matchingExtractors.map((e) => e.name));
|
|
708
|
+
|
|
709
|
+
if (extractors.length > 0) {
|
|
710
|
+
plan.push({
|
|
711
|
+
assetId: asset.assetId,
|
|
712
|
+
kind: asset.kind,
|
|
713
|
+
uri: asset.uri,
|
|
714
|
+
status: "will_process",
|
|
715
|
+
extractors,
|
|
716
|
+
});
|
|
717
|
+
continue;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
emit({
|
|
721
|
+
code: "asset_skipped_image_no_multimodal_and_no_caption",
|
|
722
|
+
message:
|
|
723
|
+
"Image would be skipped because embedding provider does not support embedImage(), assets[].text is empty, and no enabled image extractors are configured.",
|
|
724
|
+
assetId: asset.assetId,
|
|
725
|
+
assetKind: "image",
|
|
726
|
+
...(assetUri ? { assetUri } : {}),
|
|
727
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
728
|
+
});
|
|
729
|
+
plan.push({
|
|
730
|
+
assetId: asset.assetId,
|
|
731
|
+
kind: asset.kind,
|
|
732
|
+
uri: asset.uri,
|
|
733
|
+
status: "will_skip",
|
|
734
|
+
reason: "asset_skipped_image_no_multimodal_and_no_caption",
|
|
735
|
+
});
|
|
736
|
+
continue;
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
if (matchingExtractors.length === 0) {
|
|
740
|
+
const disabledByConfig =
|
|
741
|
+
(asset.kind === "audio" && !assetProcessing.audio.transcription.enabled) ||
|
|
742
|
+
(asset.kind === "video" &&
|
|
743
|
+
!assetProcessing.video.transcription.enabled &&
|
|
744
|
+
!assetProcessing.video.frames.enabled) ||
|
|
745
|
+
(asset.kind === "file" &&
|
|
746
|
+
!assetProcessing.file.text.enabled &&
|
|
747
|
+
!assetProcessing.file.docx.enabled &&
|
|
748
|
+
!assetProcessing.file.pptx.enabled &&
|
|
749
|
+
!assetProcessing.file.xlsx.enabled);
|
|
750
|
+
|
|
751
|
+
if (disabledByConfig) {
|
|
752
|
+
emit({
|
|
753
|
+
code: "asset_skipped_extraction_disabled",
|
|
754
|
+
message: `Asset would be skipped because extraction for kind "${asset.kind}" is disabled by config.`,
|
|
755
|
+
assetId: asset.assetId,
|
|
756
|
+
assetKind: asset.kind,
|
|
757
|
+
...(assetUri ? { assetUri } : {}),
|
|
758
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
759
|
+
});
|
|
760
|
+
plan.push({
|
|
761
|
+
assetId: asset.assetId,
|
|
762
|
+
kind: asset.kind,
|
|
763
|
+
uri: asset.uri,
|
|
764
|
+
status: "will_skip",
|
|
765
|
+
reason: "asset_skipped_extraction_disabled",
|
|
766
|
+
});
|
|
767
|
+
continue;
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
emit({
|
|
771
|
+
code: "asset_skipped_unsupported_kind",
|
|
772
|
+
message: `Asset would be skipped because kind "${asset.kind}" is not supported by the built-in pipeline.`,
|
|
773
|
+
assetId: asset.assetId,
|
|
774
|
+
assetKind: asset.kind,
|
|
775
|
+
...(assetUri ? { assetUri } : {}),
|
|
776
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
777
|
+
});
|
|
778
|
+
plan.push({
|
|
779
|
+
assetId: asset.assetId,
|
|
780
|
+
kind: asset.kind,
|
|
781
|
+
uri: asset.uri,
|
|
782
|
+
status: "will_skip",
|
|
783
|
+
reason: "asset_skipped_unsupported_kind",
|
|
784
|
+
});
|
|
785
|
+
continue;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
plan.push({
|
|
789
|
+
assetId: asset.assetId,
|
|
790
|
+
kind: asset.kind,
|
|
791
|
+
uri: asset.uri,
|
|
792
|
+
status: "will_process",
|
|
793
|
+
extractors: matchingExtractors.map((e) => e.name),
|
|
794
|
+
});
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
return {
|
|
798
|
+
documentId,
|
|
799
|
+
sourceId: input.sourceId,
|
|
800
|
+
assets: plan,
|
|
801
|
+
warnings,
|
|
802
|
+
};
|
|
803
|
+
};
|
|
804
|
+
|
|
79
805
|
|