unrag 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +2 -2
  2. package/dist/cli/index.js +251 -42
  3. package/package.json +2 -1
  4. package/registry/config/unrag.config.ts +140 -7
  5. package/registry/connectors/notion/render.ts +78 -0
  6. package/registry/connectors/notion/sync.ts +12 -3
  7. package/registry/connectors/notion/types.ts +3 -1
  8. package/registry/core/assets.ts +54 -0
  9. package/registry/core/config.ts +150 -0
  10. package/registry/core/context-engine.ts +69 -1
  11. package/registry/core/index.ts +15 -2
  12. package/registry/core/ingest.ts +743 -17
  13. package/registry/core/types.ts +606 -0
  14. package/registry/docs/unrag.md +6 -0
  15. package/registry/embedding/ai.ts +89 -8
  16. package/registry/extractors/_shared/fetch.ts +113 -0
  17. package/registry/extractors/_shared/media.ts +14 -0
  18. package/registry/extractors/_shared/text.ts +11 -0
  19. package/registry/extractors/audio-transcribe/index.ts +75 -0
  20. package/registry/extractors/file-docx/index.ts +53 -0
  21. package/registry/extractors/file-pptx/index.ts +92 -0
  22. package/registry/extractors/file-text/index.ts +85 -0
  23. package/registry/extractors/file-xlsx/index.ts +58 -0
  24. package/registry/extractors/image-caption-llm/index.ts +60 -0
  25. package/registry/extractors/image-ocr/index.ts +60 -0
  26. package/registry/extractors/pdf-llm/index.ts +84 -0
  27. package/registry/extractors/pdf-ocr/index.ts +125 -0
  28. package/registry/extractors/pdf-text-layer/index.ts +76 -0
  29. package/registry/extractors/video-frames/index.ts +126 -0
  30. package/registry/extractors/video-transcribe/index.ts +78 -0
  31. package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
@@ -1,12 +1,75 @@
1
1
  import type {
2
+ AssetInput,
3
+ AssetProcessingConfig,
4
+ AssetProcessingPlanItem,
5
+ IngestPlanResult,
6
+ AssetExtractor,
7
+ AssetExtractorContext,
2
8
  Chunk,
3
9
  IngestInput,
4
10
  IngestResult,
11
+ IngestWarning,
5
12
  ResolvedContextEngineConfig,
6
13
  } from "./types";
7
14
 
8
15
  const now = () => performance.now();
9
16
 
17
+ const mergeDeep = <T extends Record<string, any>>(
18
+ base: T,
19
+ overrides: any | undefined
20
+ ): T => {
21
+ if (!overrides) return base;
22
+ const out: any = Array.isArray(base) ? [...base] : { ...base };
23
+ for (const key of Object.keys(overrides)) {
24
+ const nextVal = overrides[key];
25
+ if (nextVal === undefined) continue;
26
+ const baseVal = (base as any)[key];
27
+ if (
28
+ baseVal &&
29
+ typeof baseVal === "object" &&
30
+ !Array.isArray(baseVal) &&
31
+ nextVal &&
32
+ typeof nextVal === "object" &&
33
+ !Array.isArray(nextVal)
34
+ ) {
35
+ out[key] = mergeDeep(baseVal, nextVal);
36
+ } else {
37
+ out[key] = nextVal;
38
+ }
39
+ }
40
+ return out as T;
41
+ };
42
+
43
+ const asMessage = (err: unknown) => {
44
+ if (err instanceof Error) return err.message;
45
+ try {
46
+ return typeof err === "string" ? err : JSON.stringify(err);
47
+ } catch {
48
+ return String(err);
49
+ }
50
+ };
51
+
52
+ const mapWithConcurrency = async <T, R>(
53
+ items: T[],
54
+ concurrency: number,
55
+ fn: (item: T, idx: number) => Promise<R>
56
+ ): Promise<R[]> => {
57
+ const limit = Math.max(1, Math.floor(concurrency || 1));
58
+ const results: R[] = new Array(items.length);
59
+ let nextIdx = 0;
60
+
61
+ const workers = Array.from({ length: Math.min(limit, items.length) }, async () => {
62
+ while (true) {
63
+ const i = nextIdx++;
64
+ if (i >= items.length) break;
65
+ results[i] = await fn(items[i]!, i);
66
+ }
67
+ });
68
+
69
+ await Promise.all(workers);
70
+ return results;
71
+ };
72
+
10
73
  export const ingest = async (
11
74
  config: ResolvedContextEngineConfig,
12
75
  input: IngestInput
@@ -14,6 +77,10 @@ export const ingest = async (
14
77
  const totalStart = now();
15
78
  const chunkingStart = now();
16
79
 
80
+ const storeChunkContent = config.storage.storeChunkContent;
81
+ const storeDocumentContent = config.storage.storeDocumentContent;
82
+ const storedDocumentContent = storeDocumentContent ? input.content : "";
83
+
17
84
  const chunkingOptions = {
18
85
  ...config.defaults,
19
86
  ...input.chunking,
@@ -22,36 +89,473 @@ export const ingest = async (
22
89
  const metadata = input.metadata ?? {};
23
90
  const documentId = config.idGenerator();
24
91
 
25
- const chunks = config.chunker(input.content, chunkingOptions).map<Chunk>(
26
- (chunk) => ({
27
- id: config.idGenerator(),
28
- documentId,
92
+ const assetProcessing: AssetProcessingConfig = mergeDeep(
93
+ config.assetProcessing,
94
+ input.assetProcessing
95
+ );
96
+
97
+ type PreparedChunk = {
98
+ chunk: Chunk;
99
+ embed:
100
+ | { kind: "text"; text: string }
101
+ | { kind: "image"; data: Uint8Array | string; mediaType?: string; assetId?: string };
102
+ };
103
+
104
+ const prepared: PreparedChunk[] = [];
105
+ const warnings: IngestWarning[] = [];
106
+
107
+ const baseTextChunks = config.chunker(input.content, chunkingOptions);
108
+ for (const c of baseTextChunks) {
109
+ prepared.push({
110
+ chunk: {
111
+ id: config.idGenerator(),
112
+ documentId,
113
+ sourceId: input.sourceId,
114
+ index: c.index,
115
+ content: storeChunkContent ? c.content : "",
116
+ tokenCount: storeChunkContent ? c.tokenCount : 0,
117
+ metadata,
118
+ documentContent: storedDocumentContent,
119
+ },
120
+ embed: { kind: "text", text: c.content },
121
+ });
122
+ }
123
+
124
+ const assets: AssetInput[] = Array.isArray(input.assets) ? input.assets : [];
125
+ type PreparedChunkSpec = Omit<Chunk, "id" | "index"> & {
126
+ metadata: Record<string, any>;
127
+ embed:
128
+ | { kind: "text"; text: string }
129
+ | { kind: "image"; data: Uint8Array | string; mediaType?: string; assetId?: string };
130
+ storedContent: string;
131
+ storedTokenCount: number;
132
+ };
133
+
134
+ const extractorCtx: AssetExtractorContext = {
135
+ sourceId: input.sourceId,
136
+ documentId,
137
+ documentMetadata: metadata,
138
+ assetProcessing,
139
+ };
140
+
141
+ const runExtractors = async (args: {
142
+ asset: AssetInput;
143
+ assetMeta: Record<string, any>;
144
+ assetUri?: string;
145
+ assetMediaType?: string;
146
+ extractors: AssetExtractor[];
147
+ stopOnFirstNonEmpty: boolean;
148
+ }): Promise<{
149
+ specs: PreparedChunkSpec[];
150
+ warnings: IngestWarning[];
151
+ attemptedExtractors: string[];
152
+ }> => {
153
+ const outSpecs: PreparedChunkSpec[] = [];
154
+ const outWarnings: IngestWarning[] = [];
155
+ const attemptedExtractors: string[] = [];
156
+
157
+ for (const ex of args.extractors) {
158
+ attemptedExtractors.push(ex.name);
159
+ const start = now();
160
+ assetProcessing.hooks?.onEvent?.({
161
+ type: "extractor:start",
162
+ sourceId: input.sourceId,
163
+ documentId,
164
+ assetId: args.asset.assetId,
165
+ assetKind: args.asset.kind,
166
+ extractor: ex.name,
167
+ });
168
+
169
+ try {
170
+ const res = await ex.extract({ asset: args.asset, ctx: extractorCtx });
171
+ const durationMs = now() - start;
172
+ const items = Array.isArray(res?.texts) ? res.texts : [];
173
+ assetProcessing.hooks?.onEvent?.({
174
+ type: "extractor:success",
175
+ sourceId: input.sourceId,
176
+ documentId,
177
+ assetId: args.asset.assetId,
178
+ assetKind: args.asset.kind,
179
+ extractor: ex.name,
180
+ durationMs,
181
+ textItemCount: items.length,
182
+ });
183
+
184
+ const nonEmptyItems = items
185
+ .map((t) => ({ ...t, content: (t.content ?? "").toString() }))
186
+ .filter((t) => t.content.trim().length > 0);
187
+
188
+ for (const item of nonEmptyItems) {
189
+ const chunks = config.chunker(item.content, chunkingOptions);
190
+ for (const c of chunks) {
191
+ outSpecs.push({
192
+ documentId,
193
+ sourceId: input.sourceId,
194
+ content: storeChunkContent ? c.content : "",
195
+ tokenCount: storeChunkContent ? c.tokenCount : 0,
196
+ documentContent: storedDocumentContent,
197
+ metadata: {
198
+ ...args.assetMeta,
199
+ ...(res?.metadata ?? {}),
200
+ extractor: ex.name,
201
+ extractorLabel: item.label,
202
+ ...(item.confidence !== undefined
203
+ ? { extractorConfidence: item.confidence }
204
+ : {}),
205
+ ...(item.pageRange ? { extractorPageRange: item.pageRange } : {}),
206
+ ...(item.timeRangeSec ? { extractorTimeRangeSec: item.timeRangeSec } : {}),
207
+ },
208
+ embed: { kind: "text", text: c.content },
209
+ storedContent: storeChunkContent ? c.content : "",
210
+ storedTokenCount: storeChunkContent ? c.tokenCount : 0,
211
+ });
212
+ }
213
+ }
214
+
215
+ if (outSpecs.length > 0 && args.stopOnFirstNonEmpty) {
216
+ break;
217
+ }
218
+ } catch (err) {
219
+ const durationMs = now() - start;
220
+ assetProcessing.hooks?.onEvent?.({
221
+ type: "extractor:error",
222
+ sourceId: input.sourceId,
223
+ documentId,
224
+ assetId: args.asset.assetId,
225
+ assetKind: args.asset.kind,
226
+ extractor: ex.name,
227
+ durationMs,
228
+ errorMessage: asMessage(err),
229
+ });
230
+
231
+ if (assetProcessing.onError === "fail") throw err;
232
+ outWarnings.push({
233
+ code: "asset_processing_error",
234
+ message: `Asset processing failed but was skipped due to onError="skip": ${asMessage(err)}`,
235
+ assetId: args.asset.assetId,
236
+ assetKind: args.asset.kind,
237
+ stage: "extract",
238
+ ...(args.assetUri ? { assetUri: args.assetUri } : {}),
239
+ ...(args.assetMediaType ? { assetMediaType: args.assetMediaType } : {}),
240
+ });
241
+
242
+ // try next extractor as fallback
243
+ }
244
+ }
245
+
246
+ return { specs: outSpecs, warnings: outWarnings, attemptedExtractors };
247
+ };
248
+
249
+ const processAsset = async (
250
+ asset: AssetInput
251
+ ): Promise<{ specs: PreparedChunkSpec[]; warnings: IngestWarning[] }> => {
252
+ const assetUri =
253
+ asset.uri ?? (asset.data.kind === "url" ? asset.data.url : undefined);
254
+ const assetMediaType =
255
+ asset.data.kind === "bytes" ? asset.data.mediaType : asset.data.mediaType;
256
+
257
+ const assetMeta = {
258
+ ...metadata,
259
+ ...(asset.metadata ?? {}),
260
+ assetKind: asset.kind,
261
+ assetId: asset.assetId,
262
+ ...(assetUri ? { assetUri } : {}),
263
+ ...(assetMediaType ? { assetMediaType } : {}),
264
+ };
265
+
266
+ assetProcessing.hooks?.onEvent?.({
267
+ type: "asset:start",
29
268
  sourceId: input.sourceId,
30
- index: chunk.index,
31
- content: chunk.content,
32
- tokenCount: chunk.tokenCount,
33
- metadata,
34
- documentContent: input.content,
35
- })
269
+ documentId,
270
+ assetId: asset.assetId,
271
+ assetKind: asset.kind,
272
+ ...(assetUri ? { assetUri } : {}),
273
+ ...(assetMediaType ? { assetMediaType } : {}),
274
+ });
275
+
276
+ const shouldFailForWarning = (w: IngestWarning): boolean => {
277
+ if (w.code === "asset_processing_error") {
278
+ return assetProcessing.onError === "fail";
279
+ }
280
+ if (w.code === "asset_skipped_pdf_empty_extraction") {
281
+ return assetProcessing.onError === "fail";
282
+ }
283
+ if (w.code === "asset_skipped_extraction_empty") {
284
+ return assetProcessing.onError === "fail";
285
+ }
286
+ return assetProcessing.onUnsupportedAsset === "fail";
287
+ };
288
+
289
+ const skip = (w: IngestWarning) => {
290
+ assetProcessing.hooks?.onEvent?.({
291
+ type: "asset:skipped",
292
+ sourceId: input.sourceId,
293
+ documentId,
294
+ ...w,
295
+ });
296
+ if (shouldFailForWarning(w)) {
297
+ throw new Error(w.message);
298
+ }
299
+ return { specs: [], warnings: [w] };
300
+ };
301
+
302
+ // Image handling stays in core for now (direct embed or caption fallback).
303
+ if (asset.kind === "image") {
304
+ const caption = (asset.text ?? "").trim();
305
+ const storedCaption = storeChunkContent ? caption : "";
306
+ const storedCaptionTokenCount = storedCaption
307
+ ? storedCaption.split(/\s+/).filter(Boolean).length
308
+ : 0;
309
+
310
+ const specs: PreparedChunkSpec[] = [];
311
+ const warnings: IngestWarning[] = [];
312
+
313
+ if (config.embedding.embedImage) {
314
+ const data =
315
+ asset.data.kind === "bytes" ? asset.data.bytes : asset.data.url;
316
+ const mediaType =
317
+ asset.data.kind === "bytes"
318
+ ? asset.data.mediaType
319
+ : asset.data.mediaType;
320
+
321
+ specs.push({
322
+ documentId,
323
+ sourceId: input.sourceId,
324
+ content: storedCaption,
325
+ tokenCount: storedCaptionTokenCount,
326
+ metadata: { ...assetMeta, extractor: "image:embed" },
327
+ documentContent: storedDocumentContent,
328
+ embed: { kind: "image", data, mediaType, assetId: asset.assetId },
329
+ storedContent: storedCaption,
330
+ storedTokenCount: storedCaptionTokenCount,
331
+ });
332
+ } else if (caption) {
333
+ const captionChunks = config.chunker(caption, chunkingOptions);
334
+ for (const c of captionChunks) {
335
+ specs.push({
336
+ documentId,
337
+ sourceId: input.sourceId,
338
+ content: storeChunkContent ? c.content : "",
339
+ tokenCount: storeChunkContent ? c.tokenCount : 0,
340
+ metadata: { ...assetMeta, extractor: "image:caption" },
341
+ documentContent: storedDocumentContent,
342
+ embed: { kind: "text", text: c.content },
343
+ storedContent: storeChunkContent ? c.content : "",
344
+ storedTokenCount: storeChunkContent ? c.tokenCount : 0,
345
+ });
346
+ }
347
+ }
348
+
349
+ const matching = config.extractors.filter((ex) =>
350
+ ex.supports({ asset, ctx: extractorCtx })
351
+ );
352
+
353
+ if (matching.length > 0) {
354
+ const r = await runExtractors({
355
+ asset,
356
+ assetMeta,
357
+ assetUri,
358
+ assetMediaType,
359
+ extractors: matching,
360
+ stopOnFirstNonEmpty: true,
361
+ });
362
+ specs.push(...r.specs);
363
+ warnings.push(...r.warnings);
364
+ }
365
+
366
+ if (specs.length > 0) {
367
+ return { specs, warnings };
368
+ }
369
+
370
+ return skip({
371
+ code: "asset_skipped_image_no_multimodal_and_no_caption",
372
+ message:
373
+ "Image skipped because embedding provider does not support embedImage(), assets[].text (caption/alt) is empty, and no enabled image extractors are configured.",
374
+ assetId: asset.assetId,
375
+ assetKind: "image",
376
+ ...(assetUri ? { assetUri } : {}),
377
+ ...(assetMediaType ? { assetMediaType } : {}),
378
+ });
379
+ }
380
+
381
+ // PDF handling uses extractors when enabled.
382
+ if (asset.kind === "pdf") {
383
+ const matching = config.extractors.filter((ex) =>
384
+ ex.supports({ asset, ctx: extractorCtx })
385
+ );
386
+ if (matching.length === 0) {
387
+ // If ALL configured PDF extraction approaches are disabled, emit a specific warning.
388
+ if (
389
+ !assetProcessing.pdf.llmExtraction.enabled &&
390
+ !assetProcessing.pdf.textLayer.enabled &&
391
+ !assetProcessing.pdf.ocr.enabled
392
+ ) {
393
+ return skip({
394
+ code: "asset_skipped_pdf_llm_extraction_disabled",
395
+ message:
396
+ "PDF skipped because no PDF extraction strategy is enabled (assetProcessing.pdf.*.enabled are all false).",
397
+ assetId: asset.assetId,
398
+ assetKind: "pdf",
399
+ ...(assetUri ? { assetUri } : {}),
400
+ ...(assetMediaType ? { assetMediaType } : {}),
401
+ });
402
+ }
403
+
404
+ return skip({
405
+ code: "asset_skipped_unsupported_kind",
406
+ message:
407
+ 'PDF extraction is enabled but no installed extractor supports this asset. Install/configure a PDF extractor module (e.g. "pdf-llm", "pdf-text-layer").',
408
+ assetId: asset.assetId,
409
+ assetKind: "pdf",
410
+ ...(assetUri ? { assetUri } : {}),
411
+ ...(assetMediaType ? { assetMediaType } : {}),
412
+ });
413
+ }
414
+
415
+ const { specs, warnings: w } = await runExtractors({
416
+ asset,
417
+ assetMeta,
418
+ assetUri,
419
+ assetMediaType,
420
+ extractors: matching,
421
+ stopOnFirstNonEmpty: true,
422
+ });
423
+
424
+ if (specs.length === 0) {
425
+ return skip({
426
+ code: "asset_skipped_pdf_empty_extraction",
427
+ message:
428
+ "PDF extraction returned empty text. The PDF may be scanned/image-only or the extractor failed to extract readable content.",
429
+ assetId: asset.assetId,
430
+ assetKind: "pdf",
431
+ ...(assetUri ? { assetUri } : {}),
432
+ ...(assetMediaType ? { assetMediaType } : {}),
433
+ });
434
+ }
435
+
436
+ return { specs, warnings: w };
437
+ }
438
+
439
+ // Audio/video/file: attempt extractors if any, otherwise treat as unsupported.
440
+ const matching = config.extractors.filter((ex) =>
441
+ ex.supports({ asset, ctx: extractorCtx })
442
+ );
443
+ if (matching.length === 0) {
444
+ // Distinguish \"disabled by config\" vs \"no extractor installed\".
445
+ const disabledByConfig =
446
+ (asset.kind === "audio" && !assetProcessing.audio.transcription.enabled) ||
447
+ (asset.kind === "video" &&
448
+ !assetProcessing.video.transcription.enabled &&
449
+ !assetProcessing.video.frames.enabled) ||
450
+ (asset.kind === "file" &&
451
+ !assetProcessing.file.text.enabled &&
452
+ !assetProcessing.file.docx.enabled &&
453
+ !assetProcessing.file.pptx.enabled &&
454
+ !assetProcessing.file.xlsx.enabled);
455
+
456
+ if (disabledByConfig) {
457
+ return skip({
458
+ code: "asset_skipped_extraction_disabled",
459
+ message: `Asset skipped because extraction for kind "${asset.kind}" is disabled by config.`,
460
+ assetId: asset.assetId,
461
+ assetKind: asset.kind,
462
+ ...(assetUri ? { assetUri } : {}),
463
+ ...(assetMediaType ? { assetMediaType } : {}),
464
+ });
465
+ }
466
+
467
+ return skip({
468
+ code: "asset_skipped_unsupported_kind",
469
+ message: `Asset skipped because kind "${asset.kind}" is not supported by the built-in pipeline.`,
470
+ assetId: asset.assetId,
471
+ assetKind: asset.kind,
472
+ ...(assetUri ? { assetUri } : {}),
473
+ ...(assetMediaType ? { assetMediaType } : {}),
474
+ });
475
+ }
476
+
477
+ const { specs, warnings: w } = await runExtractors({
478
+ asset,
479
+ assetMeta,
480
+ assetUri,
481
+ assetMediaType,
482
+ extractors: matching,
483
+ stopOnFirstNonEmpty: true,
484
+ });
485
+
486
+ if (specs.length === 0) {
487
+ return skip({
488
+ code: "asset_skipped_extraction_empty",
489
+ message:
490
+ "All configured extractors returned empty text outputs for this asset.",
491
+ assetId: asset.assetId,
492
+ assetKind: asset.kind,
493
+ ...(assetUri ? { assetUri } : {}),
494
+ ...(assetMediaType ? { assetMediaType } : {}),
495
+ });
496
+ }
497
+
498
+ return { specs, warnings: w };
499
+ };
500
+
501
+ const assetResults = await mapWithConcurrency(
502
+ assets,
503
+ assetProcessing.concurrency,
504
+ async (asset) => processAsset(asset)
36
505
  );
37
506
 
507
+ let nextIndex = baseTextChunks.length;
508
+ for (const r of assetResults) {
509
+ for (let i = 0; i < r.specs.length; i++) {
510
+ const spec = r.specs[i]!;
511
+ prepared.push({
512
+ chunk: {
513
+ id: config.idGenerator(),
514
+ documentId: spec.documentId,
515
+ sourceId: spec.sourceId,
516
+ index: nextIndex++,
517
+ content: spec.storedContent,
518
+ tokenCount: spec.storedTokenCount,
519
+ metadata: spec.metadata,
520
+ documentContent: spec.documentContent,
521
+ },
522
+ embed: spec.embed,
523
+ });
524
+ }
525
+ warnings.push(...r.warnings);
526
+ }
527
+
38
528
  const chunkingMs = now() - chunkingStart;
39
529
  const embeddingStart = now();
40
530
 
41
531
  const embeddedChunks = await Promise.all(
42
- chunks.map(async (chunk) => {
532
+ prepared.map(async ({ chunk, embed }) => {
533
+ if (embed.kind === "image") {
534
+ const embedImage = config.embedding.embedImage;
535
+ if (!embedImage) {
536
+ throw new Error("Image embedding requested but provider does not support embedImage()");
537
+ }
538
+ const embedding = await embedImage({
539
+ data: embed.data,
540
+ mediaType: embed.mediaType,
541
+ metadata: chunk.metadata,
542
+ position: chunk.index,
543
+ sourceId: chunk.sourceId,
544
+ documentId: chunk.documentId,
545
+ assetId: embed.assetId,
546
+ });
547
+ return { ...chunk, embedding };
548
+ }
549
+
43
550
  const embedding = await config.embedding.embed({
44
- text: chunk.content,
45
- metadata,
551
+ text: embed.text,
552
+ metadata: chunk.metadata,
46
553
  position: chunk.index,
47
554
  sourceId: chunk.sourceId,
48
555
  documentId: chunk.documentId,
49
556
  });
50
557
 
51
- return {
52
- ...chunk,
53
- embedding,
54
- };
558
+ return { ...chunk, embedding };
55
559
  })
56
560
  );
57
561
 
@@ -67,6 +571,7 @@ export const ingest = async (
67
571
  documentId,
68
572
  chunkCount: embeddedChunks.length,
69
573
  embeddingModel: config.embedding.name,
574
+ warnings,
70
575
  durations: {
71
576
  totalMs,
72
577
  chunkingMs,
@@ -76,4 +581,225 @@ export const ingest = async (
76
581
  };
77
582
  };
78
583
 
584
+ /**
585
+ * Dry-run for ingestion. Returns which assets would be processed and why,
586
+ * without calling external services or writing to the store.
587
+ */
588
+ export const planIngest = async (
589
+ config: ResolvedContextEngineConfig,
590
+ input: IngestInput
591
+ ): Promise<IngestPlanResult> => {
592
+ const documentId = config.idGenerator();
593
+ const metadata = input.metadata ?? {};
594
+
595
+ const assetProcessing: AssetProcessingConfig = mergeDeep(
596
+ config.assetProcessing,
597
+ input.assetProcessing
598
+ );
599
+
600
+ const assets: AssetInput[] = Array.isArray(input.assets) ? input.assets : [];
601
+ const warnings: IngestWarning[] = [];
602
+ const plan: AssetProcessingPlanItem[] = [];
603
+
604
+ for (const asset of assets) {
605
+ const assetUri =
606
+ asset.uri ?? (asset.data.kind === "url" ? asset.data.url : undefined);
607
+ const assetMediaType =
608
+ asset.data.kind === "bytes" ? asset.data.mediaType : asset.data.mediaType;
609
+
610
+ const emit = (w: IngestWarning) => {
611
+ warnings.push(w);
612
+ assetProcessing.hooks?.onEvent?.({
613
+ type: "asset:skipped",
614
+ sourceId: input.sourceId,
615
+ documentId,
616
+ ...w,
617
+ });
618
+ };
619
+
620
+ assetProcessing.hooks?.onEvent?.({
621
+ type: "asset:start",
622
+ sourceId: input.sourceId,
623
+ documentId,
624
+ assetId: asset.assetId,
625
+ assetKind: asset.kind,
626
+ ...(assetUri ? { assetUri } : {}),
627
+ ...(assetMediaType ? { assetMediaType } : {}),
628
+ });
629
+
630
+ const extractorCtx: AssetExtractorContext = {
631
+ sourceId: input.sourceId,
632
+ documentId,
633
+ documentMetadata: metadata,
634
+ assetProcessing,
635
+ };
636
+
637
+ const matchingExtractors = config.extractors.filter((ex) =>
638
+ ex.supports({ asset, ctx: extractorCtx })
639
+ );
640
+
641
+ if (asset.kind === "pdf") {
642
+ if (matchingExtractors.length === 0) {
643
+ if (
644
+ !assetProcessing.pdf.llmExtraction.enabled &&
645
+ !assetProcessing.pdf.textLayer.enabled &&
646
+ !assetProcessing.pdf.ocr.enabled
647
+ ) {
648
+ emit({
649
+ code: "asset_skipped_pdf_llm_extraction_disabled",
650
+ message:
651
+ "PDF would be skipped because no PDF extraction strategy is enabled (assetProcessing.pdf.*.enabled are all false).",
652
+ assetId: asset.assetId,
653
+ assetKind: "pdf",
654
+ ...(assetUri ? { assetUri } : {}),
655
+ ...(assetMediaType ? { assetMediaType } : {}),
656
+ });
657
+ plan.push({
658
+ assetId: asset.assetId,
659
+ kind: asset.kind,
660
+ uri: asset.uri,
661
+ status: "will_skip",
662
+ reason: "asset_skipped_pdf_llm_extraction_disabled",
663
+ });
664
+ continue;
665
+ }
666
+
667
+ emit({
668
+ code: "asset_skipped_unsupported_kind",
669
+ message:
670
+ 'PDF extraction is enabled but no installed extractor supports this asset. Install/configure a PDF extractor module (e.g. "pdf-llm", "pdf-text-layer").',
671
+ assetId: asset.assetId,
672
+ assetKind: "pdf",
673
+ ...(assetUri ? { assetUri } : {}),
674
+ ...(assetMediaType ? { assetMediaType } : {}),
675
+ });
676
+ plan.push({
677
+ assetId: asset.assetId,
678
+ kind: asset.kind,
679
+ uri: asset.uri,
680
+ status: "will_skip",
681
+ reason: "asset_skipped_unsupported_kind",
682
+ });
683
+ continue;
684
+ }
685
+
686
+ plan.push({
687
+ assetId: asset.assetId,
688
+ kind: asset.kind,
689
+ uri: asset.uri,
690
+ status: "will_process",
691
+ extractors: matchingExtractors.map((e) => e.name),
692
+ });
693
+ continue;
694
+ }
695
+
696
+ if (asset.kind === "image") {
697
+ const extractors: string[] = [];
698
+ if (config.embedding.embedImage) {
699
+ extractors.push("image:embed");
700
+ } else {
701
+ const caption = (asset.text ?? "").trim();
702
+ if (caption) {
703
+ extractors.push("image:caption");
704
+ }
705
+ }
706
+
707
+ extractors.push(...matchingExtractors.map((e) => e.name));
708
+
709
+ if (extractors.length > 0) {
710
+ plan.push({
711
+ assetId: asset.assetId,
712
+ kind: asset.kind,
713
+ uri: asset.uri,
714
+ status: "will_process",
715
+ extractors,
716
+ });
717
+ continue;
718
+ }
719
+
720
+ emit({
721
+ code: "asset_skipped_image_no_multimodal_and_no_caption",
722
+ message:
723
+ "Image would be skipped because embedding provider does not support embedImage(), assets[].text is empty, and no enabled image extractors are configured.",
724
+ assetId: asset.assetId,
725
+ assetKind: "image",
726
+ ...(assetUri ? { assetUri } : {}),
727
+ ...(assetMediaType ? { assetMediaType } : {}),
728
+ });
729
+ plan.push({
730
+ assetId: asset.assetId,
731
+ kind: asset.kind,
732
+ uri: asset.uri,
733
+ status: "will_skip",
734
+ reason: "asset_skipped_image_no_multimodal_and_no_caption",
735
+ });
736
+ continue;
737
+ }
738
+
739
+ if (matchingExtractors.length === 0) {
740
+ const disabledByConfig =
741
+ (asset.kind === "audio" && !assetProcessing.audio.transcription.enabled) ||
742
+ (asset.kind === "video" &&
743
+ !assetProcessing.video.transcription.enabled &&
744
+ !assetProcessing.video.frames.enabled) ||
745
+ (asset.kind === "file" &&
746
+ !assetProcessing.file.text.enabled &&
747
+ !assetProcessing.file.docx.enabled &&
748
+ !assetProcessing.file.pptx.enabled &&
749
+ !assetProcessing.file.xlsx.enabled);
750
+
751
+ if (disabledByConfig) {
752
+ emit({
753
+ code: "asset_skipped_extraction_disabled",
754
+ message: `Asset would be skipped because extraction for kind "${asset.kind}" is disabled by config.`,
755
+ assetId: asset.assetId,
756
+ assetKind: asset.kind,
757
+ ...(assetUri ? { assetUri } : {}),
758
+ ...(assetMediaType ? { assetMediaType } : {}),
759
+ });
760
+ plan.push({
761
+ assetId: asset.assetId,
762
+ kind: asset.kind,
763
+ uri: asset.uri,
764
+ status: "will_skip",
765
+ reason: "asset_skipped_extraction_disabled",
766
+ });
767
+ continue;
768
+ }
769
+
770
+ emit({
771
+ code: "asset_skipped_unsupported_kind",
772
+ message: `Asset would be skipped because kind "${asset.kind}" is not supported by the built-in pipeline.`,
773
+ assetId: asset.assetId,
774
+ assetKind: asset.kind,
775
+ ...(assetUri ? { assetUri } : {}),
776
+ ...(assetMediaType ? { assetMediaType } : {}),
777
+ });
778
+ plan.push({
779
+ assetId: asset.assetId,
780
+ kind: asset.kind,
781
+ uri: asset.uri,
782
+ status: "will_skip",
783
+ reason: "asset_skipped_unsupported_kind",
784
+ });
785
+ continue;
786
+ }
787
+
788
+ plan.push({
789
+ assetId: asset.assetId,
790
+ kind: asset.kind,
791
+ uri: asset.uri,
792
+ status: "will_process",
793
+ extractors: matchingExtractors.map((e) => e.name),
794
+ });
795
+ }
796
+
797
+ return {
798
+ documentId,
799
+ sourceId: input.sourceId,
800
+ assets: plan,
801
+ warnings,
802
+ };
803
+ };
804
+
79
805