unrag 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +2 -2
  2. package/dist/cli/index.js +408 -50
  3. package/package.json +3 -1
  4. package/registry/config/unrag.config.ts +164 -7
  5. package/registry/connectors/notion/render.ts +78 -0
  6. package/registry/connectors/notion/sync.ts +12 -3
  7. package/registry/connectors/notion/types.ts +3 -1
  8. package/registry/core/assets.ts +54 -0
  9. package/registry/core/config.ts +150 -0
  10. package/registry/core/context-engine.ts +69 -1
  11. package/registry/core/index.ts +15 -2
  12. package/registry/core/ingest.ts +743 -17
  13. package/registry/core/types.ts +606 -0
  14. package/registry/docs/unrag.md +6 -0
  15. package/registry/embedding/ai.ts +89 -8
  16. package/registry/extractors/_shared/fetch.ts +113 -0
  17. package/registry/extractors/_shared/media.ts +14 -0
  18. package/registry/extractors/_shared/text.ts +11 -0
  19. package/registry/extractors/audio-transcribe/index.ts +75 -0
  20. package/registry/extractors/file-docx/index.ts +53 -0
  21. package/registry/extractors/file-pptx/index.ts +92 -0
  22. package/registry/extractors/file-text/index.ts +85 -0
  23. package/registry/extractors/file-xlsx/index.ts +58 -0
  24. package/registry/extractors/image-caption-llm/index.ts +60 -0
  25. package/registry/extractors/image-ocr/index.ts +60 -0
  26. package/registry/extractors/pdf-llm/index.ts +84 -0
  27. package/registry/extractors/pdf-ocr/index.ts +125 -0
  28. package/registry/extractors/pdf-text-layer/index.ts +76 -0
  29. package/registry/extractors/video-frames/index.ts +126 -0
  30. package/registry/extractors/video-transcribe/index.ts +78 -0
  31. package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
@@ -2,18 +2,21 @@
2
2
  * Root Unrag config (generated).
3
3
  *
4
4
  * This file is meant to be the single place you tweak:
5
+ * - Defaults (chunking + retrieval)
6
+ * - Engine settings (storage, asset processing, extractors)
5
7
  * - Embedding provider/model/timeouts
6
- * - Chunking defaults
7
- * - Retrieval defaults
8
- * - How you construct your DB client (Pool/Prisma/etc)
8
+ * - How you construct your DB client (Pool/Prisma/etc) and vector store adapter
9
9
  *
10
10
  * The files under your install dir (e.g. `lib/unrag/**`) are intended to be
11
11
  * treated like vendored source code.
12
12
  */
13
13
 
14
+ // @ts-nocheck
15
+
14
16
  // __UNRAG_IMPORTS__
15
17
 
16
- export const unragConfig = {
18
+ export const unrag = defineUnragConfig({
19
+ defaults: {
17
20
  chunking: {
18
21
  chunkSize: 200,
19
22
  chunkOverlap: 40,
@@ -21,11 +24,165 @@ export const unragConfig = {
21
24
  retrieval: {
22
25
  topK: 8,
23
26
  },
27
+ },
24
28
  embedding: {
25
- model: "openai/text-embedding-3-small",
26
- timeoutMs: 15_000,
29
+ provider: "ai",
30
+ config: {
31
+ type: "text", // __UNRAG_EMBEDDING_TYPE__
32
+ model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__
33
+ timeoutMs: 15_000,
34
+ },
35
+ },
36
+ engine: {
37
+ /**
38
+ * Storage controls.
39
+ *
40
+ * - storeChunkContent: whether `chunk.content` is persisted and returned by retrieval.
41
+ * - storeDocumentContent: whether the full original document text is stored in `documents.content`.
42
+ */
43
+ storage: {
44
+ storeChunkContent: true,
45
+ storeDocumentContent: true,
46
+ },
47
+ /**
48
+ * Optional extractor modules that can process non-text assets into text outputs.
49
+ *
50
+ * To install:
51
+ * - `unrag add extractor pdf-llm`
52
+ *
53
+ * Then import it in this file and add it here, for example:
54
+ * - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
55
+ * - `extractors: [createPdfLlmExtractor()]`
56
+ */
57
+ extractors: [
58
+ // __UNRAG_EXTRACTORS__
59
+ ],
60
+ /**
61
+ * Rich media processing controls.
62
+ *
63
+ * Notes:
64
+ * - This generated config is cost-safe by default (all extraction is off).
65
+ * - `unrag init` can enable rich media + multimodal embeddings for you.
66
+ * - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
67
+ */
68
+ assetProcessing: {
69
+ onUnsupportedAsset: "skip",
70
+ onError: "skip",
71
+ concurrency: 4,
72
+ fetch: {
73
+ enabled: true,
74
+ maxBytes: 15 * 1024 * 1024,
75
+ timeoutMs: 20_000,
76
+ // allowedHosts: ["..."], // recommended to mitigate SSRF
77
+ },
78
+ pdf: {
79
+ // Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
80
+ textLayer: {
81
+ enabled: false, // __UNRAG_FLAG_pdf_textLayer__
82
+ maxBytes: 15 * 1024 * 1024,
83
+ maxOutputChars: 200_000,
84
+ minChars: 200,
85
+ // maxPages: 200,
86
+ },
87
+ llmExtraction: {
88
+ enabled: false, // __UNRAG_FLAG_pdf_llmExtraction__
89
+ model: "google/gemini-2.0-flash",
90
+ prompt:
91
+ "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
92
+ timeoutMs: 60_000,
93
+ maxBytes: 15 * 1024 * 1024,
94
+ maxOutputChars: 200_000,
95
+ },
96
+ // Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
97
+ ocr: {
98
+ enabled: false, // __UNRAG_FLAG_pdf_ocr__
99
+ maxBytes: 15 * 1024 * 1024,
100
+ maxOutputChars: 200_000,
101
+ minChars: 200,
102
+ // maxPages: 200,
103
+ // pdftoppmPath: "/usr/bin/pdftoppm",
104
+ // tesseractPath: "/usr/bin/tesseract",
105
+ // dpi: 200,
106
+ // lang: "eng",
107
+ },
108
+ },
109
+ image: {
110
+ ocr: {
111
+ enabled: false, // __UNRAG_FLAG_image_ocr__
112
+ model: "google/gemini-2.0-flash",
113
+ prompt:
114
+ "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
115
+ timeoutMs: 60_000,
116
+ maxBytes: 10 * 1024 * 1024,
117
+ maxOutputChars: 50_000,
118
+ },
119
+ captionLlm: {
120
+ enabled: false, // __UNRAG_FLAG_image_captionLlm__
121
+ model: "google/gemini-2.0-flash",
122
+ prompt:
123
+ "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
124
+ timeoutMs: 60_000,
125
+ maxBytes: 10 * 1024 * 1024,
126
+ maxOutputChars: 10_000,
127
+ },
128
+ },
129
+ audio: {
130
+ transcription: {
131
+ enabled: false, // __UNRAG_FLAG_audio_transcription__
132
+ model: "openai/whisper-1",
133
+ timeoutMs: 120_000,
134
+ maxBytes: 25 * 1024 * 1024,
135
+ },
136
+ },
137
+ video: {
138
+ transcription: {
139
+ enabled: false, // __UNRAG_FLAG_video_transcription__
140
+ model: "openai/whisper-1",
141
+ timeoutMs: 120_000,
142
+ maxBytes: 50 * 1024 * 1024,
143
+ },
144
+ frames: {
145
+ enabled: false, // __UNRAG_FLAG_video_frames__
146
+ sampleFps: 0.2,
147
+ maxFrames: 50,
148
+ // ffmpegPath: "/usr/bin/ffmpeg",
149
+ maxBytes: 50 * 1024 * 1024,
150
+ model: "google/gemini-2.0-flash",
151
+ prompt:
152
+ "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
153
+ timeoutMs: 60_000,
154
+ maxOutputChars: 50_000,
155
+ },
156
+ },
157
+ file: {
158
+ text: {
159
+ enabled: false, // __UNRAG_FLAG_file_text__
160
+ maxBytes: 5 * 1024 * 1024,
161
+ maxOutputChars: 200_000,
162
+ minChars: 50,
163
+ },
164
+ docx: {
165
+ enabled: false, // __UNRAG_FLAG_file_docx__
166
+ maxBytes: 15 * 1024 * 1024,
167
+ maxOutputChars: 200_000,
168
+ minChars: 50,
169
+ },
170
+ pptx: {
171
+ enabled: false, // __UNRAG_FLAG_file_pptx__
172
+ maxBytes: 30 * 1024 * 1024,
173
+ maxOutputChars: 200_000,
174
+ minChars: 50,
175
+ },
176
+ xlsx: {
177
+ enabled: false, // __UNRAG_FLAG_file_xlsx__
178
+ maxBytes: 30 * 1024 * 1024,
179
+ maxOutputChars: 200_000,
180
+ minChars: 50,
181
+ },
182
+ },
183
+ },
27
184
  },
28
- } as const;
185
+ } as const);
29
186
 
30
187
  // __UNRAG_CREATE_ENGINE__
31
188
 
@@ -1,3 +1,5 @@
1
+ import type { AssetInput, AssetKind, Metadata } from "../../core";
2
+
1
3
  type RichText = { plain_text?: string };
2
4
 
3
5
  export type NotionBlock = {
@@ -20,6 +22,82 @@ const rt = (value: unknown): string => {
20
22
 
21
23
  const indent = (n: number) => (n > 0 ? " ".repeat(n) : "");
22
24
 
25
+ const asString = (v: unknown) => String(v ?? "").trim();
26
+
27
+ const supportedAssetKinds = new Set<AssetKind>([
28
+ "image",
29
+ "pdf",
30
+ "audio",
31
+ "video",
32
+ "file",
33
+ ]);
34
+
35
+ const toAssetKind = (notionType: string): AssetKind | null => {
36
+ const t = notionType as AssetKind;
37
+ return supportedAssetKinds.has(t) ? t : null;
38
+ };
39
+
40
+ const pickUrl = (payload: any): string | undefined => {
41
+ const type = String(payload?.type ?? "");
42
+ if (type === "external") return asString(payload?.external?.url);
43
+ if (type === "file") return asString(payload?.file?.url);
44
+ return undefined;
45
+ };
46
+
47
+ const pickCaption = (payload: any): string => {
48
+ // Notion captions are typically an array of rich text items.
49
+ return rt(payload?.caption);
50
+ };
51
+
52
+ const inferMediaType = (assetKind: AssetKind, payload: any): string | undefined => {
53
+ if (assetKind === "pdf") return "application/pdf";
54
+ // Notion does not consistently include media types; keep it optional.
55
+ return asString(payload?.media_type) || undefined;
56
+ };
57
+
58
+ const asMetadata = (obj: Record<string, unknown>): Metadata => obj as any;
59
+
60
+ export function extractNotionAssets(
61
+ nodes: NotionBlockNode[],
62
+ opts: { maxDepth?: number } = {}
63
+ ): AssetInput[] {
64
+ const maxDepth = opts.maxDepth ?? 6;
65
+ const out: AssetInput[] = [];
66
+
67
+ const walk = (node: NotionBlockNode, depth: number) => {
68
+ if (depth > maxDepth) return;
69
+ const b = node.block as any;
70
+ const kind = toAssetKind(String(b.type ?? ""));
71
+ if (kind) {
72
+ const payload = b[kind];
73
+ const url = pickUrl(payload);
74
+ if (url) {
75
+ const caption = pickCaption(payload).trim();
76
+ const mediaType = inferMediaType(kind, payload);
77
+ out.push({
78
+ assetId: String(b.id),
79
+ kind,
80
+ data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
81
+ uri: url,
82
+ ...(caption ? { text: caption } : {}),
83
+ metadata: asMetadata({
84
+ connector: "notion",
85
+ notionBlockId: String(b.id),
86
+ notionBlockType: String(b.type),
87
+ }),
88
+ });
89
+ }
90
+ }
91
+
92
+ for (const child of node.children) {
93
+ walk(child, depth + 1);
94
+ }
95
+ };
96
+
97
+ for (const n of nodes) walk(n, 0);
98
+ return out;
99
+ }
100
+
23
101
  export function renderNotionBlocksToText(
24
102
  nodes: NotionBlockNode[],
25
103
  opts: { maxDepth?: number } = {}
@@ -1,8 +1,12 @@
1
- import type { ContextEngine } from "../../core";
2
- import type { IngestResult } from "../../core/types";
1
+ import type { IngestResult } from "../../core";
3
2
  import { createNotionClient, type NotionClient } from "./client";
4
3
  import { normalizeNotionPageId32, toUuidHyphenated } from "./ids";
5
- import { renderNotionBlocksToText, type NotionBlock, type NotionBlockNode } from "./render";
4
+ import {
5
+ extractNotionAssets,
6
+ renderNotionBlocksToText,
7
+ type NotionBlock,
8
+ type NotionBlockNode,
9
+ } from "./render";
6
10
  import type {
7
11
  BuildNotionPageIngestInputArgs,
8
12
  NotionPageDocument,
@@ -29,6 +33,7 @@ export function buildNotionPageIngestInput(
29
33
  sourceId,
30
34
  content: args.content,
31
35
  metadata: args.metadata ?? {},
36
+ assets: args.assets ?? [],
32
37
  };
33
38
  }
34
39
 
@@ -108,6 +113,7 @@ export async function loadNotionPageDocument(args: {
108
113
  const tree = await buildBlockTree(args.notion, apiId, 0, args.maxDepth ?? 4);
109
114
  const body = renderNotionBlocksToText(tree);
110
115
  const content = [title.trim(), body.trim()].filter(Boolean).join("\n\n");
116
+ const assets = extractNotionAssets(tree);
111
117
 
112
118
  const metadata = {
113
119
  connector: "notion",
@@ -121,6 +127,7 @@ export async function loadNotionPageDocument(args: {
121
127
  const ingest = buildNotionPageIngestInput({
122
128
  pageId,
123
129
  content,
130
+ assets,
124
131
  metadata: metadata as any,
125
132
  sourceIdPrefix: args.sourceIdPrefix,
126
133
  });
@@ -129,6 +136,7 @@ export async function loadNotionPageDocument(args: {
129
136
  sourceId: ingest.sourceId,
130
137
  content: ingest.content,
131
138
  metadata: ingest.metadata ?? {},
139
+ assets: ingest.assets ?? [],
132
140
  };
133
141
  }
134
142
 
@@ -178,6 +186,7 @@ export async function syncNotionPages(
178
186
  const result: IngestResult = await input.engine.ingest({
179
187
  sourceId: doc.sourceId,
180
188
  content: doc.content,
189
+ assets: doc.assets,
181
190
  metadata: doc.metadata as any,
182
191
  });
183
192
 
@@ -1,5 +1,5 @@
1
1
  import type { ContextEngine } from "../../core";
2
- import type { IngestInput } from "../../core/types";
2
+ import type { AssetInput, IngestInput } from "../../core";
3
3
 
4
4
  export type NotionSyncProgressEvent =
5
5
  | { type: "page:start"; pageId: string; sourceId: string }
@@ -42,11 +42,13 @@ export type NotionPageDocument = {
42
42
  sourceId: string;
43
43
  content: string;
44
44
  metadata: Record<string, unknown>;
45
+ assets: AssetInput[];
45
46
  };
46
47
 
47
48
  export type BuildNotionPageIngestInputArgs = {
48
49
  pageId: string; // normalized 32-hex (no dashes)
49
50
  content: string;
51
+ assets?: AssetInput[];
50
52
  metadata?: Record<string, unknown>;
51
53
  sourceIdPrefix?: string;
52
54
  };
@@ -0,0 +1,54 @@
1
+ import type { AssetKind, Chunk } from "./types";
2
+
3
+ export type ChunkAssetRef = {
4
+ assetId: string;
5
+ assetKind: AssetKind;
6
+ assetUri?: string;
7
+ assetMediaType?: string;
8
+ extractor?: string;
9
+ };
10
+
11
+ const assetKinds = new Set<AssetKind>(["image", "pdf", "audio", "video", "file"]);
12
+
13
+ /**
14
+ * Convenience helper to extract an asset reference from a retrieved chunk.
15
+ *
16
+ * Asset chunks are represented as standard text chunks whose `metadata` contains:
17
+ * - `assetKind`: "image" | "pdf" | "audio" | "video" | "file"
18
+ * - `assetId`: stable identifier emitted by the connector/ingester
19
+ * - optional `assetUri`, `assetMediaType`, and `extractor`
20
+ */
21
+ export function getChunkAssetRef(
22
+ chunk: Pick<Chunk, "metadata">
23
+ ): ChunkAssetRef | null {
24
+ const meta = chunk.metadata as any;
25
+ const kind = meta?.assetKind;
26
+ const id = meta?.assetId;
27
+
28
+ if (typeof kind !== "string" || !assetKinds.has(kind as AssetKind)) {
29
+ return null;
30
+ }
31
+ if (typeof id !== "string" || !id) {
32
+ return null;
33
+ }
34
+
35
+ const assetUri = typeof meta?.assetUri === "string" ? meta.assetUri : undefined;
36
+ const assetMediaType =
37
+ typeof meta?.assetMediaType === "string" ? meta.assetMediaType : undefined;
38
+ const extractor =
39
+ typeof meta?.extractor === "string" ? meta.extractor : undefined;
40
+
41
+ return {
42
+ assetId: id,
43
+ assetKind: kind as AssetKind,
44
+ ...(assetUri ? { assetUri } : {}),
45
+ ...(assetMediaType ? { assetMediaType } : {}),
46
+ ...(extractor ? { extractor } : {}),
47
+ };
48
+ }
49
+
50
+ export function isAssetChunk(chunk: Pick<Chunk, "metadata">): boolean {
51
+ return getChunkAssetRef(chunk) !== null;
52
+ }
53
+
54
+
@@ -2,6 +2,9 @@ import type {
2
2
  Chunker,
3
3
  ContextEngineConfig,
4
4
  ResolvedContextEngineConfig,
5
+ AssetProcessingConfig,
6
+ DeepPartial,
7
+ ContentStorageConfig,
5
8
  } from "./types";
6
9
  import { defaultChunker, resolveChunkingOptions } from "./chunking";
7
10
 
@@ -10,6 +13,150 @@ export const defineConfig = (config: ContextEngineConfig): ContextEngineConfig =
10
13
 
11
14
  const defaultIdGenerator = () => crypto.randomUUID();
12
15
 
16
+ const DEFAULT_PDF_LLM_MODEL = "google/gemini-2.0-flash";
17
+ const DEFAULT_IMAGE_OCR_MODEL = "google/gemini-2.0-flash";
18
+ const DEFAULT_IMAGE_CAPTION_MODEL = "google/gemini-2.0-flash";
19
+ const DEFAULT_AUDIO_TRANSCRIBE_MODEL = "openai/whisper-1";
20
+ const DEFAULT_VIDEO_TRANSCRIBE_MODEL = "openai/whisper-1";
21
+
22
+ export const defaultAssetProcessingConfig: AssetProcessingConfig = {
23
+ onUnsupportedAsset: "skip",
24
+ onError: "skip",
25
+ concurrency: 4,
26
+ hooks: {
27
+ onEvent: undefined,
28
+ },
29
+ fetch: {
30
+ enabled: true,
31
+ allowedHosts: undefined,
32
+ maxBytes: 15 * 1024 * 1024, // 15MB
33
+ timeoutMs: 20_000,
34
+ headers: undefined,
35
+ },
36
+ pdf: {
37
+ textLayer: {
38
+ enabled: false,
39
+ maxBytes: 15 * 1024 * 1024, // 15MB
40
+ maxOutputChars: 200_000,
41
+ minChars: 200,
42
+ maxPages: undefined,
43
+ },
44
+ llmExtraction: {
45
+ enabled: false, // library default (cost-safe)
46
+ model: DEFAULT_PDF_LLM_MODEL,
47
+ prompt:
48
+ "Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
49
+ timeoutMs: 60_000,
50
+ maxBytes: 15 * 1024 * 1024, // 15MB
51
+ maxOutputChars: 200_000,
52
+ },
53
+ ocr: {
54
+ enabled: false,
55
+ maxBytes: 15 * 1024 * 1024, // 15MB
56
+ maxOutputChars: 200_000,
57
+ minChars: 200,
58
+ maxPages: undefined,
59
+ pdftoppmPath: undefined,
60
+ tesseractPath: undefined,
61
+ dpi: 200,
62
+ lang: "eng",
63
+ },
64
+ },
65
+ image: {
66
+ ocr: {
67
+ enabled: false,
68
+ model: DEFAULT_IMAGE_OCR_MODEL,
69
+ prompt:
70
+ "Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
71
+ timeoutMs: 60_000,
72
+ maxBytes: 10 * 1024 * 1024, // 10MB
73
+ maxOutputChars: 50_000,
74
+ },
75
+ captionLlm: {
76
+ enabled: false,
77
+ model: DEFAULT_IMAGE_CAPTION_MODEL,
78
+ prompt:
79
+ "Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
80
+ timeoutMs: 60_000,
81
+ maxBytes: 10 * 1024 * 1024, // 10MB
82
+ maxOutputChars: 10_000,
83
+ },
84
+ },
85
+ audio: {
86
+ transcription: {
87
+ enabled: false,
88
+ model: DEFAULT_AUDIO_TRANSCRIBE_MODEL,
89
+ timeoutMs: 120_000,
90
+ maxBytes: 25 * 1024 * 1024, // 25MB
91
+ },
92
+ },
93
+ video: {
94
+ transcription: {
95
+ enabled: false,
96
+ model: DEFAULT_VIDEO_TRANSCRIBE_MODEL,
97
+ timeoutMs: 120_000,
98
+ maxBytes: 50 * 1024 * 1024, // 50MB
99
+ },
100
+ frames: {
101
+ enabled: false,
102
+ sampleFps: 0.2,
103
+ maxFrames: 50,
104
+ ffmpegPath: undefined,
105
+ maxBytes: 50 * 1024 * 1024, // 50MB
106
+ model: "google/gemini-2.0-flash",
107
+ prompt:
108
+ "Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
109
+ timeoutMs: 60_000,
110
+ maxOutputChars: 50_000,
111
+ },
112
+ },
113
+ file: {
114
+ text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
115
+ docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
116
+ pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
117
+ xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
118
+ },
119
+ };
120
+
121
+ export const defaultContentStorageConfig: ContentStorageConfig = {
122
+ storeChunkContent: true,
123
+ storeDocumentContent: true,
124
+ };
125
+
126
+ const mergeDeep = <T extends Record<string, any>>(
127
+ base: T,
128
+ overrides: DeepPartial<T> | undefined
129
+ ): T => {
130
+ if (!overrides) return base;
131
+ const out: any = Array.isArray(base) ? [...base] : { ...base };
132
+ for (const key of Object.keys(overrides) as Array<keyof T>) {
133
+ const nextVal = overrides[key];
134
+ if (nextVal === undefined) continue;
135
+ const baseVal = base[key];
136
+ if (
137
+ baseVal &&
138
+ typeof baseVal === "object" &&
139
+ !Array.isArray(baseVal) &&
140
+ nextVal &&
141
+ typeof nextVal === "object" &&
142
+ !Array.isArray(nextVal)
143
+ ) {
144
+ out[key] = mergeDeep(baseVal, nextVal as any);
145
+ } else {
146
+ out[key] = nextVal as any;
147
+ }
148
+ }
149
+ return out as T;
150
+ };
151
+
152
+ export const resolveAssetProcessingConfig = (
153
+ overrides?: DeepPartial<AssetProcessingConfig>
154
+ ): AssetProcessingConfig => mergeDeep(defaultAssetProcessingConfig, overrides);
155
+
156
+ export const resolveContentStorageConfig = (
157
+ overrides?: DeepPartial<ContentStorageConfig>
158
+ ): ContentStorageConfig => mergeDeep(defaultContentStorageConfig, overrides);
159
+
13
160
  export const resolveConfig = (
14
161
  config: ContextEngineConfig
15
162
  ): ResolvedContextEngineConfig => {
@@ -21,6 +168,9 @@ export const resolveConfig = (
21
168
  defaults: resolveChunkingOptions(config.defaults),
22
169
  chunker,
23
170
  idGenerator: config.idGenerator ?? defaultIdGenerator,
171
+ extractors: config.extractors ?? [],
172
+ storage: resolveContentStorageConfig(config.storage),
173
+ assetProcessing: resolveAssetProcessingConfig(config.assetProcessing),
24
174
  };
25
175
  };
26
176
 
@@ -1,15 +1,21 @@
1
1
  import { deleteDocuments } from "./delete";
2
- import { ingest } from "./ingest";
2
+ import { ingest, planIngest } from "./ingest";
3
3
  import { retrieve } from "./retrieve";
4
4
  import { defineConfig, resolveConfig } from "./config";
5
+ import { createAiEmbeddingProvider } from "../embedding/ai";
5
6
  import type {
7
+ AssetExtractor,
6
8
  ContextEngineConfig,
7
9
  DeleteInput,
10
+ DefineUnragConfigInput,
11
+ EmbeddingProvider,
8
12
  IngestInput,
9
13
  IngestResult,
14
+ IngestPlanResult,
10
15
  ResolvedContextEngineConfig,
11
16
  RetrieveInput,
12
17
  RetrieveResult,
18
+ UnragCreateEngineRuntime,
13
19
  } from "./types";
14
20
 
15
21
  export class ContextEngine {
@@ -23,6 +29,16 @@ export class ContextEngine {
23
29
  return ingest(this.config, input);
24
30
  }
25
31
 
32
+ /**
33
+ * Dry-run for ingestion. Returns which assets would be processed and by which extractors,
34
+ * without calling external services.
35
+ *
36
+ * Note: chunk counts/embeddings are not produced in dry-run.
37
+ */
38
+ async planIngest(input: IngestInput): Promise<IngestPlanResult> {
39
+ return planIngest(this.config, input);
40
+ }
41
+
26
42
  async retrieve(input: RetrieveInput): Promise<RetrieveResult> {
27
43
  return retrieve(this.config, input);
28
44
  }
@@ -37,4 +53,56 @@ export const createContextEngine = (config: ContextEngineConfig) =>
37
53
 
38
54
  export { defineConfig };
39
55
 
56
+ /**
57
+ * Ergonomic, higher-level config wrapper.
58
+ *
59
+ * This helps keep `unrag.config.ts` as a single source of truth while still
60
+ * allowing runtime wiring (DB client/store, optional extractors).
61
+ */
62
+ export const defineUnragConfig = <T extends DefineUnragConfigInput>(config: T) => {
63
+ let embeddingProvider: EmbeddingProvider | undefined;
64
+
65
+ const getEmbeddingProvider = () => {
66
+ if (embeddingProvider) return embeddingProvider;
67
+
68
+ if (config.embedding.provider === "ai") {
69
+ embeddingProvider = createAiEmbeddingProvider(config.embedding.config);
70
+ return embeddingProvider;
71
+ }
72
+
73
+ embeddingProvider = config.embedding.create();
74
+ return embeddingProvider;
75
+ };
76
+
77
+ const defaults = {
78
+ chunking: config.defaults?.chunking ?? {},
79
+ retrieval: {
80
+ topK: config.defaults?.retrieval?.topK ?? 8,
81
+ },
82
+ } as const;
83
+
84
+ const createEngineConfig = (runtime: UnragCreateEngineRuntime): ContextEngineConfig => {
85
+ const baseExtractors = (config.engine?.extractors ?? []) as AssetExtractor[];
86
+ const extractors =
87
+ typeof runtime.extractors === "function"
88
+ ? runtime.extractors(baseExtractors)
89
+ : runtime.extractors ?? baseExtractors;
90
+
91
+ return defineConfig({
92
+ ...(config.engine ?? {}),
93
+ defaults: defaults.chunking,
94
+ embedding: getEmbeddingProvider(),
95
+ store: runtime.store,
96
+ extractors,
97
+ });
98
+ };
99
+
100
+ return {
101
+ defaults,
102
+ createEngineConfig,
103
+ createEngine: (runtime: UnragCreateEngineRuntime) =>
104
+ new ContextEngine(createEngineConfig(runtime)),
105
+ };
106
+ };
107
+
40
108
 
@@ -1,8 +1,21 @@
1
- export { ContextEngine, createContextEngine, defineConfig } from "./context-engine";
1
+ export {
2
+ ContextEngine,
3
+ createContextEngine,
4
+ defineConfig,
5
+ defineUnragConfig,
6
+ } from "./context-engine";
2
7
  export { deleteDocuments } from "./delete";
3
- export { ingest } from "./ingest";
8
+ export { ingest, planIngest } from "./ingest";
4
9
  export { retrieve } from "./retrieve";
5
10
  export { defaultChunker, resolveChunkingOptions } from "./chunking";
11
+ export {
12
+ defaultAssetProcessingConfig,
13
+ defaultContentStorageConfig,
14
+ resolveAssetProcessingConfig,
15
+ resolveContentStorageConfig,
16
+ } from "./config";
17
+ export { getChunkAssetRef, isAssetChunk } from "./assets";
18
+ export type { ChunkAssetRef } from "./assets";
6
19
  export * from "./types";
7
20
 
8
21