unrag 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/cli/index.js +408 -50
- package/package.json +3 -1
- package/registry/config/unrag.config.ts +164 -7
- package/registry/connectors/notion/render.ts +78 -0
- package/registry/connectors/notion/sync.ts +12 -3
- package/registry/connectors/notion/types.ts +3 -1
- package/registry/core/assets.ts +54 -0
- package/registry/core/config.ts +150 -0
- package/registry/core/context-engine.ts +69 -1
- package/registry/core/index.ts +15 -2
- package/registry/core/ingest.ts +743 -17
- package/registry/core/types.ts +606 -0
- package/registry/docs/unrag.md +6 -0
- package/registry/embedding/ai.ts +89 -8
- package/registry/extractors/_shared/fetch.ts +113 -0
- package/registry/extractors/_shared/media.ts +14 -0
- package/registry/extractors/_shared/text.ts +11 -0
- package/registry/extractors/audio-transcribe/index.ts +75 -0
- package/registry/extractors/file-docx/index.ts +53 -0
- package/registry/extractors/file-pptx/index.ts +92 -0
- package/registry/extractors/file-text/index.ts +85 -0
- package/registry/extractors/file-xlsx/index.ts +58 -0
- package/registry/extractors/image-caption-llm/index.ts +60 -0
- package/registry/extractors/image-ocr/index.ts +60 -0
- package/registry/extractors/pdf-llm/index.ts +84 -0
- package/registry/extractors/pdf-ocr/index.ts +125 -0
- package/registry/extractors/pdf-text-layer/index.ts +76 -0
- package/registry/extractors/video-frames/index.ts +126 -0
- package/registry/extractors/video-transcribe/index.ts +78 -0
- package/registry/store/drizzle-postgres-pgvector/store.ts +1 -1
|
@@ -2,18 +2,21 @@
|
|
|
2
2
|
* Root Unrag config (generated).
|
|
3
3
|
*
|
|
4
4
|
* This file is meant to be the single place you tweak:
|
|
5
|
+
* - Defaults (chunking + retrieval)
|
|
6
|
+
* - Engine settings (storage, asset processing, extractors)
|
|
5
7
|
* - Embedding provider/model/timeouts
|
|
6
|
-
* -
|
|
7
|
-
* - Retrieval defaults
|
|
8
|
-
* - How you construct your DB client (Pool/Prisma/etc)
|
|
8
|
+
* - How you construct your DB client (Pool/Prisma/etc) and vector store adapter
|
|
9
9
|
*
|
|
10
10
|
* The files under your install dir (e.g. `lib/unrag/**`) are intended to be
|
|
11
11
|
* treated like vendored source code.
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
+
// @ts-nocheck
|
|
15
|
+
|
|
14
16
|
// __UNRAG_IMPORTS__
|
|
15
17
|
|
|
16
|
-
export const
|
|
18
|
+
export const unrag = defineUnragConfig({
|
|
19
|
+
defaults: {
|
|
17
20
|
chunking: {
|
|
18
21
|
chunkSize: 200,
|
|
19
22
|
chunkOverlap: 40,
|
|
@@ -21,11 +24,165 @@ export const unragConfig = {
|
|
|
21
24
|
retrieval: {
|
|
22
25
|
topK: 8,
|
|
23
26
|
},
|
|
27
|
+
},
|
|
24
28
|
embedding: {
|
|
25
|
-
|
|
26
|
-
|
|
29
|
+
provider: "ai",
|
|
30
|
+
config: {
|
|
31
|
+
type: "text", // __UNRAG_EMBEDDING_TYPE__
|
|
32
|
+
model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__
|
|
33
|
+
timeoutMs: 15_000,
|
|
34
|
+
},
|
|
35
|
+
},
|
|
36
|
+
engine: {
|
|
37
|
+
/**
|
|
38
|
+
* Storage controls.
|
|
39
|
+
*
|
|
40
|
+
* - storeChunkContent: whether `chunk.content` is persisted and returned by retrieval.
|
|
41
|
+
* - storeDocumentContent: whether the full original document text is stored in `documents.content`.
|
|
42
|
+
*/
|
|
43
|
+
storage: {
|
|
44
|
+
storeChunkContent: true,
|
|
45
|
+
storeDocumentContent: true,
|
|
46
|
+
},
|
|
47
|
+
/**
|
|
48
|
+
* Optional extractor modules that can process non-text assets into text outputs.
|
|
49
|
+
*
|
|
50
|
+
* To install:
|
|
51
|
+
* - `unrag add extractor pdf-llm`
|
|
52
|
+
*
|
|
53
|
+
* Then import it in this file and add it here, for example:
|
|
54
|
+
* - `import { createPdfLlmExtractor } from "./lib/unrag/extractors/pdf-llm";`
|
|
55
|
+
* - `extractors: [createPdfLlmExtractor()]`
|
|
56
|
+
*/
|
|
57
|
+
extractors: [
|
|
58
|
+
// __UNRAG_EXTRACTORS__
|
|
59
|
+
],
|
|
60
|
+
/**
|
|
61
|
+
* Rich media processing controls.
|
|
62
|
+
*
|
|
63
|
+
* Notes:
|
|
64
|
+
* - This generated config is cost-safe by default (all extraction is off).
|
|
65
|
+
* - `unrag init` can enable rich media + multimodal embeddings for you.
|
|
66
|
+
* - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
|
|
67
|
+
*/
|
|
68
|
+
assetProcessing: {
|
|
69
|
+
onUnsupportedAsset: "skip",
|
|
70
|
+
onError: "skip",
|
|
71
|
+
concurrency: 4,
|
|
72
|
+
fetch: {
|
|
73
|
+
enabled: true,
|
|
74
|
+
maxBytes: 15 * 1024 * 1024,
|
|
75
|
+
timeoutMs: 20_000,
|
|
76
|
+
// allowedHosts: ["..."], // recommended to mitigate SSRF
|
|
77
|
+
},
|
|
78
|
+
pdf: {
|
|
79
|
+
// Fast/cheap text-layer extraction (requires installing a PDF text-layer extractor module).
|
|
80
|
+
textLayer: {
|
|
81
|
+
enabled: false, // __UNRAG_FLAG_pdf_textLayer__
|
|
82
|
+
maxBytes: 15 * 1024 * 1024,
|
|
83
|
+
maxOutputChars: 200_000,
|
|
84
|
+
minChars: 200,
|
|
85
|
+
// maxPages: 200,
|
|
86
|
+
},
|
|
87
|
+
llmExtraction: {
|
|
88
|
+
enabled: false, // __UNRAG_FLAG_pdf_llmExtraction__
|
|
89
|
+
model: "google/gemini-2.0-flash",
|
|
90
|
+
prompt:
|
|
91
|
+
"Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
|
|
92
|
+
timeoutMs: 60_000,
|
|
93
|
+
maxBytes: 15 * 1024 * 1024,
|
|
94
|
+
maxOutputChars: 200_000,
|
|
95
|
+
},
|
|
96
|
+
// Worker-only OCR pipelines typically require native binaries (poppler/tesseract) or external services.
|
|
97
|
+
ocr: {
|
|
98
|
+
enabled: false, // __UNRAG_FLAG_pdf_ocr__
|
|
99
|
+
maxBytes: 15 * 1024 * 1024,
|
|
100
|
+
maxOutputChars: 200_000,
|
|
101
|
+
minChars: 200,
|
|
102
|
+
// maxPages: 200,
|
|
103
|
+
// pdftoppmPath: "/usr/bin/pdftoppm",
|
|
104
|
+
// tesseractPath: "/usr/bin/tesseract",
|
|
105
|
+
// dpi: 200,
|
|
106
|
+
// lang: "eng",
|
|
107
|
+
},
|
|
108
|
+
},
|
|
109
|
+
image: {
|
|
110
|
+
ocr: {
|
|
111
|
+
enabled: false, // __UNRAG_FLAG_image_ocr__
|
|
112
|
+
model: "google/gemini-2.0-flash",
|
|
113
|
+
prompt:
|
|
114
|
+
"Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
115
|
+
timeoutMs: 60_000,
|
|
116
|
+
maxBytes: 10 * 1024 * 1024,
|
|
117
|
+
maxOutputChars: 50_000,
|
|
118
|
+
},
|
|
119
|
+
captionLlm: {
|
|
120
|
+
enabled: false, // __UNRAG_FLAG_image_captionLlm__
|
|
121
|
+
model: "google/gemini-2.0-flash",
|
|
122
|
+
prompt:
|
|
123
|
+
"Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
|
|
124
|
+
timeoutMs: 60_000,
|
|
125
|
+
maxBytes: 10 * 1024 * 1024,
|
|
126
|
+
maxOutputChars: 10_000,
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
audio: {
|
|
130
|
+
transcription: {
|
|
131
|
+
enabled: false, // __UNRAG_FLAG_audio_transcription__
|
|
132
|
+
model: "openai/whisper-1",
|
|
133
|
+
timeoutMs: 120_000,
|
|
134
|
+
maxBytes: 25 * 1024 * 1024,
|
|
135
|
+
},
|
|
136
|
+
},
|
|
137
|
+
video: {
|
|
138
|
+
transcription: {
|
|
139
|
+
enabled: false, // __UNRAG_FLAG_video_transcription__
|
|
140
|
+
model: "openai/whisper-1",
|
|
141
|
+
timeoutMs: 120_000,
|
|
142
|
+
maxBytes: 50 * 1024 * 1024,
|
|
143
|
+
},
|
|
144
|
+
frames: {
|
|
145
|
+
enabled: false, // __UNRAG_FLAG_video_frames__
|
|
146
|
+
sampleFps: 0.2,
|
|
147
|
+
maxFrames: 50,
|
|
148
|
+
// ffmpegPath: "/usr/bin/ffmpeg",
|
|
149
|
+
maxBytes: 50 * 1024 * 1024,
|
|
150
|
+
model: "google/gemini-2.0-flash",
|
|
151
|
+
prompt:
|
|
152
|
+
"Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
153
|
+
timeoutMs: 60_000,
|
|
154
|
+
maxOutputChars: 50_000,
|
|
155
|
+
},
|
|
156
|
+
},
|
|
157
|
+
file: {
|
|
158
|
+
text: {
|
|
159
|
+
enabled: false, // __UNRAG_FLAG_file_text__
|
|
160
|
+
maxBytes: 5 * 1024 * 1024,
|
|
161
|
+
maxOutputChars: 200_000,
|
|
162
|
+
minChars: 50,
|
|
163
|
+
},
|
|
164
|
+
docx: {
|
|
165
|
+
enabled: false, // __UNRAG_FLAG_file_docx__
|
|
166
|
+
maxBytes: 15 * 1024 * 1024,
|
|
167
|
+
maxOutputChars: 200_000,
|
|
168
|
+
minChars: 50,
|
|
169
|
+
},
|
|
170
|
+
pptx: {
|
|
171
|
+
enabled: false, // __UNRAG_FLAG_file_pptx__
|
|
172
|
+
maxBytes: 30 * 1024 * 1024,
|
|
173
|
+
maxOutputChars: 200_000,
|
|
174
|
+
minChars: 50,
|
|
175
|
+
},
|
|
176
|
+
xlsx: {
|
|
177
|
+
enabled: false, // __UNRAG_FLAG_file_xlsx__
|
|
178
|
+
maxBytes: 30 * 1024 * 1024,
|
|
179
|
+
maxOutputChars: 200_000,
|
|
180
|
+
minChars: 50,
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
},
|
|
27
184
|
},
|
|
28
|
-
} as const;
|
|
185
|
+
} as const);
|
|
29
186
|
|
|
30
187
|
// __UNRAG_CREATE_ENGINE__
|
|
31
188
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import type { AssetInput, AssetKind, Metadata } from "../../core";
|
|
2
|
+
|
|
1
3
|
type RichText = { plain_text?: string };
|
|
2
4
|
|
|
3
5
|
export type NotionBlock = {
|
|
@@ -20,6 +22,82 @@ const rt = (value: unknown): string => {
|
|
|
20
22
|
|
|
21
23
|
const indent = (n: number) => (n > 0 ? " ".repeat(n) : "");
|
|
22
24
|
|
|
25
|
+
const asString = (v: unknown) => String(v ?? "").trim();
|
|
26
|
+
|
|
27
|
+
const supportedAssetKinds = new Set<AssetKind>([
|
|
28
|
+
"image",
|
|
29
|
+
"pdf",
|
|
30
|
+
"audio",
|
|
31
|
+
"video",
|
|
32
|
+
"file",
|
|
33
|
+
]);
|
|
34
|
+
|
|
35
|
+
const toAssetKind = (notionType: string): AssetKind | null => {
|
|
36
|
+
const t = notionType as AssetKind;
|
|
37
|
+
return supportedAssetKinds.has(t) ? t : null;
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
const pickUrl = (payload: any): string | undefined => {
|
|
41
|
+
const type = String(payload?.type ?? "");
|
|
42
|
+
if (type === "external") return asString(payload?.external?.url);
|
|
43
|
+
if (type === "file") return asString(payload?.file?.url);
|
|
44
|
+
return undefined;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const pickCaption = (payload: any): string => {
|
|
48
|
+
// Notion captions are typically an array of rich text items.
|
|
49
|
+
return rt(payload?.caption);
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const inferMediaType = (assetKind: AssetKind, payload: any): string | undefined => {
|
|
53
|
+
if (assetKind === "pdf") return "application/pdf";
|
|
54
|
+
// Notion does not consistently include media types; keep it optional.
|
|
55
|
+
return asString(payload?.media_type) || undefined;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const asMetadata = (obj: Record<string, unknown>): Metadata => obj as any;
|
|
59
|
+
|
|
60
|
+
export function extractNotionAssets(
|
|
61
|
+
nodes: NotionBlockNode[],
|
|
62
|
+
opts: { maxDepth?: number } = {}
|
|
63
|
+
): AssetInput[] {
|
|
64
|
+
const maxDepth = opts.maxDepth ?? 6;
|
|
65
|
+
const out: AssetInput[] = [];
|
|
66
|
+
|
|
67
|
+
const walk = (node: NotionBlockNode, depth: number) => {
|
|
68
|
+
if (depth > maxDepth) return;
|
|
69
|
+
const b = node.block as any;
|
|
70
|
+
const kind = toAssetKind(String(b.type ?? ""));
|
|
71
|
+
if (kind) {
|
|
72
|
+
const payload = b[kind];
|
|
73
|
+
const url = pickUrl(payload);
|
|
74
|
+
if (url) {
|
|
75
|
+
const caption = pickCaption(payload).trim();
|
|
76
|
+
const mediaType = inferMediaType(kind, payload);
|
|
77
|
+
out.push({
|
|
78
|
+
assetId: String(b.id),
|
|
79
|
+
kind,
|
|
80
|
+
data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
|
|
81
|
+
uri: url,
|
|
82
|
+
...(caption ? { text: caption } : {}),
|
|
83
|
+
metadata: asMetadata({
|
|
84
|
+
connector: "notion",
|
|
85
|
+
notionBlockId: String(b.id),
|
|
86
|
+
notionBlockType: String(b.type),
|
|
87
|
+
}),
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
for (const child of node.children) {
|
|
93
|
+
walk(child, depth + 1);
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
|
|
97
|
+
for (const n of nodes) walk(n, 0);
|
|
98
|
+
return out;
|
|
99
|
+
}
|
|
100
|
+
|
|
23
101
|
export function renderNotionBlocksToText(
|
|
24
102
|
nodes: NotionBlockNode[],
|
|
25
103
|
opts: { maxDepth?: number } = {}
|
|
@@ -1,8 +1,12 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
import type { IngestResult } from "../../core/types";
|
|
1
|
+
import type { IngestResult } from "../../core";
|
|
3
2
|
import { createNotionClient, type NotionClient } from "./client";
|
|
4
3
|
import { normalizeNotionPageId32, toUuidHyphenated } from "./ids";
|
|
5
|
-
import {
|
|
4
|
+
import {
|
|
5
|
+
extractNotionAssets,
|
|
6
|
+
renderNotionBlocksToText,
|
|
7
|
+
type NotionBlock,
|
|
8
|
+
type NotionBlockNode,
|
|
9
|
+
} from "./render";
|
|
6
10
|
import type {
|
|
7
11
|
BuildNotionPageIngestInputArgs,
|
|
8
12
|
NotionPageDocument,
|
|
@@ -29,6 +33,7 @@ export function buildNotionPageIngestInput(
|
|
|
29
33
|
sourceId,
|
|
30
34
|
content: args.content,
|
|
31
35
|
metadata: args.metadata ?? {},
|
|
36
|
+
assets: args.assets ?? [],
|
|
32
37
|
};
|
|
33
38
|
}
|
|
34
39
|
|
|
@@ -108,6 +113,7 @@ export async function loadNotionPageDocument(args: {
|
|
|
108
113
|
const tree = await buildBlockTree(args.notion, apiId, 0, args.maxDepth ?? 4);
|
|
109
114
|
const body = renderNotionBlocksToText(tree);
|
|
110
115
|
const content = [title.trim(), body.trim()].filter(Boolean).join("\n\n");
|
|
116
|
+
const assets = extractNotionAssets(tree);
|
|
111
117
|
|
|
112
118
|
const metadata = {
|
|
113
119
|
connector: "notion",
|
|
@@ -121,6 +127,7 @@ export async function loadNotionPageDocument(args: {
|
|
|
121
127
|
const ingest = buildNotionPageIngestInput({
|
|
122
128
|
pageId,
|
|
123
129
|
content,
|
|
130
|
+
assets,
|
|
124
131
|
metadata: metadata as any,
|
|
125
132
|
sourceIdPrefix: args.sourceIdPrefix,
|
|
126
133
|
});
|
|
@@ -129,6 +136,7 @@ export async function loadNotionPageDocument(args: {
|
|
|
129
136
|
sourceId: ingest.sourceId,
|
|
130
137
|
content: ingest.content,
|
|
131
138
|
metadata: ingest.metadata ?? {},
|
|
139
|
+
assets: ingest.assets ?? [],
|
|
132
140
|
};
|
|
133
141
|
}
|
|
134
142
|
|
|
@@ -178,6 +186,7 @@ export async function syncNotionPages(
|
|
|
178
186
|
const result: IngestResult = await input.engine.ingest({
|
|
179
187
|
sourceId: doc.sourceId,
|
|
180
188
|
content: doc.content,
|
|
189
|
+
assets: doc.assets,
|
|
181
190
|
metadata: doc.metadata as any,
|
|
182
191
|
});
|
|
183
192
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { ContextEngine } from "../../core";
|
|
2
|
-
import type { IngestInput } from "../../core
|
|
2
|
+
import type { AssetInput, IngestInput } from "../../core";
|
|
3
3
|
|
|
4
4
|
export type NotionSyncProgressEvent =
|
|
5
5
|
| { type: "page:start"; pageId: string; sourceId: string }
|
|
@@ -42,11 +42,13 @@ export type NotionPageDocument = {
|
|
|
42
42
|
sourceId: string;
|
|
43
43
|
content: string;
|
|
44
44
|
metadata: Record<string, unknown>;
|
|
45
|
+
assets: AssetInput[];
|
|
45
46
|
};
|
|
46
47
|
|
|
47
48
|
export type BuildNotionPageIngestInputArgs = {
|
|
48
49
|
pageId: string; // normalized 32-hex (no dashes)
|
|
49
50
|
content: string;
|
|
51
|
+
assets?: AssetInput[];
|
|
50
52
|
metadata?: Record<string, unknown>;
|
|
51
53
|
sourceIdPrefix?: string;
|
|
52
54
|
};
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import type { AssetKind, Chunk } from "./types";
|
|
2
|
+
|
|
3
|
+
export type ChunkAssetRef = {
|
|
4
|
+
assetId: string;
|
|
5
|
+
assetKind: AssetKind;
|
|
6
|
+
assetUri?: string;
|
|
7
|
+
assetMediaType?: string;
|
|
8
|
+
extractor?: string;
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
const assetKinds = new Set<AssetKind>(["image", "pdf", "audio", "video", "file"]);
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Convenience helper to extract an asset reference from a retrieved chunk.
|
|
15
|
+
*
|
|
16
|
+
* Asset chunks are represented as standard text chunks whose `metadata` contains:
|
|
17
|
+
* - `assetKind`: "image" | "pdf" | "audio" | "video" | "file"
|
|
18
|
+
* - `assetId`: stable identifier emitted by the connector/ingester
|
|
19
|
+
* - optional `assetUri`, `assetMediaType`, and `extractor`
|
|
20
|
+
*/
|
|
21
|
+
export function getChunkAssetRef(
|
|
22
|
+
chunk: Pick<Chunk, "metadata">
|
|
23
|
+
): ChunkAssetRef | null {
|
|
24
|
+
const meta = chunk.metadata as any;
|
|
25
|
+
const kind = meta?.assetKind;
|
|
26
|
+
const id = meta?.assetId;
|
|
27
|
+
|
|
28
|
+
if (typeof kind !== "string" || !assetKinds.has(kind as AssetKind)) {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
if (typeof id !== "string" || !id) {
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const assetUri = typeof meta?.assetUri === "string" ? meta.assetUri : undefined;
|
|
36
|
+
const assetMediaType =
|
|
37
|
+
typeof meta?.assetMediaType === "string" ? meta.assetMediaType : undefined;
|
|
38
|
+
const extractor =
|
|
39
|
+
typeof meta?.extractor === "string" ? meta.extractor : undefined;
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
assetId: id,
|
|
43
|
+
assetKind: kind as AssetKind,
|
|
44
|
+
...(assetUri ? { assetUri } : {}),
|
|
45
|
+
...(assetMediaType ? { assetMediaType } : {}),
|
|
46
|
+
...(extractor ? { extractor } : {}),
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export function isAssetChunk(chunk: Pick<Chunk, "metadata">): boolean {
|
|
51
|
+
return getChunkAssetRef(chunk) !== null;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
package/registry/core/config.ts
CHANGED
|
@@ -2,6 +2,9 @@ import type {
|
|
|
2
2
|
Chunker,
|
|
3
3
|
ContextEngineConfig,
|
|
4
4
|
ResolvedContextEngineConfig,
|
|
5
|
+
AssetProcessingConfig,
|
|
6
|
+
DeepPartial,
|
|
7
|
+
ContentStorageConfig,
|
|
5
8
|
} from "./types";
|
|
6
9
|
import { defaultChunker, resolveChunkingOptions } from "./chunking";
|
|
7
10
|
|
|
@@ -10,6 +13,150 @@ export const defineConfig = (config: ContextEngineConfig): ContextEngineConfig =
|
|
|
10
13
|
|
|
11
14
|
const defaultIdGenerator = () => crypto.randomUUID();
|
|
12
15
|
|
|
16
|
+
const DEFAULT_PDF_LLM_MODEL = "google/gemini-2.0-flash";
|
|
17
|
+
const DEFAULT_IMAGE_OCR_MODEL = "google/gemini-2.0-flash";
|
|
18
|
+
const DEFAULT_IMAGE_CAPTION_MODEL = "google/gemini-2.0-flash";
|
|
19
|
+
const DEFAULT_AUDIO_TRANSCRIBE_MODEL = "openai/whisper-1";
|
|
20
|
+
const DEFAULT_VIDEO_TRANSCRIBE_MODEL = "openai/whisper-1";
|
|
21
|
+
|
|
22
|
+
export const defaultAssetProcessingConfig: AssetProcessingConfig = {
|
|
23
|
+
onUnsupportedAsset: "skip",
|
|
24
|
+
onError: "skip",
|
|
25
|
+
concurrency: 4,
|
|
26
|
+
hooks: {
|
|
27
|
+
onEvent: undefined,
|
|
28
|
+
},
|
|
29
|
+
fetch: {
|
|
30
|
+
enabled: true,
|
|
31
|
+
allowedHosts: undefined,
|
|
32
|
+
maxBytes: 15 * 1024 * 1024, // 15MB
|
|
33
|
+
timeoutMs: 20_000,
|
|
34
|
+
headers: undefined,
|
|
35
|
+
},
|
|
36
|
+
pdf: {
|
|
37
|
+
textLayer: {
|
|
38
|
+
enabled: false,
|
|
39
|
+
maxBytes: 15 * 1024 * 1024, // 15MB
|
|
40
|
+
maxOutputChars: 200_000,
|
|
41
|
+
minChars: 200,
|
|
42
|
+
maxPages: undefined,
|
|
43
|
+
},
|
|
44
|
+
llmExtraction: {
|
|
45
|
+
enabled: false, // library default (cost-safe)
|
|
46
|
+
model: DEFAULT_PDF_LLM_MODEL,
|
|
47
|
+
prompt:
|
|
48
|
+
"Extract all readable text from this PDF as faithfully as possible. Preserve structure with headings and lists when obvious. Output plain text or markdown only. Do not add commentary.",
|
|
49
|
+
timeoutMs: 60_000,
|
|
50
|
+
maxBytes: 15 * 1024 * 1024, // 15MB
|
|
51
|
+
maxOutputChars: 200_000,
|
|
52
|
+
},
|
|
53
|
+
ocr: {
|
|
54
|
+
enabled: false,
|
|
55
|
+
maxBytes: 15 * 1024 * 1024, // 15MB
|
|
56
|
+
maxOutputChars: 200_000,
|
|
57
|
+
minChars: 200,
|
|
58
|
+
maxPages: undefined,
|
|
59
|
+
pdftoppmPath: undefined,
|
|
60
|
+
tesseractPath: undefined,
|
|
61
|
+
dpi: 200,
|
|
62
|
+
lang: "eng",
|
|
63
|
+
},
|
|
64
|
+
},
|
|
65
|
+
image: {
|
|
66
|
+
ocr: {
|
|
67
|
+
enabled: false,
|
|
68
|
+
model: DEFAULT_IMAGE_OCR_MODEL,
|
|
69
|
+
prompt:
|
|
70
|
+
"Extract all readable text from this image as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
71
|
+
timeoutMs: 60_000,
|
|
72
|
+
maxBytes: 10 * 1024 * 1024, // 10MB
|
|
73
|
+
maxOutputChars: 50_000,
|
|
74
|
+
},
|
|
75
|
+
captionLlm: {
|
|
76
|
+
enabled: false,
|
|
77
|
+
model: DEFAULT_IMAGE_CAPTION_MODEL,
|
|
78
|
+
prompt:
|
|
79
|
+
"Write a concise, information-dense caption for this image. Include names, numbers, and labels if visible. Output plain text only.",
|
|
80
|
+
timeoutMs: 60_000,
|
|
81
|
+
maxBytes: 10 * 1024 * 1024, // 10MB
|
|
82
|
+
maxOutputChars: 10_000,
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
audio: {
|
|
86
|
+
transcription: {
|
|
87
|
+
enabled: false,
|
|
88
|
+
model: DEFAULT_AUDIO_TRANSCRIBE_MODEL,
|
|
89
|
+
timeoutMs: 120_000,
|
|
90
|
+
maxBytes: 25 * 1024 * 1024, // 25MB
|
|
91
|
+
},
|
|
92
|
+
},
|
|
93
|
+
video: {
|
|
94
|
+
transcription: {
|
|
95
|
+
enabled: false,
|
|
96
|
+
model: DEFAULT_VIDEO_TRANSCRIBE_MODEL,
|
|
97
|
+
timeoutMs: 120_000,
|
|
98
|
+
maxBytes: 50 * 1024 * 1024, // 50MB
|
|
99
|
+
},
|
|
100
|
+
frames: {
|
|
101
|
+
enabled: false,
|
|
102
|
+
sampleFps: 0.2,
|
|
103
|
+
maxFrames: 50,
|
|
104
|
+
ffmpegPath: undefined,
|
|
105
|
+
maxBytes: 50 * 1024 * 1024, // 50MB
|
|
106
|
+
model: "google/gemini-2.0-flash",
|
|
107
|
+
prompt:
|
|
108
|
+
"Extract all readable text from this video frame as faithfully as possible. Output plain text only. Do not add commentary.",
|
|
109
|
+
timeoutMs: 60_000,
|
|
110
|
+
maxOutputChars: 50_000,
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
|
+
file: {
|
|
114
|
+
text: { enabled: false, maxBytes: 5 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
115
|
+
docx: { enabled: false, maxBytes: 15 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
116
|
+
pptx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
117
|
+
xlsx: { enabled: false, maxBytes: 30 * 1024 * 1024, maxOutputChars: 200_000, minChars: 50 },
|
|
118
|
+
},
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
export const defaultContentStorageConfig: ContentStorageConfig = {
|
|
122
|
+
storeChunkContent: true,
|
|
123
|
+
storeDocumentContent: true,
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
const mergeDeep = <T extends Record<string, any>>(
|
|
127
|
+
base: T,
|
|
128
|
+
overrides: DeepPartial<T> | undefined
|
|
129
|
+
): T => {
|
|
130
|
+
if (!overrides) return base;
|
|
131
|
+
const out: any = Array.isArray(base) ? [...base] : { ...base };
|
|
132
|
+
for (const key of Object.keys(overrides) as Array<keyof T>) {
|
|
133
|
+
const nextVal = overrides[key];
|
|
134
|
+
if (nextVal === undefined) continue;
|
|
135
|
+
const baseVal = base[key];
|
|
136
|
+
if (
|
|
137
|
+
baseVal &&
|
|
138
|
+
typeof baseVal === "object" &&
|
|
139
|
+
!Array.isArray(baseVal) &&
|
|
140
|
+
nextVal &&
|
|
141
|
+
typeof nextVal === "object" &&
|
|
142
|
+
!Array.isArray(nextVal)
|
|
143
|
+
) {
|
|
144
|
+
out[key] = mergeDeep(baseVal, nextVal as any);
|
|
145
|
+
} else {
|
|
146
|
+
out[key] = nextVal as any;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
return out as T;
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
export const resolveAssetProcessingConfig = (
|
|
153
|
+
overrides?: DeepPartial<AssetProcessingConfig>
|
|
154
|
+
): AssetProcessingConfig => mergeDeep(defaultAssetProcessingConfig, overrides);
|
|
155
|
+
|
|
156
|
+
export const resolveContentStorageConfig = (
|
|
157
|
+
overrides?: DeepPartial<ContentStorageConfig>
|
|
158
|
+
): ContentStorageConfig => mergeDeep(defaultContentStorageConfig, overrides);
|
|
159
|
+
|
|
13
160
|
export const resolveConfig = (
|
|
14
161
|
config: ContextEngineConfig
|
|
15
162
|
): ResolvedContextEngineConfig => {
|
|
@@ -21,6 +168,9 @@ export const resolveConfig = (
|
|
|
21
168
|
defaults: resolveChunkingOptions(config.defaults),
|
|
22
169
|
chunker,
|
|
23
170
|
idGenerator: config.idGenerator ?? defaultIdGenerator,
|
|
171
|
+
extractors: config.extractors ?? [],
|
|
172
|
+
storage: resolveContentStorageConfig(config.storage),
|
|
173
|
+
assetProcessing: resolveAssetProcessingConfig(config.assetProcessing),
|
|
24
174
|
};
|
|
25
175
|
};
|
|
26
176
|
|
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
import { deleteDocuments } from "./delete";
|
|
2
|
-
import { ingest } from "./ingest";
|
|
2
|
+
import { ingest, planIngest } from "./ingest";
|
|
3
3
|
import { retrieve } from "./retrieve";
|
|
4
4
|
import { defineConfig, resolveConfig } from "./config";
|
|
5
|
+
import { createAiEmbeddingProvider } from "../embedding/ai";
|
|
5
6
|
import type {
|
|
7
|
+
AssetExtractor,
|
|
6
8
|
ContextEngineConfig,
|
|
7
9
|
DeleteInput,
|
|
10
|
+
DefineUnragConfigInput,
|
|
11
|
+
EmbeddingProvider,
|
|
8
12
|
IngestInput,
|
|
9
13
|
IngestResult,
|
|
14
|
+
IngestPlanResult,
|
|
10
15
|
ResolvedContextEngineConfig,
|
|
11
16
|
RetrieveInput,
|
|
12
17
|
RetrieveResult,
|
|
18
|
+
UnragCreateEngineRuntime,
|
|
13
19
|
} from "./types";
|
|
14
20
|
|
|
15
21
|
export class ContextEngine {
|
|
@@ -23,6 +29,16 @@ export class ContextEngine {
|
|
|
23
29
|
return ingest(this.config, input);
|
|
24
30
|
}
|
|
25
31
|
|
|
32
|
+
/**
|
|
33
|
+
* Dry-run for ingestion. Returns which assets would be processed and by which extractors,
|
|
34
|
+
* without calling external services.
|
|
35
|
+
*
|
|
36
|
+
* Note: chunk counts/embeddings are not produced in dry-run.
|
|
37
|
+
*/
|
|
38
|
+
async planIngest(input: IngestInput): Promise<IngestPlanResult> {
|
|
39
|
+
return planIngest(this.config, input);
|
|
40
|
+
}
|
|
41
|
+
|
|
26
42
|
async retrieve(input: RetrieveInput): Promise<RetrieveResult> {
|
|
27
43
|
return retrieve(this.config, input);
|
|
28
44
|
}
|
|
@@ -37,4 +53,56 @@ export const createContextEngine = (config: ContextEngineConfig) =>
|
|
|
37
53
|
|
|
38
54
|
export { defineConfig };
|
|
39
55
|
|
|
56
|
+
/**
|
|
57
|
+
* Ergonomic, higher-level config wrapper.
|
|
58
|
+
*
|
|
59
|
+
* This helps keep `unrag.config.ts` as a single source of truth while still
|
|
60
|
+
* allowing runtime wiring (DB client/store, optional extractors).
|
|
61
|
+
*/
|
|
62
|
+
export const defineUnragConfig = <T extends DefineUnragConfigInput>(config: T) => {
|
|
63
|
+
let embeddingProvider: EmbeddingProvider | undefined;
|
|
64
|
+
|
|
65
|
+
const getEmbeddingProvider = () => {
|
|
66
|
+
if (embeddingProvider) return embeddingProvider;
|
|
67
|
+
|
|
68
|
+
if (config.embedding.provider === "ai") {
|
|
69
|
+
embeddingProvider = createAiEmbeddingProvider(config.embedding.config);
|
|
70
|
+
return embeddingProvider;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
embeddingProvider = config.embedding.create();
|
|
74
|
+
return embeddingProvider;
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
const defaults = {
|
|
78
|
+
chunking: config.defaults?.chunking ?? {},
|
|
79
|
+
retrieval: {
|
|
80
|
+
topK: config.defaults?.retrieval?.topK ?? 8,
|
|
81
|
+
},
|
|
82
|
+
} as const;
|
|
83
|
+
|
|
84
|
+
const createEngineConfig = (runtime: UnragCreateEngineRuntime): ContextEngineConfig => {
|
|
85
|
+
const baseExtractors = (config.engine?.extractors ?? []) as AssetExtractor[];
|
|
86
|
+
const extractors =
|
|
87
|
+
typeof runtime.extractors === "function"
|
|
88
|
+
? runtime.extractors(baseExtractors)
|
|
89
|
+
: runtime.extractors ?? baseExtractors;
|
|
90
|
+
|
|
91
|
+
return defineConfig({
|
|
92
|
+
...(config.engine ?? {}),
|
|
93
|
+
defaults: defaults.chunking,
|
|
94
|
+
embedding: getEmbeddingProvider(),
|
|
95
|
+
store: runtime.store,
|
|
96
|
+
extractors,
|
|
97
|
+
});
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
defaults,
|
|
102
|
+
createEngineConfig,
|
|
103
|
+
createEngine: (runtime: UnragCreateEngineRuntime) =>
|
|
104
|
+
new ContextEngine(createEngineConfig(runtime)),
|
|
105
|
+
};
|
|
106
|
+
};
|
|
107
|
+
|
|
40
108
|
|
package/registry/core/index.ts
CHANGED
|
@@ -1,8 +1,21 @@
|
|
|
1
|
-
export {
|
|
1
|
+
export {
|
|
2
|
+
ContextEngine,
|
|
3
|
+
createContextEngine,
|
|
4
|
+
defineConfig,
|
|
5
|
+
defineUnragConfig,
|
|
6
|
+
} from "./context-engine";
|
|
2
7
|
export { deleteDocuments } from "./delete";
|
|
3
|
-
export { ingest } from "./ingest";
|
|
8
|
+
export { ingest, planIngest } from "./ingest";
|
|
4
9
|
export { retrieve } from "./retrieve";
|
|
5
10
|
export { defaultChunker, resolveChunkingOptions } from "./chunking";
|
|
11
|
+
export {
|
|
12
|
+
defaultAssetProcessingConfig,
|
|
13
|
+
defaultContentStorageConfig,
|
|
14
|
+
resolveAssetProcessingConfig,
|
|
15
|
+
resolveContentStorageConfig,
|
|
16
|
+
} from "./config";
|
|
17
|
+
export { getChunkAssetRef, isAssetChunk } from "./assets";
|
|
18
|
+
export type { ChunkAssetRef } from "./assets";
|
|
6
19
|
export * from "./types";
|
|
7
20
|
|
|
8
21
|
|