unrag 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/registry/connectors/google-drive/_api-types.ts +60 -0
- package/registry/connectors/google-drive/client.ts +99 -38
- package/registry/connectors/google-drive/sync.ts +97 -69
- package/registry/connectors/google-drive/types.ts +76 -37
- package/registry/connectors/notion/client.ts +12 -3
- package/registry/connectors/notion/render.ts +62 -23
- package/registry/connectors/notion/sync.ts +30 -23
- package/registry/core/assets.ts +11 -10
- package/registry/core/config.ts +10 -25
- package/registry/core/context-engine.ts +5 -0
- package/registry/core/deep-merge.ts +45 -0
- package/registry/core/ingest.ts +117 -44
- package/registry/core/types.ts +52 -0
- package/registry/embedding/_shared.ts +6 -1
- package/registry/embedding/ai.ts +2 -3
- package/registry/embedding/azure.ts +11 -2
- package/registry/embedding/bedrock.ts +11 -2
- package/registry/embedding/cohere.ts +11 -2
- package/registry/embedding/google.ts +11 -2
- package/registry/embedding/mistral.ts +11 -2
- package/registry/embedding/ollama.ts +18 -3
- package/registry/embedding/openai.ts +11 -2
- package/registry/embedding/openrouter.ts +53 -11
- package/registry/embedding/together.ts +15 -5
- package/registry/embedding/vertex.ts +11 -2
- package/registry/embedding/voyage.ts +16 -6
- package/registry/extractors/audio-transcribe/index.ts +39 -23
- package/registry/extractors/file-docx/index.ts +8 -1
- package/registry/extractors/file-pptx/index.ts +22 -1
- package/registry/extractors/file-xlsx/index.ts +24 -1
- package/registry/extractors/image-caption-llm/index.ts +8 -3
- package/registry/extractors/image-ocr/index.ts +9 -4
- package/registry/extractors/pdf-llm/index.ts +9 -4
- package/registry/extractors/pdf-text-layer/index.ts +23 -2
- package/registry/extractors/video-frames/index.ts +8 -3
- package/registry/extractors/video-transcribe/index.ts +40 -24
- package/registry/manifest.json +6 -6
- package/registry/store/drizzle-postgres-pgvector/store.ts +24 -7
|
@@ -1,5 +1,78 @@
|
|
|
1
1
|
import type { ContextEngine, AssetInput, IngestInput } from "../../core";
|
|
2
2
|
|
|
3
|
+
/**
|
|
4
|
+
* Service account credentials structure.
|
|
5
|
+
*/
|
|
6
|
+
export interface ServiceAccountCredentials {
|
|
7
|
+
client_email: string;
|
|
8
|
+
private_key: string;
|
|
9
|
+
[key: string]: unknown;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* OAuth auth with an existing client instance.
|
|
14
|
+
*/
|
|
15
|
+
export type GoogleDriveOAuthClientAuth = {
|
|
16
|
+
/** Use an existing OAuth2 client instance (recommended if your app already has one). */
|
|
17
|
+
kind: "oauth";
|
|
18
|
+
oauthClient: unknown;
|
|
19
|
+
clientId?: never;
|
|
20
|
+
clientSecret?: never;
|
|
21
|
+
redirectUri?: never;
|
|
22
|
+
refreshToken?: never;
|
|
23
|
+
accessToken?: never;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* OAuth auth with credentials for building a client.
|
|
28
|
+
*/
|
|
29
|
+
export type GoogleDriveOAuthConfigAuth = {
|
|
30
|
+
/**
|
|
31
|
+
* Convenience form for OAuth2: the connector will construct an OAuth2 client
|
|
32
|
+
* and set credentials including the refresh token.
|
|
33
|
+
*/
|
|
34
|
+
kind: "oauth";
|
|
35
|
+
clientId: string;
|
|
36
|
+
clientSecret: string;
|
|
37
|
+
redirectUri: string;
|
|
38
|
+
refreshToken: string;
|
|
39
|
+
/** Optional access token if you already have one. */
|
|
40
|
+
accessToken?: string;
|
|
41
|
+
oauthClient?: never;
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* OAuth auth (either form).
|
|
46
|
+
*/
|
|
47
|
+
export type GoogleDriveOAuthAuth = GoogleDriveOAuthClientAuth | GoogleDriveOAuthConfigAuth;
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Service account auth.
|
|
51
|
+
*/
|
|
52
|
+
export type GoogleDriveServiceAccountAuth = {
|
|
53
|
+
/**
|
|
54
|
+
* Service account credentials. This supports both:
|
|
55
|
+
* - direct service-account access (files must be shared to the service account)
|
|
56
|
+
* - Workspace domain-wide delegation (DWD) when `subject` is provided
|
|
57
|
+
*/
|
|
58
|
+
kind: "service_account";
|
|
59
|
+
credentialsJson: string | ServiceAccountCredentials;
|
|
60
|
+
/**
|
|
61
|
+
* DWD impersonation subject email (Workspace only).
|
|
62
|
+
* When provided, the service account will impersonate this user.
|
|
63
|
+
*/
|
|
64
|
+
subject?: string;
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Google Auth escape hatch.
|
|
69
|
+
*/
|
|
70
|
+
export type GoogleDriveGoogleAuthAuth = {
|
|
71
|
+
/** Escape hatch: provide a pre-configured GoogleAuth (or equivalent) instance. */
|
|
72
|
+
kind: "google_auth";
|
|
73
|
+
auth: unknown;
|
|
74
|
+
};
|
|
75
|
+
|
|
3
76
|
/**
|
|
4
77
|
* A plug-and-play auth input for Google Drive.
|
|
5
78
|
*
|
|
@@ -8,43 +81,9 @@ import type { ContextEngine, AssetInput, IngestInput } from "../../core";
|
|
|
8
81
|
* by the CLI (`unrag add google-drive`).
|
|
9
82
|
*/
|
|
10
83
|
export type GoogleDriveAuth =
|
|
11
|
-
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
oauthClient: unknown;
|
|
15
|
-
}
|
|
16
|
-
| {
|
|
17
|
-
/**
|
|
18
|
-
* Convenience form for OAuth2: the connector will construct an OAuth2 client
|
|
19
|
-
* and set credentials including the refresh token.
|
|
20
|
-
*/
|
|
21
|
-
kind: "oauth";
|
|
22
|
-
clientId: string;
|
|
23
|
-
clientSecret: string;
|
|
24
|
-
redirectUri: string;
|
|
25
|
-
refreshToken: string;
|
|
26
|
-
/** Optional access token if you already have one. */
|
|
27
|
-
accessToken?: string;
|
|
28
|
-
}
|
|
29
|
-
| {
|
|
30
|
-
/**
|
|
31
|
-
* Service account credentials. This supports both:
|
|
32
|
-
* - direct service-account access (files must be shared to the service account)
|
|
33
|
-
* - Workspace domain-wide delegation (DWD) when `subject` is provided
|
|
34
|
-
*/
|
|
35
|
-
kind: "service_account";
|
|
36
|
-
credentialsJson: string | Record<string, unknown>;
|
|
37
|
-
/**
|
|
38
|
-
* DWD impersonation subject email (Workspace only).
|
|
39
|
-
* When provided, the service account will impersonate this user.
|
|
40
|
-
*/
|
|
41
|
-
subject?: string;
|
|
42
|
-
}
|
|
43
|
-
| {
|
|
44
|
-
/** Escape hatch: provide a pre-configured GoogleAuth (or equivalent) instance. */
|
|
45
|
-
kind: "google_auth";
|
|
46
|
-
auth: unknown;
|
|
47
|
-
};
|
|
84
|
+
| GoogleDriveOAuthAuth
|
|
85
|
+
| GoogleDriveServiceAccountAuth
|
|
86
|
+
| GoogleDriveGoogleAuthAuth;
|
|
48
87
|
|
|
49
88
|
export type GoogleDriveSyncProgressEvent =
|
|
50
89
|
| { type: "file:start"; fileId: string; sourceId: string }
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Client } from "@notionhq/client";
|
|
1
|
+
import { Client, type ClientOptions } from "@notionhq/client";
|
|
2
2
|
|
|
3
3
|
export type NotionClient = Client;
|
|
4
4
|
|
|
@@ -7,16 +7,25 @@ export type CreateNotionClientInput = {
|
|
|
7
7
|
timeoutMs?: number;
|
|
8
8
|
};
|
|
9
9
|
|
|
10
|
+
/**
|
|
11
|
+
* Extended client options that include timeoutMs (supported by @notionhq/client).
|
|
12
|
+
*/
|
|
13
|
+
type NotionClientOptions = ClientOptions & {
|
|
14
|
+
timeoutMs?: number;
|
|
15
|
+
};
|
|
16
|
+
|
|
10
17
|
export function createNotionClient(input: CreateNotionClientInput): NotionClient {
|
|
11
18
|
const token = input.token?.trim();
|
|
12
19
|
if (!token) throw new Error("NOTION token is required");
|
|
13
20
|
|
|
14
|
-
|
|
21
|
+
const options: NotionClientOptions = {
|
|
15
22
|
auth: token,
|
|
16
23
|
// @notionhq/client uses undici/fetch under the hood; timeout is supported.
|
|
17
24
|
// If unsupported in a future version, callers can wrap requests.
|
|
18
25
|
timeoutMs: input.timeoutMs ?? 30_000,
|
|
19
|
-
}
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
return new Client(options);
|
|
20
29
|
}
|
|
21
30
|
|
|
22
31
|
|
|
@@ -15,6 +15,31 @@ export type NotionBlockNode = {
|
|
|
15
15
|
children: NotionBlockNode[];
|
|
16
16
|
};
|
|
17
17
|
|
|
18
|
+
/**
|
|
19
|
+
* Notion block content payload that may contain rich_text and other properties.
|
|
20
|
+
*/
|
|
21
|
+
interface BlockPayload {
|
|
22
|
+
rich_text?: RichText[];
|
|
23
|
+
checked?: boolean;
|
|
24
|
+
language?: string;
|
|
25
|
+
caption?: RichText[];
|
|
26
|
+
type?: string;
|
|
27
|
+
external?: { url?: string };
|
|
28
|
+
file?: { url?: string };
|
|
29
|
+
media_type?: string;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Get block-type-specific payload from a Notion block.
|
|
34
|
+
*/
|
|
35
|
+
const getBlockPayload = (block: NotionBlock, type: string): BlockPayload | undefined => {
|
|
36
|
+
const payload = block[type];
|
|
37
|
+
if (typeof payload === "object" && payload !== null) {
|
|
38
|
+
return payload as BlockPayload;
|
|
39
|
+
}
|
|
40
|
+
return undefined;
|
|
41
|
+
};
|
|
42
|
+
|
|
18
43
|
const rt = (value: unknown): string => {
|
|
19
44
|
const items = Array.isArray(value) ? (value as RichText[]) : [];
|
|
20
45
|
return items.map((t) => t?.plain_text ?? "").join("");
|
|
@@ -37,25 +62,30 @@ const toAssetKind = (notionType: string): AssetKind | null => {
|
|
|
37
62
|
return supportedAssetKinds.has(t) ? t : null;
|
|
38
63
|
};
|
|
39
64
|
|
|
40
|
-
const pickUrl = (payload:
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
if (type === "
|
|
65
|
+
const pickUrl = (payload: BlockPayload | undefined): string | undefined => {
|
|
66
|
+
if (!payload) return undefined;
|
|
67
|
+
const type = String(payload.type ?? "");
|
|
68
|
+
if (type === "external") return asString(payload.external?.url);
|
|
69
|
+
if (type === "file") return asString(payload.file?.url);
|
|
44
70
|
return undefined;
|
|
45
71
|
};
|
|
46
72
|
|
|
47
|
-
const pickCaption = (payload:
|
|
73
|
+
const pickCaption = (payload: BlockPayload | undefined): string => {
|
|
48
74
|
// Notion captions are typically an array of rich text items.
|
|
49
75
|
return rt(payload?.caption);
|
|
50
76
|
};
|
|
51
77
|
|
|
52
|
-
const inferMediaType = (assetKind: AssetKind, payload:
|
|
78
|
+
const inferMediaType = (assetKind: AssetKind, payload: BlockPayload | undefined): string | undefined => {
|
|
53
79
|
if (assetKind === "pdf") return "application/pdf";
|
|
54
80
|
// Notion does not consistently include media types; keep it optional.
|
|
55
81
|
return asString(payload?.media_type) || undefined;
|
|
56
82
|
};
|
|
57
83
|
|
|
58
|
-
|
|
84
|
+
/**
|
|
85
|
+
* Convert a plain object to Metadata type.
|
|
86
|
+
* The Metadata type allows string, number, boolean, null values.
|
|
87
|
+
*/
|
|
88
|
+
const toMetadata = (obj: Record<string, string>): Metadata => obj;
|
|
59
89
|
|
|
60
90
|
export function extractNotionAssets(
|
|
61
91
|
nodes: NotionBlockNode[],
|
|
@@ -66,10 +96,10 @@ export function extractNotionAssets(
|
|
|
66
96
|
|
|
67
97
|
const walk = (node: NotionBlockNode, depth: number) => {
|
|
68
98
|
if (depth > maxDepth) return;
|
|
69
|
-
const b = node.block
|
|
99
|
+
const b = node.block;
|
|
70
100
|
const kind = toAssetKind(String(b.type ?? ""));
|
|
71
101
|
if (kind) {
|
|
72
|
-
const payload = b
|
|
102
|
+
const payload = getBlockPayload(b, kind);
|
|
73
103
|
const url = pickUrl(payload);
|
|
74
104
|
if (url) {
|
|
75
105
|
const caption = pickCaption(payload).trim();
|
|
@@ -80,7 +110,7 @@ export function extractNotionAssets(
|
|
|
80
110
|
data: { kind: "url", url, ...(mediaType ? { mediaType } : {}) },
|
|
81
111
|
uri: url,
|
|
82
112
|
...(caption ? { text: caption } : {}),
|
|
83
|
-
metadata:
|
|
113
|
+
metadata: toMetadata({
|
|
84
114
|
connector: "notion",
|
|
85
115
|
notionBlockId: String(b.id),
|
|
86
116
|
notionBlockType: String(b.type),
|
|
@@ -108,40 +138,49 @@ export function renderNotionBlocksToText(
|
|
|
108
138
|
const walk = (node: NotionBlockNode, depth: number, listDepth: number) => {
|
|
109
139
|
if (depth > maxDepth) return;
|
|
110
140
|
const b = node.block;
|
|
111
|
-
|
|
112
141
|
const t = b.type;
|
|
113
142
|
|
|
114
143
|
if (t === "paragraph") {
|
|
115
|
-
const
|
|
144
|
+
const payload = getBlockPayload(b, "paragraph");
|
|
145
|
+
const text = rt(payload?.rich_text);
|
|
116
146
|
if (text.trim()) lines.push(text);
|
|
117
147
|
} else if (t === "heading_1") {
|
|
118
|
-
const
|
|
148
|
+
const payload = getBlockPayload(b, "heading_1");
|
|
149
|
+
const text = rt(payload?.rich_text);
|
|
119
150
|
if (text.trim()) lines.push(`# ${text}`);
|
|
120
151
|
} else if (t === "heading_2") {
|
|
121
|
-
const
|
|
152
|
+
const payload = getBlockPayload(b, "heading_2");
|
|
153
|
+
const text = rt(payload?.rich_text);
|
|
122
154
|
if (text.trim()) lines.push(`## ${text}`);
|
|
123
155
|
} else if (t === "heading_3") {
|
|
124
|
-
const
|
|
156
|
+
const payload = getBlockPayload(b, "heading_3");
|
|
157
|
+
const text = rt(payload?.rich_text);
|
|
125
158
|
if (text.trim()) lines.push(`### ${text}`);
|
|
126
159
|
} else if (t === "bulleted_list_item") {
|
|
127
|
-
const
|
|
160
|
+
const payload = getBlockPayload(b, "bulleted_list_item");
|
|
161
|
+
const text = rt(payload?.rich_text);
|
|
128
162
|
if (text.trim()) lines.push(`${indent(listDepth)}- ${text}`);
|
|
129
163
|
} else if (t === "numbered_list_item") {
|
|
130
|
-
const
|
|
164
|
+
const payload = getBlockPayload(b, "numbered_list_item");
|
|
165
|
+
const text = rt(payload?.rich_text);
|
|
131
166
|
if (text.trim()) lines.push(`${indent(listDepth)}- ${text}`);
|
|
132
167
|
} else if (t === "to_do") {
|
|
133
|
-
const
|
|
134
|
-
const
|
|
168
|
+
const payload = getBlockPayload(b, "to_do");
|
|
169
|
+
const text = rt(payload?.rich_text);
|
|
170
|
+
const checked = Boolean(payload?.checked);
|
|
135
171
|
if (text.trim()) lines.push(`${indent(listDepth)}- [${checked ? "x" : " "}] ${text}`);
|
|
136
172
|
} else if (t === "quote") {
|
|
137
|
-
const
|
|
173
|
+
const payload = getBlockPayload(b, "quote");
|
|
174
|
+
const text = rt(payload?.rich_text);
|
|
138
175
|
if (text.trim()) lines.push(`> ${text}`);
|
|
139
176
|
} else if (t === "callout") {
|
|
140
|
-
const
|
|
177
|
+
const payload = getBlockPayload(b, "callout");
|
|
178
|
+
const text = rt(payload?.rich_text);
|
|
141
179
|
if (text.trim()) lines.push(text);
|
|
142
180
|
} else if (t === "code") {
|
|
143
|
-
const
|
|
144
|
-
const
|
|
181
|
+
const payload = getBlockPayload(b, "code");
|
|
182
|
+
const text = rt(payload?.rich_text);
|
|
183
|
+
const lang = String(payload?.language ?? "").trim();
|
|
145
184
|
lines.push("```" + lang);
|
|
146
185
|
if (text.trim()) lines.push(text);
|
|
147
186
|
lines.push("```");
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
-
import type { IngestResult } from "../../core";
|
|
1
|
+
import type { IngestResult, Metadata } from "../../core";
|
|
2
|
+
import { isFullPage } from "@notionhq/client";
|
|
3
|
+
import type {
|
|
4
|
+
GetPageResponse,
|
|
5
|
+
ListBlockChildrenResponse,
|
|
6
|
+
RichTextItemResponse,
|
|
7
|
+
} from "@notionhq/client/build/src/api-endpoints";
|
|
2
8
|
import { createNotionClient, type NotionClient } from "./client";
|
|
3
9
|
import { normalizeNotionPageId32, toUuidHyphenated } from "./ids";
|
|
4
10
|
import {
|
|
@@ -37,17 +43,16 @@ export function buildNotionPageIngestInput(
|
|
|
37
43
|
};
|
|
38
44
|
}
|
|
39
45
|
|
|
40
|
-
const richTextToText = (richText:
|
|
41
|
-
(
|
|
42
|
-
.map((t) => String(t?.plain_text ?? ""))
|
|
43
|
-
.join("");
|
|
46
|
+
const richTextToText = (richText: RichTextItemResponse[] | undefined): string =>
|
|
47
|
+
(richText ?? []).map((t) => t.plain_text).join("");
|
|
44
48
|
|
|
45
|
-
const getNotionPageTitle = (page:
|
|
46
|
-
|
|
49
|
+
const getNotionPageTitle = (page: GetPageResponse): string => {
|
|
50
|
+
if (!isFullPage(page)) return "";
|
|
51
|
+
const props = page.properties;
|
|
47
52
|
for (const key of Object.keys(props)) {
|
|
48
53
|
const p = props[key];
|
|
49
|
-
if (p
|
|
50
|
-
return richTextToText(p
|
|
54
|
+
if (p.type === "title") {
|
|
55
|
+
return richTextToText(p.title);
|
|
51
56
|
}
|
|
52
57
|
}
|
|
53
58
|
return "";
|
|
@@ -61,15 +66,15 @@ async function listAllBlockChildren(
|
|
|
61
66
|
let cursor: string | undefined = undefined;
|
|
62
67
|
|
|
63
68
|
while (true) {
|
|
64
|
-
const res:
|
|
69
|
+
const res: ListBlockChildrenResponse = await notion.blocks.children.list({
|
|
65
70
|
block_id: blockId,
|
|
66
71
|
start_cursor: cursor,
|
|
67
72
|
page_size: 100,
|
|
68
73
|
});
|
|
69
74
|
|
|
70
|
-
blocks.push(...(
|
|
71
|
-
if (!res
|
|
72
|
-
cursor = res
|
|
75
|
+
blocks.push(...(res.results as NotionBlock[]));
|
|
76
|
+
if (!res.has_more) break;
|
|
77
|
+
cursor = res.next_cursor ?? undefined;
|
|
73
78
|
if (!cursor) break;
|
|
74
79
|
}
|
|
75
80
|
|
|
@@ -105,30 +110,30 @@ export async function loadNotionPageDocument(args: {
|
|
|
105
110
|
const pageId = normalizeNotionPageId32(args.pageIdOrUrl);
|
|
106
111
|
const apiId = toUuidHyphenated(pageId);
|
|
107
112
|
|
|
108
|
-
const page:
|
|
113
|
+
const page: GetPageResponse = await args.notion.pages.retrieve({ page_id: apiId });
|
|
109
114
|
const title = getNotionPageTitle(page);
|
|
110
|
-
const url =
|
|
111
|
-
const lastEditedTime =
|
|
115
|
+
const url = isFullPage(page) ? page.url : "";
|
|
116
|
+
const lastEditedTime = isFullPage(page) ? page.last_edited_time : "";
|
|
112
117
|
|
|
113
118
|
const tree = await buildBlockTree(args.notion, apiId, 0, args.maxDepth ?? 4);
|
|
114
119
|
const body = renderNotionBlocksToText(tree);
|
|
115
120
|
const content = [title.trim(), body.trim()].filter(Boolean).join("\n\n");
|
|
116
121
|
const assets = extractNotionAssets(tree);
|
|
117
122
|
|
|
118
|
-
const metadata = {
|
|
123
|
+
const metadata: Metadata = {
|
|
119
124
|
connector: "notion",
|
|
120
125
|
kind: "page",
|
|
121
126
|
pageId,
|
|
122
127
|
url,
|
|
123
128
|
title,
|
|
124
129
|
lastEditedTime,
|
|
125
|
-
}
|
|
130
|
+
};
|
|
126
131
|
|
|
127
132
|
const ingest = buildNotionPageIngestInput({
|
|
128
133
|
pageId,
|
|
129
134
|
content,
|
|
130
135
|
assets,
|
|
131
|
-
metadata
|
|
136
|
+
metadata,
|
|
132
137
|
sourceIdPrefix: args.sourceIdPrefix,
|
|
133
138
|
});
|
|
134
139
|
|
|
@@ -140,10 +145,12 @@ export async function loadNotionPageDocument(args: {
|
|
|
140
145
|
};
|
|
141
146
|
}
|
|
142
147
|
|
|
143
|
-
const isNotFound = (err:
|
|
144
|
-
|
|
148
|
+
const isNotFound = (err: unknown): boolean => {
|
|
149
|
+
if (typeof err !== "object" || err === null) return false;
|
|
150
|
+
const e = err as Record<string, unknown>;
|
|
151
|
+
const status = Number(e.status ?? e.statusCode ?? e.code ?? 0);
|
|
145
152
|
if (status === 404) return true;
|
|
146
|
-
const msg = String(
|
|
153
|
+
const msg = String(e.message ?? "");
|
|
147
154
|
return msg.toLowerCase().includes("could not find");
|
|
148
155
|
};
|
|
149
156
|
|
|
@@ -187,7 +194,7 @@ export async function syncNotionPages(
|
|
|
187
194
|
sourceId: doc.sourceId,
|
|
188
195
|
content: doc.content,
|
|
189
196
|
assets: doc.assets,
|
|
190
|
-
metadata: doc.metadata
|
|
197
|
+
metadata: doc.metadata,
|
|
191
198
|
});
|
|
192
199
|
|
|
193
200
|
succeeded += 1;
|
package/registry/core/assets.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { AssetKind, Chunk } from "./types";
|
|
2
|
+
import { hasAssetMetadata } from "./types";
|
|
2
3
|
|
|
3
4
|
export type ChunkAssetRef = {
|
|
4
5
|
assetId: string;
|
|
@@ -21,26 +22,26 @@ const assetKinds = new Set<AssetKind>(["image", "pdf", "audio", "video", "file"]
|
|
|
21
22
|
export function getChunkAssetRef(
|
|
22
23
|
chunk: Pick<Chunk, "metadata">
|
|
23
24
|
): ChunkAssetRef | null {
|
|
24
|
-
const meta = chunk.metadata
|
|
25
|
-
const kind = meta?.assetKind;
|
|
26
|
-
const id = meta?.assetId;
|
|
25
|
+
const meta = chunk.metadata;
|
|
27
26
|
|
|
28
|
-
if (
|
|
27
|
+
if (!hasAssetMetadata(meta)) {
|
|
29
28
|
return null;
|
|
30
29
|
}
|
|
31
|
-
|
|
30
|
+
|
|
31
|
+
const kind = meta.assetKind;
|
|
32
|
+
if (!assetKinds.has(kind)) {
|
|
32
33
|
return null;
|
|
33
34
|
}
|
|
34
35
|
|
|
35
|
-
const assetUri = typeof meta
|
|
36
|
+
const assetUri = typeof meta.assetUri === "string" ? meta.assetUri : undefined;
|
|
36
37
|
const assetMediaType =
|
|
37
|
-
typeof meta
|
|
38
|
+
typeof meta.assetMediaType === "string" ? meta.assetMediaType : undefined;
|
|
38
39
|
const extractor =
|
|
39
|
-
typeof meta
|
|
40
|
+
typeof meta.extractor === "string" ? meta.extractor : undefined;
|
|
40
41
|
|
|
41
42
|
return {
|
|
42
|
-
assetId:
|
|
43
|
-
assetKind: kind
|
|
43
|
+
assetId: meta.assetId,
|
|
44
|
+
assetKind: kind,
|
|
44
45
|
...(assetUri ? { assetUri } : {}),
|
|
45
46
|
...(assetMediaType ? { assetMediaType } : {}),
|
|
46
47
|
...(extractor ? { extractor } : {}),
|
package/registry/core/config.ts
CHANGED
|
@@ -3,10 +3,11 @@ import type {
|
|
|
3
3
|
ContextEngineConfig,
|
|
4
4
|
ResolvedContextEngineConfig,
|
|
5
5
|
AssetProcessingConfig,
|
|
6
|
-
DeepPartial,
|
|
7
6
|
ContentStorageConfig,
|
|
7
|
+
EmbeddingProcessingConfig,
|
|
8
8
|
} from "./types";
|
|
9
9
|
import { defaultChunker, resolveChunkingOptions } from "./chunking";
|
|
10
|
+
import { mergeDeep } from "./deep-merge";
|
|
10
11
|
|
|
11
12
|
export const defineConfig = (config: ContextEngineConfig): ContextEngineConfig =>
|
|
12
13
|
config;
|
|
@@ -123,30 +124,9 @@ export const defaultContentStorageConfig: ContentStorageConfig = {
|
|
|
123
124
|
storeDocumentContent: true,
|
|
124
125
|
};
|
|
125
126
|
|
|
126
|
-
const
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
): T => {
|
|
130
|
-
if (!overrides) return base;
|
|
131
|
-
const out: any = Array.isArray(base) ? [...base] : { ...base };
|
|
132
|
-
for (const key of Object.keys(overrides) as Array<keyof T>) {
|
|
133
|
-
const nextVal = overrides[key];
|
|
134
|
-
if (nextVal === undefined) continue;
|
|
135
|
-
const baseVal = base[key];
|
|
136
|
-
if (
|
|
137
|
-
baseVal &&
|
|
138
|
-
typeof baseVal === "object" &&
|
|
139
|
-
!Array.isArray(baseVal) &&
|
|
140
|
-
nextVal &&
|
|
141
|
-
typeof nextVal === "object" &&
|
|
142
|
-
!Array.isArray(nextVal)
|
|
143
|
-
) {
|
|
144
|
-
out[key] = mergeDeep(baseVal, nextVal as any);
|
|
145
|
-
} else {
|
|
146
|
-
out[key] = nextVal as any;
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
return out as T;
|
|
127
|
+
export const defaultEmbeddingProcessingConfig: EmbeddingProcessingConfig = {
|
|
128
|
+
concurrency: 4,
|
|
129
|
+
batchSize: 32,
|
|
150
130
|
};
|
|
151
131
|
|
|
152
132
|
export const resolveAssetProcessingConfig = (
|
|
@@ -157,6 +137,10 @@ export const resolveContentStorageConfig = (
|
|
|
157
137
|
overrides?: DeepPartial<ContentStorageConfig>
|
|
158
138
|
): ContentStorageConfig => mergeDeep(defaultContentStorageConfig, overrides);
|
|
159
139
|
|
|
140
|
+
export const resolveEmbeddingProcessingConfig = (
|
|
141
|
+
overrides?: DeepPartial<EmbeddingProcessingConfig>
|
|
142
|
+
): EmbeddingProcessingConfig => mergeDeep(defaultEmbeddingProcessingConfig, overrides);
|
|
143
|
+
|
|
160
144
|
export const resolveConfig = (
|
|
161
145
|
config: ContextEngineConfig
|
|
162
146
|
): ResolvedContextEngineConfig => {
|
|
@@ -171,6 +155,7 @@ export const resolveConfig = (
|
|
|
171
155
|
extractors: config.extractors ?? [],
|
|
172
156
|
storage: resolveContentStorageConfig(config.storage),
|
|
173
157
|
assetProcessing: resolveAssetProcessingConfig(config.assetProcessing),
|
|
158
|
+
embeddingProcessing: resolveEmbeddingProcessingConfig(config.embeddingProcessing),
|
|
174
159
|
};
|
|
175
160
|
};
|
|
176
161
|
|
|
@@ -142,6 +142,7 @@ export const defineUnragConfig = <T extends DefineUnragConfigInput>(config: T) =
|
|
|
142
142
|
|
|
143
143
|
const defaults = {
|
|
144
144
|
chunking: config.defaults?.chunking ?? {},
|
|
145
|
+
embedding: config.defaults?.embedding ?? {},
|
|
145
146
|
retrieval: {
|
|
146
147
|
topK: config.defaults?.retrieval?.topK ?? 8,
|
|
147
148
|
},
|
|
@@ -157,6 +158,10 @@ export const defineUnragConfig = <T extends DefineUnragConfigInput>(config: T) =
|
|
|
157
158
|
return defineConfig({
|
|
158
159
|
...(config.engine ?? {}),
|
|
159
160
|
defaults: defaults.chunking,
|
|
161
|
+
embeddingProcessing: {
|
|
162
|
+
...(defaults.embedding ?? {}),
|
|
163
|
+
...(config.engine?.embeddingProcessing ?? {}),
|
|
164
|
+
},
|
|
160
165
|
embedding: getEmbeddingProvider(),
|
|
161
166
|
store: runtime.store,
|
|
162
167
|
extractors,
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import type { DeepPartial } from "./types";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Type guard to check if a value is a plain object (not array, not null).
|
|
5
|
+
*/
|
|
6
|
+
export function isRecord(value: unknown): value is Record<string, unknown> {
|
|
7
|
+
return value !== null && typeof value === "object" && !Array.isArray(value);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Deep merge utility that recursively merges overrides into base.
|
|
12
|
+
* Arrays are replaced, not merged. Undefined values in overrides are skipped.
|
|
13
|
+
*/
|
|
14
|
+
export function mergeDeep<T extends Record<string, unknown>>(
|
|
15
|
+
base: T,
|
|
16
|
+
overrides: DeepPartial<T> | undefined
|
|
17
|
+
): T {
|
|
18
|
+
if (!overrides) return base;
|
|
19
|
+
|
|
20
|
+
const out = (Array.isArray(base) ? [...base] : { ...base }) as T;
|
|
21
|
+
|
|
22
|
+
for (const key of Object.keys(overrides) as Array<keyof T>) {
|
|
23
|
+
const nextVal = overrides[key as keyof typeof overrides];
|
|
24
|
+
if (nextVal === undefined) continue;
|
|
25
|
+
|
|
26
|
+
const baseVal = base[key];
|
|
27
|
+
|
|
28
|
+
if (
|
|
29
|
+
isRecord(baseVal) &&
|
|
30
|
+
isRecord(nextVal) &&
|
|
31
|
+
!Array.isArray(baseVal) &&
|
|
32
|
+
!Array.isArray(nextVal)
|
|
33
|
+
) {
|
|
34
|
+
out[key] = mergeDeep(
|
|
35
|
+
baseVal,
|
|
36
|
+
nextVal as DeepPartial<typeof baseVal>
|
|
37
|
+
) as T[keyof T];
|
|
38
|
+
} else {
|
|
39
|
+
out[key] = nextVal as T[keyof T];
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return out;
|
|
44
|
+
}
|
|
45
|
+
|