unrag 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +611 -174
- package/package.json +12 -6
- package/registry/config/unrag.config.ts +9 -8
- package/registry/connectors/google-drive/_api-types.ts +60 -0
- package/registry/connectors/google-drive/client.ts +99 -38
- package/registry/connectors/google-drive/sync.ts +97 -69
- package/registry/connectors/google-drive/types.ts +76 -37
- package/registry/connectors/notion/client.ts +12 -3
- package/registry/connectors/notion/render.ts +62 -23
- package/registry/connectors/notion/sync.ts +30 -23
- package/registry/core/assets.ts +11 -10
- package/registry/core/config.ts +10 -25
- package/registry/core/context-engine.ts +71 -2
- package/registry/core/deep-merge.ts +45 -0
- package/registry/core/ingest.ts +117 -44
- package/registry/core/types.ts +96 -2
- package/registry/docs/unrag.md +6 -1
- package/registry/embedding/_shared.ts +25 -0
- package/registry/embedding/ai.ts +8 -68
- package/registry/embedding/azure.ts +88 -0
- package/registry/embedding/bedrock.ts +88 -0
- package/registry/embedding/cohere.ts +88 -0
- package/registry/embedding/google.ts +102 -0
- package/registry/embedding/mistral.ts +71 -0
- package/registry/embedding/ollama.ts +90 -0
- package/registry/embedding/openai.ts +88 -0
- package/registry/embedding/openrouter.ts +127 -0
- package/registry/embedding/together.ts +77 -0
- package/registry/embedding/vertex.ts +111 -0
- package/registry/embedding/voyage.ts +169 -0
- package/registry/extractors/audio-transcribe/index.ts +39 -23
- package/registry/extractors/file-docx/index.ts +8 -1
- package/registry/extractors/file-pptx/index.ts +22 -1
- package/registry/extractors/file-xlsx/index.ts +24 -1
- package/registry/extractors/image-caption-llm/index.ts +8 -3
- package/registry/extractors/image-ocr/index.ts +9 -4
- package/registry/extractors/pdf-llm/index.ts +9 -4
- package/registry/extractors/pdf-text-layer/index.ts +23 -2
- package/registry/extractors/video-frames/index.ts +8 -3
- package/registry/extractors/video-transcribe/index.ts +40 -24
- package/registry/manifest.json +346 -0
- package/registry/store/drizzle-postgres-pgvector/store.ts +26 -6
package/package.json
CHANGED
|
@@ -6,18 +6,24 @@
|
|
|
6
6
|
"bin": {
|
|
7
7
|
"unrag": "./dist/cli/index.js"
|
|
8
8
|
},
|
|
9
|
-
"version": "0.2.
|
|
9
|
+
"version": "0.2.7",
|
|
10
10
|
"private": false,
|
|
11
11
|
"license": "Apache-2.0",
|
|
12
12
|
"devDependencies": {
|
|
13
|
+
"@ai-sdk/amazon-bedrock": "^3.0.72",
|
|
14
|
+
"@ai-sdk/cohere": "^3.0.1",
|
|
15
|
+
"@ai-sdk/google": "^3.0.1",
|
|
16
|
+
"@ai-sdk/openai": "^3.0.1",
|
|
17
|
+
"@openrouter/sdk": "^0.3.10",
|
|
18
|
+
"@prisma/client": "^6.0.0",
|
|
13
19
|
"@types/bun": "latest",
|
|
14
20
|
"@types/pg": "^8.16.0",
|
|
15
|
-
"
|
|
16
|
-
"prisma": "^6.0.0",
|
|
17
|
-
"drizzle-orm": "^0.45.1",
|
|
21
|
+
"ai": "^6.0.3",
|
|
18
22
|
"drizzle-kit": "^0.31.8",
|
|
19
|
-
"
|
|
20
|
-
"pg": "^8.16.3"
|
|
23
|
+
"drizzle-orm": "^0.45.1",
|
|
24
|
+
"pg": "^8.16.3",
|
|
25
|
+
"prisma": "^6.0.0",
|
|
26
|
+
"voyage-ai-provider": "^3.0.0"
|
|
21
27
|
},
|
|
22
28
|
"dependencies": {
|
|
23
29
|
"@clack/prompts": "^0.11.0",
|
|
@@ -18,19 +18,18 @@
|
|
|
18
18
|
export const unrag = defineUnragConfig({
|
|
19
19
|
defaults: {
|
|
20
20
|
chunking: {
|
|
21
|
-
chunkSize: 200,
|
|
22
|
-
chunkOverlap: 40,
|
|
21
|
+
chunkSize: 200, // __UNRAG_DEFAULT_chunkSize__
|
|
22
|
+
chunkOverlap: 40, // __UNRAG_DEFAULT_chunkOverlap__
|
|
23
23
|
},
|
|
24
24
|
retrieval: {
|
|
25
|
-
topK: 8,
|
|
25
|
+
topK: 8, // __UNRAG_DEFAULT_topK__
|
|
26
26
|
},
|
|
27
27
|
},
|
|
28
28
|
embedding: {
|
|
29
29
|
provider: "ai",
|
|
30
30
|
config: {
|
|
31
|
-
type: "text", // __UNRAG_EMBEDDING_TYPE__
|
|
32
31
|
model: "openai/text-embedding-3-small", // __UNRAG_EMBEDDING_MODEL__
|
|
33
|
-
timeoutMs: 15_000,
|
|
32
|
+
timeoutMs: 15_000, // __UNRAG_EMBEDDING_TIMEOUT__
|
|
34
33
|
},
|
|
35
34
|
},
|
|
36
35
|
engine: {
|
|
@@ -41,8 +40,8 @@ export const unrag = defineUnragConfig({
|
|
|
41
40
|
* - storeDocumentContent: whether the full original document text is stored in `documents.content`.
|
|
42
41
|
*/
|
|
43
42
|
storage: {
|
|
44
|
-
storeChunkContent: true,
|
|
45
|
-
storeDocumentContent: true,
|
|
43
|
+
storeChunkContent: true, // __UNRAG_STORAGE_storeChunkContent__
|
|
44
|
+
storeDocumentContent: true, // __UNRAG_STORAGE_storeDocumentContent__
|
|
46
45
|
},
|
|
47
46
|
/**
|
|
48
47
|
* Optional extractor modules that can process non-text assets into text outputs.
|
|
@@ -62,9 +61,10 @@ export const unrag = defineUnragConfig({
|
|
|
62
61
|
*
|
|
63
62
|
* Notes:
|
|
64
63
|
* - This generated config is cost-safe by default (all extraction is off).
|
|
65
|
-
* - `unrag init` can enable rich media
|
|
64
|
+
* - `unrag init --rich-media` can enable rich media ingestion for you (extractors + assetProcessing flags).
|
|
66
65
|
* - Tighten fetch allowlists/limits in production if you ingest URL-based assets.
|
|
67
66
|
*/
|
|
67
|
+
// __UNRAG_ASSET_PROCESSING_BLOCK_START__
|
|
68
68
|
assetProcessing: {
|
|
69
69
|
onUnsupportedAsset: "skip",
|
|
70
70
|
onError: "skip",
|
|
@@ -181,6 +181,7 @@ export const unrag = defineUnragConfig({
|
|
|
181
181
|
},
|
|
182
182
|
},
|
|
183
183
|
},
|
|
184
|
+
// __UNRAG_ASSET_PROCESSING_BLOCK_END__
|
|
184
185
|
},
|
|
185
186
|
} as const);
|
|
186
187
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structural types for the Google Drive API.
|
|
3
|
+
*
|
|
4
|
+
* These are minimal interfaces that match the googleapis API structure,
|
|
5
|
+
* allowing the connector to work without depending on googleapis types at compile time.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface DriveFile {
|
|
9
|
+
id?: string | null;
|
|
10
|
+
name?: string | null;
|
|
11
|
+
mimeType?: string | null;
|
|
12
|
+
size?: string | null;
|
|
13
|
+
webViewLink?: string | null;
|
|
14
|
+
modifiedTime?: string | null;
|
|
15
|
+
parents?: string[] | null;
|
|
16
|
+
shortcutDetails?: {
|
|
17
|
+
targetId?: string | null;
|
|
18
|
+
targetMimeType?: string | null;
|
|
19
|
+
} | null;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface DriveFileList {
|
|
23
|
+
files?: DriveFile[];
|
|
24
|
+
nextPageToken?: string | null;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface DriveFilesResource {
|
|
28
|
+
get(params: {
|
|
29
|
+
fileId: string;
|
|
30
|
+
fields?: string;
|
|
31
|
+
alt?: string;
|
|
32
|
+
supportsAllDrives?: boolean;
|
|
33
|
+
}): Promise<{ data: DriveFile | ArrayBuffer | string }>;
|
|
34
|
+
|
|
35
|
+
list(params: {
|
|
36
|
+
q?: string;
|
|
37
|
+
fields?: string;
|
|
38
|
+
pageToken?: string;
|
|
39
|
+
pageSize?: number;
|
|
40
|
+
supportsAllDrives?: boolean;
|
|
41
|
+
includeItemsFromAllDrives?: boolean;
|
|
42
|
+
}): Promise<{ data: DriveFileList }>;
|
|
43
|
+
|
|
44
|
+
export(params: {
|
|
45
|
+
fileId: string;
|
|
46
|
+
mimeType: string;
|
|
47
|
+
}): Promise<{ data: ArrayBuffer | string }>;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface DriveClient {
|
|
51
|
+
files: DriveFilesResource;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Auth client interface - minimal subset used by the connector.
|
|
56
|
+
*/
|
|
57
|
+
export interface AuthClient {
|
|
58
|
+
getAccessToken?(): Promise<{ token?: string | null }>;
|
|
59
|
+
}
|
|
60
|
+
|
|
@@ -1,4 +1,11 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type { DriveClient, AuthClient } from "./_api-types";
|
|
2
|
+
import type {
|
|
3
|
+
GoogleDriveAuth,
|
|
4
|
+
GoogleDriveOAuthAuth,
|
|
5
|
+
GoogleDriveServiceAccountAuth,
|
|
6
|
+
GoogleDriveGoogleAuthAuth,
|
|
7
|
+
ServiceAccountCredentials,
|
|
8
|
+
} from "./types";
|
|
2
9
|
|
|
3
10
|
export const DEFAULT_DRIVE_SCOPES = [
|
|
4
11
|
"https://www.googleapis.com/auth/drive.readonly",
|
|
@@ -17,36 +24,51 @@ type NormalizedAuth =
|
|
|
17
24
|
}
|
|
18
25
|
| {
|
|
19
26
|
kind: "service_account";
|
|
20
|
-
credentials:
|
|
27
|
+
credentials: ServiceAccountCredentials;
|
|
21
28
|
subject?: string;
|
|
22
29
|
}
|
|
23
30
|
| { kind: "google_auth"; auth: unknown };
|
|
24
31
|
|
|
32
|
+
/**
|
|
33
|
+
* Type guard for service account auth.
|
|
34
|
+
*/
|
|
35
|
+
function isServiceAccountAuth(auth: GoogleDriveAuth): auth is GoogleDriveServiceAccountAuth {
|
|
36
|
+
return auth.kind === "service_account";
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Type guard for google auth.
|
|
41
|
+
*/
|
|
42
|
+
function isGoogleAuth(auth: GoogleDriveAuth): auth is GoogleDriveGoogleAuthAuth {
|
|
43
|
+
return auth.kind === "google_auth";
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Type guard for oauth.
|
|
48
|
+
*/
|
|
49
|
+
function isOAuthAuth(auth: GoogleDriveAuth): auth is GoogleDriveOAuthAuth {
|
|
50
|
+
return auth.kind === "oauth";
|
|
51
|
+
}
|
|
52
|
+
|
|
25
53
|
export function normalizeGoogleDriveAuth(auth: GoogleDriveAuth): NormalizedAuth {
|
|
26
54
|
if (!auth || typeof auth !== "object") {
|
|
27
55
|
throw new Error("Google Drive auth is required");
|
|
28
56
|
}
|
|
29
57
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
if (kind === "google_auth") {
|
|
36
|
-
const a = (auth as any).auth;
|
|
37
|
-
if (!a) throw new Error('Google Drive auth.kind="google_auth" requires auth');
|
|
38
|
-
return { kind: "google_auth", auth: a };
|
|
58
|
+
if (isGoogleAuth(auth)) {
|
|
59
|
+
if (!auth.auth) throw new Error('Google Drive auth.kind="google_auth" requires auth');
|
|
60
|
+
return { kind: "google_auth", auth: auth.auth };
|
|
39
61
|
}
|
|
40
62
|
|
|
41
|
-
if (
|
|
42
|
-
const raw =
|
|
63
|
+
if (isServiceAccountAuth(auth)) {
|
|
64
|
+
const raw = auth.credentialsJson;
|
|
43
65
|
if (!raw) {
|
|
44
66
|
throw new Error(
|
|
45
67
|
'Google Drive auth.kind="service_account" requires credentialsJson'
|
|
46
68
|
);
|
|
47
69
|
}
|
|
48
|
-
const credentials =
|
|
49
|
-
typeof raw === "string" ? (JSON.parse(raw) as
|
|
70
|
+
const credentials: ServiceAccountCredentials =
|
|
71
|
+
typeof raw === "string" ? (JSON.parse(raw) as ServiceAccountCredentials) : raw;
|
|
50
72
|
if (!credentials?.client_email || !credentials?.private_key) {
|
|
51
73
|
throw new Error(
|
|
52
74
|
'Google Drive service account credentials must include "client_email" and "private_key".'
|
|
@@ -55,29 +77,33 @@ export function normalizeGoogleDriveAuth(auth: GoogleDriveAuth): NormalizedAuth
|
|
|
55
77
|
return {
|
|
56
78
|
kind: "service_account",
|
|
57
79
|
credentials,
|
|
58
|
-
subject:
|
|
80
|
+
subject: auth.subject ? String(auth.subject) : undefined,
|
|
59
81
|
};
|
|
60
82
|
}
|
|
61
83
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
84
|
+
if (isOAuthAuth(auth)) {
|
|
85
|
+
// oauth
|
|
86
|
+
if (auth.oauthClient) {
|
|
87
|
+
return { kind: "oauth_client", oauthClient: auth.oauthClient };
|
|
88
|
+
}
|
|
66
89
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
90
|
+
const { clientId, clientSecret, redirectUri, refreshToken, accessToken } = auth;
|
|
91
|
+
if (!clientId || !clientSecret || !redirectUri || !refreshToken) {
|
|
92
|
+
throw new Error(
|
|
93
|
+
'Google Drive auth.kind="oauth" requires either oauthClient or { clientId, clientSecret, redirectUri, refreshToken }'
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
return {
|
|
97
|
+
kind: "oauth_config",
|
|
98
|
+
clientId: String(clientId),
|
|
99
|
+
clientSecret: String(clientSecret),
|
|
100
|
+
redirectUri: String(redirectUri),
|
|
101
|
+
refreshToken: String(refreshToken),
|
|
102
|
+
...(accessToken ? { accessToken: String(accessToken) } : {}),
|
|
103
|
+
};
|
|
72
104
|
}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
clientId: String(clientId),
|
|
76
|
-
clientSecret: String(clientSecret),
|
|
77
|
-
redirectUri: String(redirectUri),
|
|
78
|
-
refreshToken: String(refreshToken),
|
|
79
|
-
...(accessToken ? { accessToken: String(accessToken) } : {}),
|
|
80
|
-
};
|
|
105
|
+
|
|
106
|
+
throw new Error(`Unknown Google Drive auth kind: ${String((auth as Record<string, unknown>).kind)}`);
|
|
81
107
|
}
|
|
82
108
|
|
|
83
109
|
const asMessage = (err: unknown) => {
|
|
@@ -89,6 +115,41 @@ const asMessage = (err: unknown) => {
|
|
|
89
115
|
}
|
|
90
116
|
};
|
|
91
117
|
|
|
118
|
+
/**
|
|
119
|
+
* Google Auth Library module shape for dynamic import.
|
|
120
|
+
*/
|
|
121
|
+
interface GoogleAuthLibraryModule {
|
|
122
|
+
OAuth2Client?: new (
|
|
123
|
+
clientId: string,
|
|
124
|
+
clientSecret: string,
|
|
125
|
+
redirectUri: string
|
|
126
|
+
) => {
|
|
127
|
+
setCredentials(credentials: Record<string, string>): void;
|
|
128
|
+
};
|
|
129
|
+
OAuth2?: new (
|
|
130
|
+
clientId: string,
|
|
131
|
+
clientSecret: string,
|
|
132
|
+
redirectUri: string
|
|
133
|
+
) => {
|
|
134
|
+
setCredentials(credentials: Record<string, string>): void;
|
|
135
|
+
};
|
|
136
|
+
JWT?: new (options: {
|
|
137
|
+
email: string;
|
|
138
|
+
key: string;
|
|
139
|
+
scopes: string[];
|
|
140
|
+
subject?: string;
|
|
141
|
+
}) => unknown;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Googleapis module shape for dynamic import.
|
|
146
|
+
*/
|
|
147
|
+
interface GoogleApisModule {
|
|
148
|
+
google: {
|
|
149
|
+
drive(options: { version: string; auth: unknown }): DriveClient;
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
|
|
92
153
|
/**
|
|
93
154
|
* Creates a Google Drive API client from a plug-and-play auth input.
|
|
94
155
|
*
|
|
@@ -98,11 +159,11 @@ const asMessage = (err: unknown) => {
|
|
|
98
159
|
export async function createGoogleDriveClient(args: {
|
|
99
160
|
auth: GoogleDriveAuth;
|
|
100
161
|
scopes?: string[];
|
|
101
|
-
}): Promise<{ drive:
|
|
162
|
+
}): Promise<{ drive: DriveClient; authClient: AuthClient }> {
|
|
102
163
|
const normalized = normalizeGoogleDriveAuth(args.auth);
|
|
103
164
|
const scopes = (args.scopes?.length ? args.scopes : DEFAULT_DRIVE_SCOPES) as string[];
|
|
104
165
|
|
|
105
|
-
let authClient:
|
|
166
|
+
let authClient: unknown;
|
|
106
167
|
|
|
107
168
|
try {
|
|
108
169
|
if (normalized.kind === "oauth_client") {
|
|
@@ -111,7 +172,7 @@ export async function createGoogleDriveClient(args: {
|
|
|
111
172
|
authClient = normalized.auth;
|
|
112
173
|
} else {
|
|
113
174
|
// google-auth-library (dynamic)
|
|
114
|
-
const gal
|
|
175
|
+
const gal = (await import("google-auth-library")) as GoogleAuthLibraryModule;
|
|
115
176
|
|
|
116
177
|
if (normalized.kind === "oauth_config") {
|
|
117
178
|
const OAuth2Client = gal.OAuth2Client ?? gal.OAuth2;
|
|
@@ -143,7 +204,7 @@ export async function createGoogleDriveClient(args: {
|
|
|
143
204
|
}
|
|
144
205
|
}
|
|
145
206
|
|
|
146
|
-
const { google }
|
|
207
|
+
const { google } = (await import("googleapis")) as GoogleApisModule;
|
|
147
208
|
if (!google?.drive) {
|
|
148
209
|
throw new Error("googleapis.google.drive not found");
|
|
149
210
|
}
|
|
@@ -153,7 +214,7 @@ export async function createGoogleDriveClient(args: {
|
|
|
153
214
|
auth: authClient,
|
|
154
215
|
});
|
|
155
216
|
|
|
156
|
-
return { drive, authClient };
|
|
217
|
+
return { drive, authClient: authClient as AuthClient };
|
|
157
218
|
} catch (err) {
|
|
158
219
|
const msg = asMessage(err);
|
|
159
220
|
if (
|