unrag 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/registry/connectors/google-drive/_api-types.ts +60 -0
- package/registry/connectors/google-drive/client.ts +99 -38
- package/registry/connectors/google-drive/sync.ts +97 -69
- package/registry/connectors/google-drive/types.ts +76 -37
- package/registry/connectors/notion/client.ts +12 -3
- package/registry/connectors/notion/render.ts +62 -23
- package/registry/connectors/notion/sync.ts +30 -23
- package/registry/core/assets.ts +11 -10
- package/registry/core/config.ts +10 -25
- package/registry/core/context-engine.ts +5 -0
- package/registry/core/deep-merge.ts +45 -0
- package/registry/core/ingest.ts +117 -44
- package/registry/core/types.ts +52 -0
- package/registry/embedding/_shared.ts +6 -1
- package/registry/embedding/ai.ts +2 -3
- package/registry/embedding/azure.ts +11 -2
- package/registry/embedding/bedrock.ts +11 -2
- package/registry/embedding/cohere.ts +11 -2
- package/registry/embedding/google.ts +11 -2
- package/registry/embedding/mistral.ts +11 -2
- package/registry/embedding/ollama.ts +18 -3
- package/registry/embedding/openai.ts +11 -2
- package/registry/embedding/openrouter.ts +53 -11
- package/registry/embedding/together.ts +15 -5
- package/registry/embedding/vertex.ts +11 -2
- package/registry/embedding/voyage.ts +16 -6
- package/registry/extractors/audio-transcribe/index.ts +39 -23
- package/registry/extractors/file-docx/index.ts +8 -1
- package/registry/extractors/file-pptx/index.ts +22 -1
- package/registry/extractors/file-xlsx/index.ts +24 -1
- package/registry/extractors/image-caption-llm/index.ts +8 -3
- package/registry/extractors/image-ocr/index.ts +9 -4
- package/registry/extractors/pdf-llm/index.ts +9 -4
- package/registry/extractors/pdf-text-layer/index.ts +23 -2
- package/registry/extractors/video-frames/index.ts +8 -3
- package/registry/extractors/video-transcribe/index.ts +40 -24
- package/registry/manifest.json +6 -6
- package/registry/store/drizzle-postgres-pgvector/store.ts +24 -7
package/package.json
CHANGED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structural types for the Google Drive API.
|
|
3
|
+
*
|
|
4
|
+
* These are minimal interfaces that match the googleapis API structure,
|
|
5
|
+
* allowing the connector to work without depending on googleapis types at compile time.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface DriveFile {
|
|
9
|
+
id?: string | null;
|
|
10
|
+
name?: string | null;
|
|
11
|
+
mimeType?: string | null;
|
|
12
|
+
size?: string | null;
|
|
13
|
+
webViewLink?: string | null;
|
|
14
|
+
modifiedTime?: string | null;
|
|
15
|
+
parents?: string[] | null;
|
|
16
|
+
shortcutDetails?: {
|
|
17
|
+
targetId?: string | null;
|
|
18
|
+
targetMimeType?: string | null;
|
|
19
|
+
} | null;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface DriveFileList {
|
|
23
|
+
files?: DriveFile[];
|
|
24
|
+
nextPageToken?: string | null;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface DriveFilesResource {
|
|
28
|
+
get(params: {
|
|
29
|
+
fileId: string;
|
|
30
|
+
fields?: string;
|
|
31
|
+
alt?: string;
|
|
32
|
+
supportsAllDrives?: boolean;
|
|
33
|
+
}): Promise<{ data: DriveFile | ArrayBuffer | string }>;
|
|
34
|
+
|
|
35
|
+
list(params: {
|
|
36
|
+
q?: string;
|
|
37
|
+
fields?: string;
|
|
38
|
+
pageToken?: string;
|
|
39
|
+
pageSize?: number;
|
|
40
|
+
supportsAllDrives?: boolean;
|
|
41
|
+
includeItemsFromAllDrives?: boolean;
|
|
42
|
+
}): Promise<{ data: DriveFileList }>;
|
|
43
|
+
|
|
44
|
+
export(params: {
|
|
45
|
+
fileId: string;
|
|
46
|
+
mimeType: string;
|
|
47
|
+
}): Promise<{ data: ArrayBuffer | string }>;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface DriveClient {
|
|
51
|
+
files: DriveFilesResource;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Auth client interface - minimal subset used by the connector.
|
|
56
|
+
*/
|
|
57
|
+
export interface AuthClient {
|
|
58
|
+
getAccessToken?(): Promise<{ token?: string | null }>;
|
|
59
|
+
}
|
|
60
|
+
|
|
@@ -1,4 +1,11 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type { DriveClient, AuthClient } from "./_api-types";
|
|
2
|
+
import type {
|
|
3
|
+
GoogleDriveAuth,
|
|
4
|
+
GoogleDriveOAuthAuth,
|
|
5
|
+
GoogleDriveServiceAccountAuth,
|
|
6
|
+
GoogleDriveGoogleAuthAuth,
|
|
7
|
+
ServiceAccountCredentials,
|
|
8
|
+
} from "./types";
|
|
2
9
|
|
|
3
10
|
export const DEFAULT_DRIVE_SCOPES = [
|
|
4
11
|
"https://www.googleapis.com/auth/drive.readonly",
|
|
@@ -17,36 +24,51 @@ type NormalizedAuth =
|
|
|
17
24
|
}
|
|
18
25
|
| {
|
|
19
26
|
kind: "service_account";
|
|
20
|
-
credentials:
|
|
27
|
+
credentials: ServiceAccountCredentials;
|
|
21
28
|
subject?: string;
|
|
22
29
|
}
|
|
23
30
|
| { kind: "google_auth"; auth: unknown };
|
|
24
31
|
|
|
32
|
+
/**
|
|
33
|
+
* Type guard for service account auth.
|
|
34
|
+
*/
|
|
35
|
+
function isServiceAccountAuth(auth: GoogleDriveAuth): auth is GoogleDriveServiceAccountAuth {
|
|
36
|
+
return auth.kind === "service_account";
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Type guard for google auth.
|
|
41
|
+
*/
|
|
42
|
+
function isGoogleAuth(auth: GoogleDriveAuth): auth is GoogleDriveGoogleAuthAuth {
|
|
43
|
+
return auth.kind === "google_auth";
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Type guard for oauth.
|
|
48
|
+
*/
|
|
49
|
+
function isOAuthAuth(auth: GoogleDriveAuth): auth is GoogleDriveOAuthAuth {
|
|
50
|
+
return auth.kind === "oauth";
|
|
51
|
+
}
|
|
52
|
+
|
|
25
53
|
export function normalizeGoogleDriveAuth(auth: GoogleDriveAuth): NormalizedAuth {
|
|
26
54
|
if (!auth || typeof auth !== "object") {
|
|
27
55
|
throw new Error("Google Drive auth is required");
|
|
28
56
|
}
|
|
29
57
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
if (kind === "google_auth") {
|
|
36
|
-
const a = (auth as any).auth;
|
|
37
|
-
if (!a) throw new Error('Google Drive auth.kind="google_auth" requires auth');
|
|
38
|
-
return { kind: "google_auth", auth: a };
|
|
58
|
+
if (isGoogleAuth(auth)) {
|
|
59
|
+
if (!auth.auth) throw new Error('Google Drive auth.kind="google_auth" requires auth');
|
|
60
|
+
return { kind: "google_auth", auth: auth.auth };
|
|
39
61
|
}
|
|
40
62
|
|
|
41
|
-
if (
|
|
42
|
-
const raw =
|
|
63
|
+
if (isServiceAccountAuth(auth)) {
|
|
64
|
+
const raw = auth.credentialsJson;
|
|
43
65
|
if (!raw) {
|
|
44
66
|
throw new Error(
|
|
45
67
|
'Google Drive auth.kind="service_account" requires credentialsJson'
|
|
46
68
|
);
|
|
47
69
|
}
|
|
48
|
-
const credentials =
|
|
49
|
-
typeof raw === "string" ? (JSON.parse(raw) as
|
|
70
|
+
const credentials: ServiceAccountCredentials =
|
|
71
|
+
typeof raw === "string" ? (JSON.parse(raw) as ServiceAccountCredentials) : raw;
|
|
50
72
|
if (!credentials?.client_email || !credentials?.private_key) {
|
|
51
73
|
throw new Error(
|
|
52
74
|
'Google Drive service account credentials must include "client_email" and "private_key".'
|
|
@@ -55,29 +77,33 @@ export function normalizeGoogleDriveAuth(auth: GoogleDriveAuth): NormalizedAuth
|
|
|
55
77
|
return {
|
|
56
78
|
kind: "service_account",
|
|
57
79
|
credentials,
|
|
58
|
-
subject:
|
|
80
|
+
subject: auth.subject ? String(auth.subject) : undefined,
|
|
59
81
|
};
|
|
60
82
|
}
|
|
61
83
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
84
|
+
if (isOAuthAuth(auth)) {
|
|
85
|
+
// oauth
|
|
86
|
+
if (auth.oauthClient) {
|
|
87
|
+
return { kind: "oauth_client", oauthClient: auth.oauthClient };
|
|
88
|
+
}
|
|
66
89
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
90
|
+
const { clientId, clientSecret, redirectUri, refreshToken, accessToken } = auth;
|
|
91
|
+
if (!clientId || !clientSecret || !redirectUri || !refreshToken) {
|
|
92
|
+
throw new Error(
|
|
93
|
+
'Google Drive auth.kind="oauth" requires either oauthClient or { clientId, clientSecret, redirectUri, refreshToken }'
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
return {
|
|
97
|
+
kind: "oauth_config",
|
|
98
|
+
clientId: String(clientId),
|
|
99
|
+
clientSecret: String(clientSecret),
|
|
100
|
+
redirectUri: String(redirectUri),
|
|
101
|
+
refreshToken: String(refreshToken),
|
|
102
|
+
...(accessToken ? { accessToken: String(accessToken) } : {}),
|
|
103
|
+
};
|
|
72
104
|
}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
clientId: String(clientId),
|
|
76
|
-
clientSecret: String(clientSecret),
|
|
77
|
-
redirectUri: String(redirectUri),
|
|
78
|
-
refreshToken: String(refreshToken),
|
|
79
|
-
...(accessToken ? { accessToken: String(accessToken) } : {}),
|
|
80
|
-
};
|
|
105
|
+
|
|
106
|
+
throw new Error(`Unknown Google Drive auth kind: ${String((auth as Record<string, unknown>).kind)}`);
|
|
81
107
|
}
|
|
82
108
|
|
|
83
109
|
const asMessage = (err: unknown) => {
|
|
@@ -89,6 +115,41 @@ const asMessage = (err: unknown) => {
|
|
|
89
115
|
}
|
|
90
116
|
};
|
|
91
117
|
|
|
118
|
+
/**
|
|
119
|
+
* Google Auth Library module shape for dynamic import.
|
|
120
|
+
*/
|
|
121
|
+
interface GoogleAuthLibraryModule {
|
|
122
|
+
OAuth2Client?: new (
|
|
123
|
+
clientId: string,
|
|
124
|
+
clientSecret: string,
|
|
125
|
+
redirectUri: string
|
|
126
|
+
) => {
|
|
127
|
+
setCredentials(credentials: Record<string, string>): void;
|
|
128
|
+
};
|
|
129
|
+
OAuth2?: new (
|
|
130
|
+
clientId: string,
|
|
131
|
+
clientSecret: string,
|
|
132
|
+
redirectUri: string
|
|
133
|
+
) => {
|
|
134
|
+
setCredentials(credentials: Record<string, string>): void;
|
|
135
|
+
};
|
|
136
|
+
JWT?: new (options: {
|
|
137
|
+
email: string;
|
|
138
|
+
key: string;
|
|
139
|
+
scopes: string[];
|
|
140
|
+
subject?: string;
|
|
141
|
+
}) => unknown;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Googleapis module shape for dynamic import.
|
|
146
|
+
*/
|
|
147
|
+
interface GoogleApisModule {
|
|
148
|
+
google: {
|
|
149
|
+
drive(options: { version: string; auth: unknown }): DriveClient;
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
|
|
92
153
|
/**
|
|
93
154
|
* Creates a Google Drive API client from a plug-and-play auth input.
|
|
94
155
|
*
|
|
@@ -98,11 +159,11 @@ const asMessage = (err: unknown) => {
|
|
|
98
159
|
export async function createGoogleDriveClient(args: {
|
|
99
160
|
auth: GoogleDriveAuth;
|
|
100
161
|
scopes?: string[];
|
|
101
|
-
}): Promise<{ drive:
|
|
162
|
+
}): Promise<{ drive: DriveClient; authClient: AuthClient }> {
|
|
102
163
|
const normalized = normalizeGoogleDriveAuth(args.auth);
|
|
103
164
|
const scopes = (args.scopes?.length ? args.scopes : DEFAULT_DRIVE_SCOPES) as string[];
|
|
104
165
|
|
|
105
|
-
let authClient:
|
|
166
|
+
let authClient: unknown;
|
|
106
167
|
|
|
107
168
|
try {
|
|
108
169
|
if (normalized.kind === "oauth_client") {
|
|
@@ -111,7 +172,7 @@ export async function createGoogleDriveClient(args: {
|
|
|
111
172
|
authClient = normalized.auth;
|
|
112
173
|
} else {
|
|
113
174
|
// google-auth-library (dynamic)
|
|
114
|
-
const gal
|
|
175
|
+
const gal = (await import("google-auth-library")) as GoogleAuthLibraryModule;
|
|
115
176
|
|
|
116
177
|
if (normalized.kind === "oauth_config") {
|
|
117
178
|
const OAuth2Client = gal.OAuth2Client ?? gal.OAuth2;
|
|
@@ -143,7 +204,7 @@ export async function createGoogleDriveClient(args: {
|
|
|
143
204
|
}
|
|
144
205
|
}
|
|
145
206
|
|
|
146
|
-
const { google }
|
|
207
|
+
const { google } = (await import("googleapis")) as GoogleApisModule;
|
|
147
208
|
if (!google?.drive) {
|
|
148
209
|
throw new Error("googleapis.google.drive not found");
|
|
149
210
|
}
|
|
@@ -153,7 +214,7 @@ export async function createGoogleDriveClient(args: {
|
|
|
153
214
|
auth: authClient,
|
|
154
215
|
});
|
|
155
216
|
|
|
156
|
-
return { drive, authClient };
|
|
217
|
+
return { drive, authClient: authClient as AuthClient };
|
|
157
218
|
} catch (err) {
|
|
158
219
|
const msg = asMessage(err);
|
|
159
220
|
if (
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import type { IngestResult } from "../../core";
|
|
1
|
+
import type { IngestResult, Metadata } from "../../core";
|
|
2
2
|
import type { AssetInput } from "../../core/types";
|
|
3
|
+
import type { DriveClient, DriveFile } from "./_api-types";
|
|
3
4
|
import { createGoogleDriveClient } from "./client";
|
|
4
5
|
import {
|
|
5
6
|
assetKindFromMediaType,
|
|
@@ -17,6 +18,25 @@ import type {
|
|
|
17
18
|
|
|
18
19
|
const DEFAULT_MAX_BYTES = 15 * 1024 * 1024; // 15MB
|
|
19
20
|
|
|
21
|
+
/**
|
|
22
|
+
* Internal metadata type for Google Drive documents.
|
|
23
|
+
*/
|
|
24
|
+
interface GoogleDriveMetadata extends Metadata {
|
|
25
|
+
connector: "google-drive";
|
|
26
|
+
kind: "file" | "folder" | "shortcut";
|
|
27
|
+
fileId: string;
|
|
28
|
+
name?: string;
|
|
29
|
+
mimeType?: string;
|
|
30
|
+
size?: number;
|
|
31
|
+
googleNativeKind?: string;
|
|
32
|
+
unsupportedGoogleMime?: boolean;
|
|
33
|
+
skippedTooLarge?: boolean;
|
|
34
|
+
exportedTooLarge?: boolean;
|
|
35
|
+
shortcutUnresolved?: boolean;
|
|
36
|
+
exportMimeType?: string;
|
|
37
|
+
exportFallback?: string;
|
|
38
|
+
}
|
|
39
|
+
|
|
20
40
|
const joinPrefix = (prefix: string | undefined, rest: string) => {
|
|
21
41
|
const p = (prefix ?? "").trim();
|
|
22
42
|
if (!p) return rest;
|
|
@@ -44,10 +64,10 @@ const asMessage = (err: unknown) => {
|
|
|
44
64
|
}
|
|
45
65
|
};
|
|
46
66
|
|
|
47
|
-
const toUint8Array = (data:
|
|
67
|
+
const toUint8Array = (data: unknown): Uint8Array => {
|
|
48
68
|
if (!data) return new Uint8Array();
|
|
49
69
|
if (data instanceof Uint8Array) return data;
|
|
50
|
-
if (typeof Buffer !== "undefined" && data
|
|
70
|
+
if (typeof Buffer !== "undefined" && Buffer.isBuffer(data)) {
|
|
51
71
|
return new Uint8Array(data);
|
|
52
72
|
}
|
|
53
73
|
if (data instanceof ArrayBuffer) return new Uint8Array(data);
|
|
@@ -61,51 +81,54 @@ const toUint8Array = (data: any): Uint8Array => {
|
|
|
61
81
|
return new Uint8Array();
|
|
62
82
|
};
|
|
63
83
|
|
|
64
|
-
const bytesToText = (bytes: Uint8Array) => {
|
|
84
|
+
const bytesToText = (bytes: Uint8Array): string => {
|
|
65
85
|
return new TextDecoder("utf-8", { fatal: false }).decode(bytes);
|
|
66
86
|
};
|
|
67
87
|
|
|
68
|
-
const isNotFound = (err:
|
|
88
|
+
const isNotFound = (err: unknown, treatForbiddenAsNotFound: boolean): boolean => {
|
|
89
|
+
if (typeof err !== "object" || err === null) return false;
|
|
90
|
+
const e = err as Record<string, unknown>;
|
|
91
|
+
const response = e.response as Record<string, unknown> | undefined;
|
|
69
92
|
const status =
|
|
70
|
-
Number(
|
|
71
|
-
Number(err?.response?.status);
|
|
93
|
+
Number(e.code ?? e.status ?? response?.status ?? e.statusCode ?? 0);
|
|
72
94
|
if (status === 404) return true;
|
|
73
95
|
if (treatForbiddenAsNotFound && status === 403) return true;
|
|
74
96
|
return false;
|
|
75
97
|
};
|
|
76
98
|
|
|
77
|
-
async function getFileMetadata(drive:
|
|
99
|
+
async function getFileMetadata(drive: DriveClient, fileId: string): Promise<DriveFile> {
|
|
78
100
|
const res = await drive.files.get({
|
|
79
101
|
fileId,
|
|
80
102
|
supportsAllDrives: true,
|
|
81
103
|
fields:
|
|
82
104
|
"id,name,mimeType,size,md5Checksum,modifiedTime,webViewLink,webContentLink,iconLink,shortcutDetails,driveId",
|
|
83
105
|
});
|
|
84
|
-
return res?.data ?? {};
|
|
106
|
+
return (res?.data ?? {}) as DriveFile;
|
|
85
107
|
}
|
|
86
108
|
|
|
87
|
-
async function downloadFileBytes(drive:
|
|
88
|
-
const res = await drive.files.get(
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
109
|
+
async function downloadFileBytes(drive: DriveClient, fileId: string): Promise<Uint8Array> {
|
|
110
|
+
const res = await drive.files.get({
|
|
111
|
+
fileId,
|
|
112
|
+
alt: "media",
|
|
113
|
+
supportsAllDrives: true,
|
|
114
|
+
});
|
|
92
115
|
return toUint8Array(res?.data);
|
|
93
116
|
}
|
|
94
117
|
|
|
95
118
|
async function exportFileBytes(
|
|
96
|
-
drive:
|
|
119
|
+
drive: DriveClient,
|
|
97
120
|
fileId: string,
|
|
98
121
|
mimeType: string
|
|
99
122
|
): Promise<Uint8Array> {
|
|
100
|
-
const res = await drive.files.export(
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
);
|
|
123
|
+
const res = await drive.files.export({
|
|
124
|
+
fileId,
|
|
125
|
+
mimeType,
|
|
126
|
+
});
|
|
104
127
|
return toUint8Array(res?.data);
|
|
105
128
|
}
|
|
106
129
|
|
|
107
130
|
export async function loadGoogleDriveFileDocument(args: {
|
|
108
|
-
drive:
|
|
131
|
+
drive: DriveClient;
|
|
109
132
|
fileId: string;
|
|
110
133
|
sourceIdPrefix?: string;
|
|
111
134
|
options?: {
|
|
@@ -128,21 +151,22 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
128
151
|
|
|
129
152
|
// Handle folders: return a document shape but with no content/assets; callers typically skip.
|
|
130
153
|
if (classification.kind === "folder") {
|
|
154
|
+
const folderMetadata: GoogleDriveMetadata = {
|
|
155
|
+
connector: "google-drive",
|
|
156
|
+
kind: "folder",
|
|
157
|
+
fileId,
|
|
158
|
+
name,
|
|
159
|
+
mimeType: DRIVE_MIME.folder,
|
|
160
|
+
...(meta?.webViewLink ? { webViewLink: String(meta.webViewLink) } : {}),
|
|
161
|
+
...(meta?.modifiedTime ? { modifiedTime: String(meta.modifiedTime) } : {}),
|
|
162
|
+
};
|
|
131
163
|
return buildGoogleDriveFileIngestInput({
|
|
132
164
|
fileId,
|
|
133
165
|
sourceIdPrefix: args.sourceIdPrefix,
|
|
134
166
|
content: "",
|
|
135
167
|
assets: [],
|
|
136
|
-
metadata:
|
|
137
|
-
|
|
138
|
-
kind: "folder",
|
|
139
|
-
fileId,
|
|
140
|
-
name,
|
|
141
|
-
mimeType: DRIVE_MIME.folder,
|
|
142
|
-
...(meta?.webViewLink ? { webViewLink: String(meta.webViewLink) } : {}),
|
|
143
|
-
...(meta?.modifiedTime ? { modifiedTime: String(meta.modifiedTime) } : {}),
|
|
144
|
-
},
|
|
145
|
-
}) as any;
|
|
168
|
+
metadata: folderMetadata,
|
|
169
|
+
});
|
|
146
170
|
}
|
|
147
171
|
|
|
148
172
|
// Shortcuts: resolve to target if possible (1-level), otherwise let caller decide.
|
|
@@ -150,20 +174,21 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
150
174
|
const visited = args._visited ?? new Set<string>();
|
|
151
175
|
if (visited.has(fileId)) {
|
|
152
176
|
// cycle
|
|
177
|
+
const cycleMetadata: GoogleDriveMetadata = {
|
|
178
|
+
connector: "google-drive",
|
|
179
|
+
kind: "shortcut",
|
|
180
|
+
fileId,
|
|
181
|
+
name,
|
|
182
|
+
mimeType: DRIVE_MIME.shortcut,
|
|
183
|
+
shortcutUnresolved: true,
|
|
184
|
+
};
|
|
153
185
|
return buildGoogleDriveFileIngestInput({
|
|
154
186
|
fileId,
|
|
155
187
|
sourceIdPrefix: args.sourceIdPrefix,
|
|
156
188
|
content: "",
|
|
157
189
|
assets: [],
|
|
158
|
-
metadata:
|
|
159
|
-
|
|
160
|
-
kind: "shortcut",
|
|
161
|
-
fileId,
|
|
162
|
-
name,
|
|
163
|
-
mimeType: DRIVE_MIME.shortcut,
|
|
164
|
-
shortcutUnresolved: true,
|
|
165
|
-
},
|
|
166
|
-
}) as any;
|
|
190
|
+
metadata: cycleMetadata,
|
|
191
|
+
});
|
|
167
192
|
}
|
|
168
193
|
visited.add(fileId);
|
|
169
194
|
|
|
@@ -172,20 +197,21 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
172
197
|
: "";
|
|
173
198
|
|
|
174
199
|
if (!targetId) {
|
|
200
|
+
const unresolvedMetadata: GoogleDriveMetadata = {
|
|
201
|
+
connector: "google-drive",
|
|
202
|
+
kind: "shortcut",
|
|
203
|
+
fileId,
|
|
204
|
+
name,
|
|
205
|
+
mimeType: DRIVE_MIME.shortcut,
|
|
206
|
+
shortcutUnresolved: true,
|
|
207
|
+
};
|
|
175
208
|
return buildGoogleDriveFileIngestInput({
|
|
176
209
|
fileId,
|
|
177
210
|
sourceIdPrefix: args.sourceIdPrefix,
|
|
178
211
|
content: "",
|
|
179
212
|
assets: [],
|
|
180
|
-
metadata:
|
|
181
|
-
|
|
182
|
-
kind: "shortcut",
|
|
183
|
-
fileId,
|
|
184
|
-
name,
|
|
185
|
-
mimeType: DRIVE_MIME.shortcut,
|
|
186
|
-
shortcutUnresolved: true,
|
|
187
|
-
},
|
|
188
|
-
}) as any;
|
|
213
|
+
metadata: unresolvedMetadata,
|
|
214
|
+
});
|
|
189
215
|
}
|
|
190
216
|
|
|
191
217
|
// Resolve target content/assets but keep sourceId stable to the shortcut file id.
|
|
@@ -209,7 +235,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
209
235
|
};
|
|
210
236
|
}
|
|
211
237
|
|
|
212
|
-
const baseMetadata = {
|
|
238
|
+
const baseMetadata: Record<string, unknown> = {
|
|
213
239
|
connector: "google-drive",
|
|
214
240
|
kind: "file",
|
|
215
241
|
fileId,
|
|
@@ -222,7 +248,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
222
248
|
...(meta?.webContentLink ? { webContentLink: String(meta.webContentLink) } : {}),
|
|
223
249
|
...(meta?.iconLink ? { iconLink: String(meta.iconLink) } : {}),
|
|
224
250
|
...(meta?.driveId ? { driveId: String(meta.driveId) } : {}),
|
|
225
|
-
}
|
|
251
|
+
};
|
|
226
252
|
|
|
227
253
|
// Google-native export path
|
|
228
254
|
if (classification.kind === "google_native") {
|
|
@@ -238,7 +264,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
238
264
|
googleNativeKind: classification.nativeKind,
|
|
239
265
|
unsupportedGoogleMime: true,
|
|
240
266
|
},
|
|
241
|
-
})
|
|
267
|
+
});
|
|
242
268
|
}
|
|
243
269
|
|
|
244
270
|
// For content export, enforce maxBytesPerFile by bytes length.
|
|
@@ -252,7 +278,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
252
278
|
content: "",
|
|
253
279
|
assets: [],
|
|
254
280
|
metadata: { ...baseMetadata, exportedTooLarge: true },
|
|
255
|
-
})
|
|
281
|
+
});
|
|
256
282
|
}
|
|
257
283
|
const content = bytesToText(bytes).trim();
|
|
258
284
|
return buildGoogleDriveFileIngestInput({
|
|
@@ -261,7 +287,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
261
287
|
content,
|
|
262
288
|
assets: [],
|
|
263
289
|
metadata: { ...baseMetadata, googleNativeKind: classification.nativeKind, exportMimeType: plan.mimeType },
|
|
264
|
-
})
|
|
290
|
+
});
|
|
265
291
|
} catch (err) {
|
|
266
292
|
// Slides can fail to export as text; fallback to PPTX unless strict.
|
|
267
293
|
if (classification.nativeKind === "slides" && !strictNativeExport) {
|
|
@@ -274,7 +300,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
274
300
|
content: "",
|
|
275
301
|
assets: [],
|
|
276
302
|
metadata: { ...baseMetadata, exportedTooLarge: true },
|
|
277
|
-
})
|
|
303
|
+
});
|
|
278
304
|
}
|
|
279
305
|
const asset: AssetInput = {
|
|
280
306
|
assetId: fileId,
|
|
@@ -286,7 +312,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
286
312
|
filename: name ? `${name}.pptx` : undefined,
|
|
287
313
|
},
|
|
288
314
|
uri: meta?.webViewLink ? String(meta.webViewLink) : undefined,
|
|
289
|
-
metadata: { connector: "google-drive", fileId, exportMimeType: EXPORT_MIME.pptx }
|
|
315
|
+
metadata: { connector: "google-drive", fileId, exportMimeType: EXPORT_MIME.pptx },
|
|
290
316
|
};
|
|
291
317
|
return buildGoogleDriveFileIngestInput({
|
|
292
318
|
fileId,
|
|
@@ -294,7 +320,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
294
320
|
content: "",
|
|
295
321
|
assets: [asset],
|
|
296
322
|
metadata: { ...baseMetadata, googleNativeKind: "slides", exportFallback: "pptx" },
|
|
297
|
-
})
|
|
323
|
+
});
|
|
298
324
|
} catch {
|
|
299
325
|
// fall through to strict error
|
|
300
326
|
}
|
|
@@ -314,7 +340,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
314
340
|
content: "",
|
|
315
341
|
assets: [],
|
|
316
342
|
metadata: { ...baseMetadata, exportedTooLarge: true },
|
|
317
|
-
})
|
|
343
|
+
});
|
|
318
344
|
}
|
|
319
345
|
|
|
320
346
|
const filename = name && plan.filenameExt ? `${name}.${plan.filenameExt}` : name || undefined;
|
|
@@ -323,7 +349,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
323
349
|
kind: plan.assetKind,
|
|
324
350
|
data: { kind: "bytes", bytes, mediaType: plan.mimeType, ...(filename ? { filename } : {}) },
|
|
325
351
|
uri: meta?.webViewLink ? String(meta.webViewLink) : undefined,
|
|
326
|
-
metadata: { connector: "google-drive", fileId, exportMimeType: plan.mimeType }
|
|
352
|
+
metadata: { connector: "google-drive", fileId, exportMimeType: plan.mimeType },
|
|
327
353
|
};
|
|
328
354
|
|
|
329
355
|
return buildGoogleDriveFileIngestInput({
|
|
@@ -332,7 +358,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
332
358
|
content: "",
|
|
333
359
|
assets: [asset],
|
|
334
360
|
metadata: { ...baseMetadata, googleNativeKind: classification.nativeKind, exportMimeType: plan.mimeType },
|
|
335
|
-
})
|
|
361
|
+
});
|
|
336
362
|
}
|
|
337
363
|
}
|
|
338
364
|
|
|
@@ -344,7 +370,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
344
370
|
content: "",
|
|
345
371
|
assets: [],
|
|
346
372
|
metadata: { ...baseMetadata, skippedTooLarge: true },
|
|
347
|
-
})
|
|
373
|
+
});
|
|
348
374
|
}
|
|
349
375
|
|
|
350
376
|
const bytes = await downloadFileBytes(args.drive, fileId);
|
|
@@ -355,7 +381,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
355
381
|
content: "",
|
|
356
382
|
assets: [],
|
|
357
383
|
metadata: { ...baseMetadata, skippedTooLarge: true },
|
|
358
|
-
})
|
|
384
|
+
});
|
|
359
385
|
}
|
|
360
386
|
|
|
361
387
|
const assetKind = assetKindFromMediaType(mimeType);
|
|
@@ -370,7 +396,7 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
370
396
|
...(filename ? { filename } : {}),
|
|
371
397
|
},
|
|
372
398
|
uri: meta?.webViewLink ? String(meta.webViewLink) : undefined,
|
|
373
|
-
metadata: { connector: "google-drive", fileId, name, mimeType }
|
|
399
|
+
metadata: { connector: "google-drive", fileId, name, mimeType },
|
|
374
400
|
};
|
|
375
401
|
|
|
376
402
|
// For pure binaries, keep content empty; extraction occurs via engine asset processing + extractors.
|
|
@@ -379,8 +405,8 @@ export async function loadGoogleDriveFileDocument(args: {
|
|
|
379
405
|
sourceIdPrefix: args.sourceIdPrefix,
|
|
380
406
|
content: "",
|
|
381
407
|
assets: [asset],
|
|
382
|
-
metadata: baseMetadata
|
|
383
|
-
})
|
|
408
|
+
metadata: baseMetadata,
|
|
409
|
+
});
|
|
384
410
|
}
|
|
385
411
|
|
|
386
412
|
export async function syncGoogleDriveFiles(
|
|
@@ -434,8 +460,10 @@ export async function syncGoogleDriveFiles(
|
|
|
434
460
|
},
|
|
435
461
|
});
|
|
436
462
|
|
|
463
|
+
const meta = doc.metadata as Record<string, unknown>;
|
|
464
|
+
|
|
437
465
|
// Skip folders explicitly (v1).
|
|
438
|
-
if (
|
|
466
|
+
if (meta.kind === "folder") {
|
|
439
467
|
emit({
|
|
440
468
|
type: "file:skipped",
|
|
441
469
|
fileId,
|
|
@@ -446,7 +474,7 @@ export async function syncGoogleDriveFiles(
|
|
|
446
474
|
continue;
|
|
447
475
|
}
|
|
448
476
|
|
|
449
|
-
if (
|
|
477
|
+
if (meta.unsupportedGoogleMime) {
|
|
450
478
|
emit({
|
|
451
479
|
type: "file:skipped",
|
|
452
480
|
fileId,
|
|
@@ -458,7 +486,7 @@ export async function syncGoogleDriveFiles(
|
|
|
458
486
|
continue;
|
|
459
487
|
}
|
|
460
488
|
|
|
461
|
-
if (
|
|
489
|
+
if (meta.skippedTooLarge || meta.exportedTooLarge) {
|
|
462
490
|
emit({
|
|
463
491
|
type: "file:skipped",
|
|
464
492
|
fileId,
|
|
@@ -469,7 +497,7 @@ export async function syncGoogleDriveFiles(
|
|
|
469
497
|
continue;
|
|
470
498
|
}
|
|
471
499
|
|
|
472
|
-
if (
|
|
500
|
+
if (meta.shortcutUnresolved) {
|
|
473
501
|
emit({
|
|
474
502
|
type: "file:skipped",
|
|
475
503
|
fileId,
|
|
@@ -484,7 +512,7 @@ export async function syncGoogleDriveFiles(
|
|
|
484
512
|
sourceId: doc.sourceId,
|
|
485
513
|
content: doc.content,
|
|
486
514
|
assets: doc.assets,
|
|
487
|
-
metadata: doc.metadata
|
|
515
|
+
metadata: doc.metadata,
|
|
488
516
|
});
|
|
489
517
|
|
|
490
518
|
succeeded += 1;
|