unrag 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/index.js +256 -51
- package/package.json +2 -1
- package/registry/config/unrag.config.ts +41 -17
- package/registry/connectors/google-drive/client.ts +171 -0
- package/registry/connectors/google-drive/index.ts +10 -0
- package/registry/connectors/google-drive/mime.ts +76 -0
- package/registry/connectors/google-drive/sync.ts +528 -0
- package/registry/connectors/google-drive/types.ts +127 -0
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
import type { IngestResult } from "../../core";
|
|
2
|
+
import type { AssetInput } from "../../core/types";
|
|
3
|
+
import { createGoogleDriveClient } from "./client";
|
|
4
|
+
import {
|
|
5
|
+
assetKindFromMediaType,
|
|
6
|
+
classifyDriveMimeType,
|
|
7
|
+
EXPORT_MIME,
|
|
8
|
+
getNativeExportPlan,
|
|
9
|
+
DRIVE_MIME,
|
|
10
|
+
} from "./mime";
|
|
11
|
+
import type {
|
|
12
|
+
BuildGoogleDriveFileIngestInputArgs,
|
|
13
|
+
GoogleDriveFileDocument,
|
|
14
|
+
GoogleDriveSyncProgressEvent,
|
|
15
|
+
SyncGoogleDriveFilesInput,
|
|
16
|
+
} from "./types";
|
|
17
|
+
|
|
18
|
+
const DEFAULT_MAX_BYTES = 15 * 1024 * 1024; // 15MB
|
|
19
|
+
|
|
20
|
+
const joinPrefix = (prefix: string | undefined, rest: string) => {
|
|
21
|
+
const p = (prefix ?? "").trim();
|
|
22
|
+
if (!p) return rest;
|
|
23
|
+
return p.endsWith(":") ? p + rest : p + ":" + rest;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export function buildGoogleDriveFileIngestInput(
|
|
27
|
+
args: BuildGoogleDriveFileIngestInputArgs
|
|
28
|
+
) {
|
|
29
|
+
const sourceId = joinPrefix(args.sourceIdPrefix, `gdrive:file:${args.fileId}`);
|
|
30
|
+
return {
|
|
31
|
+
sourceId,
|
|
32
|
+
content: args.content,
|
|
33
|
+
metadata: args.metadata ?? {},
|
|
34
|
+
assets: args.assets ?? [],
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const asMessage = (err: unknown) => {
|
|
39
|
+
if (err instanceof Error) return err.message;
|
|
40
|
+
try {
|
|
41
|
+
return typeof err === "string" ? err : JSON.stringify(err);
|
|
42
|
+
} catch {
|
|
43
|
+
return String(err);
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const toUint8Array = (data: any): Uint8Array => {
|
|
48
|
+
if (!data) return new Uint8Array();
|
|
49
|
+
if (data instanceof Uint8Array) return data;
|
|
50
|
+
if (typeof Buffer !== "undefined" && data instanceof Buffer) {
|
|
51
|
+
return new Uint8Array(data);
|
|
52
|
+
}
|
|
53
|
+
if (data instanceof ArrayBuffer) return new Uint8Array(data);
|
|
54
|
+
if (ArrayBuffer.isView(data)) {
|
|
55
|
+
return new Uint8Array(data.buffer, data.byteOffset, data.byteLength);
|
|
56
|
+
}
|
|
57
|
+
// Axios can hand back a string for some responseTypes; treat as utf-8 bytes.
|
|
58
|
+
if (typeof data === "string") {
|
|
59
|
+
return new TextEncoder().encode(data);
|
|
60
|
+
}
|
|
61
|
+
return new Uint8Array();
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
const bytesToText = (bytes: Uint8Array) => {
|
|
65
|
+
return new TextDecoder("utf-8", { fatal: false }).decode(bytes);
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
const isNotFound = (err: any, treatForbiddenAsNotFound: boolean) => {
|
|
69
|
+
const status =
|
|
70
|
+
Number(err?.code ?? err?.status ?? err?.response?.status ?? err?.statusCode) ||
|
|
71
|
+
Number(err?.response?.status);
|
|
72
|
+
if (status === 404) return true;
|
|
73
|
+
if (treatForbiddenAsNotFound && status === 403) return true;
|
|
74
|
+
return false;
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
async function getFileMetadata(drive: any, fileId: string) {
|
|
78
|
+
const res = await drive.files.get({
|
|
79
|
+
fileId,
|
|
80
|
+
supportsAllDrives: true,
|
|
81
|
+
fields:
|
|
82
|
+
"id,name,mimeType,size,md5Checksum,modifiedTime,webViewLink,webContentLink,iconLink,shortcutDetails,driveId",
|
|
83
|
+
});
|
|
84
|
+
return res?.data ?? {};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
async function downloadFileBytes(drive: any, fileId: string): Promise<Uint8Array> {
|
|
88
|
+
const res = await drive.files.get(
|
|
89
|
+
{ fileId, alt: "media", supportsAllDrives: true },
|
|
90
|
+
{ responseType: "arraybuffer" }
|
|
91
|
+
);
|
|
92
|
+
return toUint8Array(res?.data);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async function exportFileBytes(
|
|
96
|
+
drive: any,
|
|
97
|
+
fileId: string,
|
|
98
|
+
mimeType: string
|
|
99
|
+
): Promise<Uint8Array> {
|
|
100
|
+
const res = await drive.files.export(
|
|
101
|
+
{ fileId, mimeType },
|
|
102
|
+
{ responseType: "arraybuffer" }
|
|
103
|
+
);
|
|
104
|
+
return toUint8Array(res?.data);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export async function loadGoogleDriveFileDocument(args: {
|
|
108
|
+
drive: any;
|
|
109
|
+
fileId: string;
|
|
110
|
+
sourceIdPrefix?: string;
|
|
111
|
+
options?: {
|
|
112
|
+
maxBytesPerFile?: number;
|
|
113
|
+
strictNativeExport?: boolean;
|
|
114
|
+
};
|
|
115
|
+
/** internal: recursion guard for shortcuts */
|
|
116
|
+
_visited?: Set<string>;
|
|
117
|
+
}): Promise<GoogleDriveFileDocument> {
|
|
118
|
+
const maxBytesPerFile = args.options?.maxBytesPerFile ?? DEFAULT_MAX_BYTES;
|
|
119
|
+
const strictNativeExport = Boolean(args.options?.strictNativeExport ?? false);
|
|
120
|
+
|
|
121
|
+
const meta = await getFileMetadata(args.drive, args.fileId);
|
|
122
|
+
const fileId = String(meta?.id ?? args.fileId);
|
|
123
|
+
const name = String(meta?.name ?? "");
|
|
124
|
+
const mimeType = String(meta?.mimeType ?? "");
|
|
125
|
+
const size = meta?.size !== undefined ? Number(meta.size) : undefined;
|
|
126
|
+
|
|
127
|
+
const classification = classifyDriveMimeType(mimeType);
|
|
128
|
+
|
|
129
|
+
// Handle folders: return a document shape but with no content/assets; callers typically skip.
|
|
130
|
+
if (classification.kind === "folder") {
|
|
131
|
+
return buildGoogleDriveFileIngestInput({
|
|
132
|
+
fileId,
|
|
133
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
134
|
+
content: "",
|
|
135
|
+
assets: [],
|
|
136
|
+
metadata: {
|
|
137
|
+
connector: "google-drive",
|
|
138
|
+
kind: "folder",
|
|
139
|
+
fileId,
|
|
140
|
+
name,
|
|
141
|
+
mimeType: DRIVE_MIME.folder,
|
|
142
|
+
...(meta?.webViewLink ? { webViewLink: String(meta.webViewLink) } : {}),
|
|
143
|
+
...(meta?.modifiedTime ? { modifiedTime: String(meta.modifiedTime) } : {}),
|
|
144
|
+
},
|
|
145
|
+
}) as any;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Shortcuts: resolve to target if possible (1-level), otherwise let caller decide.
|
|
149
|
+
if (classification.kind === "shortcut") {
|
|
150
|
+
const visited = args._visited ?? new Set<string>();
|
|
151
|
+
if (visited.has(fileId)) {
|
|
152
|
+
// cycle
|
|
153
|
+
return buildGoogleDriveFileIngestInput({
|
|
154
|
+
fileId,
|
|
155
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
156
|
+
content: "",
|
|
157
|
+
assets: [],
|
|
158
|
+
metadata: {
|
|
159
|
+
connector: "google-drive",
|
|
160
|
+
kind: "shortcut",
|
|
161
|
+
fileId,
|
|
162
|
+
name,
|
|
163
|
+
mimeType: DRIVE_MIME.shortcut,
|
|
164
|
+
shortcutUnresolved: true,
|
|
165
|
+
},
|
|
166
|
+
}) as any;
|
|
167
|
+
}
|
|
168
|
+
visited.add(fileId);
|
|
169
|
+
|
|
170
|
+
const targetId = meta?.shortcutDetails?.targetId
|
|
171
|
+
? String(meta.shortcutDetails.targetId)
|
|
172
|
+
: "";
|
|
173
|
+
|
|
174
|
+
if (!targetId) {
|
|
175
|
+
return buildGoogleDriveFileIngestInput({
|
|
176
|
+
fileId,
|
|
177
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
178
|
+
content: "",
|
|
179
|
+
assets: [],
|
|
180
|
+
metadata: {
|
|
181
|
+
connector: "google-drive",
|
|
182
|
+
kind: "shortcut",
|
|
183
|
+
fileId,
|
|
184
|
+
name,
|
|
185
|
+
mimeType: DRIVE_MIME.shortcut,
|
|
186
|
+
shortcutUnresolved: true,
|
|
187
|
+
},
|
|
188
|
+
}) as any;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Resolve target content/assets but keep sourceId stable to the shortcut file id.
|
|
192
|
+
const targetDoc = await loadGoogleDriveFileDocument({
|
|
193
|
+
drive: args.drive,
|
|
194
|
+
fileId: targetId,
|
|
195
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
196
|
+
options: args.options,
|
|
197
|
+
_visited: visited,
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
...targetDoc,
|
|
202
|
+
sourceId: joinPrefix(args.sourceIdPrefix, `gdrive:file:${fileId}`),
|
|
203
|
+
metadata: {
|
|
204
|
+
...(targetDoc.metadata ?? {}),
|
|
205
|
+
connector: "google-drive",
|
|
206
|
+
shortcutFileId: fileId,
|
|
207
|
+
shortcutTargetId: targetId,
|
|
208
|
+
},
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const baseMetadata = {
|
|
213
|
+
connector: "google-drive",
|
|
214
|
+
kind: "file",
|
|
215
|
+
fileId,
|
|
216
|
+
name,
|
|
217
|
+
mimeType,
|
|
218
|
+
...(Number.isFinite(size) ? { size } : {}),
|
|
219
|
+
...(meta?.md5Checksum ? { md5Checksum: String(meta.md5Checksum) } : {}),
|
|
220
|
+
...(meta?.modifiedTime ? { modifiedTime: String(meta.modifiedTime) } : {}),
|
|
221
|
+
...(meta?.webViewLink ? { webViewLink: String(meta.webViewLink) } : {}),
|
|
222
|
+
...(meta?.webContentLink ? { webContentLink: String(meta.webContentLink) } : {}),
|
|
223
|
+
...(meta?.iconLink ? { iconLink: String(meta.iconLink) } : {}),
|
|
224
|
+
...(meta?.driveId ? { driveId: String(meta.driveId) } : {}),
|
|
225
|
+
} as const;
|
|
226
|
+
|
|
227
|
+
// Google-native export path
|
|
228
|
+
if (classification.kind === "google_native") {
|
|
229
|
+
const plan = getNativeExportPlan(classification.nativeKind);
|
|
230
|
+
if (plan.kind === "unsupported") {
|
|
231
|
+
return buildGoogleDriveFileIngestInput({
|
|
232
|
+
fileId,
|
|
233
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
234
|
+
content: "",
|
|
235
|
+
assets: [],
|
|
236
|
+
metadata: {
|
|
237
|
+
...baseMetadata,
|
|
238
|
+
googleNativeKind: classification.nativeKind,
|
|
239
|
+
unsupportedGoogleMime: true,
|
|
240
|
+
},
|
|
241
|
+
}) as any;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// For content export, enforce maxBytesPerFile by bytes length.
|
|
245
|
+
if (plan.kind === "content") {
|
|
246
|
+
try {
|
|
247
|
+
const bytes = await exportFileBytes(args.drive, fileId, plan.mimeType);
|
|
248
|
+
if (bytes.byteLength > maxBytesPerFile) {
|
|
249
|
+
return buildGoogleDriveFileIngestInput({
|
|
250
|
+
fileId,
|
|
251
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
252
|
+
content: "",
|
|
253
|
+
assets: [],
|
|
254
|
+
metadata: { ...baseMetadata, exportedTooLarge: true },
|
|
255
|
+
}) as any;
|
|
256
|
+
}
|
|
257
|
+
const content = bytesToText(bytes).trim();
|
|
258
|
+
return buildGoogleDriveFileIngestInput({
|
|
259
|
+
fileId,
|
|
260
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
261
|
+
content,
|
|
262
|
+
assets: [],
|
|
263
|
+
metadata: { ...baseMetadata, googleNativeKind: classification.nativeKind, exportMimeType: plan.mimeType },
|
|
264
|
+
}) as any;
|
|
265
|
+
} catch (err) {
|
|
266
|
+
// Slides can fail to export as text; fallback to PPTX unless strict.
|
|
267
|
+
if (classification.nativeKind === "slides" && !strictNativeExport) {
|
|
268
|
+
try {
|
|
269
|
+
const bytes = await exportFileBytes(args.drive, fileId, EXPORT_MIME.pptx);
|
|
270
|
+
if (bytes.byteLength > maxBytesPerFile) {
|
|
271
|
+
return buildGoogleDriveFileIngestInput({
|
|
272
|
+
fileId,
|
|
273
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
274
|
+
content: "",
|
|
275
|
+
assets: [],
|
|
276
|
+
metadata: { ...baseMetadata, exportedTooLarge: true },
|
|
277
|
+
}) as any;
|
|
278
|
+
}
|
|
279
|
+
const asset: AssetInput = {
|
|
280
|
+
assetId: fileId,
|
|
281
|
+
kind: "file",
|
|
282
|
+
data: {
|
|
283
|
+
kind: "bytes",
|
|
284
|
+
bytes,
|
|
285
|
+
mediaType: EXPORT_MIME.pptx,
|
|
286
|
+
filename: name ? `${name}.pptx` : undefined,
|
|
287
|
+
},
|
|
288
|
+
uri: meta?.webViewLink ? String(meta.webViewLink) : undefined,
|
|
289
|
+
metadata: { connector: "google-drive", fileId, exportMimeType: EXPORT_MIME.pptx } as any,
|
|
290
|
+
};
|
|
291
|
+
return buildGoogleDriveFileIngestInput({
|
|
292
|
+
fileId,
|
|
293
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
294
|
+
content: "",
|
|
295
|
+
assets: [asset],
|
|
296
|
+
metadata: { ...baseMetadata, googleNativeKind: "slides", exportFallback: "pptx" },
|
|
297
|
+
}) as any;
|
|
298
|
+
} catch {
|
|
299
|
+
// fall through to strict error
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
throw err;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Asset export path (drawings -> PNG image)
|
|
308
|
+
if (plan.kind === "asset") {
|
|
309
|
+
const bytes = await exportFileBytes(args.drive, fileId, plan.mimeType);
|
|
310
|
+
if (bytes.byteLength > maxBytesPerFile) {
|
|
311
|
+
return buildGoogleDriveFileIngestInput({
|
|
312
|
+
fileId,
|
|
313
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
314
|
+
content: "",
|
|
315
|
+
assets: [],
|
|
316
|
+
metadata: { ...baseMetadata, exportedTooLarge: true },
|
|
317
|
+
}) as any;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
const filename = name && plan.filenameExt ? `${name}.${plan.filenameExt}` : name || undefined;
|
|
321
|
+
const asset: AssetInput = {
|
|
322
|
+
assetId: fileId,
|
|
323
|
+
kind: plan.assetKind,
|
|
324
|
+
data: { kind: "bytes", bytes, mediaType: plan.mimeType, ...(filename ? { filename } : {}) },
|
|
325
|
+
uri: meta?.webViewLink ? String(meta.webViewLink) : undefined,
|
|
326
|
+
metadata: { connector: "google-drive", fileId, exportMimeType: plan.mimeType } as any,
|
|
327
|
+
};
|
|
328
|
+
|
|
329
|
+
return buildGoogleDriveFileIngestInput({
|
|
330
|
+
fileId,
|
|
331
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
332
|
+
content: "",
|
|
333
|
+
assets: [asset],
|
|
334
|
+
metadata: { ...baseMetadata, googleNativeKind: classification.nativeKind, exportMimeType: plan.mimeType },
|
|
335
|
+
}) as any;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// Binary download path
|
|
340
|
+
if (Number.isFinite(size) && (size as number) > maxBytesPerFile) {
|
|
341
|
+
return buildGoogleDriveFileIngestInput({
|
|
342
|
+
fileId,
|
|
343
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
344
|
+
content: "",
|
|
345
|
+
assets: [],
|
|
346
|
+
metadata: { ...baseMetadata, skippedTooLarge: true },
|
|
347
|
+
}) as any;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
const bytes = await downloadFileBytes(args.drive, fileId);
|
|
351
|
+
if (bytes.byteLength > maxBytesPerFile) {
|
|
352
|
+
return buildGoogleDriveFileIngestInput({
|
|
353
|
+
fileId,
|
|
354
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
355
|
+
content: "",
|
|
356
|
+
assets: [],
|
|
357
|
+
metadata: { ...baseMetadata, skippedTooLarge: true },
|
|
358
|
+
}) as any;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
const assetKind = assetKindFromMediaType(mimeType);
|
|
362
|
+
const filename = name || undefined;
|
|
363
|
+
const asset: AssetInput = {
|
|
364
|
+
assetId: fileId,
|
|
365
|
+
kind: assetKind,
|
|
366
|
+
data: {
|
|
367
|
+
kind: "bytes",
|
|
368
|
+
bytes,
|
|
369
|
+
mediaType: mimeType || "application/octet-stream",
|
|
370
|
+
...(filename ? { filename } : {}),
|
|
371
|
+
},
|
|
372
|
+
uri: meta?.webViewLink ? String(meta.webViewLink) : undefined,
|
|
373
|
+
metadata: { connector: "google-drive", fileId, name, mimeType } as any,
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
// For pure binaries, keep content empty; extraction occurs via engine asset processing + extractors.
|
|
377
|
+
return buildGoogleDriveFileIngestInput({
|
|
378
|
+
fileId,
|
|
379
|
+
sourceIdPrefix: args.sourceIdPrefix,
|
|
380
|
+
content: "",
|
|
381
|
+
assets: [asset],
|
|
382
|
+
metadata: baseMetadata as any,
|
|
383
|
+
}) as any;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
export async function syncGoogleDriveFiles(
|
|
387
|
+
input: SyncGoogleDriveFilesInput
|
|
388
|
+
): Promise<{
|
|
389
|
+
fileCount: number;
|
|
390
|
+
succeeded: number;
|
|
391
|
+
failed: number;
|
|
392
|
+
deleted: number;
|
|
393
|
+
errors: Array<{ fileId: string; sourceId: string; error: unknown }>;
|
|
394
|
+
}> {
|
|
395
|
+
const deleteOnNotFound = input.deleteOnNotFound ?? false;
|
|
396
|
+
const options = input.options ?? {};
|
|
397
|
+
const maxBytesPerFile = options.maxBytesPerFile ?? DEFAULT_MAX_BYTES;
|
|
398
|
+
const treatForbiddenAsNotFound = options.treatForbiddenAsNotFound ?? true;
|
|
399
|
+
|
|
400
|
+
const { drive } = await createGoogleDriveClient({
|
|
401
|
+
auth: input.auth,
|
|
402
|
+
scopes: options.scopes,
|
|
403
|
+
});
|
|
404
|
+
|
|
405
|
+
const errors: Array<{ fileId: string; sourceId: string; error: unknown }> = [];
|
|
406
|
+
let succeeded = 0;
|
|
407
|
+
let failed = 0;
|
|
408
|
+
let deleted = 0;
|
|
409
|
+
|
|
410
|
+
for (const fileIdRaw of input.fileIds) {
|
|
411
|
+
const fileId = String(fileIdRaw ?? "").trim();
|
|
412
|
+
if (!fileId) continue;
|
|
413
|
+
|
|
414
|
+
const sourceId = joinPrefix(input.sourceIdPrefix, `gdrive:file:${fileId}`);
|
|
415
|
+
|
|
416
|
+
const emit = (event: GoogleDriveSyncProgressEvent) => {
|
|
417
|
+
try {
|
|
418
|
+
input.onProgress?.(event);
|
|
419
|
+
} catch {
|
|
420
|
+
// ignore progress handler errors
|
|
421
|
+
}
|
|
422
|
+
};
|
|
423
|
+
|
|
424
|
+
emit({ type: "file:start", fileId, sourceId });
|
|
425
|
+
|
|
426
|
+
try {
|
|
427
|
+
const doc = await loadGoogleDriveFileDocument({
|
|
428
|
+
drive,
|
|
429
|
+
fileId,
|
|
430
|
+
sourceIdPrefix: input.sourceIdPrefix,
|
|
431
|
+
options: {
|
|
432
|
+
maxBytesPerFile,
|
|
433
|
+
strictNativeExport: options.strictNativeExport,
|
|
434
|
+
},
|
|
435
|
+
});
|
|
436
|
+
|
|
437
|
+
// Skip folders explicitly (v1).
|
|
438
|
+
if ((doc.metadata as any)?.kind === "folder") {
|
|
439
|
+
emit({
|
|
440
|
+
type: "file:skipped",
|
|
441
|
+
fileId,
|
|
442
|
+
sourceId,
|
|
443
|
+
reason: "is_folder",
|
|
444
|
+
message: "Skipping folder (v1: files-only connector).",
|
|
445
|
+
});
|
|
446
|
+
continue;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
if ((doc.metadata as any)?.unsupportedGoogleMime) {
|
|
450
|
+
emit({
|
|
451
|
+
type: "file:skipped",
|
|
452
|
+
fileId,
|
|
453
|
+
sourceId,
|
|
454
|
+
reason: "unsupported_google_mime",
|
|
455
|
+
message:
|
|
456
|
+
"Skipping Google-native file type because it has no supported export plan.",
|
|
457
|
+
});
|
|
458
|
+
continue;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
if ((doc.metadata as any)?.skippedTooLarge || (doc.metadata as any)?.exportedTooLarge) {
|
|
462
|
+
emit({
|
|
463
|
+
type: "file:skipped",
|
|
464
|
+
fileId,
|
|
465
|
+
sourceId,
|
|
466
|
+
reason: "too_large",
|
|
467
|
+
message: `Skipping file because it exceeds maxBytesPerFile (${maxBytesPerFile}).`,
|
|
468
|
+
});
|
|
469
|
+
continue;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
if ((doc.metadata as any)?.shortcutUnresolved) {
|
|
473
|
+
emit({
|
|
474
|
+
type: "file:skipped",
|
|
475
|
+
fileId,
|
|
476
|
+
sourceId,
|
|
477
|
+
reason: "shortcut_unresolved",
|
|
478
|
+
message: "Skipping shortcut because target could not be resolved.",
|
|
479
|
+
});
|
|
480
|
+
continue;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
const result: IngestResult = await input.engine.ingest({
|
|
484
|
+
sourceId: doc.sourceId,
|
|
485
|
+
content: doc.content,
|
|
486
|
+
assets: doc.assets,
|
|
487
|
+
metadata: doc.metadata as any,
|
|
488
|
+
});
|
|
489
|
+
|
|
490
|
+
succeeded += 1;
|
|
491
|
+
emit({
|
|
492
|
+
type: "file:success",
|
|
493
|
+
fileId,
|
|
494
|
+
sourceId,
|
|
495
|
+
chunkCount: result.chunkCount,
|
|
496
|
+
});
|
|
497
|
+
} catch (err) {
|
|
498
|
+
if (isNotFound(err, Boolean(treatForbiddenAsNotFound))) {
|
|
499
|
+
emit({ type: "file:not-found", fileId, sourceId });
|
|
500
|
+
if (deleteOnNotFound) {
|
|
501
|
+
try {
|
|
502
|
+
await input.engine.delete({ sourceId });
|
|
503
|
+
deleted += 1;
|
|
504
|
+
} catch (deleteErr) {
|
|
505
|
+
failed += 1;
|
|
506
|
+
errors.push({ fileId, sourceId, error: deleteErr });
|
|
507
|
+
emit({ type: "file:error", fileId, sourceId, error: deleteErr });
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
continue;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
failed += 1;
|
|
514
|
+
errors.push({ fileId, sourceId, error: err });
|
|
515
|
+
emit({ type: "file:error", fileId, sourceId, error: err });
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
return {
|
|
520
|
+
fileCount: input.fileIds.length,
|
|
521
|
+
succeeded,
|
|
522
|
+
failed,
|
|
523
|
+
deleted,
|
|
524
|
+
errors,
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import type { ContextEngine, AssetInput, IngestInput } from "../../core";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* A plug-and-play auth input for Google Drive.
|
|
5
|
+
*
|
|
6
|
+
* This is intentionally structural (no hard dependency on google-auth-library types),
|
|
7
|
+
* because the connector code is vendored into user projects and dependencies are added
|
|
8
|
+
* by the CLI (`unrag add google-drive`).
|
|
9
|
+
*/
|
|
10
|
+
export type GoogleDriveAuth =
|
|
11
|
+
| {
|
|
12
|
+
/** Use an existing OAuth2 client instance (recommended if your app already has one). */
|
|
13
|
+
kind: "oauth";
|
|
14
|
+
oauthClient: unknown;
|
|
15
|
+
}
|
|
16
|
+
| {
|
|
17
|
+
/**
|
|
18
|
+
* Convenience form for OAuth2: the connector will construct an OAuth2 client
|
|
19
|
+
* and set credentials including the refresh token.
|
|
20
|
+
*/
|
|
21
|
+
kind: "oauth";
|
|
22
|
+
clientId: string;
|
|
23
|
+
clientSecret: string;
|
|
24
|
+
redirectUri: string;
|
|
25
|
+
refreshToken: string;
|
|
26
|
+
/** Optional access token if you already have one. */
|
|
27
|
+
accessToken?: string;
|
|
28
|
+
}
|
|
29
|
+
| {
|
|
30
|
+
/**
|
|
31
|
+
* Service account credentials. This supports both:
|
|
32
|
+
* - direct service-account access (files must be shared to the service account)
|
|
33
|
+
* - Workspace domain-wide delegation (DWD) when `subject` is provided
|
|
34
|
+
*/
|
|
35
|
+
kind: "service_account";
|
|
36
|
+
credentialsJson: string | Record<string, unknown>;
|
|
37
|
+
/**
|
|
38
|
+
* DWD impersonation subject email (Workspace only).
|
|
39
|
+
* When provided, the service account will impersonate this user.
|
|
40
|
+
*/
|
|
41
|
+
subject?: string;
|
|
42
|
+
}
|
|
43
|
+
| {
|
|
44
|
+
/** Escape hatch: provide a pre-configured GoogleAuth (or equivalent) instance. */
|
|
45
|
+
kind: "google_auth";
|
|
46
|
+
auth: unknown;
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
export type GoogleDriveSyncProgressEvent =
|
|
50
|
+
| { type: "file:start"; fileId: string; sourceId: string }
|
|
51
|
+
| {
|
|
52
|
+
type: "file:success";
|
|
53
|
+
fileId: string;
|
|
54
|
+
sourceId: string;
|
|
55
|
+
chunkCount: number;
|
|
56
|
+
}
|
|
57
|
+
| {
|
|
58
|
+
type: "file:skipped";
|
|
59
|
+
fileId: string;
|
|
60
|
+
sourceId: string;
|
|
61
|
+
reason:
|
|
62
|
+
| "is_folder"
|
|
63
|
+
| "unsupported_google_mime"
|
|
64
|
+
| "too_large"
|
|
65
|
+
| "shortcut_unresolved";
|
|
66
|
+
message: string;
|
|
67
|
+
}
|
|
68
|
+
| { type: "file:not-found"; fileId: string; sourceId: string }
|
|
69
|
+
| { type: "file:error"; fileId: string; sourceId: string; error: unknown };
|
|
70
|
+
|
|
71
|
+
export type GoogleDriveFileDocument = {
|
|
72
|
+
sourceId: string;
|
|
73
|
+
content: string;
|
|
74
|
+
metadata: Record<string, unknown>;
|
|
75
|
+
assets: AssetInput[];
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
export type BuildGoogleDriveFileIngestInputArgs = {
|
|
79
|
+
fileId: string;
|
|
80
|
+
content: string;
|
|
81
|
+
assets?: AssetInput[];
|
|
82
|
+
metadata?: Record<string, unknown>;
|
|
83
|
+
sourceIdPrefix?: string;
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
export type BuildGoogleDriveFileIngestInputResult = IngestInput;
|
|
87
|
+
|
|
88
|
+
export type SyncGoogleDriveFilesInput = {
|
|
89
|
+
engine: ContextEngine;
|
|
90
|
+
auth: GoogleDriveAuth;
|
|
91
|
+
/** Explicit Drive file IDs (Notion-like v1 behavior). */
|
|
92
|
+
fileIds: string[];
|
|
93
|
+
/**
|
|
94
|
+
* Optional namespace prefix, useful for multi-tenant apps:
|
|
95
|
+
* `tenant:acme:` -> `tenant:acme:gdrive:file:<id>`
|
|
96
|
+
*/
|
|
97
|
+
sourceIdPrefix?: string;
|
|
98
|
+
/**
|
|
99
|
+
* When true, if a file is not found/accessible, delete the previously ingested
|
|
100
|
+
* document for that file (exact sourceId).
|
|
101
|
+
*/
|
|
102
|
+
deleteOnNotFound?: boolean;
|
|
103
|
+
/** Optional progress callback. */
|
|
104
|
+
onProgress?: (event: GoogleDriveSyncProgressEvent) => void;
|
|
105
|
+
/** Optional connector-level knobs. */
|
|
106
|
+
options?: SyncGoogleDriveFilesOptions;
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
export type SyncGoogleDriveFilesOptions = {
|
|
110
|
+
/** Max bytes to download/export per file. Default: 15MB. */
|
|
111
|
+
maxBytesPerFile?: number;
|
|
112
|
+
/**
|
|
113
|
+
* If true, treat 403 (forbidden) as not-found for cleanup purposes.
|
|
114
|
+
* Default: true.
|
|
115
|
+
*/
|
|
116
|
+
treatForbiddenAsNotFound?: boolean;
|
|
117
|
+
/**
|
|
118
|
+
* If true, failures to export Google-native files (e.g., Slides -> text)
|
|
119
|
+
* will cause the file to be skipped instead of falling back to a binary export.
|
|
120
|
+
* Default: false (best-effort fallback).
|
|
121
|
+
*/
|
|
122
|
+
strictNativeExport?: boolean;
|
|
123
|
+
/** Override Drive API scopes if desired. */
|
|
124
|
+
scopes?: string[];
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
|