@cue-dev/retrieval-core 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1119 @@
1
+ import { createHash } from "node:crypto";
2
+ import { existsSync } from "node:fs";
3
+ import { readdir, readFile, stat, writeFile } from "node:fs/promises";
4
+ import { join, relative, resolve } from "node:path";
5
+ import { loadIndexingIgnoreMatcher, normalizeRepoRelativePath } from "./indexing-ignore.js";
6
+
7
+ export const REMOTE_SYNC_STATE_MODE = "remote_delta_v1" as const;
8
+ export const DEFAULT_REMOTE_SYNC_MAX_FILE_SIZE_BYTES = 1_000_000;
9
+
10
+ const DEFAULT_EXCLUDED_DIRS = new Set([
11
+ ".tmp",
12
+ ".cache",
13
+ ".git",
14
+ ".hg",
15
+ ".next",
16
+ ".nuxt",
17
+ ".pytest_cache",
18
+ ".cue",
19
+ ".cue-tool",
20
+ ".svn",
21
+ ".svelte-kit",
22
+ ".turbo",
23
+ ".venv",
24
+ "__pycache__",
25
+ "build",
26
+ "coverage",
27
+ "dist",
28
+ "node_modules",
29
+ "out",
30
+ "target",
31
+ "vendor",
32
+ "venv"
33
+ ]);
34
+
35
+ const DEFAULT_EXCLUDED_FILES = new Set([
36
+ "bun.lock",
37
+ "bun.lockb",
38
+ "Cargo.lock",
39
+ "composer.lock",
40
+ "Gemfile.lock",
41
+ "package-lock.json",
42
+ "Pipfile.lock",
43
+ "pnpm-lock.yaml",
44
+ "yarn.lock"
45
+ ]);
46
+
47
+ const DEFAULT_EXCLUDED_FILE_SUFFIXES = new Set([
48
+ ".min.js",
49
+ ".min.css",
50
+ ".map",
51
+ ".exe",
52
+ ".dll",
53
+ ".so",
54
+ ".bin",
55
+ ".dylib",
56
+ ".pyc",
57
+ ".png",
58
+ ".jpg",
59
+ ".jpeg",
60
+ ".gif",
61
+ ".webp",
62
+ ".svg",
63
+ ".mp4",
64
+ ".mov",
65
+ ".avi",
66
+ ".mkv",
67
+ ".pdf"
68
+ ]);
69
+
70
+ export interface RemoteSyncUploadCandidate {
71
+ path: string;
72
+ content: string;
73
+ language?: string;
74
+ }
75
+
76
+ export interface RemoteSyncProjectFileStat {
77
+ path: string;
78
+ full_path: string;
79
+ size: number;
80
+ mtime_ms: number;
81
+ language?: string;
82
+ }
83
+
84
+ export interface RemoteSyncStateEntry {
85
+ content_hash: string;
86
+ size: number;
87
+ mtime_ms: number;
88
+ language?: string;
89
+ }
90
+
91
+ export interface RemoteSyncStateFile {
92
+ mode: typeof REMOTE_SYNC_STATE_MODE;
93
+ workspace_id?: string;
94
+ last_index_version?: string;
95
+ files: Record<string, RemoteSyncStateEntry>;
96
+ updated_at: string;
97
+ }
98
+
99
+ export interface RemoteSyncDeltaPayload {
100
+ upsert_files: RemoteSyncUploadCandidate[];
101
+ deleted_paths: string[];
102
+ }
103
+
104
+ export interface RemoteSyncDeltaBatch {
105
+ upsert_files: RemoteSyncUploadCandidate[];
106
+ deleted_paths: string[];
107
+ approx_bytes: number;
108
+ }
109
+
110
+ export interface BuildRemoteSyncDeltaResult {
111
+ delta: RemoteSyncDeltaPayload;
112
+ upsert_state_entries: Record<string, RemoteSyncStateEntry>;
113
+ next_files: Record<string, RemoteSyncStateEntry>;
114
+ }
115
+
116
+ export interface RemoteSyncScanOptions {
117
+ max_file_size_bytes?: number;
118
+ excluded_dirs?: Set<string>;
119
+ excluded_files?: Set<string>;
120
+ excluded_file_suffixes?: Set<string>;
121
+ }
122
+
123
+ export class RemoteSyncHttpResponseError extends Error {
124
+ constructor(
125
+ message: string,
126
+ readonly status: number,
127
+ readonly payload?: unknown
128
+ ) {
129
+ super(message);
130
+ }
131
+ }
132
+
133
+ export interface RunRemoteDeltaSyncInput {
134
+ project_root_path: string;
135
+ scan_root_path?: string;
136
+ workspace_id?: string;
137
+ previous_state?: RemoteSyncStateFile;
138
+ force_full_upsert?: boolean;
139
+ max_body_bytes: number;
140
+ retries?: number;
141
+ initial_delay_ms?: number;
142
+ stale_base_error?: (error: unknown) => boolean;
143
+ persist_state?: (state: RemoteSyncStateFile) => Promise<void>;
144
+ on_batch_processed?: (event: {
145
+ batch_index: number;
146
+ batch_count: number;
147
+ approx_bytes: number;
148
+ upsert_files: number;
149
+ deleted_paths: number;
150
+ latency_ms: number;
151
+ }) => void | Promise<void>;
152
+ push_delta: (request: {
153
+ workspace_id?: string;
154
+ project_root_path: string;
155
+ base_index_version?: string;
156
+ upsert_files: RemoteSyncUploadCandidate[];
157
+ deleted_paths: string[];
158
+ }) => Promise<{
159
+ workspace_id?: string;
160
+ index_version?: string;
161
+ }>;
162
+ }
163
+
164
+ export interface RunRemoteDeltaSyncResult {
165
+ state: RemoteSyncStateFile;
166
+ changed: boolean;
167
+ workspace_id?: string;
168
+ index_version?: string;
169
+ applied_delta: {
170
+ upsert_files: number;
171
+ deleted_paths: number;
172
+ };
173
+ stats: {
174
+ batches_total: number;
175
+ bytes_total: number;
176
+ latency_ms: number;
177
+ };
178
+ }
179
+
180
+ export interface RemoteSyncCapabilities {
181
+ max_body_bytes: number;
182
+ sync_protocols?: string[];
183
+ max_blob_bytes?: number;
184
+ max_blob_batch_bytes?: number;
185
+ max_commit_body_bytes?: number;
186
+ upload_concurrency_hint?: number;
187
+ }
188
+
189
+ export interface RunRemoteAdaptiveSyncInput {
190
+ project_root_path: string;
191
+ scan_root_path?: string;
192
+ workspace_id?: string;
193
+ previous_state?: RemoteSyncStateFile;
194
+ force_full_upsert?: boolean;
195
+ capabilities: RemoteSyncCapabilities;
196
+ retries?: number;
197
+ initial_delay_ms?: number;
198
+ stale_base_error?: (error: unknown) => boolean;
199
+ persist_state?: (state: RemoteSyncStateFile) => Promise<void>;
200
+ push_delta: RunRemoteDeltaSyncInput["push_delta"];
201
+ upload_blobs: (request: {
202
+ workspace_id?: string;
203
+ project_root_path: string;
204
+ blobs: Array<{
205
+ hash: string;
206
+ content: string;
207
+ size_bytes: number;
208
+ }>;
209
+ }) => Promise<{
210
+ accepted_hashes: string[];
211
+ already_present_hashes: string[];
212
+ rejected: Array<{ hash: string; reason: string }>;
213
+ }>;
214
+ commit_v2: (request: {
215
+ workspace_id?: string;
216
+ project_root_path: string;
217
+ base_index_version?: string;
218
+ upsert_files: Array<{
219
+ path: string;
220
+ blob_hash: string;
221
+ language?: string;
222
+ generated?: boolean;
223
+ binary?: boolean;
224
+ updated_at?: string;
225
+ }>;
226
+ deleted_paths: string[];
227
+ }) => Promise<{
228
+ workspace_id?: string;
229
+ index_version?: string;
230
+ }>;
231
+ on_upload_strategy_change?: (event: {
232
+ previous_concurrency: number;
233
+ next_concurrency: number;
234
+ reason: "success" | "error";
235
+ }) => void | Promise<void>;
236
+ }
237
+
238
+ export interface RunRemoteAdaptiveSyncResult extends RunRemoteDeltaSyncResult {
239
+ protocol: "delta_v1" | "blob_commit_v2";
240
+ }
241
+
242
+ function nowIso(): string {
243
+ return new Date().toISOString();
244
+ }
245
+
246
+ function extensionToLanguage(path: string): string | undefined {
247
+ if (path.endsWith(".ts") || path.endsWith(".tsx") || path.endsWith(".mts") || path.endsWith(".cts")) {
248
+ return "typescript";
249
+ }
250
+ if (path.endsWith(".js") || path.endsWith(".jsx") || path.endsWith(".mjs") || path.endsWith(".cjs")) {
251
+ return "javascript";
252
+ }
253
+ if (path.endsWith(".py")) return "python";
254
+ if (path.endsWith(".go")) return "go";
255
+ if (path.endsWith(".rs")) return "rust";
256
+ if (path.endsWith(".java")) return "java";
257
+ if (path.endsWith(".json")) return "json";
258
+ if (path.endsWith(".md")) return "markdown";
259
+ if (path.endsWith(".yml") || path.endsWith(".yaml")) return "yaml";
260
+ return undefined;
261
+ }
262
+
263
+ function looksBinary(content: string): boolean {
264
+ return content.includes("\0");
265
+ }
266
+
267
+ function sha256Text(value: string): string {
268
+ return createHash("sha256").update(value).digest("hex");
269
+ }
270
+
271
+ function shouldExcludeFile(path: string, excludedFiles: Set<string>, excludedSuffixes: Set<string>): boolean {
272
+ const lower = path.toLowerCase();
273
+ if (excludedFiles.has(path) || excludedFiles.has(lower)) {
274
+ return true;
275
+ }
276
+ for (const suffix of excludedSuffixes) {
277
+ if (lower.endsWith(suffix)) {
278
+ return true;
279
+ }
280
+ }
281
+ return false;
282
+ }
283
+
284
+ function resolveScanOptions(options?: RemoteSyncScanOptions): Required<RemoteSyncScanOptions> {
285
+ return {
286
+ max_file_size_bytes: options?.max_file_size_bytes ?? DEFAULT_REMOTE_SYNC_MAX_FILE_SIZE_BYTES,
287
+ excluded_dirs: options?.excluded_dirs ?? DEFAULT_EXCLUDED_DIRS,
288
+ excluded_files: options?.excluded_files ?? DEFAULT_EXCLUDED_FILES,
289
+ excluded_file_suffixes: options?.excluded_file_suffixes ?? DEFAULT_EXCLUDED_FILE_SUFFIXES
290
+ };
291
+ }
292
+
293
+ export async function collectProjectFileStats(
294
+ project_root_path: string,
295
+ options?: RemoteSyncScanOptions
296
+ ): Promise<Map<string, RemoteSyncProjectFileStat>> {
297
+ const root = resolve(project_root_path);
298
+ const resolvedOptions = resolveScanOptions(options);
299
+ const ignoreMatcher = await loadIndexingIgnoreMatcher(root);
300
+ const output = new Map<string, RemoteSyncProjectFileStat>();
301
+
302
+ async function walk(dir: string): Promise<void> {
303
+ const entries = await readdir(dir, { withFileTypes: true });
304
+ for (const entry of entries) {
305
+ const fullPath = join(dir, entry.name);
306
+ const repoPath = normalizeRepoRelativePath(relative(root, fullPath));
307
+
308
+ if (entry.isDirectory()) {
309
+ if (resolvedOptions.excluded_dirs.has(entry.name) || ignoreMatcher.shouldIgnorePath(repoPath, "dir")) {
310
+ continue;
311
+ }
312
+ await walk(fullPath);
313
+ continue;
314
+ }
315
+
316
+ if (!entry.isFile()) {
317
+ continue;
318
+ }
319
+
320
+ if (shouldExcludeFile(entry.name, resolvedOptions.excluded_files, resolvedOptions.excluded_file_suffixes)) {
321
+ continue;
322
+ }
323
+ if (ignoreMatcher.shouldIgnorePath(repoPath, "file")) {
324
+ continue;
325
+ }
326
+ const fileStat = await stat(fullPath);
327
+ if (fileStat.size > resolvedOptions.max_file_size_bytes) {
328
+ continue;
329
+ }
330
+
331
+ output.set(repoPath, {
332
+ path: repoPath,
333
+ full_path: fullPath,
334
+ size: fileStat.size,
335
+ mtime_ms: Math.trunc(fileStat.mtimeMs),
336
+ language: extensionToLanguage(repoPath)
337
+ });
338
+ }
339
+ }
340
+
341
+ await walk(root);
342
+ return output;
343
+ }
344
+
345
+ export async function collectUploadCandidates(
346
+ project_root_path: string,
347
+ options?: RemoteSyncScanOptions
348
+ ): Promise<RemoteSyncUploadCandidate[]> {
349
+ const stats = await collectProjectFileStats(project_root_path, options);
350
+ const output: RemoteSyncUploadCandidate[] = [];
351
+
352
+ for (const repoPath of [...stats.keys()].sort((a, b) => a.localeCompare(b))) {
353
+ const fileStat = stats.get(repoPath)!;
354
+ let content: string;
355
+ try {
356
+ content = await readFile(fileStat.full_path, "utf8");
357
+ } catch {
358
+ continue;
359
+ }
360
+ if (looksBinary(content)) {
361
+ continue;
362
+ }
363
+
364
+ output.push({
365
+ path: repoPath,
366
+ content,
367
+ ...(fileStat.language ? { language: fileStat.language } : {})
368
+ });
369
+ }
370
+
371
+ return output;
372
+ }
373
+
374
+ export async function buildRemoteSyncDeltaFromState(input: {
375
+ project_root_path: string;
376
+ previous_state?: RemoteSyncStateFile;
377
+ force_full_upsert: boolean;
378
+ options?: RemoteSyncScanOptions;
379
+ }): Promise<BuildRemoteSyncDeltaResult> {
380
+ const projectFiles = await collectProjectFileStats(input.project_root_path, input.options);
381
+ const previousFiles = input.previous_state?.files ?? {};
382
+ const upsertFiles: RemoteSyncUploadCandidate[] = [];
383
+ const deletedPaths: string[] = [];
384
+ const upsertStateEntries: Record<string, RemoteSyncStateEntry> = {};
385
+ const nextFiles: Record<string, RemoteSyncStateEntry> = {};
386
+ const sortedPaths = [...projectFiles.keys()].sort((a, b) => a.localeCompare(b));
387
+
388
+ for (const repoPath of sortedPaths) {
389
+ const projectFile = projectFiles.get(repoPath)!;
390
+ const previous = previousFiles[repoPath];
391
+ const statChanged =
392
+ !previous ||
393
+ previous.size !== projectFile.size ||
394
+ previous.mtime_ms !== projectFile.mtime_ms ||
395
+ input.force_full_upsert;
396
+
397
+ if (!statChanged) {
398
+ nextFiles[repoPath] = previous;
399
+ continue;
400
+ }
401
+
402
+ let content: string;
403
+ try {
404
+ content = await readFile(projectFile.full_path, "utf8");
405
+ } catch {
406
+ if (previous) {
407
+ nextFiles[repoPath] = previous;
408
+ }
409
+ continue;
410
+ }
411
+
412
+ if (looksBinary(content)) {
413
+ if (previous) {
414
+ nextFiles[repoPath] = previous;
415
+ }
416
+ continue;
417
+ }
418
+
419
+ const contentHash = sha256Text(content);
420
+ const nextEntry: RemoteSyncStateEntry = {
421
+ content_hash: contentHash,
422
+ size: projectFile.size,
423
+ mtime_ms: projectFile.mtime_ms,
424
+ ...(projectFile.language ? { language: projectFile.language } : {})
425
+ };
426
+
427
+ nextFiles[repoPath] = nextEntry;
428
+ if (!input.force_full_upsert && previous?.content_hash === contentHash) {
429
+ continue;
430
+ }
431
+
432
+ upsertFiles.push({
433
+ path: repoPath,
434
+ content,
435
+ ...(projectFile.language ? { language: projectFile.language } : {})
436
+ });
437
+ upsertStateEntries[repoPath] = nextEntry;
438
+ }
439
+
440
+ for (const previousPath of Object.keys(previousFiles)) {
441
+ if (!projectFiles.has(previousPath)) {
442
+ deletedPaths.push(previousPath);
443
+ }
444
+ }
445
+
446
+ return {
447
+ delta: {
448
+ upsert_files: upsertFiles,
449
+ deleted_paths: deletedPaths
450
+ },
451
+ upsert_state_entries: upsertStateEntries,
452
+ next_files: nextFiles
453
+ };
454
+ }
455
+
456
+ export function estimateRemoteSyncDeltaRequestSize(input: {
457
+ project_root_path: string;
458
+ workspace_id?: string;
459
+ base_index_version?: string;
460
+ upsert_files: RemoteSyncUploadCandidate[];
461
+ deleted_paths: string[];
462
+ }): number {
463
+ const payload = {
464
+ project_root_path: input.project_root_path,
465
+ ...(input.workspace_id ? { workspace_id: input.workspace_id } : {}),
466
+ ...(input.base_index_version ? { base_index_version: input.base_index_version } : {}),
467
+ upsert_files: input.upsert_files.map((file) => ({
468
+ path: file.path,
469
+ content: file.content,
470
+ ...(file.language ? { language: file.language } : {})
471
+ })),
472
+ deleted_paths: input.deleted_paths
473
+ };
474
+ return Buffer.byteLength(JSON.stringify(payload), "utf8");
475
+ }
476
+
477
+ export function splitRemoteSyncDeltaIntoBatches(input: {
478
+ project_root_path: string;
479
+ workspace_id?: string;
480
+ base_index_version?: string;
481
+ delta: RemoteSyncDeltaPayload;
482
+ max_body_bytes: number;
483
+ }): RemoteSyncDeltaBatch[] {
484
+ const budget = Math.max(256 * 1024, Math.floor(input.max_body_bytes * 0.8));
485
+ const upserts = [...input.delta.upsert_files].sort((a, b) => a.path.localeCompare(b.path));
486
+ const deleted = [...new Set(input.delta.deleted_paths)].sort((a, b) => a.localeCompare(b));
487
+ const batches: RemoteSyncDeltaBatch[] = [];
488
+ let currentUpserts: RemoteSyncUploadCandidate[] = [];
489
+ let currentDeleted: string[] = [];
490
+
491
+ const flush = () => {
492
+ if (currentUpserts.length === 0 && currentDeleted.length === 0) {
493
+ return;
494
+ }
495
+ batches.push({
496
+ upsert_files: currentUpserts,
497
+ deleted_paths: currentDeleted,
498
+ approx_bytes: estimateRemoteSyncDeltaRequestSize({
499
+ project_root_path: input.project_root_path,
500
+ workspace_id: input.workspace_id,
501
+ base_index_version: input.base_index_version,
502
+ upsert_files: currentUpserts,
503
+ deleted_paths: currentDeleted
504
+ })
505
+ });
506
+ currentUpserts = [];
507
+ currentDeleted = [];
508
+ };
509
+
510
+ const canFit = (nextUpserts: RemoteSyncUploadCandidate[], nextDeleted: string[]) =>
511
+ estimateRemoteSyncDeltaRequestSize({
512
+ project_root_path: input.project_root_path,
513
+ workspace_id: input.workspace_id,
514
+ base_index_version: input.base_index_version,
515
+ upsert_files: nextUpserts,
516
+ deleted_paths: nextDeleted
517
+ }) <= budget;
518
+
519
+ for (const file of upserts) {
520
+ const candidateUpserts = [...currentUpserts, file];
521
+ if (canFit(candidateUpserts, currentDeleted)) {
522
+ currentUpserts = candidateUpserts;
523
+ continue;
524
+ }
525
+ flush();
526
+ if (!canFit([file], [])) {
527
+ throw new Error(`delta upsert payload too large for path ${file.path}`);
528
+ }
529
+ currentUpserts = [file];
530
+ }
531
+
532
+ for (const path of deleted) {
533
+ const candidateDeleted = [...currentDeleted, path];
534
+ if (canFit(currentUpserts, candidateDeleted)) {
535
+ currentDeleted = candidateDeleted;
536
+ continue;
537
+ }
538
+ flush();
539
+ if (!canFit([], [path])) {
540
+ throw new Error(`delta delete payload too large for path ${path}`);
541
+ }
542
+ currentDeleted = [path];
543
+ }
544
+
545
+ flush();
546
+ return batches;
547
+ }
548
+
549
+ export async function readRemoteSyncState(path: string): Promise<RemoteSyncStateFile | undefined> {
550
+ if (!existsSync(path)) {
551
+ return undefined;
552
+ }
553
+
554
+ try {
555
+ const parsed = JSON.parse(await readFile(path, "utf8")) as Partial<RemoteSyncStateFile>;
556
+ if (
557
+ !parsed ||
558
+ parsed.mode !== REMOTE_SYNC_STATE_MODE ||
559
+ typeof parsed.files !== "object" ||
560
+ parsed.files === null ||
561
+ Array.isArray(parsed.files)
562
+ ) {
563
+ return undefined;
564
+ }
565
+
566
+ return {
567
+ mode: REMOTE_SYNC_STATE_MODE,
568
+ workspace_id: parsed.workspace_id,
569
+ last_index_version: parsed.last_index_version,
570
+ files: parsed.files as Record<string, RemoteSyncStateEntry>,
571
+ updated_at: typeof parsed.updated_at === "string" ? parsed.updated_at : nowIso()
572
+ };
573
+ } catch {
574
+ return undefined;
575
+ }
576
+ }
577
+
578
+ export async function writeRemoteSyncState(path: string, state: RemoteSyncStateFile): Promise<void> {
579
+ await writeFile(
580
+ path,
581
+ JSON.stringify(
582
+ {
583
+ ...state,
584
+ mode: REMOTE_SYNC_STATE_MODE,
585
+ updated_at: nowIso()
586
+ },
587
+ null,
588
+ 2
589
+ )
590
+ );
591
+ }
592
+
593
+ export function isStaleBaseIndexError(error: unknown): boolean {
594
+ if (!(error instanceof RemoteSyncHttpResponseError)) {
595
+ return false;
596
+ }
597
+ if (error.status !== 400) {
598
+ return false;
599
+ }
600
+ const message = error.message.toLowerCase();
601
+ return message.includes("invalid_argument") && message.includes("base index version");
602
+ }
603
+
604
+ export function isDeltaUnsupportedError(error: unknown): boolean {
605
+ if (error instanceof RemoteSyncHttpResponseError) {
606
+ if ([404, 405, 501].includes(error.status)) {
607
+ return true;
608
+ }
609
+ const message = error.message.toLowerCase();
610
+ return message.includes("not_found") && message.includes("push-delta");
611
+ }
612
+
613
+ if (error instanceof Error) {
614
+ const message = error.message.toLowerCase();
615
+ return message.includes("push-delta") && (message.includes("not found") || message.includes("404"));
616
+ }
617
+
618
+ return false;
619
+ }
620
+
621
+ export function isBlobCommitV2UnsupportedError(error: unknown): boolean {
622
+ if (error instanceof RemoteSyncHttpResponseError) {
623
+ if ([404, 405, 501].includes(error.status)) {
624
+ return true;
625
+ }
626
+ const message = error.message.toLowerCase();
627
+ return (
628
+ (message.includes("blob_commit_v2") && message.includes("disabled")) ||
629
+ message.includes("commit-v2") ||
630
+ message.includes("blobs/upload")
631
+ ) && message.includes("not found");
632
+ }
633
+
634
+ if (error instanceof Error) {
635
+ const message = error.message.toLowerCase();
636
+ return (
637
+ (message.includes("commit-v2") || message.includes("blobs/upload")) &&
638
+ (message.includes("not found") || message.includes("404"))
639
+ );
640
+ }
641
+
642
+ return false;
643
+ }
644
+
645
+ function isRetryableAdaptiveError(error: unknown): boolean {
646
+ if (error instanceof RemoteSyncHttpResponseError) {
647
+ return error.status === 429 || error.status >= 500;
648
+ }
649
+ return false;
650
+ }
651
+
652
+ function computeBlobBatches(input: {
653
+ upsert_files: RemoteSyncUploadCandidate[];
654
+ max_blob_bytes: number;
655
+ max_blob_batch_bytes: number;
656
+ }): Array<
657
+ Array<{
658
+ hash: string;
659
+ content: string;
660
+ size_bytes: number;
661
+ }>
662
+ > {
663
+ const deduped = new Map<string, { hash: string; content: string; size_bytes: number }>();
664
+ for (const file of input.upsert_files) {
665
+ const sizeBytes = Buffer.byteLength(file.content, "utf8");
666
+ if (sizeBytes > input.max_blob_bytes) {
667
+ throw new Error(`blob payload too large for path ${file.path}`);
668
+ }
669
+ const hash = sha256Text(file.content);
670
+ if (!deduped.has(hash)) {
671
+ deduped.set(hash, {
672
+ hash,
673
+ content: file.content,
674
+ size_bytes: sizeBytes
675
+ });
676
+ }
677
+ }
678
+
679
+ const blobs = [...deduped.values()].sort((a, b) => a.hash.localeCompare(b.hash));
680
+ const batches: Array<Array<{ hash: string; content: string; size_bytes: number }>> = [];
681
+ let current: Array<{ hash: string; content: string; size_bytes: number }> = [];
682
+ let currentBytes = 0;
683
+ for (const blob of blobs) {
684
+ if (blob.size_bytes > input.max_blob_batch_bytes) {
685
+ throw new Error(`blob ${blob.hash} exceeds max blob batch bytes`);
686
+ }
687
+ if (current.length > 0 && currentBytes + blob.size_bytes > input.max_blob_batch_bytes) {
688
+ batches.push(current);
689
+ current = [];
690
+ currentBytes = 0;
691
+ }
692
+ current.push(blob);
693
+ currentBytes += blob.size_bytes;
694
+ }
695
+ if (current.length > 0) {
696
+ batches.push(current);
697
+ }
698
+ return batches;
699
+ }
700
+
701
+ function supportsBlobCommitV2(capabilities: RemoteSyncCapabilities): boolean {
702
+ const protocols = capabilities.sync_protocols ?? [];
703
+ return protocols.includes("blob_commit_v2");
704
+ }
705
+
706
+ export async function runRemoteAdaptiveSync(input: RunRemoteAdaptiveSyncInput): Promise<RunRemoteAdaptiveSyncResult> {
707
+ const runStartedAt = Date.now();
708
+ const retries = input.retries ?? 3;
709
+ const initialDelayMs = input.initial_delay_ms ?? 500;
710
+ const staleBaseError = input.stale_base_error ?? isStaleBaseIndexError;
711
+
712
+ if (!supportsBlobCommitV2(input.capabilities)) {
713
+ const delta = await runRemoteDeltaSync({
714
+ project_root_path: input.project_root_path,
715
+ scan_root_path: input.scan_root_path,
716
+ workspace_id: input.workspace_id,
717
+ previous_state: input.previous_state,
718
+ force_full_upsert: input.force_full_upsert,
719
+ max_body_bytes: input.capabilities.max_body_bytes,
720
+ retries,
721
+ initial_delay_ms: initialDelayMs,
722
+ stale_base_error: staleBaseError,
723
+ persist_state: input.persist_state,
724
+ push_delta: input.push_delta
725
+ });
726
+ return {
727
+ ...delta,
728
+ protocol: "delta_v1"
729
+ };
730
+ }
731
+
732
+ const execute = async (previousState: RemoteSyncStateFile | undefined, forceFullUpsert: boolean) => {
733
+ const scanRootPath = input.scan_root_path ?? input.project_root_path;
734
+ const deltaBuild = await buildRemoteSyncDeltaFromState({
735
+ project_root_path: scanRootPath,
736
+ previous_state: previousState,
737
+ force_full_upsert: forceFullUpsert
738
+ });
739
+
740
+ const nextWorkspaceId = input.workspace_id ?? previousState?.workspace_id;
741
+ if (deltaBuild.delta.upsert_files.length === 0 && deltaBuild.delta.deleted_paths.length === 0) {
742
+ const unchangedState: RemoteSyncStateFile = {
743
+ mode: REMOTE_SYNC_STATE_MODE,
744
+ workspace_id: nextWorkspaceId,
745
+ last_index_version: previousState?.last_index_version,
746
+ files: deltaBuild.next_files,
747
+ updated_at: nowIso()
748
+ };
749
+ await input.persist_state?.(unchangedState);
750
+ return {
751
+ state: unchangedState,
752
+ changed: false,
753
+ workspace_id: unchangedState.workspace_id,
754
+ index_version: unchangedState.last_index_version,
755
+ applied_delta: {
756
+ upsert_files: 0,
757
+ deleted_paths: 0
758
+ },
759
+ stats: {
760
+ batches_total: 0,
761
+ bytes_total: 0,
762
+ latency_ms: Date.now() - runStartedAt
763
+ },
764
+ protocol: "blob_commit_v2" as const
765
+ };
766
+ }
767
+
768
+ const maxBlobBytes = input.capabilities.max_blob_bytes ?? 128 * 1024;
769
+ const maxBlobBatchBytes = input.capabilities.max_blob_batch_bytes ?? Math.max(maxBlobBytes, 1024 * 1024);
770
+ const maxCommitBodyBytes = input.capabilities.max_commit_body_bytes ?? input.capabilities.max_body_bytes;
771
+ let currentConcurrency = Math.max(
772
+ 1,
773
+ Math.min(input.capabilities.upload_concurrency_hint ?? 4, 16)
774
+ );
775
+
776
+ const blobBatches = computeBlobBatches({
777
+ upsert_files: deltaBuild.delta.upsert_files,
778
+ max_blob_bytes: maxBlobBytes,
779
+ max_blob_batch_bytes: maxBlobBatchBytes
780
+ });
781
+ const pending = [...blobBatches];
782
+ const acknowledged = new Set<string>();
783
+ let bytesTotal = 0;
784
+
785
+ while (pending.length > 0) {
786
+ const wave = pending.splice(0, currentConcurrency);
787
+ const settled = await Promise.allSettled(
788
+ wave.map(async (batch) =>
789
+ retryWithBackoff({
790
+ retries,
791
+ initial_delay_ms: initialDelayMs,
792
+ fn: async () =>
793
+ input.upload_blobs({
794
+ workspace_id: nextWorkspaceId,
795
+ project_root_path: input.project_root_path,
796
+ blobs: batch
797
+ })
798
+ })
799
+ )
800
+ );
801
+
802
+ let waveHadRetryableError = false;
803
+ for (let idx = 0; idx < settled.length; idx += 1) {
804
+ const result = settled[idx]!;
805
+ const batch = wave[idx]!;
806
+ if (result.status === "rejected") {
807
+ if (isRetryableAdaptiveError(result.reason)) {
808
+ pending.push(batch);
809
+ waveHadRetryableError = true;
810
+ continue;
811
+ }
812
+ throw result.reason;
813
+ }
814
+
815
+ const payload = result.value;
816
+ for (const hash of payload.accepted_hashes) {
817
+ acknowledged.add(hash.toLowerCase());
818
+ }
819
+ for (const hash of payload.already_present_hashes) {
820
+ acknowledged.add(hash.toLowerCase());
821
+ }
822
+ if (payload.rejected.length > 0) {
823
+ throw new Error(`blob upload rejected: ${payload.rejected[0]!.reason}`);
824
+ }
825
+ bytesTotal += batch.reduce((sum, blob) => sum + blob.size_bytes, 0);
826
+ }
827
+
828
+ if (waveHadRetryableError) {
829
+ const previous = currentConcurrency;
830
+ currentConcurrency = Math.max(1, Math.floor(currentConcurrency / 2));
831
+ await input.on_upload_strategy_change?.({
832
+ previous_concurrency: previous,
833
+ next_concurrency: currentConcurrency,
834
+ reason: "error"
835
+ });
836
+ } else if (currentConcurrency < Math.max(1, input.capabilities.upload_concurrency_hint ?? 4)) {
837
+ const previous = currentConcurrency;
838
+ currentConcurrency += 1;
839
+ await input.on_upload_strategy_change?.({
840
+ previous_concurrency: previous,
841
+ next_concurrency: currentConcurrency,
842
+ reason: "success"
843
+ });
844
+ }
845
+ }
846
+
847
+ const dedupedUpserts = new Map<string, RemoteSyncUploadCandidate>();
848
+ for (const file of deltaBuild.delta.upsert_files) {
849
+ dedupedUpserts.set(file.path, file);
850
+ }
851
+ const commitUpserts = [...dedupedUpserts.values()]
852
+ .sort((a, b) => a.path.localeCompare(b.path))
853
+ .map((file) => ({
854
+ path: file.path,
855
+ blob_hash: sha256Text(file.content),
856
+ ...(file.language ? { language: file.language } : {})
857
+ }));
858
+ const commitDeletes = [...new Set(deltaBuild.delta.deleted_paths)].sort((a, b) => a.localeCompare(b));
859
+
860
+ const missingHash = commitUpserts.find((file) => !acknowledged.has(file.blob_hash.toLowerCase()));
861
+ if (missingHash) {
862
+ throw new Error(`missing uploaded blob hash for commit: ${missingHash.blob_hash}`);
863
+ }
864
+
865
+ if (
866
+ Buffer.byteLength(
867
+ JSON.stringify({
868
+ project_root_path: input.project_root_path,
869
+ workspace_id: nextWorkspaceId,
870
+ base_index_version: previousState?.last_index_version,
871
+ upsert_files: commitUpserts,
872
+ deleted_paths: commitDeletes
873
+ }),
874
+ "utf8"
875
+ ) > maxCommitBodyBytes
876
+ ) {
877
+ throw new Error("commit-v2 payload exceeds max_commit_body_bytes");
878
+ }
879
+
880
+ const commit = await retryWithBackoff({
881
+ retries,
882
+ initial_delay_ms: initialDelayMs,
883
+ fn: async () =>
884
+ input.commit_v2({
885
+ workspace_id: nextWorkspaceId,
886
+ project_root_path: input.project_root_path,
887
+ ...(previousState?.last_index_version ? { base_index_version: previousState.last_index_version } : {}),
888
+ upsert_files: commitUpserts,
889
+ deleted_paths: commitDeletes
890
+ })
891
+ });
892
+
893
+ const finalWorkspace = commit.workspace_id ?? nextWorkspaceId;
894
+ const finalIndexVersion = commit.index_version ?? previousState?.last_index_version;
895
+ const finalState: RemoteSyncStateFile = {
896
+ mode: REMOTE_SYNC_STATE_MODE,
897
+ workspace_id: finalWorkspace,
898
+ last_index_version: finalIndexVersion,
899
+ files: deltaBuild.next_files,
900
+ updated_at: nowIso()
901
+ };
902
+ await input.persist_state?.(finalState);
903
+
904
+ return {
905
+ state: finalState,
906
+ changed: true,
907
+ workspace_id: finalWorkspace,
908
+ index_version: finalIndexVersion,
909
+ applied_delta: {
910
+ upsert_files: commitUpserts.length,
911
+ deleted_paths: commitDeletes.length
912
+ },
913
+ stats: {
914
+ batches_total: blobBatches.length,
915
+ bytes_total: bytesTotal,
916
+ latency_ms: Date.now() - runStartedAt
917
+ },
918
+ protocol: "blob_commit_v2" as const
919
+ };
920
+ };
921
+
922
+ const forceFullUpsert = input.force_full_upsert ?? false;
923
+ try {
924
+ return await execute(input.previous_state, forceFullUpsert);
925
+ } catch (error) {
926
+ if (isBlobCommitV2UnsupportedError(error)) {
927
+ const delta = await runRemoteDeltaSync({
928
+ project_root_path: input.project_root_path,
929
+ scan_root_path: input.scan_root_path,
930
+ workspace_id: input.workspace_id,
931
+ previous_state: input.previous_state,
932
+ force_full_upsert: input.force_full_upsert,
933
+ max_body_bytes: input.capabilities.max_body_bytes,
934
+ retries,
935
+ initial_delay_ms: initialDelayMs,
936
+ stale_base_error: staleBaseError,
937
+ persist_state: input.persist_state,
938
+ push_delta: input.push_delta
939
+ });
940
+ return {
941
+ ...delta,
942
+ protocol: "delta_v1"
943
+ };
944
+ }
945
+ if (!forceFullUpsert && input.previous_state && staleBaseError(error)) {
946
+ return execute(undefined, true);
947
+ }
948
+ throw error;
949
+ }
950
+ }
951
+
952
+ export async function retryWithBackoff<T>(input: {
953
+ fn: () => Promise<T>;
954
+ retries: number;
955
+ initial_delay_ms: number;
956
+ }): Promise<T> {
957
+ let attempt = 0;
958
+ let lastError: unknown;
959
+ while (attempt < input.retries) {
960
+ try {
961
+ return await input.fn();
962
+ } catch (error) {
963
+ lastError = error;
964
+ attempt += 1;
965
+ if (attempt >= input.retries) {
966
+ break;
967
+ }
968
+ await new Promise((resolveSleep) => setTimeout(resolveSleep, input.initial_delay_ms * 2 ** (attempt - 1)));
969
+ }
970
+ }
971
+ throw lastError;
972
+ }
973
+
974
+ export async function runRemoteDeltaSync(input: RunRemoteDeltaSyncInput): Promise<RunRemoteDeltaSyncResult> {
975
+ const runStartedAt = Date.now();
976
+ const retries = input.retries ?? 3;
977
+ const initialDelayMs = input.initial_delay_ms ?? 500;
978
+ const isStaleError = input.stale_base_error ?? isStaleBaseIndexError;
979
+
980
+ const execute = async (previousState: RemoteSyncStateFile | undefined, forceFullUpsert: boolean): Promise<RunRemoteDeltaSyncResult> => {
981
+ const scanRootPath = input.scan_root_path ?? input.project_root_path;
982
+ const deltaBuild = await buildRemoteSyncDeltaFromState({
983
+ project_root_path: scanRootPath,
984
+ previous_state: previousState,
985
+ force_full_upsert: forceFullUpsert
986
+ });
987
+
988
+ const nextWorkspaceId = input.workspace_id ?? previousState?.workspace_id;
989
+
990
+ if (deltaBuild.delta.upsert_files.length === 0 && deltaBuild.delta.deleted_paths.length === 0) {
991
+ const unchangedState: RemoteSyncStateFile = {
992
+ mode: REMOTE_SYNC_STATE_MODE,
993
+ workspace_id: nextWorkspaceId,
994
+ last_index_version: previousState?.last_index_version,
995
+ files: deltaBuild.next_files,
996
+ updated_at: nowIso()
997
+ };
998
+ await input.persist_state?.(unchangedState);
999
+ return {
1000
+ state: unchangedState,
1001
+ changed: false,
1002
+ workspace_id: unchangedState.workspace_id,
1003
+ index_version: unchangedState.last_index_version,
1004
+ applied_delta: {
1005
+ upsert_files: 0,
1006
+ deleted_paths: 0
1007
+ },
1008
+ stats: {
1009
+ batches_total: 0,
1010
+ bytes_total: 0,
1011
+ latency_ms: Date.now() - runStartedAt
1012
+ }
1013
+ };
1014
+ }
1015
+
1016
+ let currentBaseIndexVersion = previousState?.last_index_version;
1017
+ let currentWorkspaceId = nextWorkspaceId;
1018
+ const progressFiles: Record<string, RemoteSyncStateEntry> = { ...(previousState?.files ?? {}) };
1019
+ let uploadedCount = 0;
1020
+ let deletedCount = 0;
1021
+ let bytesTotal = 0;
1022
+
1023
+ const batches = splitRemoteSyncDeltaIntoBatches({
1024
+ project_root_path: input.project_root_path,
1025
+ workspace_id: currentWorkspaceId,
1026
+ base_index_version: currentBaseIndexVersion,
1027
+ delta: deltaBuild.delta,
1028
+ max_body_bytes: input.max_body_bytes
1029
+ });
1030
+
1031
+ let latestState: RemoteSyncStateFile | undefined;
1032
+
1033
+ for (let batchIndex = 0; batchIndex < batches.length; batchIndex += 1) {
1034
+ const batch = batches[batchIndex]!;
1035
+ const batchStartedAt = Date.now();
1036
+ const result = await retryWithBackoff({
1037
+ retries,
1038
+ initial_delay_ms: initialDelayMs,
1039
+ fn: async () =>
1040
+ input.push_delta({
1041
+ workspace_id: currentWorkspaceId,
1042
+ project_root_path: input.project_root_path,
1043
+ ...(currentBaseIndexVersion ? { base_index_version: currentBaseIndexVersion } : {}),
1044
+ upsert_files: batch.upsert_files,
1045
+ deleted_paths: batch.deleted_paths
1046
+ })
1047
+ });
1048
+
1049
+ currentWorkspaceId = result.workspace_id ?? currentWorkspaceId;
1050
+ currentBaseIndexVersion = result.index_version ?? currentBaseIndexVersion;
1051
+
1052
+ for (const file of batch.upsert_files) {
1053
+ const entry = deltaBuild.upsert_state_entries[file.path];
1054
+ if (entry) {
1055
+ progressFiles[file.path] = entry;
1056
+ }
1057
+ }
1058
+ for (const path of batch.deleted_paths) {
1059
+ delete progressFiles[path];
1060
+ }
1061
+
1062
+ uploadedCount += batch.upsert_files.length;
1063
+ deletedCount += batch.deleted_paths.length;
1064
+ bytesTotal += batch.approx_bytes;
1065
+
1066
+ await input.on_batch_processed?.({
1067
+ batch_index: batchIndex,
1068
+ batch_count: batches.length,
1069
+ approx_bytes: batch.approx_bytes,
1070
+ upsert_files: batch.upsert_files.length,
1071
+ deleted_paths: batch.deleted_paths.length,
1072
+ latency_ms: Date.now() - batchStartedAt
1073
+ });
1074
+
1075
+ latestState = {
1076
+ mode: REMOTE_SYNC_STATE_MODE,
1077
+ workspace_id: currentWorkspaceId,
1078
+ last_index_version: currentBaseIndexVersion,
1079
+ files: progressFiles,
1080
+ updated_at: nowIso()
1081
+ };
1082
+ await input.persist_state?.(latestState);
1083
+ }
1084
+
1085
+ const finalState = latestState ?? {
1086
+ mode: REMOTE_SYNC_STATE_MODE,
1087
+ workspace_id: currentWorkspaceId,
1088
+ last_index_version: currentBaseIndexVersion,
1089
+ files: progressFiles,
1090
+ updated_at: nowIso()
1091
+ };
1092
+
1093
+ return {
1094
+ state: finalState,
1095
+ changed: true,
1096
+ workspace_id: currentWorkspaceId,
1097
+ index_version: currentBaseIndexVersion,
1098
+ applied_delta: {
1099
+ upsert_files: uploadedCount,
1100
+ deleted_paths: deletedCount
1101
+ },
1102
+ stats: {
1103
+ batches_total: batches.length,
1104
+ bytes_total: bytesTotal,
1105
+ latency_ms: Date.now() - runStartedAt
1106
+ }
1107
+ };
1108
+ };
1109
+
1110
+ const forceFullUpsert = input.force_full_upsert ?? false;
1111
+ try {
1112
+ return await execute(input.previous_state, forceFullUpsert);
1113
+ } catch (error) {
1114
+ if (!forceFullUpsert && input.previous_state && isStaleError(error)) {
1115
+ return execute(undefined, true);
1116
+ }
1117
+ throw error;
1118
+ }
1119
+ }