@okrlinkhub/agent-factory 0.2.14 → 0.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +1 -4
  2. package/dist/client/index.d.ts +1 -1
  3. package/dist/client/index.d.ts.map +1 -1
  4. package/dist/client/index.js +0 -3
  5. package/dist/client/index.js.map +1 -1
  6. package/dist/component/_generated/component.d.ts +0 -34
  7. package/dist/component/_generated/component.d.ts.map +1 -1
  8. package/dist/component/lib.d.ts +1 -1
  9. package/dist/component/lib.d.ts.map +1 -1
  10. package/dist/component/lib.js +1 -1
  11. package/dist/component/lib.js.map +1 -1
  12. package/dist/component/providers/fly.d.ts +14 -0
  13. package/dist/component/providers/fly.d.ts.map +1 -1
  14. package/dist/component/providers/fly.js +35 -5
  15. package/dist/component/providers/fly.js.map +1 -1
  16. package/dist/component/queue.d.ts +5 -20
  17. package/dist/component/queue.d.ts.map +1 -1
  18. package/dist/component/queue.js +41 -107
  19. package/dist/component/queue.js.map +1 -1
  20. package/dist/component/scheduler.d.ts.map +1 -1
  21. package/dist/component/scheduler.js +127 -81
  22. package/dist/component/scheduler.js.map +1 -1
  23. package/dist/component/schema.d.ts +5 -13
  24. package/dist/component/schema.d.ts.map +1 -1
  25. package/dist/component/schema.js +0 -4
  26. package/dist/component/schema.js.map +1 -1
  27. package/package.json +1 -1
  28. package/src/client/index.ts +0 -3
  29. package/src/component/_generated/component.ts +0 -42
  30. package/src/component/lib.test.ts +348 -88
  31. package/src/component/lib.ts +0 -1
  32. package/src/component/providers/fly.ts +50 -5
  33. package/src/component/queue.ts +52 -135
  34. package/src/component/scheduler.ts +211 -96
  35. package/src/component/schema.ts +0 -4
@@ -44,6 +44,20 @@ type ReconcileWorkerPoolArgs = {
44
44
  providerConfig?: typeof DEFAULT_CONFIG.provider;
45
45
  };
46
46
 
47
+ type SchedulerWorkerRow = {
48
+ workerId: string;
49
+ status: "active" | "stopped";
50
+ load: number;
51
+ heartbeatAt: number;
52
+ lastClaimAt: number | null;
53
+ scheduledShutdownAt: number | null;
54
+ stoppedAt: number | null;
55
+ lastSnapshotId: string | null;
56
+ machineId: string | null;
57
+ appName: string | null;
58
+ region: string | null;
59
+ };
60
+
47
61
  export const reconcileWorkerPool = action({
48
62
  args: reconcileWorkerPoolArgs,
49
63
  returns: reconcileWorkerPoolReturns,
@@ -126,6 +140,7 @@ async function runReconcileWorkerPool(
126
140
  lastClaimAt: number | null;
127
141
  scheduledShutdownAt: number | null;
128
142
  stoppedAt: number | null;
143
+ lastSnapshotId: string | null;
129
144
  machineId: string | null;
130
145
  appName: string | null;
131
146
  region: string | null;
@@ -172,11 +187,11 @@ async function runReconcileWorkerPool(
172
187
  }
173
188
  const workspaceId = args.workspaceId ?? "default";
174
189
  const provider = resolveProvider(providerConfig.kind, flyApiToken);
175
-
190
+ const isScopedWorker = (worker: SchedulerWorkerRow) =>
191
+ worker.appName === null || worker.appName === providerConfig.appName;
192
+ const scopedWorkerRows = () => workerRows.filter(isScopedWorker);
176
193
  const localWorkersWithMachine = workerRows.filter(
177
- (worker) =>
178
- worker.machineId &&
179
- (worker.appName === null || worker.appName === providerConfig.appName),
194
+ (worker) => isScopedWorker(worker) && worker.machineId,
180
195
  );
181
196
  const liveMachineIds = new Set<string>();
182
197
  const liveMachineImages = new Set<string>();
@@ -205,18 +220,9 @@ async function runReconcileWorkerPool(
205
220
  (worker) => worker.machineId && !liveMachineIds.has(worker.machineId),
206
221
  );
207
222
  for (const worker of staleWorkers) {
208
- await ctx.runMutation(internal.queue.upsertWorkerState, {
209
- workerId: worker.workerId,
210
- provider: providerConfig.kind,
211
- status: "stopped",
212
- load: 0,
213
- nowMs,
214
- scheduledShutdownAt: worker.scheduledShutdownAt ?? undefined,
215
- stoppedAt: nowMs,
216
- machineId: worker.machineId ?? undefined,
217
- appName: providerConfig.appName,
218
- region: providerConfig.region,
219
- });
223
+ if (worker.status !== "stopped") {
224
+ await transitionWorkerToStopped(ctx, worker, providerConfig, nowMs, false);
225
+ }
220
226
  }
221
227
  if (staleWorkers.length > 0) {
222
228
  workerRows = await ctx.runQuery((internal.queue as any).listWorkersForScheduler, {});
@@ -224,7 +230,7 @@ async function runReconcileWorkerPool(
224
230
  }
225
231
 
226
232
  let spawned = 0;
227
- let terminated = staleWorkers.length;
233
+ let terminated = staleWorkers.filter((worker) => worker.status !== "stopped").length;
228
234
 
229
235
  const dedicatedVolumeMode =
230
236
  providerConfig.volumeName.trim().length > 0 && providerConfig.volumePath.trim().length > 0;
@@ -237,7 +243,7 @@ async function runReconcileWorkerPool(
237
243
  `[scheduler] dedicated volume mode enabled for ${providerConfig.volumeName}; clamping desired workers to 1`,
238
244
  );
239
245
  }
240
- const activeWorkers = workerRows.filter(
246
+ const activeWorkers = scopedWorkerRows().filter(
241
247
  (worker) => worker.status === "active" && worker.heartbeatAt > staleHeartbeatCutoff,
242
248
  ).length;
243
249
 
@@ -273,46 +279,49 @@ async function runReconcileWorkerPool(
273
279
  appName: providerConfig.appName,
274
280
  region: created.region,
275
281
  });
282
+ await scheduleIdleShutdownWatch(ctx, providerConfig, nowMs + scaling.idleTimeoutMs, nowMs);
276
283
  spawned += 1;
277
284
  }
278
285
  }
279
286
 
280
- const dueIdleTimeout = workerRows
281
- .filter(
282
- (worker) =>
283
- worker.status === "active" &&
284
- worker.load === 0 &&
285
- worker.scheduledShutdownAt !== null &&
286
- worker.scheduledShutdownAt <= nowMs,
287
- )
288
- .sort((a, b) => (a.scheduledShutdownAt ?? 0) - (b.scheduledShutdownAt ?? 0));
287
+ const dueIdleTimeout = getDueIdleWorkers(scopedWorkerRows(), nowMs);
289
288
  for (const worker of dueIdleTimeout) {
290
- const machineId = worker.machineId;
291
- const machineIsLive = machineId ? liveMachineIds.has(machineId) : false;
292
- const terminatedNow = await drainAndTerminateWorker({
293
- provider,
294
- appName: providerConfig.appName,
295
- machineId,
296
- machineIsLive,
297
- workerId: worker.workerId,
298
- });
299
- if (!terminatedNow) {
300
- // Keep worker active so the next reconcile can retry termination.
289
+ const machineIsLive = worker.machineId ? liveMachineIds.has(worker.machineId) : false;
290
+ await transitionWorkerToStopped(
291
+ ctx,
292
+ worker,
293
+ providerConfig,
294
+ nowMs,
295
+ machineIsLive && requiresFinalSnapshot(worker),
296
+ );
297
+ terminated += 1;
298
+ }
299
+ if (dueIdleTimeout.length > 0) {
300
+ workerRows = await ctx.runQuery((internal.queue as any).listWorkersForScheduler, {});
301
+ }
302
+
303
+ let pendingFinalization = 0;
304
+ const stoppedWorkersAwaitingTeardown = getStoppedWorkersAwaitingTeardown(
305
+ scopedWorkerRows(),
306
+ nowMs,
307
+ );
308
+ for (const worker of stoppedWorkersAwaitingTeardown) {
309
+ if (!hasFinalSnapshotReady(worker)) {
310
+ pendingFinalization += 1;
301
311
  continue;
302
312
  }
303
- await ctx.runMutation(internal.queue.upsertWorkerState, {
304
- workerId: worker.workerId,
305
- provider: providerConfig.kind,
306
- status: "stopped",
307
- load: 0,
308
- nowMs,
309
- scheduledShutdownAt: worker.scheduledShutdownAt ?? undefined,
310
- stoppedAt: nowMs,
311
- machineId: machineId ?? undefined,
312
- appName: providerConfig.appName,
313
- region: providerConfig.region,
313
+ const finalized = await finalizeStoppedWorkerTeardown({
314
+ provider,
315
+ providerConfig,
316
+ worker,
317
+ liveMachineIds,
314
318
  });
315
- terminated += 1;
319
+ if (!finalized) {
320
+ pendingFinalization += 1;
321
+ }
322
+ }
323
+ if (pendingFinalization > 0) {
324
+ await scheduleIdleShutdownRetry(ctx, providerConfig);
316
325
  }
317
326
 
318
327
  await ctx.runMutation((internal.queue as any).expireOldDataSnapshots, {
@@ -361,25 +370,21 @@ async function runEnforceIdleShutdowns(
361
370
  );
362
371
  }
363
372
  const provider = resolveProvider(providerConfig.kind, flyApiToken);
364
- const workerRows: Array<{
365
- workerId: string;
366
- status: "active" | "stopped";
367
- load: number;
368
- scheduledShutdownAt: number | null;
369
- machineId: string | null;
370
- }> = await ctx.runQuery((internal.queue as any).listWorkersForScheduler, {});
371
-
372
- const dueIdleTimeout = workerRows
373
- .filter(
374
- (worker) =>
375
- worker.status === "active" &&
376
- worker.load === 0 &&
377
- worker.scheduledShutdownAt !== null &&
378
- worker.scheduledShutdownAt <= nowMs,
379
- )
380
- .sort((a, b) => (a.scheduledShutdownAt ?? 0) - (b.scheduledShutdownAt ?? 0));
373
+ let workerRows: Array<SchedulerWorkerRow> = await ctx.runQuery(
374
+ (internal.queue as any).listWorkersForScheduler,
375
+ {},
376
+ );
377
+ const scopedWorkers = () =>
378
+ workerRows.filter(
379
+ (worker) => worker.appName === null || worker.appName === providerConfig.appName,
380
+ );
381
+ const dueIdleTimeout = getDueIdleWorkers(scopedWorkers(), nowMs);
382
+ const stoppedWorkersAwaitingTeardown = getStoppedWorkersAwaitingTeardown(
383
+ scopedWorkers(),
384
+ nowMs,
385
+ );
381
386
 
382
- if (dueIdleTimeout.length === 0) {
387
+ if (dueIdleTimeout.length === 0 && stoppedWorkersAwaitingTeardown.length === 0) {
383
388
  return {
384
389
  checked: 0,
385
390
  stopped: 0,
@@ -391,51 +396,131 @@ async function runEnforceIdleShutdowns(
391
396
  const providerWorkers = await provider.listWorkers(providerConfig.appName);
392
397
  const liveMachineIds = new Set(providerWorkers.map((worker) => worker.machineId));
393
398
 
394
- let stopped = 0;
395
- let pending = 0;
396
399
  for (const worker of dueIdleTimeout) {
397
- const machineId = worker.machineId;
398
- const machineIsLive = machineId ? liveMachineIds.has(machineId) : false;
399
- const terminatedNow = await drainAndTerminateWorker({
400
- provider,
401
- appName: providerConfig.appName,
402
- machineId,
403
- machineIsLive,
404
- workerId: worker.workerId,
405
- });
406
- if (!terminatedNow) {
400
+ const machineIsLive = worker.machineId ? liveMachineIds.has(worker.machineId) : false;
401
+ await transitionWorkerToStopped(
402
+ ctx,
403
+ worker,
404
+ providerConfig,
405
+ nowMs,
406
+ machineIsLive && requiresFinalSnapshot(worker),
407
+ );
408
+ }
409
+ if (dueIdleTimeout.length > 0) {
410
+ workerRows = await ctx.runQuery((internal.queue as any).listWorkersForScheduler, {});
411
+ }
412
+
413
+ const stopped = dueIdleTimeout.length;
414
+ let pending = 0;
415
+ for (const worker of getStoppedWorkersAwaitingTeardown(scopedWorkers(), nowMs)) {
416
+ if (!hasFinalSnapshotReady(worker)) {
407
417
  pending += 1;
408
418
  continue;
409
419
  }
410
- await ctx.runMutation(internal.queue.upsertWorkerState, {
411
- workerId: worker.workerId,
412
- provider: providerConfig.kind,
413
- status: "stopped",
414
- load: 0,
415
- nowMs,
416
- scheduledShutdownAt: worker.scheduledShutdownAt ?? undefined,
417
- stoppedAt: nowMs,
418
- machineId: machineId ?? undefined,
419
- appName: providerConfig.appName,
420
- region: providerConfig.region,
420
+ const finalized = await finalizeStoppedWorkerTeardown({
421
+ provider,
422
+ providerConfig,
423
+ worker,
424
+ liveMachineIds,
421
425
  });
422
- stopped += 1;
426
+ if (!finalized) {
427
+ pending += 1;
428
+ }
423
429
  }
424
430
 
425
431
  if (pending > 0) {
426
- await ctx.scheduler.runAfter(60_000, (internal.scheduler as any).enforceIdleShutdowns, {
427
- providerConfig,
428
- });
432
+ await scheduleIdleShutdownRetry(ctx, providerConfig);
429
433
  }
430
434
 
431
435
  return {
432
- checked: dueIdleTimeout.length,
436
+ checked: dueIdleTimeout.length + stoppedWorkersAwaitingTeardown.length,
433
437
  stopped,
434
438
  pending,
435
439
  nextCheckScheduled: pending > 0,
436
440
  };
437
441
  }
438
442
 
443
+ function getDueIdleWorkers(workerRows: Array<SchedulerWorkerRow>, nowMs: number) {
444
+ return workerRows
445
+ .filter(
446
+ (worker) =>
447
+ worker.status === "active" &&
448
+ worker.load === 0 &&
449
+ worker.scheduledShutdownAt !== null &&
450
+ worker.scheduledShutdownAt <= nowMs,
451
+ )
452
+ .sort((a, b) => (a.scheduledShutdownAt ?? 0) - (b.scheduledShutdownAt ?? 0));
453
+ }
454
+
455
+ function getStoppedWorkersAwaitingTeardown(workerRows: Array<SchedulerWorkerRow>, nowMs: number) {
456
+ return workerRows
457
+ .filter(
458
+ (worker) =>
459
+ worker.status === "stopped" &&
460
+ worker.scheduledShutdownAt !== null &&
461
+ worker.scheduledShutdownAt <= nowMs,
462
+ )
463
+ .sort((a, b) => (a.stoppedAt ?? a.scheduledShutdownAt ?? 0) - (b.stoppedAt ?? b.scheduledShutdownAt ?? 0));
464
+ }
465
+
466
+ function requiresFinalSnapshot(worker: SchedulerWorkerRow) {
467
+ return worker.lastClaimAt !== null;
468
+ }
469
+
470
+ function hasFinalSnapshotReady(worker: SchedulerWorkerRow) {
471
+ return !requiresFinalSnapshot(worker) || worker.lastSnapshotId !== null;
472
+ }
473
+
474
+ async function transitionWorkerToStopped(
475
+ ctx: any,
476
+ worker: SchedulerWorkerRow,
477
+ providerConfig: typeof DEFAULT_CONFIG.provider,
478
+ nowMs: number,
479
+ clearLastSnapshotId: boolean,
480
+ ) {
481
+ await ctx.runMutation(internal.queue.upsertWorkerState, {
482
+ workerId: worker.workerId,
483
+ provider: providerConfig.kind,
484
+ status: "stopped",
485
+ load: 0,
486
+ nowMs,
487
+ scheduledShutdownAt: worker.scheduledShutdownAt ?? undefined,
488
+ stoppedAt: worker.stoppedAt ?? nowMs,
489
+ machineId: worker.machineId ?? undefined,
490
+ appName: providerConfig.appName,
491
+ region: worker.region ?? providerConfig.region,
492
+ clearLastSnapshotId,
493
+ });
494
+ }
495
+
496
+ async function finalizeStoppedWorkerTeardown(input: {
497
+ provider: WorkerProvider;
498
+ providerConfig: typeof DEFAULT_CONFIG.provider;
499
+ worker: SchedulerWorkerRow;
500
+ liveMachineIds: Set<string>;
501
+ }) {
502
+ const machineId = input.worker.machineId;
503
+ const machineIsLive = machineId ? input.liveMachineIds.has(machineId) : false;
504
+ const terminatedNow = await drainAndTerminateWorker({
505
+ provider: input.provider,
506
+ appName: input.providerConfig.appName,
507
+ machineId,
508
+ machineIsLive,
509
+ workerId: input.worker.workerId,
510
+ });
511
+ if (!terminatedNow) {
512
+ return false;
513
+ }
514
+ await input.provider.cleanupWorkerStorage({
515
+ appName: input.providerConfig.appName,
516
+ workerId: input.worker.workerId,
517
+ machineId,
518
+ region: input.worker.region ?? input.providerConfig.region,
519
+ volumeName: input.providerConfig.volumeName,
520
+ });
521
+ return true;
522
+ }
523
+
439
524
  async function drainAndTerminateWorker(input: {
440
525
  provider: WorkerProvider;
441
526
  appName: string;
@@ -467,6 +552,36 @@ async function drainAndTerminateWorker(input: {
467
552
  }
468
553
  }
469
554
 
555
+ async function scheduleIdleShutdownWatch(
556
+ ctx: {
557
+ scheduler: {
558
+ runAfter: (delayMs: number, fn: unknown, args: { providerConfig: typeof DEFAULT_CONFIG.provider }) => Promise<unknown>;
559
+ };
560
+ },
561
+ providerConfig: typeof DEFAULT_CONFIG.provider,
562
+ scheduledShutdownAt: number,
563
+ nowMs: number,
564
+ ) {
565
+ const delayMs = Math.max(0, scheduledShutdownAt - nowMs) + 1_000;
566
+ await ctx.scheduler.runAfter(delayMs, (internal.scheduler as any).enforceIdleShutdowns, {
567
+ providerConfig,
568
+ });
569
+ }
570
+
571
+ async function scheduleIdleShutdownRetry(
572
+ ctx: {
573
+ scheduler: {
574
+ runAfter: (delayMs: number, fn: unknown, args: { providerConfig: typeof DEFAULT_CONFIG.provider }) => Promise<unknown>;
575
+ };
576
+ },
577
+ providerConfig: typeof DEFAULT_CONFIG.provider,
578
+ delayMs = 60_000,
579
+ ) {
580
+ await ctx.scheduler.runAfter(delayMs, (internal.scheduler as any).enforceIdleShutdowns, {
581
+ providerConfig,
582
+ });
583
+ }
584
+
470
585
  function resolveProvider(kind: string, flyApiToken: string): WorkerProvider {
471
586
  switch (kind) {
472
587
  case "fly":
@@ -4,11 +4,7 @@ import { v } from "convex/values";
4
4
  export default defineSchema({
5
5
  agentProfiles: defineTable({
6
6
  agentKey: v.string(),
7
- providerUserId: v.optional(v.string()),
8
7
  version: v.string(),
9
- soulMd: v.optional(v.string()),
10
- clientMd: v.optional(v.string()),
11
- skills: v.optional(v.array(v.string())),
12
8
  secretsRef: v.array(v.string()),
13
9
  bridgeConfig: v.optional(
14
10
  v.object({