@desplega.ai/agent-swarm 1.76.3 → 1.77.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -272,6 +272,66 @@ async function fetchResolvedEnv(
272
272
  return { env, credentialSelections, resolvedProvider };
273
273
  }
274
274
 
275
+ /**
276
+ * Keys we permit `applyResolvedEnvToProcessEnv` to mutate live.
277
+ *
278
+ * Anything not in this list is considered unsafe to overwrite post-boot:
279
+ *
280
+ * - **Boot-time identity / connectivity** (AGENT_ID, API_KEY, MCP_BASE_URL,
281
+ * AGENT_ROLE, MANAGED_*): mutating these mid-flight effectively makes the
282
+ * worker a different agent talking to a different API. Reboot, don't reload.
283
+ * - **Credential pool members** (CLAUDE_CODE_OAUTH_TOKEN, ANTHROPIC_API_KEY,
284
+ * OPENAI_API_KEY, etc.): `resolveCredentialPools` picks one randomly *per
285
+ * task* from a comma-separated pool. Persisting the picked value into
286
+ * process.env freezes the rotation. Re-resolution happens per spawn anyway,
287
+ * so we deliberately leave these alone.
288
+ * - **Coordinated values with paired state** (HARNESS_PROVIDER): swapping
289
+ * the env without also swapping the adapter and rebuilding the system
290
+ * prompt produces an inconsistent worker. Handled by its own reconcile
291
+ * path that updates state.harnessProvider + adapter atomically.
292
+ * - **Process-runtime / OS-level** (PATH, HOME, NODE_OPTIONS, HOSTNAME, …):
293
+ * never overwrite. Some of these are read once by libraries at boot.
294
+ * - **Values memoized at boot** (TEMPLATE_ID, AGENT_NAME): the cached
295
+ * in-process value wins anyway — overwriting just creates confusion.
296
+ *
297
+ * For values that affect runner-loop behavior (like MAX_CONCURRENT_TASKS),
298
+ * prefer mutating `RunnerState` directly — no round-trip through process.env.
299
+ */
300
+ const RELOADABLE_ENV_KEYS: ReadonlySet<string> = new Set([
301
+ "MODEL_OVERRIDE",
302
+ "AGENT_FS_SHARED_ORG_ID",
303
+ ]);
304
+
305
+ /**
306
+ * Apply a fresh resolved env to `process.env` for keys safe to mutate live.
307
+ * Returns the list of keys that actually changed (useful for logging).
308
+ */
309
+ function applyResolvedEnvToProcessEnv(freshEnv: Record<string, string | undefined>): string[] {
310
+ const changed: string[] = [];
311
+ for (const key of RELOADABLE_ENV_KEYS) {
312
+ const next = freshEnv[key];
313
+ if (next !== undefined && next !== process.env[key]) {
314
+ process.env[key] = next;
315
+ changed.push(key);
316
+ }
317
+ }
318
+ return changed;
319
+ }
320
+
321
+ /** Compute effective max concurrent tasks from env > template default > role default. */
322
+ function resolveMaxConcurrent(
323
+ env: Record<string, string | undefined>,
324
+ templateMax: number | undefined,
325
+ defaultMaxTasks: number,
326
+ ): number {
327
+ const raw = env.MAX_CONCURRENT_TASKS;
328
+ if (raw) {
329
+ const parsed = parseInt(raw, 10);
330
+ if (Number.isFinite(parsed) && parsed > 0) return parsed;
331
+ }
332
+ return templateMax ?? defaultMaxTasks;
333
+ }
334
+
275
335
  /** Tools that produce noise — skip auto-progress for these */
276
336
  const SKIP_PROGRESS_TOOLS = new Set(["ToolSearch", "TodoRead", "TodoWrite"]);
277
337
 
@@ -355,13 +415,19 @@ export function humanizeToolName(name: string): string {
355
415
  export function toolCallToProgress(toolName: string, args: unknown): string | null {
356
416
  if (SKIP_PROGRESS_TOOLS.has(toolName)) return null;
357
417
 
418
+ const a = args as Record<string, unknown>;
419
+ const maybeMcpServer = typeof a?.server === "string" ? a.server : undefined;
420
+ const maybeMcpTool = typeof a?.tool === "string" ? a.tool : undefined;
421
+ const effectiveToolName =
422
+ maybeMcpServer && maybeMcpTool ? `mcp__${maybeMcpServer}__${maybeMcpTool}` : toolName;
423
+ if (SKIP_PROGRESS_TOOLS.has(effectiveToolName)) return null;
424
+
358
425
  // Normalize: pi-mono uses lowercase ("read"), Claude uses PascalCase ("Read")
359
426
  const normalized =
360
- toolName.startsWith("mcp__") || toolName.includes("_")
361
- ? toolName
362
- : toolName.charAt(0).toUpperCase() + toolName.slice(1);
427
+ effectiveToolName.startsWith("mcp__") || effectiveToolName.includes("_")
428
+ ? effectiveToolName
429
+ : effectiveToolName.charAt(0).toUpperCase() + effectiveToolName.slice(1);
363
430
 
364
- const a = args as Record<string, unknown>;
365
431
  const shortPath = (p: unknown) => {
366
432
  if (typeof p !== "string") return "";
367
433
  // Show last 2 path segments for readability
@@ -390,8 +456,8 @@ export function toolCallToProgress(toolName: string, args: unknown): string | nu
390
456
  return `⚙️ Running /${a.skill}`;
391
457
  default: {
392
458
  // MCP tools: mcp__server__tool
393
- if (toolName.startsWith("mcp__")) {
394
- const parts = toolName.split("__");
459
+ if (effectiveToolName.startsWith("mcp__")) {
460
+ const parts = effectiveToolName.split("__");
395
461
  if (parts.length >= 3) {
396
462
  const server = parts[1];
397
463
  const tool = parts.slice(2).join("__");
@@ -405,8 +471,18 @@ export function toolCallToProgress(toolName: string, args: unknown): string | nu
405
471
  // Other MCP servers: "🔌 server: Humanized tool"
406
472
  return `🔌 ${server}: ${humanizeToolName(tool)}`;
407
473
  }
408
- return `🔌 ${toolName}`;
474
+ return `🔌 ${effectiveToolName}`;
475
+ }
476
+
477
+ // Pi-mono exposes tools from the built-in swarm MCP endpoint as bare
478
+ // names ("store-progress", "send-task", ...), not as mcp__ names.
479
+ // Treat those names as agent-swarm tools so activity stays readable.
480
+ if (toolName.includes("-")) {
481
+ const label = SWARM_TOOL_LABELS[toolName];
482
+ if (label === null) return null;
483
+ if (label) return label;
409
484
  }
485
+
410
486
  return `🔧 ${toolName}`;
411
487
  }
412
488
  }
@@ -962,7 +1038,6 @@ export interface RunnerOptions {
962
1038
  systemPromptFile?: string;
963
1039
  logsDir?: string;
964
1040
  additionalArgs?: string[];
965
- aiLoop?: boolean; // Use AI-based loop (old behavior)
966
1041
  }
967
1042
 
968
1043
  /** Running task state for parallel execution */
@@ -2099,68 +2174,6 @@ async function spawnProviderProcess(
2099
2174
  return runningTask;
2100
2175
  }
2101
2176
 
2102
- /** Run a single provider iteration (blocking) - used for AI-loop mode */
2103
- async function runProviderIteration(
2104
- adapter: ReturnType<typeof createProviderAdapter>,
2105
- opts: {
2106
- prompt: string;
2107
- logFile: string;
2108
- systemPrompt?: string;
2109
- additionalArgs?: string[];
2110
- role: string;
2111
- apiUrl: string;
2112
- apiKey: string;
2113
- agentId: string;
2114
- taskId?: string;
2115
- cwd?: string;
2116
- },
2117
- ): Promise<ProviderResult> {
2118
- const { env: freshEnv } = await fetchResolvedEnv(opts.apiUrl, opts.apiKey, opts.agentId);
2119
- const model = (freshEnv.MODEL_OVERRIDE as string) || "";
2120
-
2121
- const config: ProviderSessionConfig = {
2122
- prompt: opts.prompt,
2123
- systemPrompt: opts.systemPrompt || "",
2124
- model,
2125
- role: opts.role,
2126
- agentId: opts.agentId,
2127
- taskId: opts.taskId || crypto.randomUUID(),
2128
- apiUrl: opts.apiUrl,
2129
- apiKey: opts.apiKey,
2130
- cwd: opts.cwd || process.cwd(),
2131
- logFile: opts.logFile,
2132
- additionalArgs: opts.additionalArgs,
2133
- env: freshEnv as Record<string, string>,
2134
- };
2135
-
2136
- const session = await adapter.createSession(config);
2137
-
2138
- let lastAiLoopProgressTime = 0;
2139
- session.onEvent((event) => {
2140
- if (event.type === "raw_log") prettyPrintLine(event.content, opts.role);
2141
- if (event.type === "raw_stderr") prettyPrintStderr(event.content, opts.role);
2142
- if (event.type === "session_init" && opts.taskId) {
2143
- saveProviderSessionId(
2144
- opts.apiUrl,
2145
- opts.apiKey,
2146
- opts.taskId,
2147
- event.sessionId,
2148
- event.provider,
2149
- event.providerMeta,
2150
- ).catch((err) => console.warn(`[runner] Failed to save session ID: ${err}`));
2151
- }
2152
- if (event.type === "progress" && opts.taskId) {
2153
- const now = Date.now();
2154
- if (now - lastAiLoopProgressTime >= PROGRESS_THROTTLE_MS) {
2155
- lastAiLoopProgressTime = now;
2156
- updateProgressViaAPI(opts.apiUrl, opts.apiKey, opts.taskId, event.message).catch(() => {});
2157
- }
2158
- }
2159
- });
2160
-
2161
- return session.waitForCompletion();
2162
- }
2163
-
2164
2177
  /** Check for completed processes and remove them from active tasks */
2165
2178
  async function checkCompletedProcesses(
2166
2179
  state: RunnerState,
@@ -2541,86 +2554,142 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2541
2554
  );
2542
2555
  console.log(`[${role}] Total system prompt length: ${resolvedSystemPrompt.length} chars`);
2543
2556
 
2544
- const isAiLoop = opts.aiLoop || process.env.AI_LOOP === "true";
2545
-
2546
2557
  // Constants for polling
2547
2558
  const PollIntervalMs = 2000; // 2 seconds between polls
2548
2559
  const PollTimeoutMs = 60000; // 1 minute timeout before retrying
2549
2560
 
2550
2561
  let iteration = 0;
2551
2562
 
2552
- if (!isAiLoop) {
2553
- // Fetch template early (before registration) so defaults can be applied
2554
- const templateId = process.env.TEMPLATE_ID;
2555
- const registryUrl = process.env.TEMPLATE_REGISTRY_URL || "https://templates.agent-swarm.dev";
2556
- let cachedTemplate: TemplateResponse | null = null;
2557
-
2558
- if (templateId) {
2559
- try {
2560
- cachedTemplate = await fetchTemplate(templateId, registryUrl, "/workspace/.template-cache");
2561
- if (cachedTemplate) {
2562
- console.log(`[${role}] Fetched template: ${templateId}`);
2563
+ // Fetch template early (before registration) so defaults can be applied
2564
+ const templateId = process.env.TEMPLATE_ID;
2565
+ const registryUrl = process.env.TEMPLATE_REGISTRY_URL || "https://templates.agent-swarm.dev";
2566
+ let cachedTemplate: TemplateResponse | null = null;
2563
2567
 
2564
- // Apply agentDefaults as fallbacks (env/config takes precedence)
2565
- const defaults = cachedTemplate.config.agentDefaults;
2566
- if (config.role === "worker" && defaults.role) {
2567
- role = defaults.role;
2568
- }
2569
- if (!capabilities?.length && defaults.capabilities?.length) {
2570
- capabilities = defaults.capabilities;
2571
- }
2568
+ if (templateId) {
2569
+ try {
2570
+ cachedTemplate = await fetchTemplate(templateId, registryUrl, "/workspace/.template-cache");
2571
+ if (cachedTemplate) {
2572
+ console.log(`[${role}] Fetched template: ${templateId}`);
2573
+
2574
+ // Apply agentDefaults as fallbacks (env/config takes precedence)
2575
+ const defaults = cachedTemplate.config.agentDefaults;
2576
+ if (config.role === "worker" && defaults.role) {
2577
+ role = defaults.role;
2578
+ }
2579
+ if (!capabilities?.length && defaults.capabilities?.length) {
2580
+ capabilities = defaults.capabilities;
2572
2581
  }
2582
+ }
2583
+ } catch (err) {
2584
+ console.warn(`[${role}] Failed to fetch template ${templateId}: ${err}`);
2585
+ }
2586
+ }
2587
+
2588
+ // Runner-level polling mode with parallel execution support
2589
+ const isLeadFromConfig = config.role === "lead";
2590
+ const isLead = isLeadFromConfig || (cachedTemplate?.config.agentDefaults?.isLead ?? false);
2591
+ const defaultMaxTasks = isLead ? 2 : 1;
2592
+ const templateMaxTasks = cachedTemplate?.config.agentDefaults?.maxTasks;
2593
+ const maxConcurrent = resolveMaxConcurrent(process.env, templateMaxTasks, defaultMaxTasks);
2594
+ console.log(`[${role}] Mode: runner-level polling`);
2595
+ console.log(`[${role}] Max concurrent tasks: ${maxConcurrent}`);
2596
+
2597
+ // Initialize runner state for parallel execution
2598
+ const state: RunnerState = {
2599
+ activeTasks: new Map(),
2600
+ maxConcurrent,
2601
+ harnessProvider: bootProvider,
2602
+ };
2603
+
2604
+ // Track tasks already signaled for cancellation to avoid repeated SIGTERM
2605
+ const cancelledSignaled = new Set<string>();
2606
+
2607
+ // Migration 055 — cache the harness_provider value used when we last
2608
+ // built a `cred_status` snapshot. Re-runs the post-task check only when
2609
+ // the resolved provider changes. Section 4 of the swarm_config-overrides-
2610
+ // HARNESS_PROVIDER work makes this dynamic: state.harnessProvider is
2611
+ // reconciled below from `swarm_config`, so an operator's change reaches
2612
+ // here without a worker restart.
2613
+ let cachedCredHarnessProvider: string | null = null;
2614
+
2615
+ // Throttle for live HARNESS_PROVIDER reconciliation. Each reconciliation
2616
+ // calls `fetchResolvedEnv` which also re-resolves credential pools — we
2617
+ // don't want that on every 2s poll. 10s gives operator changes a near-
2618
+ // immediate effect from a UX perspective without hammering the API.
2619
+ let lastHarnessReconcileAt = 0;
2620
+ const HARNESS_RECONCILE_INTERVAL_MS = 10_000;
2621
+
2622
+ // Create API config for ping/close
2623
+ const apiConfig: ApiConfig = { apiUrl, apiKey, agentId };
2624
+
2625
+ // Setup graceful shutdown handlers with API config and runner state access
2626
+ setupShutdownHandlers(role, apiConfig, () => state);
2627
+
2628
+ // Register agent before starting
2629
+ const agentName =
2630
+ process.env.AGENT_NAME ||
2631
+ cachedTemplate?.config.displayName ||
2632
+ `${role}-${agentId.slice(0, 8)}`;
2633
+
2634
+ /**
2635
+ * Reconcile RunnerState + process.env against a freshly resolved swarm
2636
+ * config snapshot. Single source of truth for live config drift; used
2637
+ * both during the credential-wait (so operator flips reach the predicate
2638
+ * mid-loop) and from the post-boot periodic reconciler.
2639
+ *
2640
+ * Returns whether anything agent-visible (provider, maxConcurrent)
2641
+ * changed — callers use this to decide whether to re-register.
2642
+ */
2643
+ const applySwarmConfigDrift = async (
2644
+ freshEnv: Record<string, string | undefined>,
2645
+ resolvedProvider: ProviderName,
2646
+ ): Promise<{ agentVisibleChanged: boolean }> => {
2647
+ let agentVisibleChanged = false;
2648
+
2649
+ // (1) Harness provider — swap adapter + rebuild prompt atomically.
2650
+ if (resolvedProvider !== state.harnessProvider) {
2651
+ const previous = state.harnessProvider;
2652
+ console.log(`[${role}] [harness] Reconciling adapter: ${previous} → ${resolvedProvider}`);
2653
+ try {
2654
+ adapter = createProviderAdapter(resolvedProvider);
2655
+ state.harnessProvider = resolvedProvider;
2656
+ basePrompt = await buildSystemPrompt();
2657
+ resolvedSystemPrompt = additionalSystemPrompt
2658
+ ? `${basePrompt}\n\n${additionalSystemPrompt}`
2659
+ : basePrompt;
2660
+ cachedCredHarnessProvider = null;
2661
+ agentVisibleChanged = true;
2662
+ console.log(
2663
+ `[${role}] [harness] Swapped to ${resolvedProvider} (basePrompt rebuilt: ${basePrompt.length} chars)`,
2664
+ );
2573
2665
  } catch (err) {
2574
- console.warn(`[${role}] Failed to fetch template ${templateId}: ${err}`);
2666
+ console.warn(
2667
+ `[${role}] [harness] Failed to swap to ${resolvedProvider} (staying on ${previous}): ${err}`,
2668
+ );
2575
2669
  }
2576
2670
  }
2577
2671
 
2578
- // Runner-level polling mode with parallel execution support
2579
- const isLeadFromConfig = config.role === "lead";
2580
- const isLead = isLeadFromConfig || (cachedTemplate?.config.agentDefaults?.isLead ?? false);
2581
- const defaultMaxTasks = isLead ? 2 : 1;
2582
- const maxConcurrent = process.env.MAX_CONCURRENT_TASKS
2583
- ? parseInt(process.env.MAX_CONCURRENT_TASKS, 10)
2584
- : (cachedTemplate?.config.agentDefaults?.maxTasks ?? defaultMaxTasks);
2585
- console.log(`[${role}] Mode: runner-level polling (use --ai-loop for AI-based polling)`);
2586
- console.log(`[${role}] Max concurrent tasks: ${maxConcurrent}`);
2587
-
2588
- // Initialize runner state for parallel execution
2589
- const state: RunnerState = {
2590
- activeTasks: new Map(),
2591
- maxConcurrent,
2592
- harnessProvider: bootProvider,
2593
- };
2672
+ // (2) Max concurrency operator can tune from the dashboard live.
2673
+ // Note: shrinking below activeTasks.size won't kill in-flight tasks; new
2674
+ // spawns are simply gated until in-flight drain back under the new cap.
2675
+ const nextMax = resolveMaxConcurrent(freshEnv, templateMaxTasks, defaultMaxTasks);
2676
+ if (nextMax !== state.maxConcurrent) {
2677
+ console.log(`[${role}] [config] maxConcurrent: ${state.maxConcurrent} → ${nextMax}`);
2678
+ state.maxConcurrent = nextMax;
2679
+ agentVisibleChanged = true;
2680
+ }
2681
+
2682
+ // (3) Apply the small allowlist of safe-to-mutate env keys to process.env.
2683
+ const changedKeys = applyResolvedEnvToProcessEnv(freshEnv);
2684
+ if (changedKeys.length > 0) {
2685
+ console.log(`[${role}] [env-reload] Updated process.env: ${changedKeys.join(", ")}`);
2686
+ }
2687
+
2688
+ return { agentVisibleChanged };
2689
+ };
2594
2690
 
2595
- // Track tasks already signaled for cancellation to avoid repeated SIGTERM
2596
- const cancelledSignaled = new Set<string>();
2597
-
2598
- // Migration 055 — cache the harness_provider value used when we last
2599
- // built a `cred_status` snapshot. Re-runs the post-task check only when
2600
- // the resolved provider changes. Section 4 of the swarm_config-overrides-
2601
- // HARNESS_PROVIDER work makes this dynamic: state.harnessProvider is
2602
- // reconciled below from `swarm_config`, so an operator's change reaches
2603
- // here without a worker restart.
2604
- let cachedCredHarnessProvider: string | null = null;
2605
-
2606
- // Throttle for live HARNESS_PROVIDER reconciliation. Each reconciliation
2607
- // calls `fetchResolvedEnv` which also re-resolves credential pools — we
2608
- // don't want that on every 2s poll. 10s gives operator changes a near-
2609
- // immediate effect from a UX perspective without hammering the API.
2610
- let lastHarnessReconcileAt = 0;
2611
- const HARNESS_RECONCILE_INTERVAL_MS = 10_000;
2612
-
2613
- // Create API config for ping/close
2614
- const apiConfig: ApiConfig = { apiUrl, apiKey, agentId };
2615
-
2616
- // Setup graceful shutdown handlers with API config and runner state access
2617
- setupShutdownHandlers(role, apiConfig, () => state);
2618
-
2619
- // Register agent before starting
2620
- const agentName =
2621
- process.env.AGENT_NAME ||
2622
- cachedTemplate?.config.displayName ||
2623
- `${role}-${agentId.slice(0, 8)}`;
2691
+ /** Push the current live state back to the API so the dashboard reflects it. */
2692
+ const reregisterAgent = async () => {
2624
2693
  try {
2625
2694
  await registerAgent({
2626
2695
  apiUrl,
@@ -2630,1055 +2699,988 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2630
2699
  role,
2631
2700
  isLead,
2632
2701
  capabilities,
2633
- maxTasks: maxConcurrent,
2634
- harnessProvider: bootProvider,
2702
+ maxTasks: state.maxConcurrent,
2703
+ harnessProvider: state.harnessProvider,
2635
2704
  });
2636
- console.log(`[${role}] Registered as "${agentName}" (ID: ${agentId})`);
2637
- } catch (error) {
2638
- console.error(`[${role}] Failed to register: ${error}`);
2639
- process.exit(1);
2640
- }
2641
-
2642
- // Block until harness credentials are present in env. This loop replaces
2643
- // the old bash-level fail-fast in `docker-entrypoint.sh` — the worker is
2644
- // already registered (visible to the dashboard) and self-heals once
2645
- // creds appear in `swarm_config`. See plans/2026-05-06-worker-credential-safe-loop.md.
2646
- //
2647
- // CRED_CHECK_DISABLE=1 opts out entirely: the worker trusts the operator
2648
- // and starts polling immediately, with a NULL `cred_status` row that the
2649
- // dashboard surfaces as "unreported."
2650
- const harnessProvider = bootProvider;
2651
- cachedCredHarnessProvider = harnessProvider;
2652
- if (isCredCheckDisabled(process.env)) {
2653
- console.log(`[${role}] CRED_CHECK_DISABLE=1, skipping credential checks`);
2654
- } else {
2655
- try {
2656
- await awaitCredentials({
2657
- provider: harnessProvider,
2658
- refreshEnv: async () => {
2659
- const { env } = await fetchResolvedEnv(apiUrl, apiKey, agentId);
2660
- return env;
2661
- },
2662
- onTick: (status) => {
2663
- // Best-effort status report — the dispatcher uses it to route
2664
- // around blocked agents. Failures are non-fatal (the wait loop
2665
- // already swallows onTick exceptions). We do NOT include
2666
- // `cred_status` here — the live test runs once the worker is
2667
- // ready (below), and intermediate ticks are presence-only.
2668
- fetch(`${apiUrl}/api/agents/${encodeURIComponent(agentId)}/credential-status`, {
2669
- method: "PUT",
2670
- headers: {
2671
- Authorization: `Bearer ${apiKey}`,
2672
- "X-Agent-ID": agentId,
2673
- "Content-Type": "application/json",
2674
- },
2675
- body: JSON.stringify({ ready: status.ready, missing: status.missing }),
2676
- }).catch(() => {
2677
- // Swallowed — Phase 2 wait loop logs every tick anyway.
2678
- });
2679
- },
2680
- });
2681
- } catch (err) {
2682
- if (err instanceof BootMaxWaitExceededError) {
2683
- console.error(`[${role}] ${err.message}`);
2684
- process.exit(EX_CONFIG);
2685
- }
2686
- throw err;
2687
- }
2688
-
2689
- // Migration 055: build the full snapshot (presence + live test) once
2690
- // creds are ready and POST it to the agent row. Status endpoint reads
2691
- // this instead of running predicates server-side.
2692
- try {
2693
- const snapshot = await buildCredStatusReport(harnessProvider, process.env, {}, "boot");
2694
- await reportCredStatus(apiUrl, apiKey, agentId, snapshot);
2695
- } catch (err) {
2696
- // Non-fatal — worker proceeds even if reporting fails.
2697
- console.warn(`[${role}] cred_status boot report failed (non-fatal): ${err}`);
2698
- }
2705
+ } catch (err) {
2706
+ console.warn(`[${role}] [config] Re-register failed (non-fatal): ${err}`);
2699
2707
  }
2708
+ };
2709
+ try {
2710
+ await registerAgent({
2711
+ apiUrl,
2712
+ apiKey,
2713
+ agentId,
2714
+ name: agentName,
2715
+ role,
2716
+ isLead,
2717
+ capabilities,
2718
+ maxTasks: maxConcurrent,
2719
+ harnessProvider: bootProvider,
2720
+ });
2721
+ console.log(`[${role}] Registered as "${agentName}" (ID: ${agentId})`);
2722
+ } catch (error) {
2723
+ console.error(`[${role}] Failed to register: ${error}`);
2724
+ process.exit(1);
2725
+ }
2700
2726
 
2701
- // Clean up any stale active sessions from previous runs (crash recovery)
2702
- await cleanupActiveSessions(apiConfig);
2703
- console.log(`[${role}] Cleaned up stale active sessions`);
2704
-
2705
- // Fetch full agent profile to get soul/identity content
2727
+ // Block until harness credentials are present in env. This loop replaces
2728
+ // the old bash-level fail-fast in `docker-entrypoint.sh` — the worker is
2729
+ // already registered (visible to the dashboard) and self-heals once
2730
+ // creds appear in `swarm_config`. See plans/2026-05-06-worker-credential-safe-loop.md.
2731
+ //
2732
+ // CRED_CHECK_DISABLE=1 opts out entirely: the worker trusts the operator
2733
+ // and starts polling immediately, with a NULL `cred_status` row that the
2734
+ // dashboard surfaces as "unreported."
2735
+ cachedCredHarnessProvider = state.harnessProvider;
2736
+ if (isCredCheckDisabled(process.env)) {
2737
+ console.log(`[${role}] CRED_CHECK_DISABLE=1, skipping credential checks`);
2738
+ } else {
2706
2739
  try {
2707
- const resp = await fetch(`${apiUrl}/me`, {
2708
- headers: {
2709
- Authorization: `Bearer ${apiKey}`,
2710
- "X-Agent-ID": agentId,
2711
- },
2712
- });
2713
- if (resp.ok) {
2714
- const profile = (await resp.json()) as {
2715
- soulMd?: string;
2716
- identityMd?: string;
2717
- claudeMd?: string;
2718
- setupScript?: string;
2719
- toolsMd?: string;
2720
- heartbeatMd?: string;
2721
- name?: string;
2722
- description?: string;
2723
- };
2724
- agentSoulMd = profile.soulMd;
2725
- agentIdentityMd = profile.identityMd;
2726
- agentSetupScript = profile.setupScript;
2727
- agentToolsMd = profile.toolsMd;
2728
- agentClaudeMd = profile.claudeMd;
2729
- agentHeartbeatMd = profile.heartbeatMd;
2730
- agentProfileName = profile.name;
2731
- agentDescription = profile.description;
2732
-
2733
- // Generate default templates if missing (runner registers via POST /api/agents
2734
- // which doesn't generate templates like join-swarm does)
2735
- if (
2736
- !agentSoulMd ||
2737
- !agentIdentityMd ||
2738
- !agentToolsMd ||
2739
- !agentClaudeMd ||
2740
- !agentHeartbeatMd
2741
- ) {
2742
- // Use already-fetched template (from pre-registration step)
2743
- if (cachedTemplate) {
2744
- const ctx = {
2745
- agent: {
2746
- name: agentProfileName || agentName,
2747
- role: role,
2748
- description: agentDescription || "",
2749
- capabilities: (capabilities || []).join(", "),
2750
- },
2751
- };
2752
- if (!agentSoulMd) agentSoulMd = interpolate(cachedTemplate.files.soulMd, ctx).result;
2753
- if (!agentIdentityMd)
2754
- agentIdentityMd = interpolate(cachedTemplate.files.identityMd, ctx).result;
2755
- if (!agentToolsMd) agentToolsMd = interpolate(cachedTemplate.files.toolsMd, ctx).result;
2756
- if (!agentClaudeMd)
2757
- agentClaudeMd = interpolate(cachedTemplate.files.claudeMd, ctx).result;
2758
- if (!agentSetupScript)
2759
- agentSetupScript = interpolate(cachedTemplate.files.setupScript, ctx).result;
2760
- if (!agentHeartbeatMd)
2761
- agentHeartbeatMd = interpolate(cachedTemplate.files.heartbeatMd, ctx).result;
2762
- console.log(`[${role}] Applied template: ${templateId}`);
2763
- }
2764
-
2765
- // Fallback to generic defaults for any still-missing fields
2766
- const agentInfo = {
2767
- name: agentProfileName || agentName,
2768
- role: role,
2769
- description: agentDescription,
2770
- capabilities: config.capabilities,
2771
- };
2772
- if (!agentSoulMd) agentSoulMd = generateDefaultSoulMd(agentInfo);
2773
- if (!agentIdentityMd) agentIdentityMd = generateDefaultIdentityMd(agentInfo);
2774
- if (!agentToolsMd) agentToolsMd = generateDefaultToolsMd(agentInfo);
2775
- if (!agentClaudeMd) agentClaudeMd = generateDefaultClaudeMd(agentInfo);
2776
-
2777
- // Push generated templates to server
2778
- try {
2779
- const profileUpdate: Record<string, string> = {};
2780
- if (!profile.soulMd) profileUpdate.soulMd = agentSoulMd;
2781
- if (!profile.identityMd) profileUpdate.identityMd = agentIdentityMd;
2782
- if (!profile.toolsMd) profileUpdate.toolsMd = agentToolsMd;
2783
- if (!profile.claudeMd && agentClaudeMd) profileUpdate.claudeMd = agentClaudeMd;
2784
- if (!profile.setupScript && agentSetupScript)
2785
- profileUpdate.setupScript = agentSetupScript;
2786
- if (!profile.heartbeatMd && agentHeartbeatMd)
2787
- profileUpdate.heartbeatMd = agentHeartbeatMd;
2788
-
2789
- await fetch(`${apiUrl}/api/agents/${agentId}/profile`, {
2790
- method: "PUT",
2791
- headers: {
2792
- Authorization: `Bearer ${apiKey}`,
2793
- "X-Agent-ID": agentId,
2794
- "Content-Type": "application/json",
2795
- },
2796
- body: JSON.stringify(profileUpdate),
2797
- });
2798
- console.log(`[${role}] Generated and saved default identity templates`);
2799
- } catch {
2800
- console.warn(`[${role}] Could not save generated templates to server`);
2740
+ await awaitCredentials({
2741
+ provider: state.harnessProvider,
2742
+ // Re-read each tick so an operator's HARNESS_PROVIDER flip during
2743
+ // the wait pivots the credential predicate (and onwards).
2744
+ getProvider: () => state.harnessProvider,
2745
+ refreshEnv: async () => {
2746
+ const { env, resolvedProvider } = await fetchResolvedEnv(apiUrl, apiKey, agentId);
2747
+ // Apply drift inside the wait so adapter/prompt/state stay in
2748
+ // sync if the operator flips HARNESS_PROVIDER mid-loop. The
2749
+ // helper is idempotent when nothing changed.
2750
+ const { agentVisibleChanged } = await applySwarmConfigDrift(env, resolvedProvider);
2751
+ if (agentVisibleChanged) {
2752
+ // Fire-and-forget — dashboard reflects the live values, the
2753
+ // wait loop doesn't block on it.
2754
+ reregisterAgent().catch(() => {});
2801
2755
  }
2802
- }
2803
-
2804
- // Fetch installed skills for system prompt
2805
- try {
2806
- const skillsResp = await fetch(`${apiUrl}/api/agents/${agentId}/skills`, {
2756
+ return env;
2757
+ },
2758
+ onTick: (status) => {
2759
+ // Best-effort status report — the dispatcher uses it to route
2760
+ // around blocked agents. Failures are non-fatal (the wait loop
2761
+ // already swallows onTick exceptions). We do NOT include
2762
+ // `cred_status` here — the live test runs once the worker is
2763
+ // ready (below), and intermediate ticks are presence-only.
2764
+ fetch(`${apiUrl}/api/agents/${encodeURIComponent(agentId)}/credential-status`, {
2765
+ method: "PUT",
2807
2766
  headers: {
2808
2767
  Authorization: `Bearer ${apiKey}`,
2809
2768
  "X-Agent-ID": agentId,
2769
+ "Content-Type": "application/json",
2810
2770
  },
2771
+ body: JSON.stringify({ ready: status.ready, missing: status.missing }),
2772
+ }).catch(() => {
2773
+ // Swallowed — Phase 2 wait loop logs every tick anyway.
2811
2774
  });
2812
- if (skillsResp.ok) {
2813
- const skillsData = (await skillsResp.json()) as {
2814
- skills: {
2815
- name: string;
2816
- description: string;
2817
- isActive: boolean;
2818
- isEnabled: boolean;
2819
- }[];
2820
- };
2821
- agentSkillsSummary = skillsData.skills
2822
- .filter((s) => s.isActive && s.isEnabled)
2823
- .map((s) => ({ name: s.name, description: s.description }));
2824
- if (agentSkillsSummary.length > 0) {
2825
- console.log(`[${role}] Loaded ${agentSkillsSummary.length} skills for system prompt`);
2826
- }
2827
- }
2828
- } catch {
2829
- // Non-fatal skills are optional
2775
+ },
2776
+ });
2777
+ } catch (err) {
2778
+ if (err instanceof BootMaxWaitExceededError) {
2779
+ console.error(`[${role}] ${err.message}`);
2780
+ process.exit(EX_CONFIG);
2781
+ }
2782
+ throw err;
2783
+ }
2784
+
2785
+ // Migration 055: build the full snapshot (presence + live test) once
2786
+ // creds are ready and POST it to the agent row. Status endpoint reads
2787
+ // this instead of running predicates server-side. Always uses the
2788
+ // *current* state.harnessProvider in case it flipped during the wait.
2789
+ try {
2790
+ const snapshot = await buildCredStatusReport(state.harnessProvider, process.env, {}, "boot");
2791
+ await reportCredStatus(apiUrl, apiKey, agentId, snapshot);
2792
+ } catch (err) {
2793
+ // Non-fatal — worker proceeds even if reporting fails.
2794
+ console.warn(`[${role}] cred_status boot report failed (non-fatal): ${err}`);
2795
+ }
2796
+ }
2797
+
2798
+ // Clean up any stale active sessions from previous runs (crash recovery)
2799
+ await cleanupActiveSessions(apiConfig);
2800
+ console.log(`[${role}] Cleaned up stale active sessions`);
2801
+
2802
+ // Fetch full agent profile to get soul/identity content
2803
+ try {
2804
+ const resp = await fetch(`${apiUrl}/me`, {
2805
+ headers: {
2806
+ Authorization: `Bearer ${apiKey}`,
2807
+ "X-Agent-ID": agentId,
2808
+ },
2809
+ });
2810
+ if (resp.ok) {
2811
+ const profile = (await resp.json()) as {
2812
+ soulMd?: string;
2813
+ identityMd?: string;
2814
+ claudeMd?: string;
2815
+ setupScript?: string;
2816
+ toolsMd?: string;
2817
+ heartbeatMd?: string;
2818
+ name?: string;
2819
+ description?: string;
2820
+ };
2821
+ agentSoulMd = profile.soulMd;
2822
+ agentIdentityMd = profile.identityMd;
2823
+ agentSetupScript = profile.setupScript;
2824
+ agentToolsMd = profile.toolsMd;
2825
+ agentClaudeMd = profile.claudeMd;
2826
+ agentHeartbeatMd = profile.heartbeatMd;
2827
+ agentProfileName = profile.name;
2828
+ agentDescription = profile.description;
2829
+
2830
+ // Generate default templates if missing (runner registers via POST /api/agents
2831
+ // which doesn't generate templates like join-swarm does)
2832
+ if (
2833
+ !agentSoulMd ||
2834
+ !agentIdentityMd ||
2835
+ !agentToolsMd ||
2836
+ !agentClaudeMd ||
2837
+ !agentHeartbeatMd
2838
+ ) {
2839
+ // Use already-fetched template (from pre-registration step)
2840
+ if (cachedTemplate) {
2841
+ const ctx = {
2842
+ agent: {
2843
+ name: agentProfileName || agentName,
2844
+ role: role,
2845
+ description: agentDescription || "",
2846
+ capabilities: (capabilities || []).join(", "),
2847
+ },
2848
+ };
2849
+ if (!agentSoulMd) agentSoulMd = interpolate(cachedTemplate.files.soulMd, ctx).result;
2850
+ if (!agentIdentityMd)
2851
+ agentIdentityMd = interpolate(cachedTemplate.files.identityMd, ctx).result;
2852
+ if (!agentToolsMd) agentToolsMd = interpolate(cachedTemplate.files.toolsMd, ctx).result;
2853
+ if (!agentClaudeMd)
2854
+ agentClaudeMd = interpolate(cachedTemplate.files.claudeMd, ctx).result;
2855
+ if (!agentSetupScript)
2856
+ agentSetupScript = interpolate(cachedTemplate.files.setupScript, ctx).result;
2857
+ if (!agentHeartbeatMd)
2858
+ agentHeartbeatMd = interpolate(cachedTemplate.files.heartbeatMd, ctx).result;
2859
+ console.log(`[${role}] Applied template: ${templateId}`);
2830
2860
  }
2831
2861
 
2832
- // Fetch installed MCP servers for system prompt
2862
+ // Fallback to generic defaults for any still-missing fields
2863
+ const agentInfo = {
2864
+ name: agentProfileName || agentName,
2865
+ role: role,
2866
+ description: agentDescription,
2867
+ capabilities: config.capabilities,
2868
+ };
2869
+ if (!agentSoulMd) agentSoulMd = generateDefaultSoulMd(agentInfo);
2870
+ if (!agentIdentityMd) agentIdentityMd = generateDefaultIdentityMd(agentInfo);
2871
+ if (!agentToolsMd) agentToolsMd = generateDefaultToolsMd(agentInfo);
2872
+ if (!agentClaudeMd) agentClaudeMd = generateDefaultClaudeMd(agentInfo);
2873
+
2874
+ // Push generated templates to server
2833
2875
  try {
2834
- const mcpServersResp = await fetch(`${apiUrl}/api/agents/${agentId}/mcp-servers`, {
2876
+ const profileUpdate: Record<string, string> = {};
2877
+ if (!profile.soulMd) profileUpdate.soulMd = agentSoulMd;
2878
+ if (!profile.identityMd) profileUpdate.identityMd = agentIdentityMd;
2879
+ if (!profile.toolsMd) profileUpdate.toolsMd = agentToolsMd;
2880
+ if (!profile.claudeMd && agentClaudeMd) profileUpdate.claudeMd = agentClaudeMd;
2881
+ if (!profile.setupScript && agentSetupScript)
2882
+ profileUpdate.setupScript = agentSetupScript;
2883
+ if (!profile.heartbeatMd && agentHeartbeatMd)
2884
+ profileUpdate.heartbeatMd = agentHeartbeatMd;
2885
+
2886
+ await fetch(`${apiUrl}/api/agents/${agentId}/profile`, {
2887
+ method: "PUT",
2835
2888
  headers: {
2836
2889
  Authorization: `Bearer ${apiKey}`,
2837
2890
  "X-Agent-ID": agentId,
2891
+ "Content-Type": "application/json",
2838
2892
  },
2893
+ body: JSON.stringify(profileUpdate),
2839
2894
  });
2840
- if (mcpServersResp.ok) {
2841
- const mcpServersData = (await mcpServersResp.json()) as {
2842
- servers: {
2843
- name: string;
2844
- transport: string;
2845
- description: string | null;
2846
- isActive: boolean;
2847
- isEnabled: boolean;
2848
- }[];
2849
- };
2850
- const activeMcpServers = mcpServersData.servers.filter(
2851
- (s) => s.isActive && s.isEnabled,
2852
- );
2853
- if (activeMcpServers.length > 0) {
2854
- agentMcpServersSummary = activeMcpServers
2855
- .map(
2856
- (s) => `- **${s.name}** (${s.transport}): ${s.description || "No description"}`,
2857
- )
2858
- .join("\n");
2859
- console.log(
2860
- `[${role}] Loaded ${activeMcpServers.length} MCP servers for system prompt`,
2861
- );
2862
- }
2863
- }
2895
+ console.log(`[${role}] Generated and saved default identity templates`);
2864
2896
  } catch {
2865
- // Non-fatal MCP servers are optional
2897
+ console.warn(`[${role}] Could not save generated templates to server`);
2866
2898
  }
2867
-
2868
- // Rebuild system prompt with identity
2869
- basePrompt = await buildSystemPrompt();
2870
- resolvedSystemPrompt = additionalSystemPrompt
2871
- ? `${basePrompt}\n\n${additionalSystemPrompt}`
2872
- : basePrompt;
2873
- console.log(
2874
- `[${role}] Loaded agent identity (soul: ${agentSoulMd ? "yes" : "no"}, identity: ${agentIdentityMd ? "yes" : "no"}, tools: ${agentToolsMd ? "yes" : "no"}, claude: ${agentClaudeMd ? "yes" : "no"})`,
2875
- );
2876
- console.log(`[${role}] Updated system prompt length: ${resolvedSystemPrompt.length} chars`);
2877
2899
  }
2878
- } catch {
2879
- console.warn(`[${role}] Could not fetch agent profile for identity — proceeding without`);
2880
- }
2881
2900
 
2882
- // Write SOUL.md and IDENTITY.md to workspace before spawning Claude
2883
- const SOUL_MD_PATH = "/workspace/SOUL.md";
2884
- const IDENTITY_MD_PATH = "/workspace/IDENTITY.md";
2885
-
2886
- if (agentSoulMd) {
2887
- try {
2888
- await Bun.write(SOUL_MD_PATH, agentSoulMd);
2889
- console.log(`[${role}] Wrote SOUL.md to workspace`);
2890
- } catch (err) {
2891
- console.warn(`[${role}] Could not write SOUL.md: ${(err as Error).message}`);
2892
- }
2893
- }
2894
- if (agentIdentityMd) {
2901
+ // Fetch installed skills for system prompt
2895
2902
  try {
2896
- await Bun.write(IDENTITY_MD_PATH, agentIdentityMd);
2897
- console.log(`[${role}] Wrote IDENTITY.md to workspace`);
2898
- } catch (err) {
2899
- console.warn(`[${role}] Could not write IDENTITY.md: ${(err as Error).message}`);
2903
+ const skillsResp = await fetch(`${apiUrl}/api/agents/${agentId}/skills`, {
2904
+ headers: {
2905
+ Authorization: `Bearer ${apiKey}`,
2906
+ "X-Agent-ID": agentId,
2907
+ },
2908
+ });
2909
+ if (skillsResp.ok) {
2910
+ const skillsData = (await skillsResp.json()) as {
2911
+ skills: {
2912
+ name: string;
2913
+ description: string;
2914
+ isActive: boolean;
2915
+ isEnabled: boolean;
2916
+ }[];
2917
+ };
2918
+ agentSkillsSummary = skillsData.skills
2919
+ .filter((s) => s.isActive && s.isEnabled)
2920
+ .map((s) => ({ name: s.name, description: s.description }));
2921
+ if (agentSkillsSummary.length > 0) {
2922
+ console.log(`[${role}] Loaded ${agentSkillsSummary.length} skills for system prompt`);
2923
+ }
2924
+ }
2925
+ } catch {
2926
+ // Non-fatal — skills are optional
2900
2927
  }
2901
- }
2902
2928
 
2903
- // Write setup script to workspace (agent can edit during session)
2904
- // Only create if it doesn't exist — the entrypoint already composed/prepended it at container start
2905
- if (agentSetupScript) {
2929
+ // Fetch installed MCP servers for system prompt
2906
2930
  try {
2907
- if (!(await Bun.file("/workspace/start-up.sh").exists())) {
2908
- await Bun.write("/workspace/start-up.sh", `#!/bin/bash\n${agentSetupScript}\n`);
2909
- console.log(`[${role}] Wrote start-up.sh to workspace`);
2931
+ const mcpServersResp = await fetch(`${apiUrl}/api/agents/${agentId}/mcp-servers`, {
2932
+ headers: {
2933
+ Authorization: `Bearer ${apiKey}`,
2934
+ "X-Agent-ID": agentId,
2935
+ },
2936
+ });
2937
+ if (mcpServersResp.ok) {
2938
+ const mcpServersData = (await mcpServersResp.json()) as {
2939
+ servers: {
2940
+ name: string;
2941
+ transport: string;
2942
+ description: string | null;
2943
+ isActive: boolean;
2944
+ isEnabled: boolean;
2945
+ }[];
2946
+ };
2947
+ const activeMcpServers = mcpServersData.servers.filter((s) => s.isActive && s.isEnabled);
2948
+ if (activeMcpServers.length > 0) {
2949
+ agentMcpServersSummary = activeMcpServers
2950
+ .map((s) => `- **${s.name}** (${s.transport}): ${s.description || "No description"}`)
2951
+ .join("\n");
2952
+ console.log(
2953
+ `[${role}] Loaded ${activeMcpServers.length} MCP servers for system prompt`,
2954
+ );
2955
+ }
2910
2956
  }
2911
- } catch (err) {
2912
- console.warn(`[${role}] Could not write start-up.sh: ${(err as Error).message}`);
2957
+ } catch {
2958
+ // Non-fatal MCP servers are optional
2913
2959
  }
2914
- }
2915
2960
 
2916
- // Write TOOLS.md to workspace (agent can edit during session)
2917
- if (agentToolsMd) {
2918
- try {
2919
- await Bun.write("/workspace/TOOLS.md", agentToolsMd);
2920
- console.log(`[${role}] Wrote TOOLS.md to workspace`);
2921
- } catch (err) {
2922
- console.warn(`[${role}] Could not write TOOLS.md: ${(err as Error).message}`);
2923
- }
2961
+ // Rebuild system prompt with identity
2962
+ basePrompt = await buildSystemPrompt();
2963
+ resolvedSystemPrompt = additionalSystemPrompt
2964
+ ? `${basePrompt}\n\n${additionalSystemPrompt}`
2965
+ : basePrompt;
2966
+ console.log(
2967
+ `[${role}] Loaded agent identity (soul: ${agentSoulMd ? "yes" : "no"}, identity: ${agentIdentityMd ? "yes" : "no"}, tools: ${agentToolsMd ? "yes" : "no"}, claude: ${agentClaudeMd ? "yes" : "no"})`,
2968
+ );
2969
+ console.log(`[${role}] Updated system prompt length: ${resolvedSystemPrompt.length} chars`);
2924
2970
  }
2971
+ } catch {
2972
+ console.warn(`[${role}] Could not fetch agent profile for identity — proceeding without`);
2973
+ }
2925
2974
 
2926
- // Write HEARTBEAT.md to workspace (lead's periodic checklist)
2927
- if (agentHeartbeatMd) {
2928
- try {
2929
- await Bun.write("/workspace/HEARTBEAT.md", agentHeartbeatMd);
2930
- console.log(`[${role}] Wrote HEARTBEAT.md to workspace`);
2931
- } catch (err) {
2932
- console.warn(`[${role}] Could not write HEARTBEAT.md: ${(err as Error).message}`);
2933
- }
2934
- }
2975
+ // Write SOUL.md and IDENTITY.md to workspace before spawning Claude
2976
+ const SOUL_MD_PATH = "/workspace/SOUL.md";
2977
+ const IDENTITY_MD_PATH = "/workspace/IDENTITY.md";
2935
2978
 
2936
- // Write CLAUDE.md to workspace (agent-level instructions)
2937
- if (agentClaudeMd) {
2938
- try {
2939
- await Bun.write("/workspace/CLAUDE.md", agentClaudeMd);
2940
- console.log(`[${role}] Wrote CLAUDE.md to workspace`);
2941
- } catch (err) {
2942
- console.warn(`[${role}] Could not write CLAUDE.md: ${(err as Error).message}`);
2943
- }
2979
+ if (agentSoulMd) {
2980
+ try {
2981
+ await Bun.write(SOUL_MD_PATH, agentSoulMd);
2982
+ console.log(`[${role}] Wrote SOUL.md to workspace`);
2983
+ } catch (err) {
2984
+ console.warn(`[${role}] Could not write SOUL.md: ${(err as Error).message}`);
2944
2985
  }
2945
-
2946
- // ========== Sync skills to filesystem ==========
2986
+ }
2987
+ if (agentIdentityMd) {
2947
2988
  try {
2948
- console.log(`[${role}] Syncing skills to filesystem...`);
2949
- const syncHeaders: Record<string, string> = {
2950
- "Content-Type": "application/json",
2951
- "X-Agent-ID": agentId,
2952
- };
2953
- if (apiKey) syncHeaders.Authorization = `Bearer ${apiKey}`;
2954
- const syncRes = await fetch(`${swarmUrl}/api/skills/sync-filesystem`, {
2955
- method: "POST",
2956
- headers: syncHeaders,
2957
- });
2958
- if (syncRes.ok) {
2959
- const syncResult = (await syncRes.json()) as {
2960
- synced: number;
2961
- removed: number;
2962
- errors: string[];
2963
- };
2964
- console.log(
2965
- `[${role}] Skills synced: ${syncResult.synced} written, ${syncResult.removed} removed`,
2966
- );
2967
- if (syncResult.errors.length > 0) {
2968
- console.warn(`[${role}] Skill sync errors: ${syncResult.errors.join(", ")}`);
2969
- }
2970
- } else {
2971
- console.warn(`[${role}] Skill sync failed: HTTP ${syncRes.status}`);
2972
- }
2989
+ await Bun.write(IDENTITY_MD_PATH, agentIdentityMd);
2990
+ console.log(`[${role}] Wrote IDENTITY.md to workspace`);
2973
2991
  } catch (err) {
2974
- console.warn(`[${role}] Skill sync failed: ${(err as Error).message}`);
2992
+ console.warn(`[${role}] Could not write IDENTITY.md: ${(err as Error).message}`);
2975
2993
  }
2994
+ }
2976
2995
 
2977
- // ========== Resume paused tasks with PRIORITY ==========
2978
- // Check for paused tasks from previous shutdown and resume them before normal polling
2996
+ // Write setup script to workspace (agent can edit during session)
2997
+ // Only create if it doesn't exist the entrypoint already composed/prepended it at container start
2998
+ if (agentSetupScript) {
2979
2999
  try {
2980
- console.log(`[${role}] Checking for paused tasks to resume...`);
2981
- const pausedTasks = await getPausedTasksFromAPI(apiConfig);
3000
+ if (!(await Bun.file("/workspace/start-up.sh").exists())) {
3001
+ await Bun.write("/workspace/start-up.sh", `#!/bin/bash\n${agentSetupScript}\n`);
3002
+ console.log(`[${role}] Wrote start-up.sh to workspace`);
3003
+ }
3004
+ } catch (err) {
3005
+ console.warn(`[${role}] Could not write start-up.sh: ${(err as Error).message}`);
3006
+ }
3007
+ }
2982
3008
 
2983
- if (pausedTasks.length > 0) {
2984
- console.log(`[${role}] Found ${pausedTasks.length} paused task(s) to resume`);
3009
+ // Write TOOLS.md to workspace (agent can edit during session)
3010
+ if (agentToolsMd) {
3011
+ try {
3012
+ await Bun.write("/workspace/TOOLS.md", agentToolsMd);
3013
+ console.log(`[${role}] Wrote TOOLS.md to workspace`);
3014
+ } catch (err) {
3015
+ console.warn(`[${role}] Could not write TOOLS.md: ${(err as Error).message}`);
3016
+ }
3017
+ }
2985
3018
 
2986
- for (const task of pausedTasks) {
2987
- // Defensive: skip tasks that already have completion data (zombie prevention)
2988
- if (task.finishedAt || task.output) {
2989
- console.warn(
2990
- `[${role}] Skipping zombie task ${task.id.slice(0, 8)} — already has completion data (finishedAt: ${!!task.finishedAt}, output: ${!!task.output})`,
2991
- );
2992
- continue;
2993
- }
3019
+ // Write HEARTBEAT.md to workspace (lead's periodic checklist)
3020
+ if (agentHeartbeatMd) {
3021
+ try {
3022
+ await Bun.write("/workspace/HEARTBEAT.md", agentHeartbeatMd);
3023
+ console.log(`[${role}] Wrote HEARTBEAT.md to workspace`);
3024
+ } catch (err) {
3025
+ console.warn(`[${role}] Could not write HEARTBEAT.md: ${(err as Error).message}`);
3026
+ }
3027
+ }
2994
3028
 
2995
- // Wait if at capacity (though unlikely on fresh startup)
2996
- while (state.activeTasks.size >= state.maxConcurrent) {
2997
- await checkCompletedProcesses(state, role, apiConfig);
2998
- await Bun.sleep(1000);
2999
- }
3029
+ // Write CLAUDE.md to workspace (agent-level instructions)
3030
+ if (agentClaudeMd) {
3031
+ try {
3032
+ await Bun.write("/workspace/CLAUDE.md", agentClaudeMd);
3033
+ console.log(`[${role}] Wrote CLAUDE.md to workspace`);
3034
+ } catch (err) {
3035
+ console.warn(`[${role}] Could not write CLAUDE.md: ${(err as Error).message}`);
3036
+ }
3037
+ }
3000
3038
 
3001
- console.log(
3002
- `[${role}] Resuming paused task ${task.id.slice(0, 8)}: "${task.task.slice(0, 50)}..."`,
3003
- );
3039
+ // ========== Sync skills to filesystem ==========
3040
+ try {
3041
+ console.log(`[${role}] Syncing skills to filesystem...`);
3042
+ const syncHeaders: Record<string, string> = {
3043
+ "Content-Type": "application/json",
3044
+ "X-Agent-ID": agentId,
3045
+ };
3046
+ if (apiKey) syncHeaders.Authorization = `Bearer ${apiKey}`;
3047
+ const syncRes = await fetch(`${swarmUrl}/api/skills/sync-filesystem`, {
3048
+ method: "POST",
3049
+ headers: syncHeaders,
3050
+ });
3051
+ if (syncRes.ok) {
3052
+ const syncResult = (await syncRes.json()) as {
3053
+ synced: number;
3054
+ removed: number;
3055
+ errors: string[];
3056
+ };
3057
+ console.log(
3058
+ `[${role}] Skills synced: ${syncResult.synced} written, ${syncResult.removed} removed`,
3059
+ );
3060
+ if (syncResult.errors.length > 0) {
3061
+ console.warn(`[${role}] Skill sync errors: ${syncResult.errors.join(", ")}`);
3062
+ }
3063
+ } else {
3064
+ console.warn(`[${role}] Skill sync failed: HTTP ${syncRes.status}`);
3065
+ }
3066
+ } catch (err) {
3067
+ console.warn(`[${role}] Skill sync failed: ${(err as Error).message}`);
3068
+ }
3004
3069
 
3005
- // Resume the task via API (marks as in_progress)
3006
- const resumed = await resumeTaskViaAPI(apiConfig, task.id);
3007
- if (!resumed) {
3008
- console.warn(
3009
- `[${role}] Failed to resume task ${task.id.slice(0, 8)} via API, skipping`,
3010
- );
3011
- continue;
3012
- }
3070
+ // ========== Resume paused tasks with PRIORITY ==========
3071
+ // Check for paused tasks from previous shutdown and resume them before normal polling
3072
+ try {
3073
+ console.log(`[${role}] Checking for paused tasks to resume...`);
3074
+ const pausedTasks = await getPausedTasksFromAPI(apiConfig);
3013
3075
 
3014
- // Build prompt with resume context + memory injection
3015
- let resumePrompt = await buildResumePrompt(task, adapter.formatCommand.bind(adapter), {
3016
- hasMcp: adapter.traits.hasMcp,
3017
- });
3076
+ if (pausedTasks.length > 0) {
3077
+ console.log(`[${role}] Found ${pausedTasks.length} paused task(s) to resume`);
3018
3078
 
3019
- // Inject relevant memories for resumed tasks
3020
- const resumeMemoryContext = await fetchRelevantMemories(
3021
- apiUrl,
3022
- apiKey,
3023
- agentId,
3024
- task.task,
3025
- task.id,
3079
+ for (const task of pausedTasks) {
3080
+ // Defensive: skip tasks that already have completion data (zombie prevention)
3081
+ if (task.finishedAt || task.output) {
3082
+ console.warn(
3083
+ `[${role}] Skipping zombie task ${task.id.slice(0, 8)} — already has completion data (finishedAt: ${!!task.finishedAt}, output: ${!!task.output})`,
3026
3084
  );
3027
- if (resumeMemoryContext) {
3028
- resumePrompt += resumeMemoryContext;
3029
- console.log(`[${role}] Injected relevant memories into resumed task prompt`);
3030
- }
3085
+ continue;
3086
+ }
3031
3087
 
3032
- // Resolve --resume: prefer own session ID, then parent's
3033
- let resumeAdditionalArgs = opts.additionalArgs || [];
3034
- if (task.claudeSessionId) {
3035
- resumeAdditionalArgs = [...resumeAdditionalArgs, "--resume", task.claudeSessionId];
3036
- console.log(
3037
- `[${role}] Resuming task's own session ${task.claudeSessionId.slice(0, 8)}`,
3038
- );
3039
- } else if (task.parentTaskId) {
3040
- const parentSessionId = await fetchProviderSessionId(apiUrl, apiKey, task.parentTaskId);
3041
- if (parentSessionId) {
3042
- resumeAdditionalArgs = [...resumeAdditionalArgs, "--resume", parentSessionId];
3043
- console.log(`[${role}] Resuming parent session ${parentSessionId.slice(0, 8)}`);
3044
- }
3045
- }
3088
+ // Wait if at capacity (though unlikely on fresh startup)
3089
+ while (state.activeTasks.size >= state.maxConcurrent) {
3090
+ await checkCompletedProcesses(state, role, apiConfig);
3091
+ await Bun.sleep(1000);
3092
+ }
3046
3093
 
3047
- // Spawn Claude process for resumed task
3048
- iteration++;
3049
- const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3050
- const logFile = `${logDir}/${timestamp}-resume-${task.id.slice(0, 8)}.jsonl`;
3051
-
3052
- console.log(`\n[${role}] === Resuming paused task (iteration ${iteration}) ===`);
3053
- console.log(`[${role}] Logging to: ${logFile}`);
3054
- console.log(`[${role}] Prompt: ${resumePrompt.slice(0, 100)}...`);
3055
-
3056
- const metadata = {
3057
- type: metadataType,
3058
- sessionId,
3059
- iteration,
3060
- timestamp: new Date().toISOString(),
3061
- prompt: resumePrompt,
3062
- trigger: "task_resumed",
3063
- resumedTaskId: task.id,
3064
- yolo: isYolo,
3065
- };
3066
- await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3067
-
3068
- // Resolve cwd for resumed task (mirrors normal task path: task.dir > vcsRepo clonePath)
3069
- let resumeCwd: string | undefined;
3070
- if (task.dir) {
3071
- try {
3072
- if (existsSync(task.dir) && statSync(task.dir).isDirectory()) {
3073
- resumeCwd = task.dir;
3074
- } else {
3075
- console.warn(
3076
- `[${role}] Resume task dir "${task.dir}" does not exist or is not a directory, falling back to default cwd`,
3077
- );
3078
- }
3079
- } catch {
3080
- console.warn(
3081
- `[${role}] Failed to check resume task dir "${task.dir}", falling back to default cwd`,
3082
- );
3083
- }
3084
- }
3094
+ console.log(
3095
+ `[${role}] Resuming paused task ${task.id.slice(0, 8)}: "${task.task.slice(0, 50)}..."`,
3096
+ );
3085
3097
 
3086
- if (!resumeCwd && task.vcsRepo && apiUrl) {
3087
- const repoConfig = await fetchRepoConfig(apiUrl, apiKey, task.vcsRepo);
3088
- const effectiveConfig = repoConfig ?? {
3089
- url: task.vcsRepo,
3090
- name: task.vcsRepo.split("/").pop() || task.vcsRepo,
3091
- clonePath: `/workspace/repos/${task.vcsRepo.split("/").pop() || task.vcsRepo}`,
3092
- defaultBranch: "main",
3093
- };
3094
- const repoContext = await ensureRepoForTask(effectiveConfig, role);
3095
- if (repoContext?.clonePath) {
3096
- resumeCwd = repoContext.clonePath;
3097
- }
3098
- }
3098
+ // Resume the task via API (marks as in_progress)
3099
+ const resumed = await resumeTaskViaAPI(apiConfig, task.id);
3100
+ if (!resumed) {
3101
+ console.warn(`[${role}] Failed to resume task ${task.id.slice(0, 8)} via API, skipping`);
3102
+ continue;
3103
+ }
3099
3104
 
3100
- // Per-task runner session ID so session logs are scoped to this task
3101
- const resumeRunnerSessionId = crypto.randomUUID();
3105
+ // Build prompt with resume context + memory injection
3106
+ let resumePrompt = await buildResumePrompt(task, adapter.formatCommand.bind(adapter), {
3107
+ hasMcp: adapter.traits.hasMcp,
3108
+ });
3102
3109
 
3103
- let runningTask: RunningTask;
3104
- try {
3105
- runningTask = await spawnProviderProcess(
3106
- adapter,
3107
- {
3108
- prompt: resumePrompt,
3109
- logFile,
3110
- systemPrompt: resolvedSystemPrompt,
3111
- additionalArgs: resumeAdditionalArgs,
3112
- role,
3113
- apiUrl,
3114
- apiKey,
3115
- agentId,
3116
- runnerSessionId: resumeRunnerSessionId,
3117
- iteration,
3118
- taskId: task.id,
3119
- model: (task as { model?: string }).model,
3120
- cwd: resumeCwd,
3121
- vcsRepo: task.vcsRepo,
3122
- },
3123
- logDir,
3124
- isYolo,
3125
- );
3126
- } catch (spawnErr) {
3127
- const errMsg = spawnErr instanceof Error ? spawnErr.message : String(spawnErr);
3128
- console.error(
3129
- `[${role}] Failed to spawn process for resumed task ${task.id.slice(0, 8)}: ${errMsg}`,
3130
- );
3131
- await ensureTaskFinished(
3132
- apiConfig,
3133
- role,
3134
- task.id,
3135
- 1,
3136
- `Spawn failed: ${errMsg}`,
3137
- undefined,
3138
- state.harnessProvider,
3139
- );
3140
- continue;
3141
- }
3110
+ // Inject relevant memories for resumed tasks
3111
+ const resumeMemoryContext = await fetchRelevantMemories(
3112
+ apiUrl,
3113
+ apiKey,
3114
+ agentId,
3115
+ task.task,
3116
+ task.id,
3117
+ );
3118
+ if (resumeMemoryContext) {
3119
+ resumePrompt += resumeMemoryContext;
3120
+ console.log(`[${role}] Injected relevant memories into resumed task prompt`);
3121
+ }
3142
3122
 
3143
- state.activeTasks.set(task.id, runningTask);
3144
- registerActiveSession(apiConfig, {
3145
- taskId: task.id,
3146
- triggerType: "task_resumed",
3147
- taskDescription: task.task?.slice(0, 200),
3148
- runnerSessionId: resumeRunnerSessionId,
3149
- });
3150
- console.log(
3151
- `[${role}] Resumed task ${task.id.slice(0, 8)} (${state.activeTasks.size}/${state.maxConcurrent} active)`,
3152
- );
3123
+ // Resolve --resume: prefer own session ID, then parent's
3124
+ let resumeAdditionalArgs = opts.additionalArgs || [];
3125
+ if (task.claudeSessionId) {
3126
+ resumeAdditionalArgs = [...resumeAdditionalArgs, "--resume", task.claudeSessionId];
3127
+ console.log(`[${role}] Resuming task's own session ${task.claudeSessionId.slice(0, 8)}`);
3128
+ } else if (task.parentTaskId) {
3129
+ const parentSessionId = await fetchProviderSessionId(apiUrl, apiKey, task.parentTaskId);
3130
+ if (parentSessionId) {
3131
+ resumeAdditionalArgs = [...resumeAdditionalArgs, "--resume", parentSessionId];
3132
+ console.log(`[${role}] Resuming parent session ${parentSessionId.slice(0, 8)}`);
3133
+ }
3153
3134
  }
3154
3135
 
3155
- console.log(`[${role}] All paused tasks resumed. Entering normal polling...`);
3156
- } else {
3157
- console.log(`[${role}] No paused tasks found. Entering normal polling...`);
3158
- }
3159
- } catch (error) {
3160
- console.error(`[${role}] Error checking/resuming paused tasks: ${error}`);
3161
- // Continue to normal polling even if resume fails
3162
- }
3163
- // ========== END: Resume paused tasks ==========
3136
+ // Spawn Claude process for resumed task
3137
+ iteration++;
3138
+ const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3139
+ const logFile = `${logDir}/${timestamp}-resume-${task.id.slice(0, 8)}.jsonl`;
3164
3140
 
3165
- // ========== Lead startup self-check ==========
3166
- if (isLead) {
3167
- console.log(`[${role}] Running startup heartbeat sweep...`);
3168
- const swept = await triggerHeartbeatSweep(apiConfig);
3169
- if (swept) {
3170
- console.log(`[${role}] Startup heartbeat sweep completed`);
3171
- } else {
3172
- console.warn(`[${role}] Startup heartbeat sweep failed (non-fatal)`);
3173
- }
3174
- }
3141
+ console.log(`\n[${role}] === Resuming paused task (iteration ${iteration}) ===`);
3142
+ console.log(`[${role}] Logging to: ${logFile}`);
3143
+ console.log(`[${role}] Prompt: ${resumePrompt.slice(0, 100)}...`);
3175
3144
 
3176
- // Phase 4 — exponential back-off state for `budget_refused` triggers.
3177
- // Resets to 0 on any non-refused outcome. Lives outside the loop so
3178
- // state persists across iterations.
3179
- let consecutiveBudgetRefusals = 0;
3180
-
3181
- // Track last finished task check for leads (to avoid re-processing)
3182
- while (true) {
3183
- // Ping server on each iteration to keep status updated
3184
- await pingServer(apiConfig, role);
3185
-
3186
- // Check for completed processes first and ensure tasks are marked as finished
3187
- await checkCompletedProcesses(state, role, apiConfig);
3188
-
3189
- // Live HARNESS_PROVIDER reconciliation. Re-fetches `swarm_config` (overlaid
3190
- // on env) and swaps the adapter if the resolved provider changed —
3191
- // typically because an operator PATCH'd /api/agents/:id/harness-provider
3192
- // (which writes a swarm_config row) or upserted a config row directly.
3193
- //
3194
- // Safety: in-flight sessions hold their own `ProviderSession` references
3195
- // and continue on the old adapter unaffected. New spawns (below) read
3196
- // the current `adapter` binding and pick up the swap. `basePrompt` is
3197
- // rebuilt because traits (and therefore prompt content) may differ across
3198
- // providers.
3199
- if (Date.now() - lastHarnessReconcileAt > HARNESS_RECONCILE_INTERVAL_MS) {
3200
- lastHarnessReconcileAt = Date.now();
3201
- try {
3202
- const { resolvedProvider } = await fetchResolvedEnv(apiUrl, apiKey, agentId);
3203
- if (resolvedProvider !== state.harnessProvider) {
3204
- const previous = state.harnessProvider;
3205
- console.log(
3206
- `[${role}] [harness] Reconciling adapter: ${previous} → ${resolvedProvider}`,
3207
- );
3208
- try {
3209
- adapter = createProviderAdapter(resolvedProvider);
3210
- state.harnessProvider = resolvedProvider;
3211
- basePrompt = await buildSystemPrompt();
3212
- resolvedSystemPrompt = additionalSystemPrompt
3213
- ? `${basePrompt}\n\n${additionalSystemPrompt}`
3214
- : basePrompt;
3215
- // Force a fresh cred_status report below for the new provider.
3216
- cachedCredHarnessProvider = null;
3217
- console.log(
3218
- `[${role}] [harness] Swapped to ${resolvedProvider} (basePrompt rebuilt: ${basePrompt.length} chars)`,
3219
- );
3220
- } catch (err) {
3145
+ const metadata = {
3146
+ type: metadataType,
3147
+ sessionId,
3148
+ iteration,
3149
+ timestamp: new Date().toISOString(),
3150
+ prompt: resumePrompt,
3151
+ trigger: "task_resumed",
3152
+ resumedTaskId: task.id,
3153
+ yolo: isYolo,
3154
+ };
3155
+ await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3156
+
3157
+ // Resolve cwd for resumed task (mirrors normal task path: task.dir > vcsRepo clonePath)
3158
+ let resumeCwd: string | undefined;
3159
+ if (task.dir) {
3160
+ try {
3161
+ if (existsSync(task.dir) && statSync(task.dir).isDirectory()) {
3162
+ resumeCwd = task.dir;
3163
+ } else {
3221
3164
  console.warn(
3222
- `[${role}] [harness] Failed to swap to ${resolvedProvider} (staying on ${previous}): ${err}`,
3165
+ `[${role}] Resume task dir "${task.dir}" does not exist or is not a directory, falling back to default cwd`,
3223
3166
  );
3224
3167
  }
3168
+ } catch {
3169
+ console.warn(
3170
+ `[${role}] Failed to check resume task dir "${task.dir}", falling back to default cwd`,
3171
+ );
3225
3172
  }
3226
- } catch (err) {
3227
- console.warn(`[${role}] [harness] Reconcile fetch failed (non-fatal): ${err}`);
3228
3173
  }
3229
- }
3230
3174
 
3231
- // Migration 055 — post-task credential refresh, cache-keyed on the
3232
- // *resolved* harness_provider. Re-runs the snapshot when the provider
3233
- // changes (boot, or after a live swap above) so the dashboard shows
3234
- // up-to-date credential status for the active adapter.
3235
- if (!isCredCheckDisabled(process.env)) {
3236
- const currentHarness = state.harnessProvider;
3237
- if (currentHarness !== cachedCredHarnessProvider) {
3238
- cachedCredHarnessProvider = currentHarness;
3239
- buildCredStatusReport(currentHarness, process.env, {}, "post_task")
3240
- .then((snap) => reportCredStatus(apiUrl, apiKey, agentId, snap))
3241
- .catch((err) =>
3242
- console.warn(`[${role}] cred_status post_task report failed (non-fatal): ${err}`),
3243
- );
3175
+ if (!resumeCwd && task.vcsRepo && apiUrl) {
3176
+ const repoConfig = await fetchRepoConfig(apiUrl, apiKey, task.vcsRepo);
3177
+ const effectiveConfig = repoConfig ?? {
3178
+ url: task.vcsRepo,
3179
+ name: task.vcsRepo.split("/").pop() || task.vcsRepo,
3180
+ clonePath: `/workspace/repos/${task.vcsRepo.split("/").pop() || task.vcsRepo}`,
3181
+ defaultBranch: "main",
3182
+ };
3183
+ const repoContext = await ensureRepoForTask(effectiveConfig, role);
3184
+ if (repoContext?.clonePath) {
3185
+ resumeCwd = repoContext.clonePath;
3186
+ }
3244
3187
  }
3245
- }
3246
3188
 
3247
- // Periodic VCS detection for running tasks (fire-and-forget, throttled per task)
3248
- const now = Date.now();
3249
- for (const [taskId, task] of state.activeTasks) {
3250
- if (vcsDetectedTasks.has(taskId)) continue;
3251
- const lastCheck = vcsCheckTimestamps.get(taskId) ?? 0;
3252
- if (now - lastCheck < VCS_CHECK_INTERVAL) continue;
3253
- if (!task.workingDir) continue;
3189
+ // Per-task runner session ID so session logs are scoped to this task
3190
+ const resumeRunnerSessionId = crypto.randomUUID();
3254
3191
 
3255
- vcsCheckTimestamps.set(taskId, now);
3256
- detectVcsForTask(apiUrl, apiKey, taskId, task.workingDir);
3257
- }
3258
-
3259
- // Check for cancelled tasks and signal their subprocesses
3260
- if (state.activeTasks.size > 0) {
3261
- for (const [taskId, task] of state.activeTasks) {
3262
- if (cancelledSignaled.has(taskId)) continue; // Already sent SIGTERM
3263
- try {
3264
- const cancelResp = await fetch(
3265
- `${apiUrl}/cancelled-tasks?taskId=${encodeURIComponent(taskId)}`,
3266
- {
3267
- headers: {
3268
- Authorization: `Bearer ${apiKey}`,
3269
- "X-Agent-ID": agentId,
3270
- },
3271
- },
3272
- );
3273
- if (cancelResp.ok) {
3274
- const cancelData = (await cancelResp.json()) as {
3275
- cancelled: Array<{ id: string }>;
3276
- };
3277
- if (cancelData.cancelled?.some((t) => t.id === taskId)) {
3278
- console.log(
3279
- `[${role}] Task ${taskId.slice(0, 8)} was cancelled sending SIGTERM to subprocess`,
3280
- );
3281
- task.session.abort().catch(() => {});
3282
- cancelledSignaled.add(taskId);
3283
- }
3284
- }
3285
- } catch {
3286
- // Non-blocking — cancellation check is best-effort
3287
- }
3192
+ let runningTask: RunningTask;
3193
+ try {
3194
+ runningTask = await spawnProviderProcess(
3195
+ adapter,
3196
+ {
3197
+ prompt: resumePrompt,
3198
+ logFile,
3199
+ systemPrompt: resolvedSystemPrompt,
3200
+ additionalArgs: resumeAdditionalArgs,
3201
+ role,
3202
+ apiUrl,
3203
+ apiKey,
3204
+ agentId,
3205
+ runnerSessionId: resumeRunnerSessionId,
3206
+ iteration,
3207
+ taskId: task.id,
3208
+ model: (task as { model?: string }).model,
3209
+ cwd: resumeCwd,
3210
+ vcsRepo: task.vcsRepo,
3211
+ },
3212
+ logDir,
3213
+ isYolo,
3214
+ );
3215
+ } catch (spawnErr) {
3216
+ const errMsg = spawnErr instanceof Error ? spawnErr.message : String(spawnErr);
3217
+ console.error(
3218
+ `[${role}] Failed to spawn process for resumed task ${task.id.slice(0, 8)}: ${errMsg}`,
3219
+ );
3220
+ await ensureTaskFinished(
3221
+ apiConfig,
3222
+ role,
3223
+ task.id,
3224
+ 1,
3225
+ `Spawn failed: ${errMsg}`,
3226
+ undefined,
3227
+ state.harnessProvider,
3228
+ );
3229
+ continue;
3288
3230
  }
3289
- }
3290
3231
 
3291
- // Only poll if we have capacity
3292
- if (state.activeTasks.size < state.maxConcurrent) {
3232
+ state.activeTasks.set(task.id, runningTask);
3233
+ registerActiveSession(apiConfig, {
3234
+ taskId: task.id,
3235
+ triggerType: "task_resumed",
3236
+ taskDescription: task.task?.slice(0, 200),
3237
+ runnerSessionId: resumeRunnerSessionId,
3238
+ });
3293
3239
  console.log(
3294
- `[${role}] Polling for triggers (${state.activeTasks.size}/${state.maxConcurrent} active)...`,
3240
+ `[${role}] Resumed task ${task.id.slice(0, 8)} (${state.activeTasks.size}/${state.maxConcurrent} active)`,
3295
3241
  );
3242
+ }
3296
3243
 
3297
- // Use shorter timeout if tasks are running (to check completion more often)
3298
- const effectiveTimeout = state.activeTasks.size > 0 ? 5000 : PollTimeoutMs;
3299
-
3300
- const trigger = await pollForTrigger({
3301
- apiUrl,
3302
- apiKey,
3303
- agentId,
3304
- pollInterval: PollIntervalMs,
3305
- pollTimeout: effectiveTimeout,
3306
- });
3244
+ console.log(`[${role}] All paused tasks resumed. Entering normal polling...`);
3245
+ } else {
3246
+ console.log(`[${role}] No paused tasks found. Entering normal polling...`);
3247
+ }
3248
+ } catch (error) {
3249
+ console.error(`[${role}] Error checking/resuming paused tasks: ${error}`);
3250
+ // Continue to normal polling even if resume fails
3251
+ }
3252
+ // ========== END: Resume paused tasks ==========
3253
+
3254
+ // ========== Lead startup self-check ==========
3255
+ if (isLead) {
3256
+ console.log(`[${role}] Running startup heartbeat sweep...`);
3257
+ const swept = await triggerHeartbeatSweep(apiConfig);
3258
+ if (swept) {
3259
+ console.log(`[${role}] Startup heartbeat sweep completed`);
3260
+ } else {
3261
+ console.warn(`[${role}] Startup heartbeat sweep failed (non-fatal)`);
3262
+ }
3263
+ }
3307
3264
 
3308
- if (trigger) {
3309
- // Phase 4 server refused to admit a claim because the agent or
3310
- // global budget is exhausted. Log a structured payload (scrubbed
3311
- // at egress per project convention) and back off exponentially.
3312
- // We deliberately `continue` BEFORE the empty-poll counter logic
3313
- // below — refusals are not empty polls.
3314
- if (trigger.type === "budget_refused") {
3315
- consecutiveBudgetRefusals++;
3316
- const backoffMs = computeBudgetBackoffMs(consecutiveBudgetRefusals, PollIntervalMs);
3317
- const refusalPayload = JSON.stringify({
3318
- event: "budget_refused",
3319
- cause: trigger.cause,
3320
- agentSpend: trigger.agentSpend,
3321
- agentBudget: trigger.agentBudget,
3322
- globalSpend: trigger.globalSpend,
3323
- globalBudget: trigger.globalBudget,
3324
- resetAt: trigger.resetAt,
3325
- consecutiveRefusals: consecutiveBudgetRefusals,
3326
- backoffMs,
3327
- });
3328
- console.log(
3329
- `[${role}] budget_refused — backing off ${backoffMs}ms: ${scrubSecrets(refusalPayload)}`,
3330
- );
3331
- await Bun.sleep(backoffMs);
3332
- continue;
3333
- }
3265
+ // Phase 4 — exponential back-off state for `budget_refused` triggers.
3266
+ // Resets to 0 on any non-refused outcome. Lives outside the loop so
3267
+ // state persists across iterations.
3268
+ let consecutiveBudgetRefusals = 0;
3334
3269
 
3335
- // Any other non-null trigger means we're being admitted normally
3336
- // reset the back-off so the next refusal starts at base interval.
3337
- consecutiveBudgetRefusals = 0;
3338
-
3339
- console.log(`[${role}] Trigger received: ${trigger.type}`);
3340
-
3341
- if (
3342
- trigger.taskId &&
3343
- (trigger.type === "task_assigned" || trigger.type === "task_offered")
3344
- ) {
3345
- ensure({
3346
- id: "worker_received",
3347
- flow: "task",
3348
- runId: trigger.taskId,
3349
- depIds: ["started"],
3350
- data: {
3351
- taskId: trigger.taskId,
3352
- agentId,
3353
- triggerType: trigger.type,
3354
- role,
3355
- },
3356
- // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
3357
- filter: ({}, ctx) => ctx.deps.length > 0,
3358
- conditions: [{ timeout_ms: 60_000 }], // 1 min: immediate after poll
3359
- });
3360
- }
3270
+ // Track last finished task check for leads (to avoid re-processing)
3271
+ while (true) {
3272
+ // Ping server on each iteration to keep status updated
3273
+ await pingServer(apiConfig, role);
3361
3274
 
3362
- // Build prompt based on trigger
3363
- let triggerPrompt = await buildPromptForTrigger(
3364
- trigger,
3365
- prompt,
3366
- adapter.formatCommand.bind(adapter),
3367
- { hasMcp: adapter.traits.hasMcp },
3368
- );
3275
+ // Check for completed processes first and ensure tasks are marked as finished
3276
+ await checkCompletedProcesses(state, role, apiConfig);
3369
3277
 
3370
- // Enrich prompt with relevant memories from past sessions
3371
- if (trigger.type === "task_assigned" || trigger.type === "task_offered") {
3372
- const task =
3373
- trigger.task && typeof trigger.task === "object" && "task" in trigger.task
3374
- ? (trigger.task as { task: string; id?: string })
3375
- : null;
3376
- if (task?.task) {
3377
- const memoryContext = await fetchRelevantMemories(
3378
- apiUrl,
3379
- apiKey,
3380
- agentId,
3381
- task.task,
3382
- task.id,
3383
- );
3384
- if (memoryContext) {
3385
- triggerPrompt += memoryContext;
3386
- console.log(`[${role}] Injected relevant memories into task prompt`);
3387
- }
3388
- }
3389
- }
3278
+ // Live HARNESS_PROVIDER reconciliation. Re-fetches `swarm_config` (overlaid
3279
+ // on env) and swaps the adapter if the resolved provider changed —
3280
+ // typically because an operator PATCH'd /api/agents/:id/harness-provider
3281
+ // (which writes a swarm_config row) or upserted a config row directly.
3282
+ //
3283
+ // Safety: in-flight sessions hold their own `ProviderSession` references
3284
+ // and continue on the old adapter unaffected. New spawns (below) read
3285
+ // the current `adapter` binding and pick up the swap. `basePrompt` is
3286
+ // rebuilt because traits (and therefore prompt content) may differ across
3287
+ // providers.
3288
+ if (Date.now() - lastHarnessReconcileAt > HARNESS_RECONCILE_INTERVAL_MS) {
3289
+ lastHarnessReconcileAt = Date.now();
3290
+ try {
3291
+ const { env: freshEnv, resolvedProvider } = await fetchResolvedEnv(apiUrl, apiKey, agentId);
3292
+ const { agentVisibleChanged } = await applySwarmConfigDrift(freshEnv, resolvedProvider);
3293
+ if (agentVisibleChanged) {
3294
+ // Re-register so the agents row + dashboard reflect the live
3295
+ // harness_provider / maxTasks. Idempotent: only writes columns
3296
+ // that actually changed (see src/http/agents.ts).
3297
+ await reregisterAgent();
3298
+ }
3299
+ } catch (err) {
3300
+ console.warn(`[${role}] [harness] Reconcile fetch failed (non-fatal): ${err}`);
3301
+ }
3302
+ }
3390
3303
 
3391
- // Resolve --resume for child tasks with parentTaskId
3392
- let effectiveAdditionalArgs = opts.additionalArgs || [];
3393
- const taskObj = trigger.task as { parentTaskId?: string } | undefined;
3394
- if (taskObj?.parentTaskId) {
3395
- const parentSessionId = await fetchProviderSessionId(
3396
- apiUrl,
3397
- apiKey,
3398
- taskObj.parentTaskId,
3399
- );
3400
- if (parentSessionId) {
3401
- effectiveAdditionalArgs = [...effectiveAdditionalArgs, "--resume", parentSessionId];
3402
- console.log(
3403
- `[${role}] Child task — resuming parent session ${parentSessionId.slice(0, 8)}`,
3404
- );
3405
- } else {
3406
- console.log(`[${role}] Child task — parent session ID not found, starting fresh`);
3407
- }
3408
- }
3304
+ // Migration 055 post-task credential refresh, cache-keyed on the
3305
+ // *resolved* harness_provider. Re-runs the snapshot when the provider
3306
+ // changes (boot, or after a live swap above) so the dashboard shows
3307
+ // up-to-date credential status for the active adapter.
3308
+ if (!isCredCheckDisabled(process.env)) {
3309
+ const currentHarness = state.harnessProvider;
3310
+ if (currentHarness !== cachedCredHarnessProvider) {
3311
+ cachedCredHarnessProvider = currentHarness;
3312
+ buildCredStatusReport(currentHarness, process.env, {}, "post_task")
3313
+ .then((snap) => reportCredStatus(apiUrl, apiKey, agentId, snap))
3314
+ .catch((err) =>
3315
+ console.warn(`[${role}] cred_status post_task report failed (non-fatal): ${err}`),
3316
+ );
3317
+ }
3318
+ }
3409
3319
 
3410
- // Extract model from task data for per-task model selection
3411
- const taskModel = (trigger.task as { model?: string } | undefined)?.model;
3320
+ // Periodic VCS detection for running tasks (fire-and-forget, throttled per task)
3321
+ const now = Date.now();
3322
+ for (const [taskId, task] of state.activeTasks) {
3323
+ if (vcsDetectedTasks.has(taskId)) continue;
3324
+ const lastCheck = vcsCheckTimestamps.get(taskId) ?? 0;
3325
+ if (now - lastCheck < VCS_CHECK_INTERVAL) continue;
3326
+ if (!task.workingDir) continue;
3412
3327
 
3413
- // Detect Slack context for conditional prompt sections
3414
- const taskSlackChannelId = (trigger.task as { slackChannelId?: string } | undefined)
3415
- ?.slackChannelId;
3416
- const taskSlackThreadTs = (trigger.task as { slackThreadTs?: string } | undefined)
3417
- ?.slackThreadTs;
3418
- currentTaskSlackContext = taskSlackChannelId
3419
- ? { channelId: taskSlackChannelId, threadTs: taskSlackThreadTs }
3420
- : undefined;
3328
+ vcsCheckTimestamps.set(taskId, now);
3329
+ detectVcsForTask(apiUrl, apiKey, taskId, task.workingDir);
3330
+ }
3421
3331
 
3422
- // Handle repo context for tasks with vcsRepo (GitHub/GitLab)
3423
- const taskVcsRepo = (trigger.task as { vcsRepo?: string } | undefined)?.vcsRepo;
3424
- if (taskVcsRepo && apiUrl) {
3425
- const repoConfig = await fetchRepoConfig(apiUrl, apiKey, taskVcsRepo);
3426
- // Fall back to convention-based config if repo is not registered
3427
- const effectiveConfig = repoConfig ?? {
3428
- url: taskVcsRepo,
3429
- name: taskVcsRepo.split("/").pop() || taskVcsRepo,
3430
- clonePath: `/workspace/repos/${taskVcsRepo.split("/").pop() || taskVcsRepo}`,
3431
- defaultBranch: "main",
3432
- };
3433
- const repoResult = await ensureRepoForTask(effectiveConfig, role);
3434
- currentRepoContext = {
3435
- ...repoResult,
3436
- guidelines: repoConfig?.guidelines ?? null,
3332
+ // Check for cancelled tasks and signal their subprocesses
3333
+ if (state.activeTasks.size > 0) {
3334
+ for (const [taskId, task] of state.activeTasks) {
3335
+ if (cancelledSignaled.has(taskId)) continue; // Already sent SIGTERM
3336
+ try {
3337
+ const cancelResp = await fetch(
3338
+ `${apiUrl}/cancelled-tasks?taskId=${encodeURIComponent(taskId)}`,
3339
+ {
3340
+ headers: {
3341
+ Authorization: `Bearer ${apiKey}`,
3342
+ "X-Agent-ID": agentId,
3343
+ },
3344
+ },
3345
+ );
3346
+ if (cancelResp.ok) {
3347
+ const cancelData = (await cancelResp.json()) as {
3348
+ cancelled: Array<{ id: string }>;
3437
3349
  };
3438
- } else {
3439
- currentRepoContext = undefined;
3440
- }
3441
-
3442
- // Resolve effective working directory (priority: task.dir > repoContext.clonePath > process.cwd())
3443
- const taskDir = (trigger.task as { dir?: string } | undefined)?.dir;
3444
- let effectiveCwd: string | undefined;
3445
-
3446
- if (taskDir) {
3447
- try {
3448
- if (existsSync(taskDir) && statSync(taskDir).isDirectory()) {
3449
- effectiveCwd = taskDir;
3450
- } else {
3451
- console.warn(
3452
- `[${role}] Task dir "${taskDir}" does not exist or is not a directory, falling back to default cwd`,
3453
- );
3454
- }
3455
- } catch {
3456
- console.warn(
3457
- `[${role}] Failed to check task dir "${taskDir}", falling back to default cwd`,
3350
+ if (cancelData.cancelled?.some((t) => t.id === taskId)) {
3351
+ console.log(
3352
+ `[${role}] Task ${taskId.slice(0, 8)} was cancelled — sending SIGTERM to subprocess`,
3458
3353
  );
3354
+ task.session.abort().catch(() => {});
3355
+ cancelledSignaled.add(taskId);
3459
3356
  }
3460
3357
  }
3358
+ } catch {
3359
+ // Non-blocking — cancellation check is best-effort
3360
+ }
3361
+ }
3362
+ }
3461
3363
 
3462
- if (!effectiveCwd && currentRepoContext?.clonePath) {
3463
- effectiveCwd = currentRepoContext.clonePath;
3464
- }
3465
-
3466
- // Annotate prompt with working directory context
3467
- if (effectiveCwd && effectiveCwd !== process.cwd()) {
3468
- triggerPrompt += `\n\n---\n**Working Directory**: You are starting in \`${effectiveCwd}\`. `;
3469
- if (taskDir) {
3470
- triggerPrompt += "This was explicitly set on the task.";
3471
- } else if (currentRepoContext?.clonePath) {
3472
- triggerPrompt += "This is the repository clone path for this task's VCS repo.";
3473
- }
3474
- triggerPrompt +=
3475
- " You can still access any path on the filesystem — this is just your starting directory.";
3476
- }
3364
+ // Only poll if we have capacity
3365
+ if (state.activeTasks.size < state.maxConcurrent) {
3366
+ console.log(
3367
+ `[${role}] Polling for triggers (${state.activeTasks.size}/${state.maxConcurrent} active)...`,
3368
+ );
3477
3369
 
3478
- // Warn in system prompt when task dir was specified but doesn't exist
3479
- let cwdWarning = "";
3480
- if (taskDir && !effectiveCwd) {
3481
- cwdWarning = `\n\nNote: The task requested working directory "${taskDir}" but it does not exist. Falling back to default directory.`;
3482
- }
3370
+ // Use shorter timeout if tasks are running (to check completion more often)
3371
+ const effectiveTimeout = state.activeTasks.size > 0 ? 5000 : PollTimeoutMs;
3483
3372
 
3484
- // Rebuild system prompt with per-task repo context
3485
- const taskBasePrompt = await buildSystemPrompt();
3486
- const taskSystemPrompt =
3487
- (additionalSystemPrompt
3488
- ? `${taskBasePrompt}\n\n${additionalSystemPrompt}`
3489
- : taskBasePrompt) + cwdWarning;
3490
-
3491
- iteration++;
3492
- const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3493
- const taskIdSlice = trigger.taskId?.slice(0, 8) || "notask";
3494
- const logFile = `${logDir}/${timestamp}-${taskIdSlice}.jsonl`;
3495
-
3496
- console.log(`\n[${role}] === Iteration ${iteration} ===`);
3497
- console.log(`[${role}] Logging to: ${logFile}`);
3498
- console.log(`[${role}] Prompt: ${triggerPrompt.slice(0, 100)}...`);
3499
- if (effectiveCwd) {
3500
- console.log(`[${role}] Working directory: ${effectiveCwd}`);
3501
- }
3373
+ const trigger = await pollForTrigger({
3374
+ apiUrl,
3375
+ apiKey,
3376
+ agentId,
3377
+ pollInterval: PollIntervalMs,
3378
+ pollTimeout: effectiveTimeout,
3379
+ });
3502
3380
 
3503
- const metadata = {
3504
- type: metadataType,
3505
- sessionId,
3506
- iteration,
3507
- timestamp: new Date().toISOString(),
3508
- prompt: triggerPrompt,
3509
- trigger: trigger.type,
3510
- yolo: isYolo,
3511
- };
3512
- await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3381
+ if (trigger) {
3382
+ // Phase 4 — server refused to admit a claim because the agent or
3383
+ // global budget is exhausted. Log a structured payload (scrubbed
3384
+ // at egress per project convention) and back off exponentially.
3385
+ // We deliberately `continue` BEFORE the empty-poll counter logic
3386
+ // below — refusals are not empty polls.
3387
+ if (trigger.type === "budget_refused") {
3388
+ consecutiveBudgetRefusals++;
3389
+ const backoffMs = computeBudgetBackoffMs(consecutiveBudgetRefusals, PollIntervalMs);
3390
+ const refusalPayload = JSON.stringify({
3391
+ event: "budget_refused",
3392
+ cause: trigger.cause,
3393
+ agentSpend: trigger.agentSpend,
3394
+ agentBudget: trigger.agentBudget,
3395
+ globalSpend: trigger.globalSpend,
3396
+ globalBudget: trigger.globalBudget,
3397
+ resetAt: trigger.resetAt,
3398
+ consecutiveRefusals: consecutiveBudgetRefusals,
3399
+ backoffMs,
3400
+ });
3401
+ console.log(
3402
+ `[${role}] budget_refused — backing off ${backoffMs}ms: ${scrubSecrets(refusalPayload)}`,
3403
+ );
3404
+ await Bun.sleep(backoffMs);
3405
+ continue;
3406
+ }
3513
3407
 
3514
- // Per-task runner session ID so session logs are scoped to this task
3515
- const taskRunnerSessionId = crypto.randomUUID();
3408
+ // Any other non-null trigger means we're being admitted normally
3409
+ // reset the back-off so the next refusal starts at base interval.
3410
+ consecutiveBudgetRefusals = 0;
3516
3411
 
3517
- // Spawn without blocking (await to set up session, but process runs async)
3518
- let runningTask: RunningTask;
3519
- try {
3520
- runningTask = await spawnProviderProcess(
3521
- adapter,
3522
- {
3523
- prompt: triggerPrompt,
3524
- logFile,
3525
- systemPrompt: taskSystemPrompt,
3526
- additionalArgs: effectiveAdditionalArgs,
3527
- role,
3528
- apiUrl,
3529
- apiKey,
3530
- agentId,
3531
- runnerSessionId: taskRunnerSessionId,
3532
- iteration,
3533
- taskId: trigger.taskId,
3534
- model: taskModel,
3535
- cwd: effectiveCwd,
3536
- vcsRepo: taskVcsRepo,
3537
- },
3538
- logDir,
3539
- isYolo,
3540
- );
3541
- } catch (spawnErr) {
3542
- const errMsg = spawnErr instanceof Error ? spawnErr.message : String(spawnErr);
3543
- console.error(
3544
- `[${role}] Failed to spawn process for task ${trigger.taskId?.slice(0, 8) || "unknown"}: ${errMsg}`,
3545
- );
3546
- if (trigger.taskId) {
3547
- await ensureTaskFinished(
3548
- apiConfig,
3549
- role,
3550
- trigger.taskId,
3551
- 1,
3552
- `Spawn failed: ${errMsg}`,
3553
- undefined,
3554
- state.harnessProvider,
3555
- );
3556
- }
3557
- continue;
3558
- }
3412
+ console.log(`[${role}] Trigger received: ${trigger.type}`);
3559
3413
 
3414
+ if (
3415
+ trigger.taskId &&
3416
+ (trigger.type === "task_assigned" || trigger.type === "task_offered")
3417
+ ) {
3560
3418
  ensure({
3561
- id: "worker_process_spawned",
3419
+ id: "worker_received",
3562
3420
  flow: "task",
3563
- runId: runningTask.taskId,
3564
- depIds: ["worker_received"],
3421
+ runId: trigger.taskId,
3422
+ depIds: ["started"],
3565
3423
  data: {
3566
- taskId: runningTask.taskId,
3424
+ taskId: trigger.taskId,
3567
3425
  agentId,
3426
+ triggerType: trigger.type,
3568
3427
  role,
3569
- model: taskModel,
3570
3428
  },
3571
3429
  // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
3572
3430
  filter: ({}, ctx) => ctx.deps.length > 0,
3573
- conditions: [{ timeout_ms: 60_000 }], // 1 min: process startup
3431
+ conditions: [{ timeout_ms: 60_000 }], // 1 min: immediate after poll
3574
3432
  });
3433
+ }
3575
3434
 
3576
- // Attach trigger metadata for logging
3577
- runningTask.triggerType = trigger.type;
3578
- runningTask.workingDir = effectiveCwd;
3579
-
3580
- // Attach deferred cursor updates for channel_activity triggers
3581
- if (trigger.type === "channel_activity" && trigger.cursorUpdates) {
3582
- runningTask.cursorUpdates = trigger.cursorUpdates as Array<{
3583
- channelId: string;
3584
- ts: string;
3585
- }>;
3586
- }
3587
-
3588
- state.activeTasks.set(runningTask.taskId, runningTask);
3435
+ // Build prompt based on trigger
3436
+ let triggerPrompt = await buildPromptForTrigger(
3437
+ trigger,
3438
+ prompt,
3439
+ adapter.formatCommand.bind(adapter),
3440
+ { hasMcp: adapter.traits.hasMcp },
3441
+ );
3589
3442
 
3590
- // Register active session for concurrency awareness
3591
- const taskDesc =
3443
+ // Enrich prompt with relevant memories from past sessions
3444
+ if (trigger.type === "task_assigned" || trigger.type === "task_offered") {
3445
+ const task =
3592
3446
  trigger.task && typeof trigger.task === "object" && "task" in trigger.task
3593
- ? String((trigger.task as { task: string }).task).slice(0, 200)
3594
- : undefined;
3595
- registerActiveSession(apiConfig, {
3596
- taskId: runningTask.taskId,
3597
- triggerType: trigger.type,
3598
- taskDescription: taskDesc,
3599
- runnerSessionId: taskRunnerSessionId,
3600
- });
3447
+ ? (trigger.task as { task: string; id?: string })
3448
+ : null;
3449
+ if (task?.task) {
3450
+ const memoryContext = await fetchRelevantMemories(
3451
+ apiUrl,
3452
+ apiKey,
3453
+ agentId,
3454
+ task.task,
3455
+ task.id,
3456
+ );
3457
+ if (memoryContext) {
3458
+ triggerPrompt += memoryContext;
3459
+ console.log(`[${role}] Injected relevant memories into task prompt`);
3460
+ }
3461
+ }
3462
+ }
3601
3463
 
3602
- console.log(
3603
- `[${role}] Started task ${runningTask.taskId.slice(0, 8)} (${state.activeTasks.size}/${state.maxConcurrent} active, trigger: ${trigger.type})`,
3464
+ // Resolve --resume for child tasks with parentTaskId
3465
+ let effectiveAdditionalArgs = opts.additionalArgs || [];
3466
+ const taskObj = trigger.task as { parentTaskId?: string } | undefined;
3467
+ if (taskObj?.parentTaskId) {
3468
+ const parentSessionId = await fetchProviderSessionId(
3469
+ apiUrl,
3470
+ apiKey,
3471
+ taskObj.parentTaskId,
3604
3472
  );
3473
+ if (parentSessionId) {
3474
+ effectiveAdditionalArgs = [...effectiveAdditionalArgs, "--resume", parentSessionId];
3475
+ console.log(
3476
+ `[${role}] Child task — resuming parent session ${parentSessionId.slice(0, 8)}`,
3477
+ );
3478
+ } else {
3479
+ console.log(`[${role}] Child task — parent session ID not found, starting fresh`);
3480
+ }
3605
3481
  }
3606
- } else {
3607
- console.log(
3608
- `[${role}] At capacity (${state.activeTasks.size}/${state.maxConcurrent}), waiting for completion...`,
3609
- );
3610
- await Bun.sleep(1000);
3611
- }
3612
- }
3613
- } else {
3614
- // Original AI-loop mode (existing behavior)
3615
- console.log(`[${role}] Mode: AI-based polling (legacy)`);
3616
3482
 
3617
- // Create API config for ping/close
3618
- const apiConfig: ApiConfig = { apiUrl, apiKey, agentId };
3619
-
3620
- // Setup graceful shutdown handlers with API config for close on exit
3621
- setupShutdownHandlers(role, apiConfig);
3483
+ // Extract model from task data for per-task model selection
3484
+ const taskModel = (trigger.task as { model?: string } | undefined)?.model;
3485
+
3486
+ // Detect Slack context for conditional prompt sections
3487
+ const taskSlackChannelId = (trigger.task as { slackChannelId?: string } | undefined)
3488
+ ?.slackChannelId;
3489
+ const taskSlackThreadTs = (trigger.task as { slackThreadTs?: string } | undefined)
3490
+ ?.slackThreadTs;
3491
+ currentTaskSlackContext = taskSlackChannelId
3492
+ ? { channelId: taskSlackChannelId, threadTs: taskSlackThreadTs }
3493
+ : undefined;
3494
+
3495
+ // Handle repo context for tasks with vcsRepo (GitHub/GitLab)
3496
+ const taskVcsRepo = (trigger.task as { vcsRepo?: string } | undefined)?.vcsRepo;
3497
+ if (taskVcsRepo && apiUrl) {
3498
+ const repoConfig = await fetchRepoConfig(apiUrl, apiKey, taskVcsRepo);
3499
+ // Fall back to convention-based config if repo is not registered
3500
+ const effectiveConfig = repoConfig ?? {
3501
+ url: taskVcsRepo,
3502
+ name: taskVcsRepo.split("/").pop() || taskVcsRepo,
3503
+ clonePath: `/workspace/repos/${taskVcsRepo.split("/").pop() || taskVcsRepo}`,
3504
+ defaultBranch: "main",
3505
+ };
3506
+ const repoResult = await ensureRepoForTask(effectiveConfig, role);
3507
+ currentRepoContext = {
3508
+ ...repoResult,
3509
+ guidelines: repoConfig?.guidelines ?? null,
3510
+ };
3511
+ } else {
3512
+ currentRepoContext = undefined;
3513
+ }
3622
3514
 
3623
- while (true) {
3624
- // Ping server on each iteration to keep status updated
3625
- await pingServer(apiConfig, role);
3515
+ // Resolve effective working directory (priority: task.dir > repoContext.clonePath > process.cwd())
3516
+ const taskDir = (trigger.task as { dir?: string } | undefined)?.dir;
3517
+ let effectiveCwd: string | undefined;
3626
3518
 
3627
- iteration++;
3628
- const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3629
- const logFile = `${logDir}/${timestamp}.jsonl`;
3519
+ if (taskDir) {
3520
+ try {
3521
+ if (existsSync(taskDir) && statSync(taskDir).isDirectory()) {
3522
+ effectiveCwd = taskDir;
3523
+ } else {
3524
+ console.warn(
3525
+ `[${role}] Task dir "${taskDir}" does not exist or is not a directory, falling back to default cwd`,
3526
+ );
3527
+ }
3528
+ } catch {
3529
+ console.warn(
3530
+ `[${role}] Failed to check task dir "${taskDir}", falling back to default cwd`,
3531
+ );
3532
+ }
3533
+ }
3630
3534
 
3631
- console.log(`\n[${role}] === Iteration ${iteration} ===`);
3632
- console.log(`[${role}] Logging to: ${logFile}`);
3535
+ if (!effectiveCwd && currentRepoContext?.clonePath) {
3536
+ effectiveCwd = currentRepoContext.clonePath;
3537
+ }
3633
3538
 
3634
- const metadata = {
3635
- type: metadataType,
3636
- sessionId,
3637
- iteration,
3638
- timestamp: new Date().toISOString(),
3639
- prompt,
3640
- yolo: isYolo,
3641
- };
3642
- await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3539
+ // Annotate prompt with working directory context
3540
+ if (effectiveCwd && effectiveCwd !== process.cwd()) {
3541
+ triggerPrompt += `\n\n---\n**Working Directory**: You are starting in \`${effectiveCwd}\`. `;
3542
+ if (taskDir) {
3543
+ triggerPrompt += "This was explicitly set on the task.";
3544
+ } else if (currentRepoContext?.clonePath) {
3545
+ triggerPrompt += "This is the repository clone path for this task's VCS repo.";
3546
+ }
3547
+ triggerPrompt +=
3548
+ " You can still access any path on the filesystem — this is just your starting directory.";
3549
+ }
3643
3550
 
3644
- const iterationResult = await runProviderIteration(adapter, {
3645
- prompt,
3646
- logFile,
3647
- systemPrompt: resolvedSystemPrompt,
3648
- additionalArgs: opts.additionalArgs,
3649
- role,
3650
- apiUrl,
3651
- apiKey,
3652
- agentId,
3653
- });
3551
+ // Warn in system prompt when task dir was specified but doesn't exist
3552
+ let cwdWarning = "";
3553
+ if (taskDir && !effectiveCwd) {
3554
+ cwdWarning = `\n\nNote: The task requested working directory "${taskDir}" but it does not exist. Falling back to default directory.`;
3555
+ }
3654
3556
 
3655
- if (iterationResult.exitCode !== 0) {
3656
- const failureReason =
3657
- iterationResult.failureReason || `Process exited with code ${iterationResult.exitCode}`;
3557
+ // Rebuild system prompt with per-task repo context
3558
+ const taskBasePrompt = await buildSystemPrompt();
3559
+ const taskSystemPrompt =
3560
+ (additionalSystemPrompt
3561
+ ? `${taskBasePrompt}\n\n${additionalSystemPrompt}`
3562
+ : taskBasePrompt) + cwdWarning;
3563
+
3564
+ iteration++;
3565
+ const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3566
+ const taskIdSlice = trigger.taskId?.slice(0, 8) || "notask";
3567
+ const logFile = `${logDir}/${timestamp}-${taskIdSlice}.jsonl`;
3568
+
3569
+ console.log(`\n[${role}] === Iteration ${iteration} ===`);
3570
+ console.log(`[${role}] Logging to: ${logFile}`);
3571
+ console.log(`[${role}] Prompt: ${triggerPrompt.slice(0, 100)}...`);
3572
+ if (effectiveCwd) {
3573
+ console.log(`[${role}] Working directory: ${effectiveCwd}`);
3574
+ }
3658
3575
 
3659
- const errorLog = {
3660
- timestamp: new Date().toISOString(),
3576
+ const metadata = {
3577
+ type: metadataType,
3578
+ sessionId,
3661
3579
  iteration,
3662
- exitCode: iterationResult.exitCode,
3663
- failureReason,
3664
- error: true,
3580
+ timestamp: new Date().toISOString(),
3581
+ prompt: triggerPrompt,
3582
+ trigger: trigger.type,
3583
+ yolo: isYolo,
3665
3584
  };
3585
+ await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3666
3586
 
3667
- const errorsFile = `${logDir}/errors.jsonl`;
3668
- const errorsFileRef = Bun.file(errorsFile);
3669
- const existingErrors = (await errorsFileRef.exists()) ? await errorsFileRef.text() : "";
3670
- await Bun.write(errorsFile, `${existingErrors}${JSON.stringify(errorLog)}\n`);
3587
+ // Per-task runner session ID so session logs are scoped to this task
3588
+ const taskRunnerSessionId = crypto.randomUUID();
3671
3589
 
3672
- if (!isYolo) {
3673
- console.error(`[${role}] ${failureReason}. Stopping.`);
3674
- console.error(`[${role}] Error logged to: ${errorsFile}`);
3675
- process.exit(iterationResult.exitCode);
3590
+ // Spawn without blocking (await to set up session, but process runs async)
3591
+ let runningTask: RunningTask;
3592
+ try {
3593
+ runningTask = await spawnProviderProcess(
3594
+ adapter,
3595
+ {
3596
+ prompt: triggerPrompt,
3597
+ logFile,
3598
+ systemPrompt: taskSystemPrompt,
3599
+ additionalArgs: effectiveAdditionalArgs,
3600
+ role,
3601
+ apiUrl,
3602
+ apiKey,
3603
+ agentId,
3604
+ runnerSessionId: taskRunnerSessionId,
3605
+ iteration,
3606
+ taskId: trigger.taskId,
3607
+ model: taskModel,
3608
+ cwd: effectiveCwd,
3609
+ vcsRepo: taskVcsRepo,
3610
+ },
3611
+ logDir,
3612
+ isYolo,
3613
+ );
3614
+ } catch (spawnErr) {
3615
+ const errMsg = spawnErr instanceof Error ? spawnErr.message : String(spawnErr);
3616
+ console.error(
3617
+ `[${role}] Failed to spawn process for task ${trigger.taskId?.slice(0, 8) || "unknown"}: ${errMsg}`,
3618
+ );
3619
+ if (trigger.taskId) {
3620
+ await ensureTaskFinished(
3621
+ apiConfig,
3622
+ role,
3623
+ trigger.taskId,
3624
+ 1,
3625
+ `Spawn failed: ${errMsg}`,
3626
+ undefined,
3627
+ state.harnessProvider,
3628
+ );
3629
+ }
3630
+ continue;
3676
3631
  }
3677
3632
 
3678
- console.warn(`[${role}] ${failureReason}. YOLO mode - continuing...`);
3679
- }
3633
+ ensure({
3634
+ id: "worker_process_spawned",
3635
+ flow: "task",
3636
+ runId: runningTask.taskId,
3637
+ depIds: ["worker_received"],
3638
+ data: {
3639
+ taskId: runningTask.taskId,
3640
+ agentId,
3641
+ role,
3642
+ model: taskModel,
3643
+ },
3644
+ // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
3645
+ filter: ({}, ctx) => ctx.deps.length > 0,
3646
+ conditions: [{ timeout_ms: 60_000 }], // 1 min: process startup
3647
+ });
3648
+
3649
+ // Attach trigger metadata for logging
3650
+ runningTask.triggerType = trigger.type;
3651
+ runningTask.workingDir = effectiveCwd;
3652
+
3653
+ // Attach deferred cursor updates for channel_activity triggers
3654
+ if (trigger.type === "channel_activity" && trigger.cursorUpdates) {
3655
+ runningTask.cursorUpdates = trigger.cursorUpdates as Array<{
3656
+ channelId: string;
3657
+ ts: string;
3658
+ }>;
3659
+ }
3680
3660
 
3681
- console.log(`[${role}] Iteration ${iteration} complete. Starting next iteration...`);
3661
+ state.activeTasks.set(runningTask.taskId, runningTask);
3662
+
3663
+ // Register active session for concurrency awareness
3664
+ const taskDesc =
3665
+ trigger.task && typeof trigger.task === "object" && "task" in trigger.task
3666
+ ? String((trigger.task as { task: string }).task).slice(0, 200)
3667
+ : undefined;
3668
+ registerActiveSession(apiConfig, {
3669
+ taskId: runningTask.taskId,
3670
+ triggerType: trigger.type,
3671
+ taskDescription: taskDesc,
3672
+ runnerSessionId: taskRunnerSessionId,
3673
+ });
3674
+
3675
+ console.log(
3676
+ `[${role}] Started task ${runningTask.taskId.slice(0, 8)} (${state.activeTasks.size}/${state.maxConcurrent} active, trigger: ${trigger.type})`,
3677
+ );
3678
+ }
3679
+ } else {
3680
+ console.log(
3681
+ `[${role}] At capacity (${state.activeTasks.size}/${state.maxConcurrent}), waiting for completion...`,
3682
+ );
3683
+ await Bun.sleep(1000);
3682
3684
  }
3683
3685
  }
3684
3686
  }