@desplega.ai/agent-swarm 1.76.3 → 1.77.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -272,6 +272,66 @@ async function fetchResolvedEnv(
272
272
  return { env, credentialSelections, resolvedProvider };
273
273
  }
274
274
 
275
+ /**
276
+ * Keys we permit `applyResolvedEnvToProcessEnv` to mutate live.
277
+ *
278
+ * Anything not in this list is considered unsafe to overwrite post-boot:
279
+ *
280
+ * - **Boot-time identity / connectivity** (AGENT_ID, API_KEY, MCP_BASE_URL,
281
+ * AGENT_ROLE, MANAGED_*): mutating these mid-flight effectively makes the
282
+ * worker a different agent talking to a different API. Reboot, don't reload.
283
+ * - **Credential pool members** (CLAUDE_CODE_OAUTH_TOKEN, ANTHROPIC_API_KEY,
284
+ * OPENAI_API_KEY, etc.): `resolveCredentialPools` picks one randomly *per
285
+ * task* from a comma-separated pool. Persisting the picked value into
286
+ * process.env freezes the rotation. Re-resolution happens per spawn anyway,
287
+ * so we deliberately leave these alone.
288
+ * - **Coordinated values with paired state** (HARNESS_PROVIDER): swapping
289
+ * the env without also swapping the adapter and rebuilding the system
290
+ * prompt produces an inconsistent worker. Handled by its own reconcile
291
+ * path that updates state.harnessProvider + adapter atomically.
292
+ * - **Process-runtime / OS-level** (PATH, HOME, NODE_OPTIONS, HOSTNAME, …):
293
+ * never overwrite. Some of these are read once by libraries at boot.
294
+ * - **Values memoized at boot** (TEMPLATE_ID, AGENT_NAME): the cached
295
+ * in-process value wins anyway — overwriting just creates confusion.
296
+ *
297
+ * For values that affect runner-loop behavior (like MAX_CONCURRENT_TASKS),
298
+ * prefer mutating `RunnerState` directly — no round-trip through process.env.
299
+ */
300
+ const RELOADABLE_ENV_KEYS: ReadonlySet<string> = new Set([
301
+ "MODEL_OVERRIDE",
302
+ "AGENT_FS_SHARED_ORG_ID",
303
+ ]);
304
+
305
+ /**
306
+ * Apply a fresh resolved env to `process.env` for keys safe to mutate live.
307
+ * Returns the list of keys that actually changed (useful for logging).
308
+ */
309
+ function applyResolvedEnvToProcessEnv(freshEnv: Record<string, string | undefined>): string[] {
310
+ const changed: string[] = [];
311
+ for (const key of RELOADABLE_ENV_KEYS) {
312
+ const next = freshEnv[key];
313
+ if (next !== undefined && next !== process.env[key]) {
314
+ process.env[key] = next;
315
+ changed.push(key);
316
+ }
317
+ }
318
+ return changed;
319
+ }
320
+
321
+ /** Compute effective max concurrent tasks from env > template default > role default. */
322
+ function resolveMaxConcurrent(
323
+ env: Record<string, string | undefined>,
324
+ templateMax: number | undefined,
325
+ defaultMaxTasks: number,
326
+ ): number {
327
+ const raw = env.MAX_CONCURRENT_TASKS;
328
+ if (raw) {
329
+ const parsed = parseInt(raw, 10);
330
+ if (Number.isFinite(parsed) && parsed > 0) return parsed;
331
+ }
332
+ return templateMax ?? defaultMaxTasks;
333
+ }
334
+
275
335
  /** Tools that produce noise — skip auto-progress for these */
276
336
  const SKIP_PROGRESS_TOOLS = new Set(["ToolSearch", "TodoRead", "TodoWrite"]);
277
337
 
@@ -962,7 +1022,6 @@ export interface RunnerOptions {
962
1022
  systemPromptFile?: string;
963
1023
  logsDir?: string;
964
1024
  additionalArgs?: string[];
965
- aiLoop?: boolean; // Use AI-based loop (old behavior)
966
1025
  }
967
1026
 
968
1027
  /** Running task state for parallel execution */
@@ -2099,68 +2158,6 @@ async function spawnProviderProcess(
2099
2158
  return runningTask;
2100
2159
  }
2101
2160
 
2102
- /** Run a single provider iteration (blocking) - used for AI-loop mode */
2103
- async function runProviderIteration(
2104
- adapter: ReturnType<typeof createProviderAdapter>,
2105
- opts: {
2106
- prompt: string;
2107
- logFile: string;
2108
- systemPrompt?: string;
2109
- additionalArgs?: string[];
2110
- role: string;
2111
- apiUrl: string;
2112
- apiKey: string;
2113
- agentId: string;
2114
- taskId?: string;
2115
- cwd?: string;
2116
- },
2117
- ): Promise<ProviderResult> {
2118
- const { env: freshEnv } = await fetchResolvedEnv(opts.apiUrl, opts.apiKey, opts.agentId);
2119
- const model = (freshEnv.MODEL_OVERRIDE as string) || "";
2120
-
2121
- const config: ProviderSessionConfig = {
2122
- prompt: opts.prompt,
2123
- systemPrompt: opts.systemPrompt || "",
2124
- model,
2125
- role: opts.role,
2126
- agentId: opts.agentId,
2127
- taskId: opts.taskId || crypto.randomUUID(),
2128
- apiUrl: opts.apiUrl,
2129
- apiKey: opts.apiKey,
2130
- cwd: opts.cwd || process.cwd(),
2131
- logFile: opts.logFile,
2132
- additionalArgs: opts.additionalArgs,
2133
- env: freshEnv as Record<string, string>,
2134
- };
2135
-
2136
- const session = await adapter.createSession(config);
2137
-
2138
- let lastAiLoopProgressTime = 0;
2139
- session.onEvent((event) => {
2140
- if (event.type === "raw_log") prettyPrintLine(event.content, opts.role);
2141
- if (event.type === "raw_stderr") prettyPrintStderr(event.content, opts.role);
2142
- if (event.type === "session_init" && opts.taskId) {
2143
- saveProviderSessionId(
2144
- opts.apiUrl,
2145
- opts.apiKey,
2146
- opts.taskId,
2147
- event.sessionId,
2148
- event.provider,
2149
- event.providerMeta,
2150
- ).catch((err) => console.warn(`[runner] Failed to save session ID: ${err}`));
2151
- }
2152
- if (event.type === "progress" && opts.taskId) {
2153
- const now = Date.now();
2154
- if (now - lastAiLoopProgressTime >= PROGRESS_THROTTLE_MS) {
2155
- lastAiLoopProgressTime = now;
2156
- updateProgressViaAPI(opts.apiUrl, opts.apiKey, opts.taskId, event.message).catch(() => {});
2157
- }
2158
- }
2159
- });
2160
-
2161
- return session.waitForCompletion();
2162
- }
2163
-
2164
2161
  /** Check for completed processes and remove them from active tasks */
2165
2162
  async function checkCompletedProcesses(
2166
2163
  state: RunnerState,
@@ -2541,86 +2538,142 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2541
2538
  );
2542
2539
  console.log(`[${role}] Total system prompt length: ${resolvedSystemPrompt.length} chars`);
2543
2540
 
2544
- const isAiLoop = opts.aiLoop || process.env.AI_LOOP === "true";
2545
-
2546
2541
  // Constants for polling
2547
2542
  const PollIntervalMs = 2000; // 2 seconds between polls
2548
2543
  const PollTimeoutMs = 60000; // 1 minute timeout before retrying
2549
2544
 
2550
2545
  let iteration = 0;
2551
2546
 
2552
- if (!isAiLoop) {
2553
- // Fetch template early (before registration) so defaults can be applied
2554
- const templateId = process.env.TEMPLATE_ID;
2555
- const registryUrl = process.env.TEMPLATE_REGISTRY_URL || "https://templates.agent-swarm.dev";
2556
- let cachedTemplate: TemplateResponse | null = null;
2557
-
2558
- if (templateId) {
2559
- try {
2560
- cachedTemplate = await fetchTemplate(templateId, registryUrl, "/workspace/.template-cache");
2561
- if (cachedTemplate) {
2562
- console.log(`[${role}] Fetched template: ${templateId}`);
2547
+ // Fetch template early (before registration) so defaults can be applied
2548
+ const templateId = process.env.TEMPLATE_ID;
2549
+ const registryUrl = process.env.TEMPLATE_REGISTRY_URL || "https://templates.agent-swarm.dev";
2550
+ let cachedTemplate: TemplateResponse | null = null;
2563
2551
 
2564
- // Apply agentDefaults as fallbacks (env/config takes precedence)
2565
- const defaults = cachedTemplate.config.agentDefaults;
2566
- if (config.role === "worker" && defaults.role) {
2567
- role = defaults.role;
2568
- }
2569
- if (!capabilities?.length && defaults.capabilities?.length) {
2570
- capabilities = defaults.capabilities;
2571
- }
2552
+ if (templateId) {
2553
+ try {
2554
+ cachedTemplate = await fetchTemplate(templateId, registryUrl, "/workspace/.template-cache");
2555
+ if (cachedTemplate) {
2556
+ console.log(`[${role}] Fetched template: ${templateId}`);
2557
+
2558
+ // Apply agentDefaults as fallbacks (env/config takes precedence)
2559
+ const defaults = cachedTemplate.config.agentDefaults;
2560
+ if (config.role === "worker" && defaults.role) {
2561
+ role = defaults.role;
2562
+ }
2563
+ if (!capabilities?.length && defaults.capabilities?.length) {
2564
+ capabilities = defaults.capabilities;
2572
2565
  }
2566
+ }
2567
+ } catch (err) {
2568
+ console.warn(`[${role}] Failed to fetch template ${templateId}: ${err}`);
2569
+ }
2570
+ }
2571
+
2572
+ // Runner-level polling mode with parallel execution support
2573
+ const isLeadFromConfig = config.role === "lead";
2574
+ const isLead = isLeadFromConfig || (cachedTemplate?.config.agentDefaults?.isLead ?? false);
2575
+ const defaultMaxTasks = isLead ? 2 : 1;
2576
+ const templateMaxTasks = cachedTemplate?.config.agentDefaults?.maxTasks;
2577
+ const maxConcurrent = resolveMaxConcurrent(process.env, templateMaxTasks, defaultMaxTasks);
2578
+ console.log(`[${role}] Mode: runner-level polling`);
2579
+ console.log(`[${role}] Max concurrent tasks: ${maxConcurrent}`);
2580
+
2581
+ // Initialize runner state for parallel execution
2582
+ const state: RunnerState = {
2583
+ activeTasks: new Map(),
2584
+ maxConcurrent,
2585
+ harnessProvider: bootProvider,
2586
+ };
2587
+
2588
+ // Track tasks already signaled for cancellation to avoid repeated SIGTERM
2589
+ const cancelledSignaled = new Set<string>();
2590
+
2591
+ // Migration 055 — cache the harness_provider value used when we last
2592
+ // built a `cred_status` snapshot. Re-runs the post-task check only when
2593
+ // the resolved provider changes. Section 4 of the swarm_config-overrides-
2594
+ // HARNESS_PROVIDER work makes this dynamic: state.harnessProvider is
2595
+ // reconciled below from `swarm_config`, so an operator's change reaches
2596
+ // here without a worker restart.
2597
+ let cachedCredHarnessProvider: string | null = null;
2598
+
2599
+ // Throttle for live HARNESS_PROVIDER reconciliation. Each reconciliation
2600
+ // calls `fetchResolvedEnv` which also re-resolves credential pools — we
2601
+ // don't want that on every 2s poll. 10s gives operator changes a near-
2602
+ // immediate effect from a UX perspective without hammering the API.
2603
+ let lastHarnessReconcileAt = 0;
2604
+ const HARNESS_RECONCILE_INTERVAL_MS = 10_000;
2605
+
2606
+ // Create API config for ping/close
2607
+ const apiConfig: ApiConfig = { apiUrl, apiKey, agentId };
2608
+
2609
+ // Setup graceful shutdown handlers with API config and runner state access
2610
+ setupShutdownHandlers(role, apiConfig, () => state);
2611
+
2612
+ // Register agent before starting
2613
+ const agentName =
2614
+ process.env.AGENT_NAME ||
2615
+ cachedTemplate?.config.displayName ||
2616
+ `${role}-${agentId.slice(0, 8)}`;
2617
+
2618
+ /**
2619
+ * Reconcile RunnerState + process.env against a freshly resolved swarm
2620
+ * config snapshot. Single source of truth for live config drift; used
2621
+ * both during the credential-wait (so operator flips reach the predicate
2622
+ * mid-loop) and from the post-boot periodic reconciler.
2623
+ *
2624
+ * Returns whether anything agent-visible (provider, maxConcurrent)
2625
+ * changed — callers use this to decide whether to re-register.
2626
+ */
2627
+ const applySwarmConfigDrift = async (
2628
+ freshEnv: Record<string, string | undefined>,
2629
+ resolvedProvider: ProviderName,
2630
+ ): Promise<{ agentVisibleChanged: boolean }> => {
2631
+ let agentVisibleChanged = false;
2632
+
2633
+ // (1) Harness provider — swap adapter + rebuild prompt atomically.
2634
+ if (resolvedProvider !== state.harnessProvider) {
2635
+ const previous = state.harnessProvider;
2636
+ console.log(`[${role}] [harness] Reconciling adapter: ${previous} → ${resolvedProvider}`);
2637
+ try {
2638
+ adapter = createProviderAdapter(resolvedProvider);
2639
+ state.harnessProvider = resolvedProvider;
2640
+ basePrompt = await buildSystemPrompt();
2641
+ resolvedSystemPrompt = additionalSystemPrompt
2642
+ ? `${basePrompt}\n\n${additionalSystemPrompt}`
2643
+ : basePrompt;
2644
+ cachedCredHarnessProvider = null;
2645
+ agentVisibleChanged = true;
2646
+ console.log(
2647
+ `[${role}] [harness] Swapped to ${resolvedProvider} (basePrompt rebuilt: ${basePrompt.length} chars)`,
2648
+ );
2573
2649
  } catch (err) {
2574
- console.warn(`[${role}] Failed to fetch template ${templateId}: ${err}`);
2650
+ console.warn(
2651
+ `[${role}] [harness] Failed to swap to ${resolvedProvider} (staying on ${previous}): ${err}`,
2652
+ );
2575
2653
  }
2576
2654
  }
2577
2655
 
2578
- // Runner-level polling mode with parallel execution support
2579
- const isLeadFromConfig = config.role === "lead";
2580
- const isLead = isLeadFromConfig || (cachedTemplate?.config.agentDefaults?.isLead ?? false);
2581
- const defaultMaxTasks = isLead ? 2 : 1;
2582
- const maxConcurrent = process.env.MAX_CONCURRENT_TASKS
2583
- ? parseInt(process.env.MAX_CONCURRENT_TASKS, 10)
2584
- : (cachedTemplate?.config.agentDefaults?.maxTasks ?? defaultMaxTasks);
2585
- console.log(`[${role}] Mode: runner-level polling (use --ai-loop for AI-based polling)`);
2586
- console.log(`[${role}] Max concurrent tasks: ${maxConcurrent}`);
2587
-
2588
- // Initialize runner state for parallel execution
2589
- const state: RunnerState = {
2590
- activeTasks: new Map(),
2591
- maxConcurrent,
2592
- harnessProvider: bootProvider,
2593
- };
2656
+ // (2) Max concurrency operator can tune from the dashboard live.
2657
+ // Note: shrinking below activeTasks.size won't kill in-flight tasks; new
2658
+ // spawns are simply gated until in-flight drain back under the new cap.
2659
+ const nextMax = resolveMaxConcurrent(freshEnv, templateMaxTasks, defaultMaxTasks);
2660
+ if (nextMax !== state.maxConcurrent) {
2661
+ console.log(`[${role}] [config] maxConcurrent: ${state.maxConcurrent} → ${nextMax}`);
2662
+ state.maxConcurrent = nextMax;
2663
+ agentVisibleChanged = true;
2664
+ }
2665
+
2666
+ // (3) Apply the small allowlist of safe-to-mutate env keys to process.env.
2667
+ const changedKeys = applyResolvedEnvToProcessEnv(freshEnv);
2668
+ if (changedKeys.length > 0) {
2669
+ console.log(`[${role}] [env-reload] Updated process.env: ${changedKeys.join(", ")}`);
2670
+ }
2671
+
2672
+ return { agentVisibleChanged };
2673
+ };
2594
2674
 
2595
- // Track tasks already signaled for cancellation to avoid repeated SIGTERM
2596
- const cancelledSignaled = new Set<string>();
2597
-
2598
- // Migration 055 — cache the harness_provider value used when we last
2599
- // built a `cred_status` snapshot. Re-runs the post-task check only when
2600
- // the resolved provider changes. Section 4 of the swarm_config-overrides-
2601
- // HARNESS_PROVIDER work makes this dynamic: state.harnessProvider is
2602
- // reconciled below from `swarm_config`, so an operator's change reaches
2603
- // here without a worker restart.
2604
- let cachedCredHarnessProvider: string | null = null;
2605
-
2606
- // Throttle for live HARNESS_PROVIDER reconciliation. Each reconciliation
2607
- // calls `fetchResolvedEnv` which also re-resolves credential pools — we
2608
- // don't want that on every 2s poll. 10s gives operator changes a near-
2609
- // immediate effect from a UX perspective without hammering the API.
2610
- let lastHarnessReconcileAt = 0;
2611
- const HARNESS_RECONCILE_INTERVAL_MS = 10_000;
2612
-
2613
- // Create API config for ping/close
2614
- const apiConfig: ApiConfig = { apiUrl, apiKey, agentId };
2615
-
2616
- // Setup graceful shutdown handlers with API config and runner state access
2617
- setupShutdownHandlers(role, apiConfig, () => state);
2618
-
2619
- // Register agent before starting
2620
- const agentName =
2621
- process.env.AGENT_NAME ||
2622
- cachedTemplate?.config.displayName ||
2623
- `${role}-${agentId.slice(0, 8)}`;
2675
+ /** Push the current live state back to the API so the dashboard reflects it. */
2676
+ const reregisterAgent = async () => {
2624
2677
  try {
2625
2678
  await registerAgent({
2626
2679
  apiUrl,
@@ -2630,1055 +2683,988 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2630
2683
  role,
2631
2684
  isLead,
2632
2685
  capabilities,
2633
- maxTasks: maxConcurrent,
2634
- harnessProvider: bootProvider,
2686
+ maxTasks: state.maxConcurrent,
2687
+ harnessProvider: state.harnessProvider,
2635
2688
  });
2636
- console.log(`[${role}] Registered as "${agentName}" (ID: ${agentId})`);
2637
- } catch (error) {
2638
- console.error(`[${role}] Failed to register: ${error}`);
2639
- process.exit(1);
2640
- }
2641
-
2642
- // Block until harness credentials are present in env. This loop replaces
2643
- // the old bash-level fail-fast in `docker-entrypoint.sh` — the worker is
2644
- // already registered (visible to the dashboard) and self-heals once
2645
- // creds appear in `swarm_config`. See plans/2026-05-06-worker-credential-safe-loop.md.
2646
- //
2647
- // CRED_CHECK_DISABLE=1 opts out entirely: the worker trusts the operator
2648
- // and starts polling immediately, with a NULL `cred_status` row that the
2649
- // dashboard surfaces as "unreported."
2650
- const harnessProvider = bootProvider;
2651
- cachedCredHarnessProvider = harnessProvider;
2652
- if (isCredCheckDisabled(process.env)) {
2653
- console.log(`[${role}] CRED_CHECK_DISABLE=1, skipping credential checks`);
2654
- } else {
2655
- try {
2656
- await awaitCredentials({
2657
- provider: harnessProvider,
2658
- refreshEnv: async () => {
2659
- const { env } = await fetchResolvedEnv(apiUrl, apiKey, agentId);
2660
- return env;
2661
- },
2662
- onTick: (status) => {
2663
- // Best-effort status report — the dispatcher uses it to route
2664
- // around blocked agents. Failures are non-fatal (the wait loop
2665
- // already swallows onTick exceptions). We do NOT include
2666
- // `cred_status` here — the live test runs once the worker is
2667
- // ready (below), and intermediate ticks are presence-only.
2668
- fetch(`${apiUrl}/api/agents/${encodeURIComponent(agentId)}/credential-status`, {
2669
- method: "PUT",
2670
- headers: {
2671
- Authorization: `Bearer ${apiKey}`,
2672
- "X-Agent-ID": agentId,
2673
- "Content-Type": "application/json",
2674
- },
2675
- body: JSON.stringify({ ready: status.ready, missing: status.missing }),
2676
- }).catch(() => {
2677
- // Swallowed — Phase 2 wait loop logs every tick anyway.
2678
- });
2679
- },
2680
- });
2681
- } catch (err) {
2682
- if (err instanceof BootMaxWaitExceededError) {
2683
- console.error(`[${role}] ${err.message}`);
2684
- process.exit(EX_CONFIG);
2685
- }
2686
- throw err;
2687
- }
2688
-
2689
- // Migration 055: build the full snapshot (presence + live test) once
2690
- // creds are ready and POST it to the agent row. Status endpoint reads
2691
- // this instead of running predicates server-side.
2692
- try {
2693
- const snapshot = await buildCredStatusReport(harnessProvider, process.env, {}, "boot");
2694
- await reportCredStatus(apiUrl, apiKey, agentId, snapshot);
2695
- } catch (err) {
2696
- // Non-fatal — worker proceeds even if reporting fails.
2697
- console.warn(`[${role}] cred_status boot report failed (non-fatal): ${err}`);
2698
- }
2689
+ } catch (err) {
2690
+ console.warn(`[${role}] [config] Re-register failed (non-fatal): ${err}`);
2699
2691
  }
2692
+ };
2693
+ try {
2694
+ await registerAgent({
2695
+ apiUrl,
2696
+ apiKey,
2697
+ agentId,
2698
+ name: agentName,
2699
+ role,
2700
+ isLead,
2701
+ capabilities,
2702
+ maxTasks: maxConcurrent,
2703
+ harnessProvider: bootProvider,
2704
+ });
2705
+ console.log(`[${role}] Registered as "${agentName}" (ID: ${agentId})`);
2706
+ } catch (error) {
2707
+ console.error(`[${role}] Failed to register: ${error}`);
2708
+ process.exit(1);
2709
+ }
2700
2710
 
2701
- // Clean up any stale active sessions from previous runs (crash recovery)
2702
- await cleanupActiveSessions(apiConfig);
2703
- console.log(`[${role}] Cleaned up stale active sessions`);
2704
-
2705
- // Fetch full agent profile to get soul/identity content
2711
+ // Block until harness credentials are present in env. This loop replaces
2712
+ // the old bash-level fail-fast in `docker-entrypoint.sh` — the worker is
2713
+ // already registered (visible to the dashboard) and self-heals once
2714
+ // creds appear in `swarm_config`. See plans/2026-05-06-worker-credential-safe-loop.md.
2715
+ //
2716
+ // CRED_CHECK_DISABLE=1 opts out entirely: the worker trusts the operator
2717
+ // and starts polling immediately, with a NULL `cred_status` row that the
2718
+ // dashboard surfaces as "unreported."
2719
+ cachedCredHarnessProvider = state.harnessProvider;
2720
+ if (isCredCheckDisabled(process.env)) {
2721
+ console.log(`[${role}] CRED_CHECK_DISABLE=1, skipping credential checks`);
2722
+ } else {
2706
2723
  try {
2707
- const resp = await fetch(`${apiUrl}/me`, {
2708
- headers: {
2709
- Authorization: `Bearer ${apiKey}`,
2710
- "X-Agent-ID": agentId,
2711
- },
2712
- });
2713
- if (resp.ok) {
2714
- const profile = (await resp.json()) as {
2715
- soulMd?: string;
2716
- identityMd?: string;
2717
- claudeMd?: string;
2718
- setupScript?: string;
2719
- toolsMd?: string;
2720
- heartbeatMd?: string;
2721
- name?: string;
2722
- description?: string;
2723
- };
2724
- agentSoulMd = profile.soulMd;
2725
- agentIdentityMd = profile.identityMd;
2726
- agentSetupScript = profile.setupScript;
2727
- agentToolsMd = profile.toolsMd;
2728
- agentClaudeMd = profile.claudeMd;
2729
- agentHeartbeatMd = profile.heartbeatMd;
2730
- agentProfileName = profile.name;
2731
- agentDescription = profile.description;
2732
-
2733
- // Generate default templates if missing (runner registers via POST /api/agents
2734
- // which doesn't generate templates like join-swarm does)
2735
- if (
2736
- !agentSoulMd ||
2737
- !agentIdentityMd ||
2738
- !agentToolsMd ||
2739
- !agentClaudeMd ||
2740
- !agentHeartbeatMd
2741
- ) {
2742
- // Use already-fetched template (from pre-registration step)
2743
- if (cachedTemplate) {
2744
- const ctx = {
2745
- agent: {
2746
- name: agentProfileName || agentName,
2747
- role: role,
2748
- description: agentDescription || "",
2749
- capabilities: (capabilities || []).join(", "),
2750
- },
2751
- };
2752
- if (!agentSoulMd) agentSoulMd = interpolate(cachedTemplate.files.soulMd, ctx).result;
2753
- if (!agentIdentityMd)
2754
- agentIdentityMd = interpolate(cachedTemplate.files.identityMd, ctx).result;
2755
- if (!agentToolsMd) agentToolsMd = interpolate(cachedTemplate.files.toolsMd, ctx).result;
2756
- if (!agentClaudeMd)
2757
- agentClaudeMd = interpolate(cachedTemplate.files.claudeMd, ctx).result;
2758
- if (!agentSetupScript)
2759
- agentSetupScript = interpolate(cachedTemplate.files.setupScript, ctx).result;
2760
- if (!agentHeartbeatMd)
2761
- agentHeartbeatMd = interpolate(cachedTemplate.files.heartbeatMd, ctx).result;
2762
- console.log(`[${role}] Applied template: ${templateId}`);
2763
- }
2764
-
2765
- // Fallback to generic defaults for any still-missing fields
2766
- const agentInfo = {
2767
- name: agentProfileName || agentName,
2768
- role: role,
2769
- description: agentDescription,
2770
- capabilities: config.capabilities,
2771
- };
2772
- if (!agentSoulMd) agentSoulMd = generateDefaultSoulMd(agentInfo);
2773
- if (!agentIdentityMd) agentIdentityMd = generateDefaultIdentityMd(agentInfo);
2774
- if (!agentToolsMd) agentToolsMd = generateDefaultToolsMd(agentInfo);
2775
- if (!agentClaudeMd) agentClaudeMd = generateDefaultClaudeMd(agentInfo);
2776
-
2777
- // Push generated templates to server
2778
- try {
2779
- const profileUpdate: Record<string, string> = {};
2780
- if (!profile.soulMd) profileUpdate.soulMd = agentSoulMd;
2781
- if (!profile.identityMd) profileUpdate.identityMd = agentIdentityMd;
2782
- if (!profile.toolsMd) profileUpdate.toolsMd = agentToolsMd;
2783
- if (!profile.claudeMd && agentClaudeMd) profileUpdate.claudeMd = agentClaudeMd;
2784
- if (!profile.setupScript && agentSetupScript)
2785
- profileUpdate.setupScript = agentSetupScript;
2786
- if (!profile.heartbeatMd && agentHeartbeatMd)
2787
- profileUpdate.heartbeatMd = agentHeartbeatMd;
2788
-
2789
- await fetch(`${apiUrl}/api/agents/${agentId}/profile`, {
2790
- method: "PUT",
2791
- headers: {
2792
- Authorization: `Bearer ${apiKey}`,
2793
- "X-Agent-ID": agentId,
2794
- "Content-Type": "application/json",
2795
- },
2796
- body: JSON.stringify(profileUpdate),
2797
- });
2798
- console.log(`[${role}] Generated and saved default identity templates`);
2799
- } catch {
2800
- console.warn(`[${role}] Could not save generated templates to server`);
2724
+ await awaitCredentials({
2725
+ provider: state.harnessProvider,
2726
+ // Re-read each tick so an operator's HARNESS_PROVIDER flip during
2727
+ // the wait pivots the credential predicate (and onwards).
2728
+ getProvider: () => state.harnessProvider,
2729
+ refreshEnv: async () => {
2730
+ const { env, resolvedProvider } = await fetchResolvedEnv(apiUrl, apiKey, agentId);
2731
+ // Apply drift inside the wait so adapter/prompt/state stay in
2732
+ // sync if the operator flips HARNESS_PROVIDER mid-loop. The
2733
+ // helper is idempotent when nothing changed.
2734
+ const { agentVisibleChanged } = await applySwarmConfigDrift(env, resolvedProvider);
2735
+ if (agentVisibleChanged) {
2736
+ // Fire-and-forget — dashboard reflects the live values, the
2737
+ // wait loop doesn't block on it.
2738
+ reregisterAgent().catch(() => {});
2801
2739
  }
2802
- }
2803
-
2804
- // Fetch installed skills for system prompt
2805
- try {
2806
- const skillsResp = await fetch(`${apiUrl}/api/agents/${agentId}/skills`, {
2740
+ return env;
2741
+ },
2742
+ onTick: (status) => {
2743
+ // Best-effort status report — the dispatcher uses it to route
2744
+ // around blocked agents. Failures are non-fatal (the wait loop
2745
+ // already swallows onTick exceptions). We do NOT include
2746
+ // `cred_status` here — the live test runs once the worker is
2747
+ // ready (below), and intermediate ticks are presence-only.
2748
+ fetch(`${apiUrl}/api/agents/${encodeURIComponent(agentId)}/credential-status`, {
2749
+ method: "PUT",
2807
2750
  headers: {
2808
2751
  Authorization: `Bearer ${apiKey}`,
2809
2752
  "X-Agent-ID": agentId,
2753
+ "Content-Type": "application/json",
2810
2754
  },
2755
+ body: JSON.stringify({ ready: status.ready, missing: status.missing }),
2756
+ }).catch(() => {
2757
+ // Swallowed — Phase 2 wait loop logs every tick anyway.
2811
2758
  });
2812
- if (skillsResp.ok) {
2813
- const skillsData = (await skillsResp.json()) as {
2814
- skills: {
2815
- name: string;
2816
- description: string;
2817
- isActive: boolean;
2818
- isEnabled: boolean;
2819
- }[];
2820
- };
2821
- agentSkillsSummary = skillsData.skills
2822
- .filter((s) => s.isActive && s.isEnabled)
2823
- .map((s) => ({ name: s.name, description: s.description }));
2824
- if (agentSkillsSummary.length > 0) {
2825
- console.log(`[${role}] Loaded ${agentSkillsSummary.length} skills for system prompt`);
2826
- }
2827
- }
2828
- } catch {
2829
- // Non-fatal skills are optional
2759
+ },
2760
+ });
2761
+ } catch (err) {
2762
+ if (err instanceof BootMaxWaitExceededError) {
2763
+ console.error(`[${role}] ${err.message}`);
2764
+ process.exit(EX_CONFIG);
2765
+ }
2766
+ throw err;
2767
+ }
2768
+
2769
+ // Migration 055: build the full snapshot (presence + live test) once
2770
+ // creds are ready and POST it to the agent row. Status endpoint reads
2771
+ // this instead of running predicates server-side. Always uses the
2772
+ // *current* state.harnessProvider in case it flipped during the wait.
2773
+ try {
2774
+ const snapshot = await buildCredStatusReport(state.harnessProvider, process.env, {}, "boot");
2775
+ await reportCredStatus(apiUrl, apiKey, agentId, snapshot);
2776
+ } catch (err) {
2777
+ // Non-fatal — worker proceeds even if reporting fails.
2778
+ console.warn(`[${role}] cred_status boot report failed (non-fatal): ${err}`);
2779
+ }
2780
+ }
2781
+
2782
+ // Clean up any stale active sessions from previous runs (crash recovery)
2783
+ await cleanupActiveSessions(apiConfig);
2784
+ console.log(`[${role}] Cleaned up stale active sessions`);
2785
+
2786
+ // Fetch full agent profile to get soul/identity content
2787
+ try {
2788
+ const resp = await fetch(`${apiUrl}/me`, {
2789
+ headers: {
2790
+ Authorization: `Bearer ${apiKey}`,
2791
+ "X-Agent-ID": agentId,
2792
+ },
2793
+ });
2794
+ if (resp.ok) {
2795
+ const profile = (await resp.json()) as {
2796
+ soulMd?: string;
2797
+ identityMd?: string;
2798
+ claudeMd?: string;
2799
+ setupScript?: string;
2800
+ toolsMd?: string;
2801
+ heartbeatMd?: string;
2802
+ name?: string;
2803
+ description?: string;
2804
+ };
2805
+ agentSoulMd = profile.soulMd;
2806
+ agentIdentityMd = profile.identityMd;
2807
+ agentSetupScript = profile.setupScript;
2808
+ agentToolsMd = profile.toolsMd;
2809
+ agentClaudeMd = profile.claudeMd;
2810
+ agentHeartbeatMd = profile.heartbeatMd;
2811
+ agentProfileName = profile.name;
2812
+ agentDescription = profile.description;
2813
+
2814
+ // Generate default templates if missing (runner registers via POST /api/agents
2815
+ // which doesn't generate templates like join-swarm does)
2816
+ if (
2817
+ !agentSoulMd ||
2818
+ !agentIdentityMd ||
2819
+ !agentToolsMd ||
2820
+ !agentClaudeMd ||
2821
+ !agentHeartbeatMd
2822
+ ) {
2823
+ // Use already-fetched template (from pre-registration step)
2824
+ if (cachedTemplate) {
2825
+ const ctx = {
2826
+ agent: {
2827
+ name: agentProfileName || agentName,
2828
+ role: role,
2829
+ description: agentDescription || "",
2830
+ capabilities: (capabilities || []).join(", "),
2831
+ },
2832
+ };
2833
+ if (!agentSoulMd) agentSoulMd = interpolate(cachedTemplate.files.soulMd, ctx).result;
2834
+ if (!agentIdentityMd)
2835
+ agentIdentityMd = interpolate(cachedTemplate.files.identityMd, ctx).result;
2836
+ if (!agentToolsMd) agentToolsMd = interpolate(cachedTemplate.files.toolsMd, ctx).result;
2837
+ if (!agentClaudeMd)
2838
+ agentClaudeMd = interpolate(cachedTemplate.files.claudeMd, ctx).result;
2839
+ if (!agentSetupScript)
2840
+ agentSetupScript = interpolate(cachedTemplate.files.setupScript, ctx).result;
2841
+ if (!agentHeartbeatMd)
2842
+ agentHeartbeatMd = interpolate(cachedTemplate.files.heartbeatMd, ctx).result;
2843
+ console.log(`[${role}] Applied template: ${templateId}`);
2830
2844
  }
2831
2845
 
2832
- // Fetch installed MCP servers for system prompt
2846
+ // Fallback to generic defaults for any still-missing fields
2847
+ const agentInfo = {
2848
+ name: agentProfileName || agentName,
2849
+ role: role,
2850
+ description: agentDescription,
2851
+ capabilities: config.capabilities,
2852
+ };
2853
+ if (!agentSoulMd) agentSoulMd = generateDefaultSoulMd(agentInfo);
2854
+ if (!agentIdentityMd) agentIdentityMd = generateDefaultIdentityMd(agentInfo);
2855
+ if (!agentToolsMd) agentToolsMd = generateDefaultToolsMd(agentInfo);
2856
+ if (!agentClaudeMd) agentClaudeMd = generateDefaultClaudeMd(agentInfo);
2857
+
2858
+ // Push generated templates to server
2833
2859
  try {
2834
- const mcpServersResp = await fetch(`${apiUrl}/api/agents/${agentId}/mcp-servers`, {
2860
+ const profileUpdate: Record<string, string> = {};
2861
+ if (!profile.soulMd) profileUpdate.soulMd = agentSoulMd;
2862
+ if (!profile.identityMd) profileUpdate.identityMd = agentIdentityMd;
2863
+ if (!profile.toolsMd) profileUpdate.toolsMd = agentToolsMd;
2864
+ if (!profile.claudeMd && agentClaudeMd) profileUpdate.claudeMd = agentClaudeMd;
2865
+ if (!profile.setupScript && agentSetupScript)
2866
+ profileUpdate.setupScript = agentSetupScript;
2867
+ if (!profile.heartbeatMd && agentHeartbeatMd)
2868
+ profileUpdate.heartbeatMd = agentHeartbeatMd;
2869
+
2870
+ await fetch(`${apiUrl}/api/agents/${agentId}/profile`, {
2871
+ method: "PUT",
2835
2872
  headers: {
2836
2873
  Authorization: `Bearer ${apiKey}`,
2837
2874
  "X-Agent-ID": agentId,
2875
+ "Content-Type": "application/json",
2838
2876
  },
2877
+ body: JSON.stringify(profileUpdate),
2839
2878
  });
2840
- if (mcpServersResp.ok) {
2841
- const mcpServersData = (await mcpServersResp.json()) as {
2842
- servers: {
2843
- name: string;
2844
- transport: string;
2845
- description: string | null;
2846
- isActive: boolean;
2847
- isEnabled: boolean;
2848
- }[];
2849
- };
2850
- const activeMcpServers = mcpServersData.servers.filter(
2851
- (s) => s.isActive && s.isEnabled,
2852
- );
2853
- if (activeMcpServers.length > 0) {
2854
- agentMcpServersSummary = activeMcpServers
2855
- .map(
2856
- (s) => `- **${s.name}** (${s.transport}): ${s.description || "No description"}`,
2857
- )
2858
- .join("\n");
2859
- console.log(
2860
- `[${role}] Loaded ${activeMcpServers.length} MCP servers for system prompt`,
2861
- );
2862
- }
2863
- }
2879
+ console.log(`[${role}] Generated and saved default identity templates`);
2864
2880
  } catch {
2865
- // Non-fatal MCP servers are optional
2881
+ console.warn(`[${role}] Could not save generated templates to server`);
2866
2882
  }
2867
-
2868
- // Rebuild system prompt with identity
2869
- basePrompt = await buildSystemPrompt();
2870
- resolvedSystemPrompt = additionalSystemPrompt
2871
- ? `${basePrompt}\n\n${additionalSystemPrompt}`
2872
- : basePrompt;
2873
- console.log(
2874
- `[${role}] Loaded agent identity (soul: ${agentSoulMd ? "yes" : "no"}, identity: ${agentIdentityMd ? "yes" : "no"}, tools: ${agentToolsMd ? "yes" : "no"}, claude: ${agentClaudeMd ? "yes" : "no"})`,
2875
- );
2876
- console.log(`[${role}] Updated system prompt length: ${resolvedSystemPrompt.length} chars`);
2877
2883
  }
2878
- } catch {
2879
- console.warn(`[${role}] Could not fetch agent profile for identity — proceeding without`);
2880
- }
2881
2884
 
2882
- // Write SOUL.md and IDENTITY.md to workspace before spawning Claude
2883
- const SOUL_MD_PATH = "/workspace/SOUL.md";
2884
- const IDENTITY_MD_PATH = "/workspace/IDENTITY.md";
2885
-
2886
- if (agentSoulMd) {
2887
- try {
2888
- await Bun.write(SOUL_MD_PATH, agentSoulMd);
2889
- console.log(`[${role}] Wrote SOUL.md to workspace`);
2890
- } catch (err) {
2891
- console.warn(`[${role}] Could not write SOUL.md: ${(err as Error).message}`);
2892
- }
2893
- }
2894
- if (agentIdentityMd) {
2885
+ // Fetch installed skills for system prompt
2895
2886
  try {
2896
- await Bun.write(IDENTITY_MD_PATH, agentIdentityMd);
2897
- console.log(`[${role}] Wrote IDENTITY.md to workspace`);
2898
- } catch (err) {
2899
- console.warn(`[${role}] Could not write IDENTITY.md: ${(err as Error).message}`);
2887
+ const skillsResp = await fetch(`${apiUrl}/api/agents/${agentId}/skills`, {
2888
+ headers: {
2889
+ Authorization: `Bearer ${apiKey}`,
2890
+ "X-Agent-ID": agentId,
2891
+ },
2892
+ });
2893
+ if (skillsResp.ok) {
2894
+ const skillsData = (await skillsResp.json()) as {
2895
+ skills: {
2896
+ name: string;
2897
+ description: string;
2898
+ isActive: boolean;
2899
+ isEnabled: boolean;
2900
+ }[];
2901
+ };
2902
+ agentSkillsSummary = skillsData.skills
2903
+ .filter((s) => s.isActive && s.isEnabled)
2904
+ .map((s) => ({ name: s.name, description: s.description }));
2905
+ if (agentSkillsSummary.length > 0) {
2906
+ console.log(`[${role}] Loaded ${agentSkillsSummary.length} skills for system prompt`);
2907
+ }
2908
+ }
2909
+ } catch {
2910
+ // Non-fatal — skills are optional
2900
2911
  }
2901
- }
2902
2912
 
2903
- // Write setup script to workspace (agent can edit during session)
2904
- // Only create if it doesn't exist — the entrypoint already composed/prepended it at container start
2905
- if (agentSetupScript) {
2913
+ // Fetch installed MCP servers for system prompt
2906
2914
  try {
2907
- if (!(await Bun.file("/workspace/start-up.sh").exists())) {
2908
- await Bun.write("/workspace/start-up.sh", `#!/bin/bash\n${agentSetupScript}\n`);
2909
- console.log(`[${role}] Wrote start-up.sh to workspace`);
2915
+ const mcpServersResp = await fetch(`${apiUrl}/api/agents/${agentId}/mcp-servers`, {
2916
+ headers: {
2917
+ Authorization: `Bearer ${apiKey}`,
2918
+ "X-Agent-ID": agentId,
2919
+ },
2920
+ });
2921
+ if (mcpServersResp.ok) {
2922
+ const mcpServersData = (await mcpServersResp.json()) as {
2923
+ servers: {
2924
+ name: string;
2925
+ transport: string;
2926
+ description: string | null;
2927
+ isActive: boolean;
2928
+ isEnabled: boolean;
2929
+ }[];
2930
+ };
2931
+ const activeMcpServers = mcpServersData.servers.filter((s) => s.isActive && s.isEnabled);
2932
+ if (activeMcpServers.length > 0) {
2933
+ agentMcpServersSummary = activeMcpServers
2934
+ .map((s) => `- **${s.name}** (${s.transport}): ${s.description || "No description"}`)
2935
+ .join("\n");
2936
+ console.log(
2937
+ `[${role}] Loaded ${activeMcpServers.length} MCP servers for system prompt`,
2938
+ );
2939
+ }
2910
2940
  }
2911
- } catch (err) {
2912
- console.warn(`[${role}] Could not write start-up.sh: ${(err as Error).message}`);
2941
+ } catch {
2942
+ // Non-fatal MCP servers are optional
2913
2943
  }
2914
- }
2915
2944
 
2916
- // Write TOOLS.md to workspace (agent can edit during session)
2917
- if (agentToolsMd) {
2918
- try {
2919
- await Bun.write("/workspace/TOOLS.md", agentToolsMd);
2920
- console.log(`[${role}] Wrote TOOLS.md to workspace`);
2921
- } catch (err) {
2922
- console.warn(`[${role}] Could not write TOOLS.md: ${(err as Error).message}`);
2923
- }
2945
+ // Rebuild system prompt with identity
2946
+ basePrompt = await buildSystemPrompt();
2947
+ resolvedSystemPrompt = additionalSystemPrompt
2948
+ ? `${basePrompt}\n\n${additionalSystemPrompt}`
2949
+ : basePrompt;
2950
+ console.log(
2951
+ `[${role}] Loaded agent identity (soul: ${agentSoulMd ? "yes" : "no"}, identity: ${agentIdentityMd ? "yes" : "no"}, tools: ${agentToolsMd ? "yes" : "no"}, claude: ${agentClaudeMd ? "yes" : "no"})`,
2952
+ );
2953
+ console.log(`[${role}] Updated system prompt length: ${resolvedSystemPrompt.length} chars`);
2924
2954
  }
2955
+ } catch {
2956
+ console.warn(`[${role}] Could not fetch agent profile for identity — proceeding without`);
2957
+ }
2925
2958
 
2926
- // Write HEARTBEAT.md to workspace (lead's periodic checklist)
2927
- if (agentHeartbeatMd) {
2928
- try {
2929
- await Bun.write("/workspace/HEARTBEAT.md", agentHeartbeatMd);
2930
- console.log(`[${role}] Wrote HEARTBEAT.md to workspace`);
2931
- } catch (err) {
2932
- console.warn(`[${role}] Could not write HEARTBEAT.md: ${(err as Error).message}`);
2933
- }
2934
- }
2959
+ // Write SOUL.md and IDENTITY.md to workspace before spawning Claude
2960
+ const SOUL_MD_PATH = "/workspace/SOUL.md";
2961
+ const IDENTITY_MD_PATH = "/workspace/IDENTITY.md";
2935
2962
 
2936
- // Write CLAUDE.md to workspace (agent-level instructions)
2937
- if (agentClaudeMd) {
2938
- try {
2939
- await Bun.write("/workspace/CLAUDE.md", agentClaudeMd);
2940
- console.log(`[${role}] Wrote CLAUDE.md to workspace`);
2941
- } catch (err) {
2942
- console.warn(`[${role}] Could not write CLAUDE.md: ${(err as Error).message}`);
2943
- }
2963
+ if (agentSoulMd) {
2964
+ try {
2965
+ await Bun.write(SOUL_MD_PATH, agentSoulMd);
2966
+ console.log(`[${role}] Wrote SOUL.md to workspace`);
2967
+ } catch (err) {
2968
+ console.warn(`[${role}] Could not write SOUL.md: ${(err as Error).message}`);
2969
+ }
2970
+ }
2971
+ if (agentIdentityMd) {
2972
+ try {
2973
+ await Bun.write(IDENTITY_MD_PATH, agentIdentityMd);
2974
+ console.log(`[${role}] Wrote IDENTITY.md to workspace`);
2975
+ } catch (err) {
2976
+ console.warn(`[${role}] Could not write IDENTITY.md: ${(err as Error).message}`);
2944
2977
  }
2978
+ }
2945
2979
 
2946
- // ========== Sync skills to filesystem ==========
2980
+ // Write setup script to workspace (agent can edit during session)
2981
+ // Only create if it doesn't exist — the entrypoint already composed/prepended it at container start
2982
+ if (agentSetupScript) {
2947
2983
  try {
2948
- console.log(`[${role}] Syncing skills to filesystem...`);
2949
- const syncHeaders: Record<string, string> = {
2950
- "Content-Type": "application/json",
2951
- "X-Agent-ID": agentId,
2952
- };
2953
- if (apiKey) syncHeaders.Authorization = `Bearer ${apiKey}`;
2954
- const syncRes = await fetch(`${swarmUrl}/api/skills/sync-filesystem`, {
2955
- method: "POST",
2956
- headers: syncHeaders,
2957
- });
2958
- if (syncRes.ok) {
2959
- const syncResult = (await syncRes.json()) as {
2960
- synced: number;
2961
- removed: number;
2962
- errors: string[];
2963
- };
2964
- console.log(
2965
- `[${role}] Skills synced: ${syncResult.synced} written, ${syncResult.removed} removed`,
2966
- );
2967
- if (syncResult.errors.length > 0) {
2968
- console.warn(`[${role}] Skill sync errors: ${syncResult.errors.join(", ")}`);
2969
- }
2970
- } else {
2971
- console.warn(`[${role}] Skill sync failed: HTTP ${syncRes.status}`);
2984
+ if (!(await Bun.file("/workspace/start-up.sh").exists())) {
2985
+ await Bun.write("/workspace/start-up.sh", `#!/bin/bash\n${agentSetupScript}\n`);
2986
+ console.log(`[${role}] Wrote start-up.sh to workspace`);
2972
2987
  }
2973
2988
  } catch (err) {
2974
- console.warn(`[${role}] Skill sync failed: ${(err as Error).message}`);
2989
+ console.warn(`[${role}] Could not write start-up.sh: ${(err as Error).message}`);
2975
2990
  }
2991
+ }
2976
2992
 
2977
- // ========== Resume paused tasks with PRIORITY ==========
2978
- // Check for paused tasks from previous shutdown and resume them before normal polling
2993
+ // Write TOOLS.md to workspace (agent can edit during session)
2994
+ if (agentToolsMd) {
2979
2995
  try {
2980
- console.log(`[${role}] Checking for paused tasks to resume...`);
2981
- const pausedTasks = await getPausedTasksFromAPI(apiConfig);
2982
-
2983
- if (pausedTasks.length > 0) {
2984
- console.log(`[${role}] Found ${pausedTasks.length} paused task(s) to resume`);
2996
+ await Bun.write("/workspace/TOOLS.md", agentToolsMd);
2997
+ console.log(`[${role}] Wrote TOOLS.md to workspace`);
2998
+ } catch (err) {
2999
+ console.warn(`[${role}] Could not write TOOLS.md: ${(err as Error).message}`);
3000
+ }
3001
+ }
2985
3002
 
2986
- for (const task of pausedTasks) {
2987
- // Defensive: skip tasks that already have completion data (zombie prevention)
2988
- if (task.finishedAt || task.output) {
2989
- console.warn(
2990
- `[${role}] Skipping zombie task ${task.id.slice(0, 8)} — already has completion data (finishedAt: ${!!task.finishedAt}, output: ${!!task.output})`,
2991
- );
2992
- continue;
2993
- }
3003
+ // Write HEARTBEAT.md to workspace (lead's periodic checklist)
3004
+ if (agentHeartbeatMd) {
3005
+ try {
3006
+ await Bun.write("/workspace/HEARTBEAT.md", agentHeartbeatMd);
3007
+ console.log(`[${role}] Wrote HEARTBEAT.md to workspace`);
3008
+ } catch (err) {
3009
+ console.warn(`[${role}] Could not write HEARTBEAT.md: ${(err as Error).message}`);
3010
+ }
3011
+ }
2994
3012
 
2995
- // Wait if at capacity (though unlikely on fresh startup)
2996
- while (state.activeTasks.size >= state.maxConcurrent) {
2997
- await checkCompletedProcesses(state, role, apiConfig);
2998
- await Bun.sleep(1000);
2999
- }
3013
+ // Write CLAUDE.md to workspace (agent-level instructions)
3014
+ if (agentClaudeMd) {
3015
+ try {
3016
+ await Bun.write("/workspace/CLAUDE.md", agentClaudeMd);
3017
+ console.log(`[${role}] Wrote CLAUDE.md to workspace`);
3018
+ } catch (err) {
3019
+ console.warn(`[${role}] Could not write CLAUDE.md: ${(err as Error).message}`);
3020
+ }
3021
+ }
3000
3022
 
3001
- console.log(
3002
- `[${role}] Resuming paused task ${task.id.slice(0, 8)}: "${task.task.slice(0, 50)}..."`,
3003
- );
3023
+ // ========== Sync skills to filesystem ==========
3024
+ try {
3025
+ console.log(`[${role}] Syncing skills to filesystem...`);
3026
+ const syncHeaders: Record<string, string> = {
3027
+ "Content-Type": "application/json",
3028
+ "X-Agent-ID": agentId,
3029
+ };
3030
+ if (apiKey) syncHeaders.Authorization = `Bearer ${apiKey}`;
3031
+ const syncRes = await fetch(`${swarmUrl}/api/skills/sync-filesystem`, {
3032
+ method: "POST",
3033
+ headers: syncHeaders,
3034
+ });
3035
+ if (syncRes.ok) {
3036
+ const syncResult = (await syncRes.json()) as {
3037
+ synced: number;
3038
+ removed: number;
3039
+ errors: string[];
3040
+ };
3041
+ console.log(
3042
+ `[${role}] Skills synced: ${syncResult.synced} written, ${syncResult.removed} removed`,
3043
+ );
3044
+ if (syncResult.errors.length > 0) {
3045
+ console.warn(`[${role}] Skill sync errors: ${syncResult.errors.join(", ")}`);
3046
+ }
3047
+ } else {
3048
+ console.warn(`[${role}] Skill sync failed: HTTP ${syncRes.status}`);
3049
+ }
3050
+ } catch (err) {
3051
+ console.warn(`[${role}] Skill sync failed: ${(err as Error).message}`);
3052
+ }
3004
3053
 
3005
- // Resume the task via API (marks as in_progress)
3006
- const resumed = await resumeTaskViaAPI(apiConfig, task.id);
3007
- if (!resumed) {
3008
- console.warn(
3009
- `[${role}] Failed to resume task ${task.id.slice(0, 8)} via API, skipping`,
3010
- );
3011
- continue;
3012
- }
3054
+ // ========== Resume paused tasks with PRIORITY ==========
3055
+ // Check for paused tasks from previous shutdown and resume them before normal polling
3056
+ try {
3057
+ console.log(`[${role}] Checking for paused tasks to resume...`);
3058
+ const pausedTasks = await getPausedTasksFromAPI(apiConfig);
3013
3059
 
3014
- // Build prompt with resume context + memory injection
3015
- let resumePrompt = await buildResumePrompt(task, adapter.formatCommand.bind(adapter), {
3016
- hasMcp: adapter.traits.hasMcp,
3017
- });
3060
+ if (pausedTasks.length > 0) {
3061
+ console.log(`[${role}] Found ${pausedTasks.length} paused task(s) to resume`);
3018
3062
 
3019
- // Inject relevant memories for resumed tasks
3020
- const resumeMemoryContext = await fetchRelevantMemories(
3021
- apiUrl,
3022
- apiKey,
3023
- agentId,
3024
- task.task,
3025
- task.id,
3063
+ for (const task of pausedTasks) {
3064
+ // Defensive: skip tasks that already have completion data (zombie prevention)
3065
+ if (task.finishedAt || task.output) {
3066
+ console.warn(
3067
+ `[${role}] Skipping zombie task ${task.id.slice(0, 8)} — already has completion data (finishedAt: ${!!task.finishedAt}, output: ${!!task.output})`,
3026
3068
  );
3027
- if (resumeMemoryContext) {
3028
- resumePrompt += resumeMemoryContext;
3029
- console.log(`[${role}] Injected relevant memories into resumed task prompt`);
3030
- }
3069
+ continue;
3070
+ }
3031
3071
 
3032
- // Resolve --resume: prefer own session ID, then parent's
3033
- let resumeAdditionalArgs = opts.additionalArgs || [];
3034
- if (task.claudeSessionId) {
3035
- resumeAdditionalArgs = [...resumeAdditionalArgs, "--resume", task.claudeSessionId];
3036
- console.log(
3037
- `[${role}] Resuming task's own session ${task.claudeSessionId.slice(0, 8)}`,
3038
- );
3039
- } else if (task.parentTaskId) {
3040
- const parentSessionId = await fetchProviderSessionId(apiUrl, apiKey, task.parentTaskId);
3041
- if (parentSessionId) {
3042
- resumeAdditionalArgs = [...resumeAdditionalArgs, "--resume", parentSessionId];
3043
- console.log(`[${role}] Resuming parent session ${parentSessionId.slice(0, 8)}`);
3044
- }
3045
- }
3072
+ // Wait if at capacity (though unlikely on fresh startup)
3073
+ while (state.activeTasks.size >= state.maxConcurrent) {
3074
+ await checkCompletedProcesses(state, role, apiConfig);
3075
+ await Bun.sleep(1000);
3076
+ }
3046
3077
 
3047
- // Spawn Claude process for resumed task
3048
- iteration++;
3049
- const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3050
- const logFile = `${logDir}/${timestamp}-resume-${task.id.slice(0, 8)}.jsonl`;
3051
-
3052
- console.log(`\n[${role}] === Resuming paused task (iteration ${iteration}) ===`);
3053
- console.log(`[${role}] Logging to: ${logFile}`);
3054
- console.log(`[${role}] Prompt: ${resumePrompt.slice(0, 100)}...`);
3055
-
3056
- const metadata = {
3057
- type: metadataType,
3058
- sessionId,
3059
- iteration,
3060
- timestamp: new Date().toISOString(),
3061
- prompt: resumePrompt,
3062
- trigger: "task_resumed",
3063
- resumedTaskId: task.id,
3064
- yolo: isYolo,
3065
- };
3066
- await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3067
-
3068
- // Resolve cwd for resumed task (mirrors normal task path: task.dir > vcsRepo clonePath)
3069
- let resumeCwd: string | undefined;
3070
- if (task.dir) {
3071
- try {
3072
- if (existsSync(task.dir) && statSync(task.dir).isDirectory()) {
3073
- resumeCwd = task.dir;
3074
- } else {
3075
- console.warn(
3076
- `[${role}] Resume task dir "${task.dir}" does not exist or is not a directory, falling back to default cwd`,
3077
- );
3078
- }
3079
- } catch {
3080
- console.warn(
3081
- `[${role}] Failed to check resume task dir "${task.dir}", falling back to default cwd`,
3082
- );
3083
- }
3084
- }
3078
+ console.log(
3079
+ `[${role}] Resuming paused task ${task.id.slice(0, 8)}: "${task.task.slice(0, 50)}..."`,
3080
+ );
3085
3081
 
3086
- if (!resumeCwd && task.vcsRepo && apiUrl) {
3087
- const repoConfig = await fetchRepoConfig(apiUrl, apiKey, task.vcsRepo);
3088
- const effectiveConfig = repoConfig ?? {
3089
- url: task.vcsRepo,
3090
- name: task.vcsRepo.split("/").pop() || task.vcsRepo,
3091
- clonePath: `/workspace/repos/${task.vcsRepo.split("/").pop() || task.vcsRepo}`,
3092
- defaultBranch: "main",
3093
- };
3094
- const repoContext = await ensureRepoForTask(effectiveConfig, role);
3095
- if (repoContext?.clonePath) {
3096
- resumeCwd = repoContext.clonePath;
3097
- }
3098
- }
3082
+ // Resume the task via API (marks as in_progress)
3083
+ const resumed = await resumeTaskViaAPI(apiConfig, task.id);
3084
+ if (!resumed) {
3085
+ console.warn(`[${role}] Failed to resume task ${task.id.slice(0, 8)} via API, skipping`);
3086
+ continue;
3087
+ }
3099
3088
 
3100
- // Per-task runner session ID so session logs are scoped to this task
3101
- const resumeRunnerSessionId = crypto.randomUUID();
3089
+ // Build prompt with resume context + memory injection
3090
+ let resumePrompt = await buildResumePrompt(task, adapter.formatCommand.bind(adapter), {
3091
+ hasMcp: adapter.traits.hasMcp,
3092
+ });
3102
3093
 
3103
- let runningTask: RunningTask;
3104
- try {
3105
- runningTask = await spawnProviderProcess(
3106
- adapter,
3107
- {
3108
- prompt: resumePrompt,
3109
- logFile,
3110
- systemPrompt: resolvedSystemPrompt,
3111
- additionalArgs: resumeAdditionalArgs,
3112
- role,
3113
- apiUrl,
3114
- apiKey,
3115
- agentId,
3116
- runnerSessionId: resumeRunnerSessionId,
3117
- iteration,
3118
- taskId: task.id,
3119
- model: (task as { model?: string }).model,
3120
- cwd: resumeCwd,
3121
- vcsRepo: task.vcsRepo,
3122
- },
3123
- logDir,
3124
- isYolo,
3125
- );
3126
- } catch (spawnErr) {
3127
- const errMsg = spawnErr instanceof Error ? spawnErr.message : String(spawnErr);
3128
- console.error(
3129
- `[${role}] Failed to spawn process for resumed task ${task.id.slice(0, 8)}: ${errMsg}`,
3130
- );
3131
- await ensureTaskFinished(
3132
- apiConfig,
3133
- role,
3134
- task.id,
3135
- 1,
3136
- `Spawn failed: ${errMsg}`,
3137
- undefined,
3138
- state.harnessProvider,
3139
- );
3140
- continue;
3141
- }
3094
+ // Inject relevant memories for resumed tasks
3095
+ const resumeMemoryContext = await fetchRelevantMemories(
3096
+ apiUrl,
3097
+ apiKey,
3098
+ agentId,
3099
+ task.task,
3100
+ task.id,
3101
+ );
3102
+ if (resumeMemoryContext) {
3103
+ resumePrompt += resumeMemoryContext;
3104
+ console.log(`[${role}] Injected relevant memories into resumed task prompt`);
3105
+ }
3142
3106
 
3143
- state.activeTasks.set(task.id, runningTask);
3144
- registerActiveSession(apiConfig, {
3145
- taskId: task.id,
3146
- triggerType: "task_resumed",
3147
- taskDescription: task.task?.slice(0, 200),
3148
- runnerSessionId: resumeRunnerSessionId,
3149
- });
3150
- console.log(
3151
- `[${role}] Resumed task ${task.id.slice(0, 8)} (${state.activeTasks.size}/${state.maxConcurrent} active)`,
3152
- );
3107
+ // Resolve --resume: prefer own session ID, then parent's
3108
+ let resumeAdditionalArgs = opts.additionalArgs || [];
3109
+ if (task.claudeSessionId) {
3110
+ resumeAdditionalArgs = [...resumeAdditionalArgs, "--resume", task.claudeSessionId];
3111
+ console.log(`[${role}] Resuming task's own session ${task.claudeSessionId.slice(0, 8)}`);
3112
+ } else if (task.parentTaskId) {
3113
+ const parentSessionId = await fetchProviderSessionId(apiUrl, apiKey, task.parentTaskId);
3114
+ if (parentSessionId) {
3115
+ resumeAdditionalArgs = [...resumeAdditionalArgs, "--resume", parentSessionId];
3116
+ console.log(`[${role}] Resuming parent session ${parentSessionId.slice(0, 8)}`);
3117
+ }
3153
3118
  }
3154
3119
 
3155
- console.log(`[${role}] All paused tasks resumed. Entering normal polling...`);
3156
- } else {
3157
- console.log(`[${role}] No paused tasks found. Entering normal polling...`);
3158
- }
3159
- } catch (error) {
3160
- console.error(`[${role}] Error checking/resuming paused tasks: ${error}`);
3161
- // Continue to normal polling even if resume fails
3162
- }
3163
- // ========== END: Resume paused tasks ==========
3120
+ // Spawn Claude process for resumed task
3121
+ iteration++;
3122
+ const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3123
+ const logFile = `${logDir}/${timestamp}-resume-${task.id.slice(0, 8)}.jsonl`;
3164
3124
 
3165
- // ========== Lead startup self-check ==========
3166
- if (isLead) {
3167
- console.log(`[${role}] Running startup heartbeat sweep...`);
3168
- const swept = await triggerHeartbeatSweep(apiConfig);
3169
- if (swept) {
3170
- console.log(`[${role}] Startup heartbeat sweep completed`);
3171
- } else {
3172
- console.warn(`[${role}] Startup heartbeat sweep failed (non-fatal)`);
3173
- }
3174
- }
3125
+ console.log(`\n[${role}] === Resuming paused task (iteration ${iteration}) ===`);
3126
+ console.log(`[${role}] Logging to: ${logFile}`);
3127
+ console.log(`[${role}] Prompt: ${resumePrompt.slice(0, 100)}...`);
3175
3128
 
3176
- // Phase 4 — exponential back-off state for `budget_refused` triggers.
3177
- // Resets to 0 on any non-refused outcome. Lives outside the loop so
3178
- // state persists across iterations.
3179
- let consecutiveBudgetRefusals = 0;
3180
-
3181
- // Track last finished task check for leads (to avoid re-processing)
3182
- while (true) {
3183
- // Ping server on each iteration to keep status updated
3184
- await pingServer(apiConfig, role);
3185
-
3186
- // Check for completed processes first and ensure tasks are marked as finished
3187
- await checkCompletedProcesses(state, role, apiConfig);
3188
-
3189
- // Live HARNESS_PROVIDER reconciliation. Re-fetches `swarm_config` (overlaid
3190
- // on env) and swaps the adapter if the resolved provider changed —
3191
- // typically because an operator PATCH'd /api/agents/:id/harness-provider
3192
- // (which writes a swarm_config row) or upserted a config row directly.
3193
- //
3194
- // Safety: in-flight sessions hold their own `ProviderSession` references
3195
- // and continue on the old adapter unaffected. New spawns (below) read
3196
- // the current `adapter` binding and pick up the swap. `basePrompt` is
3197
- // rebuilt because traits (and therefore prompt content) may differ across
3198
- // providers.
3199
- if (Date.now() - lastHarnessReconcileAt > HARNESS_RECONCILE_INTERVAL_MS) {
3200
- lastHarnessReconcileAt = Date.now();
3201
- try {
3202
- const { resolvedProvider } = await fetchResolvedEnv(apiUrl, apiKey, agentId);
3203
- if (resolvedProvider !== state.harnessProvider) {
3204
- const previous = state.harnessProvider;
3205
- console.log(
3206
- `[${role}] [harness] Reconciling adapter: ${previous} → ${resolvedProvider}`,
3207
- );
3208
- try {
3209
- adapter = createProviderAdapter(resolvedProvider);
3210
- state.harnessProvider = resolvedProvider;
3211
- basePrompt = await buildSystemPrompt();
3212
- resolvedSystemPrompt = additionalSystemPrompt
3213
- ? `${basePrompt}\n\n${additionalSystemPrompt}`
3214
- : basePrompt;
3215
- // Force a fresh cred_status report below for the new provider.
3216
- cachedCredHarnessProvider = null;
3217
- console.log(
3218
- `[${role}] [harness] Swapped to ${resolvedProvider} (basePrompt rebuilt: ${basePrompt.length} chars)`,
3219
- );
3220
- } catch (err) {
3129
+ const metadata = {
3130
+ type: metadataType,
3131
+ sessionId,
3132
+ iteration,
3133
+ timestamp: new Date().toISOString(),
3134
+ prompt: resumePrompt,
3135
+ trigger: "task_resumed",
3136
+ resumedTaskId: task.id,
3137
+ yolo: isYolo,
3138
+ };
3139
+ await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3140
+
3141
+ // Resolve cwd for resumed task (mirrors normal task path: task.dir > vcsRepo clonePath)
3142
+ let resumeCwd: string | undefined;
3143
+ if (task.dir) {
3144
+ try {
3145
+ if (existsSync(task.dir) && statSync(task.dir).isDirectory()) {
3146
+ resumeCwd = task.dir;
3147
+ } else {
3221
3148
  console.warn(
3222
- `[${role}] [harness] Failed to swap to ${resolvedProvider} (staying on ${previous}): ${err}`,
3149
+ `[${role}] Resume task dir "${task.dir}" does not exist or is not a directory, falling back to default cwd`,
3223
3150
  );
3224
3151
  }
3152
+ } catch {
3153
+ console.warn(
3154
+ `[${role}] Failed to check resume task dir "${task.dir}", falling back to default cwd`,
3155
+ );
3225
3156
  }
3226
- } catch (err) {
3227
- console.warn(`[${role}] [harness] Reconcile fetch failed (non-fatal): ${err}`);
3228
3157
  }
3229
- }
3230
3158
 
3231
- // Migration 055 — post-task credential refresh, cache-keyed on the
3232
- // *resolved* harness_provider. Re-runs the snapshot when the provider
3233
- // changes (boot, or after a live swap above) so the dashboard shows
3234
- // up-to-date credential status for the active adapter.
3235
- if (!isCredCheckDisabled(process.env)) {
3236
- const currentHarness = state.harnessProvider;
3237
- if (currentHarness !== cachedCredHarnessProvider) {
3238
- cachedCredHarnessProvider = currentHarness;
3239
- buildCredStatusReport(currentHarness, process.env, {}, "post_task")
3240
- .then((snap) => reportCredStatus(apiUrl, apiKey, agentId, snap))
3241
- .catch((err) =>
3242
- console.warn(`[${role}] cred_status post_task report failed (non-fatal): ${err}`),
3243
- );
3159
+ if (!resumeCwd && task.vcsRepo && apiUrl) {
3160
+ const repoConfig = await fetchRepoConfig(apiUrl, apiKey, task.vcsRepo);
3161
+ const effectiveConfig = repoConfig ?? {
3162
+ url: task.vcsRepo,
3163
+ name: task.vcsRepo.split("/").pop() || task.vcsRepo,
3164
+ clonePath: `/workspace/repos/${task.vcsRepo.split("/").pop() || task.vcsRepo}`,
3165
+ defaultBranch: "main",
3166
+ };
3167
+ const repoContext = await ensureRepoForTask(effectiveConfig, role);
3168
+ if (repoContext?.clonePath) {
3169
+ resumeCwd = repoContext.clonePath;
3170
+ }
3244
3171
  }
3245
- }
3246
3172
 
3247
- // Periodic VCS detection for running tasks (fire-and-forget, throttled per task)
3248
- const now = Date.now();
3249
- for (const [taskId, task] of state.activeTasks) {
3250
- if (vcsDetectedTasks.has(taskId)) continue;
3251
- const lastCheck = vcsCheckTimestamps.get(taskId) ?? 0;
3252
- if (now - lastCheck < VCS_CHECK_INTERVAL) continue;
3253
- if (!task.workingDir) continue;
3173
+ // Per-task runner session ID so session logs are scoped to this task
3174
+ const resumeRunnerSessionId = crypto.randomUUID();
3254
3175
 
3255
- vcsCheckTimestamps.set(taskId, now);
3256
- detectVcsForTask(apiUrl, apiKey, taskId, task.workingDir);
3257
- }
3258
-
3259
- // Check for cancelled tasks and signal their subprocesses
3260
- if (state.activeTasks.size > 0) {
3261
- for (const [taskId, task] of state.activeTasks) {
3262
- if (cancelledSignaled.has(taskId)) continue; // Already sent SIGTERM
3263
- try {
3264
- const cancelResp = await fetch(
3265
- `${apiUrl}/cancelled-tasks?taskId=${encodeURIComponent(taskId)}`,
3266
- {
3267
- headers: {
3268
- Authorization: `Bearer ${apiKey}`,
3269
- "X-Agent-ID": agentId,
3270
- },
3271
- },
3272
- );
3273
- if (cancelResp.ok) {
3274
- const cancelData = (await cancelResp.json()) as {
3275
- cancelled: Array<{ id: string }>;
3276
- };
3277
- if (cancelData.cancelled?.some((t) => t.id === taskId)) {
3278
- console.log(
3279
- `[${role}] Task ${taskId.slice(0, 8)} was cancelled sending SIGTERM to subprocess`,
3280
- );
3281
- task.session.abort().catch(() => {});
3282
- cancelledSignaled.add(taskId);
3283
- }
3284
- }
3285
- } catch {
3286
- // Non-blocking — cancellation check is best-effort
3287
- }
3176
+ let runningTask: RunningTask;
3177
+ try {
3178
+ runningTask = await spawnProviderProcess(
3179
+ adapter,
3180
+ {
3181
+ prompt: resumePrompt,
3182
+ logFile,
3183
+ systemPrompt: resolvedSystemPrompt,
3184
+ additionalArgs: resumeAdditionalArgs,
3185
+ role,
3186
+ apiUrl,
3187
+ apiKey,
3188
+ agentId,
3189
+ runnerSessionId: resumeRunnerSessionId,
3190
+ iteration,
3191
+ taskId: task.id,
3192
+ model: (task as { model?: string }).model,
3193
+ cwd: resumeCwd,
3194
+ vcsRepo: task.vcsRepo,
3195
+ },
3196
+ logDir,
3197
+ isYolo,
3198
+ );
3199
+ } catch (spawnErr) {
3200
+ const errMsg = spawnErr instanceof Error ? spawnErr.message : String(spawnErr);
3201
+ console.error(
3202
+ `[${role}] Failed to spawn process for resumed task ${task.id.slice(0, 8)}: ${errMsg}`,
3203
+ );
3204
+ await ensureTaskFinished(
3205
+ apiConfig,
3206
+ role,
3207
+ task.id,
3208
+ 1,
3209
+ `Spawn failed: ${errMsg}`,
3210
+ undefined,
3211
+ state.harnessProvider,
3212
+ );
3213
+ continue;
3288
3214
  }
3289
- }
3290
3215
 
3291
- // Only poll if we have capacity
3292
- if (state.activeTasks.size < state.maxConcurrent) {
3216
+ state.activeTasks.set(task.id, runningTask);
3217
+ registerActiveSession(apiConfig, {
3218
+ taskId: task.id,
3219
+ triggerType: "task_resumed",
3220
+ taskDescription: task.task?.slice(0, 200),
3221
+ runnerSessionId: resumeRunnerSessionId,
3222
+ });
3293
3223
  console.log(
3294
- `[${role}] Polling for triggers (${state.activeTasks.size}/${state.maxConcurrent} active)...`,
3224
+ `[${role}] Resumed task ${task.id.slice(0, 8)} (${state.activeTasks.size}/${state.maxConcurrent} active)`,
3295
3225
  );
3226
+ }
3296
3227
 
3297
- // Use shorter timeout if tasks are running (to check completion more often)
3298
- const effectiveTimeout = state.activeTasks.size > 0 ? 5000 : PollTimeoutMs;
3299
-
3300
- const trigger = await pollForTrigger({
3301
- apiUrl,
3302
- apiKey,
3303
- agentId,
3304
- pollInterval: PollIntervalMs,
3305
- pollTimeout: effectiveTimeout,
3306
- });
3228
+ console.log(`[${role}] All paused tasks resumed. Entering normal polling...`);
3229
+ } else {
3230
+ console.log(`[${role}] No paused tasks found. Entering normal polling...`);
3231
+ }
3232
+ } catch (error) {
3233
+ console.error(`[${role}] Error checking/resuming paused tasks: ${error}`);
3234
+ // Continue to normal polling even if resume fails
3235
+ }
3236
+ // ========== END: Resume paused tasks ==========
3237
+
3238
+ // ========== Lead startup self-check ==========
3239
+ if (isLead) {
3240
+ console.log(`[${role}] Running startup heartbeat sweep...`);
3241
+ const swept = await triggerHeartbeatSweep(apiConfig);
3242
+ if (swept) {
3243
+ console.log(`[${role}] Startup heartbeat sweep completed`);
3244
+ } else {
3245
+ console.warn(`[${role}] Startup heartbeat sweep failed (non-fatal)`);
3246
+ }
3247
+ }
3307
3248
 
3308
- if (trigger) {
3309
- // Phase 4 server refused to admit a claim because the agent or
3310
- // global budget is exhausted. Log a structured payload (scrubbed
3311
- // at egress per project convention) and back off exponentially.
3312
- // We deliberately `continue` BEFORE the empty-poll counter logic
3313
- // below — refusals are not empty polls.
3314
- if (trigger.type === "budget_refused") {
3315
- consecutiveBudgetRefusals++;
3316
- const backoffMs = computeBudgetBackoffMs(consecutiveBudgetRefusals, PollIntervalMs);
3317
- const refusalPayload = JSON.stringify({
3318
- event: "budget_refused",
3319
- cause: trigger.cause,
3320
- agentSpend: trigger.agentSpend,
3321
- agentBudget: trigger.agentBudget,
3322
- globalSpend: trigger.globalSpend,
3323
- globalBudget: trigger.globalBudget,
3324
- resetAt: trigger.resetAt,
3325
- consecutiveRefusals: consecutiveBudgetRefusals,
3326
- backoffMs,
3327
- });
3328
- console.log(
3329
- `[${role}] budget_refused — backing off ${backoffMs}ms: ${scrubSecrets(refusalPayload)}`,
3330
- );
3331
- await Bun.sleep(backoffMs);
3332
- continue;
3333
- }
3249
+ // Phase 4 — exponential back-off state for `budget_refused` triggers.
3250
+ // Resets to 0 on any non-refused outcome. Lives outside the loop so
3251
+ // state persists across iterations.
3252
+ let consecutiveBudgetRefusals = 0;
3334
3253
 
3335
- // Any other non-null trigger means we're being admitted normally
3336
- // reset the back-off so the next refusal starts at base interval.
3337
- consecutiveBudgetRefusals = 0;
3338
-
3339
- console.log(`[${role}] Trigger received: ${trigger.type}`);
3340
-
3341
- if (
3342
- trigger.taskId &&
3343
- (trigger.type === "task_assigned" || trigger.type === "task_offered")
3344
- ) {
3345
- ensure({
3346
- id: "worker_received",
3347
- flow: "task",
3348
- runId: trigger.taskId,
3349
- depIds: ["started"],
3350
- data: {
3351
- taskId: trigger.taskId,
3352
- agentId,
3353
- triggerType: trigger.type,
3354
- role,
3355
- },
3356
- // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
3357
- filter: ({}, ctx) => ctx.deps.length > 0,
3358
- conditions: [{ timeout_ms: 60_000 }], // 1 min: immediate after poll
3359
- });
3360
- }
3254
+ // Track last finished task check for leads (to avoid re-processing)
3255
+ while (true) {
3256
+ // Ping server on each iteration to keep status updated
3257
+ await pingServer(apiConfig, role);
3361
3258
 
3362
- // Build prompt based on trigger
3363
- let triggerPrompt = await buildPromptForTrigger(
3364
- trigger,
3365
- prompt,
3366
- adapter.formatCommand.bind(adapter),
3367
- { hasMcp: adapter.traits.hasMcp },
3368
- );
3259
+ // Check for completed processes first and ensure tasks are marked as finished
3260
+ await checkCompletedProcesses(state, role, apiConfig);
3369
3261
 
3370
- // Enrich prompt with relevant memories from past sessions
3371
- if (trigger.type === "task_assigned" || trigger.type === "task_offered") {
3372
- const task =
3373
- trigger.task && typeof trigger.task === "object" && "task" in trigger.task
3374
- ? (trigger.task as { task: string; id?: string })
3375
- : null;
3376
- if (task?.task) {
3377
- const memoryContext = await fetchRelevantMemories(
3378
- apiUrl,
3379
- apiKey,
3380
- agentId,
3381
- task.task,
3382
- task.id,
3383
- );
3384
- if (memoryContext) {
3385
- triggerPrompt += memoryContext;
3386
- console.log(`[${role}] Injected relevant memories into task prompt`);
3387
- }
3388
- }
3389
- }
3262
+ // Live HARNESS_PROVIDER reconciliation. Re-fetches `swarm_config` (overlaid
3263
+ // on env) and swaps the adapter if the resolved provider changed —
3264
+ // typically because an operator PATCH'd /api/agents/:id/harness-provider
3265
+ // (which writes a swarm_config row) or upserted a config row directly.
3266
+ //
3267
+ // Safety: in-flight sessions hold their own `ProviderSession` references
3268
+ // and continue on the old adapter unaffected. New spawns (below) read
3269
+ // the current `adapter` binding and pick up the swap. `basePrompt` is
3270
+ // rebuilt because traits (and therefore prompt content) may differ across
3271
+ // providers.
3272
+ if (Date.now() - lastHarnessReconcileAt > HARNESS_RECONCILE_INTERVAL_MS) {
3273
+ lastHarnessReconcileAt = Date.now();
3274
+ try {
3275
+ const { env: freshEnv, resolvedProvider } = await fetchResolvedEnv(apiUrl, apiKey, agentId);
3276
+ const { agentVisibleChanged } = await applySwarmConfigDrift(freshEnv, resolvedProvider);
3277
+ if (agentVisibleChanged) {
3278
+ // Re-register so the agents row + dashboard reflect the live
3279
+ // harness_provider / maxTasks. Idempotent: only writes columns
3280
+ // that actually changed (see src/http/agents.ts).
3281
+ await reregisterAgent();
3282
+ }
3283
+ } catch (err) {
3284
+ console.warn(`[${role}] [harness] Reconcile fetch failed (non-fatal): ${err}`);
3285
+ }
3286
+ }
3390
3287
 
3391
- // Resolve --resume for child tasks with parentTaskId
3392
- let effectiveAdditionalArgs = opts.additionalArgs || [];
3393
- const taskObj = trigger.task as { parentTaskId?: string } | undefined;
3394
- if (taskObj?.parentTaskId) {
3395
- const parentSessionId = await fetchProviderSessionId(
3396
- apiUrl,
3397
- apiKey,
3398
- taskObj.parentTaskId,
3399
- );
3400
- if (parentSessionId) {
3401
- effectiveAdditionalArgs = [...effectiveAdditionalArgs, "--resume", parentSessionId];
3402
- console.log(
3403
- `[${role}] Child task — resuming parent session ${parentSessionId.slice(0, 8)}`,
3404
- );
3405
- } else {
3406
- console.log(`[${role}] Child task — parent session ID not found, starting fresh`);
3407
- }
3408
- }
3288
+ // Migration 055 post-task credential refresh, cache-keyed on the
3289
+ // *resolved* harness_provider. Re-runs the snapshot when the provider
3290
+ // changes (boot, or after a live swap above) so the dashboard shows
3291
+ // up-to-date credential status for the active adapter.
3292
+ if (!isCredCheckDisabled(process.env)) {
3293
+ const currentHarness = state.harnessProvider;
3294
+ if (currentHarness !== cachedCredHarnessProvider) {
3295
+ cachedCredHarnessProvider = currentHarness;
3296
+ buildCredStatusReport(currentHarness, process.env, {}, "post_task")
3297
+ .then((snap) => reportCredStatus(apiUrl, apiKey, agentId, snap))
3298
+ .catch((err) =>
3299
+ console.warn(`[${role}] cred_status post_task report failed (non-fatal): ${err}`),
3300
+ );
3301
+ }
3302
+ }
3409
3303
 
3410
- // Extract model from task data for per-task model selection
3411
- const taskModel = (trigger.task as { model?: string } | undefined)?.model;
3304
+ // Periodic VCS detection for running tasks (fire-and-forget, throttled per task)
3305
+ const now = Date.now();
3306
+ for (const [taskId, task] of state.activeTasks) {
3307
+ if (vcsDetectedTasks.has(taskId)) continue;
3308
+ const lastCheck = vcsCheckTimestamps.get(taskId) ?? 0;
3309
+ if (now - lastCheck < VCS_CHECK_INTERVAL) continue;
3310
+ if (!task.workingDir) continue;
3412
3311
 
3413
- // Detect Slack context for conditional prompt sections
3414
- const taskSlackChannelId = (trigger.task as { slackChannelId?: string } | undefined)
3415
- ?.slackChannelId;
3416
- const taskSlackThreadTs = (trigger.task as { slackThreadTs?: string } | undefined)
3417
- ?.slackThreadTs;
3418
- currentTaskSlackContext = taskSlackChannelId
3419
- ? { channelId: taskSlackChannelId, threadTs: taskSlackThreadTs }
3420
- : undefined;
3312
+ vcsCheckTimestamps.set(taskId, now);
3313
+ detectVcsForTask(apiUrl, apiKey, taskId, task.workingDir);
3314
+ }
3421
3315
 
3422
- // Handle repo context for tasks with vcsRepo (GitHub/GitLab)
3423
- const taskVcsRepo = (trigger.task as { vcsRepo?: string } | undefined)?.vcsRepo;
3424
- if (taskVcsRepo && apiUrl) {
3425
- const repoConfig = await fetchRepoConfig(apiUrl, apiKey, taskVcsRepo);
3426
- // Fall back to convention-based config if repo is not registered
3427
- const effectiveConfig = repoConfig ?? {
3428
- url: taskVcsRepo,
3429
- name: taskVcsRepo.split("/").pop() || taskVcsRepo,
3430
- clonePath: `/workspace/repos/${taskVcsRepo.split("/").pop() || taskVcsRepo}`,
3431
- defaultBranch: "main",
3432
- };
3433
- const repoResult = await ensureRepoForTask(effectiveConfig, role);
3434
- currentRepoContext = {
3435
- ...repoResult,
3436
- guidelines: repoConfig?.guidelines ?? null,
3316
+ // Check for cancelled tasks and signal their subprocesses
3317
+ if (state.activeTasks.size > 0) {
3318
+ for (const [taskId, task] of state.activeTasks) {
3319
+ if (cancelledSignaled.has(taskId)) continue; // Already sent SIGTERM
3320
+ try {
3321
+ const cancelResp = await fetch(
3322
+ `${apiUrl}/cancelled-tasks?taskId=${encodeURIComponent(taskId)}`,
3323
+ {
3324
+ headers: {
3325
+ Authorization: `Bearer ${apiKey}`,
3326
+ "X-Agent-ID": agentId,
3327
+ },
3328
+ },
3329
+ );
3330
+ if (cancelResp.ok) {
3331
+ const cancelData = (await cancelResp.json()) as {
3332
+ cancelled: Array<{ id: string }>;
3437
3333
  };
3438
- } else {
3439
- currentRepoContext = undefined;
3440
- }
3441
-
3442
- // Resolve effective working directory (priority: task.dir > repoContext.clonePath > process.cwd())
3443
- const taskDir = (trigger.task as { dir?: string } | undefined)?.dir;
3444
- let effectiveCwd: string | undefined;
3445
-
3446
- if (taskDir) {
3447
- try {
3448
- if (existsSync(taskDir) && statSync(taskDir).isDirectory()) {
3449
- effectiveCwd = taskDir;
3450
- } else {
3451
- console.warn(
3452
- `[${role}] Task dir "${taskDir}" does not exist or is not a directory, falling back to default cwd`,
3453
- );
3454
- }
3455
- } catch {
3456
- console.warn(
3457
- `[${role}] Failed to check task dir "${taskDir}", falling back to default cwd`,
3334
+ if (cancelData.cancelled?.some((t) => t.id === taskId)) {
3335
+ console.log(
3336
+ `[${role}] Task ${taskId.slice(0, 8)} was cancelled — sending SIGTERM to subprocess`,
3458
3337
  );
3338
+ task.session.abort().catch(() => {});
3339
+ cancelledSignaled.add(taskId);
3459
3340
  }
3460
3341
  }
3342
+ } catch {
3343
+ // Non-blocking — cancellation check is best-effort
3344
+ }
3345
+ }
3346
+ }
3461
3347
 
3462
- if (!effectiveCwd && currentRepoContext?.clonePath) {
3463
- effectiveCwd = currentRepoContext.clonePath;
3464
- }
3465
-
3466
- // Annotate prompt with working directory context
3467
- if (effectiveCwd && effectiveCwd !== process.cwd()) {
3468
- triggerPrompt += `\n\n---\n**Working Directory**: You are starting in \`${effectiveCwd}\`. `;
3469
- if (taskDir) {
3470
- triggerPrompt += "This was explicitly set on the task.";
3471
- } else if (currentRepoContext?.clonePath) {
3472
- triggerPrompt += "This is the repository clone path for this task's VCS repo.";
3473
- }
3474
- triggerPrompt +=
3475
- " You can still access any path on the filesystem — this is just your starting directory.";
3476
- }
3348
+ // Only poll if we have capacity
3349
+ if (state.activeTasks.size < state.maxConcurrent) {
3350
+ console.log(
3351
+ `[${role}] Polling for triggers (${state.activeTasks.size}/${state.maxConcurrent} active)...`,
3352
+ );
3477
3353
 
3478
- // Warn in system prompt when task dir was specified but doesn't exist
3479
- let cwdWarning = "";
3480
- if (taskDir && !effectiveCwd) {
3481
- cwdWarning = `\n\nNote: The task requested working directory "${taskDir}" but it does not exist. Falling back to default directory.`;
3482
- }
3354
+ // Use shorter timeout if tasks are running (to check completion more often)
3355
+ const effectiveTimeout = state.activeTasks.size > 0 ? 5000 : PollTimeoutMs;
3483
3356
 
3484
- // Rebuild system prompt with per-task repo context
3485
- const taskBasePrompt = await buildSystemPrompt();
3486
- const taskSystemPrompt =
3487
- (additionalSystemPrompt
3488
- ? `${taskBasePrompt}\n\n${additionalSystemPrompt}`
3489
- : taskBasePrompt) + cwdWarning;
3490
-
3491
- iteration++;
3492
- const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3493
- const taskIdSlice = trigger.taskId?.slice(0, 8) || "notask";
3494
- const logFile = `${logDir}/${timestamp}-${taskIdSlice}.jsonl`;
3495
-
3496
- console.log(`\n[${role}] === Iteration ${iteration} ===`);
3497
- console.log(`[${role}] Logging to: ${logFile}`);
3498
- console.log(`[${role}] Prompt: ${triggerPrompt.slice(0, 100)}...`);
3499
- if (effectiveCwd) {
3500
- console.log(`[${role}] Working directory: ${effectiveCwd}`);
3501
- }
3357
+ const trigger = await pollForTrigger({
3358
+ apiUrl,
3359
+ apiKey,
3360
+ agentId,
3361
+ pollInterval: PollIntervalMs,
3362
+ pollTimeout: effectiveTimeout,
3363
+ });
3502
3364
 
3503
- const metadata = {
3504
- type: metadataType,
3505
- sessionId,
3506
- iteration,
3507
- timestamp: new Date().toISOString(),
3508
- prompt: triggerPrompt,
3509
- trigger: trigger.type,
3510
- yolo: isYolo,
3511
- };
3512
- await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3365
+ if (trigger) {
3366
+ // Phase 4 — server refused to admit a claim because the agent or
3367
+ // global budget is exhausted. Log a structured payload (scrubbed
3368
+ // at egress per project convention) and back off exponentially.
3369
+ // We deliberately `continue` BEFORE the empty-poll counter logic
3370
+ // below — refusals are not empty polls.
3371
+ if (trigger.type === "budget_refused") {
3372
+ consecutiveBudgetRefusals++;
3373
+ const backoffMs = computeBudgetBackoffMs(consecutiveBudgetRefusals, PollIntervalMs);
3374
+ const refusalPayload = JSON.stringify({
3375
+ event: "budget_refused",
3376
+ cause: trigger.cause,
3377
+ agentSpend: trigger.agentSpend,
3378
+ agentBudget: trigger.agentBudget,
3379
+ globalSpend: trigger.globalSpend,
3380
+ globalBudget: trigger.globalBudget,
3381
+ resetAt: trigger.resetAt,
3382
+ consecutiveRefusals: consecutiveBudgetRefusals,
3383
+ backoffMs,
3384
+ });
3385
+ console.log(
3386
+ `[${role}] budget_refused — backing off ${backoffMs}ms: ${scrubSecrets(refusalPayload)}`,
3387
+ );
3388
+ await Bun.sleep(backoffMs);
3389
+ continue;
3390
+ }
3513
3391
 
3514
- // Per-task runner session ID so session logs are scoped to this task
3515
- const taskRunnerSessionId = crypto.randomUUID();
3392
+ // Any other non-null trigger means we're being admitted normally
3393
+ // reset the back-off so the next refusal starts at base interval.
3394
+ consecutiveBudgetRefusals = 0;
3516
3395
 
3517
- // Spawn without blocking (await to set up session, but process runs async)
3518
- let runningTask: RunningTask;
3519
- try {
3520
- runningTask = await spawnProviderProcess(
3521
- adapter,
3522
- {
3523
- prompt: triggerPrompt,
3524
- logFile,
3525
- systemPrompt: taskSystemPrompt,
3526
- additionalArgs: effectiveAdditionalArgs,
3527
- role,
3528
- apiUrl,
3529
- apiKey,
3530
- agentId,
3531
- runnerSessionId: taskRunnerSessionId,
3532
- iteration,
3533
- taskId: trigger.taskId,
3534
- model: taskModel,
3535
- cwd: effectiveCwd,
3536
- vcsRepo: taskVcsRepo,
3537
- },
3538
- logDir,
3539
- isYolo,
3540
- );
3541
- } catch (spawnErr) {
3542
- const errMsg = spawnErr instanceof Error ? spawnErr.message : String(spawnErr);
3543
- console.error(
3544
- `[${role}] Failed to spawn process for task ${trigger.taskId?.slice(0, 8) || "unknown"}: ${errMsg}`,
3545
- );
3546
- if (trigger.taskId) {
3547
- await ensureTaskFinished(
3548
- apiConfig,
3549
- role,
3550
- trigger.taskId,
3551
- 1,
3552
- `Spawn failed: ${errMsg}`,
3553
- undefined,
3554
- state.harnessProvider,
3555
- );
3556
- }
3557
- continue;
3558
- }
3396
+ console.log(`[${role}] Trigger received: ${trigger.type}`);
3559
3397
 
3398
+ if (
3399
+ trigger.taskId &&
3400
+ (trigger.type === "task_assigned" || trigger.type === "task_offered")
3401
+ ) {
3560
3402
  ensure({
3561
- id: "worker_process_spawned",
3403
+ id: "worker_received",
3562
3404
  flow: "task",
3563
- runId: runningTask.taskId,
3564
- depIds: ["worker_received"],
3405
+ runId: trigger.taskId,
3406
+ depIds: ["started"],
3565
3407
  data: {
3566
- taskId: runningTask.taskId,
3408
+ taskId: trigger.taskId,
3567
3409
  agentId,
3410
+ triggerType: trigger.type,
3568
3411
  role,
3569
- model: taskModel,
3570
3412
  },
3571
3413
  // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
3572
3414
  filter: ({}, ctx) => ctx.deps.length > 0,
3573
- conditions: [{ timeout_ms: 60_000 }], // 1 min: process startup
3415
+ conditions: [{ timeout_ms: 60_000 }], // 1 min: immediate after poll
3574
3416
  });
3417
+ }
3575
3418
 
3576
- // Attach trigger metadata for logging
3577
- runningTask.triggerType = trigger.type;
3578
- runningTask.workingDir = effectiveCwd;
3579
-
3580
- // Attach deferred cursor updates for channel_activity triggers
3581
- if (trigger.type === "channel_activity" && trigger.cursorUpdates) {
3582
- runningTask.cursorUpdates = trigger.cursorUpdates as Array<{
3583
- channelId: string;
3584
- ts: string;
3585
- }>;
3586
- }
3587
-
3588
- state.activeTasks.set(runningTask.taskId, runningTask);
3419
+ // Build prompt based on trigger
3420
+ let triggerPrompt = await buildPromptForTrigger(
3421
+ trigger,
3422
+ prompt,
3423
+ adapter.formatCommand.bind(adapter),
3424
+ { hasMcp: adapter.traits.hasMcp },
3425
+ );
3589
3426
 
3590
- // Register active session for concurrency awareness
3591
- const taskDesc =
3427
+ // Enrich prompt with relevant memories from past sessions
3428
+ if (trigger.type === "task_assigned" || trigger.type === "task_offered") {
3429
+ const task =
3592
3430
  trigger.task && typeof trigger.task === "object" && "task" in trigger.task
3593
- ? String((trigger.task as { task: string }).task).slice(0, 200)
3594
- : undefined;
3595
- registerActiveSession(apiConfig, {
3596
- taskId: runningTask.taskId,
3597
- triggerType: trigger.type,
3598
- taskDescription: taskDesc,
3599
- runnerSessionId: taskRunnerSessionId,
3600
- });
3431
+ ? (trigger.task as { task: string; id?: string })
3432
+ : null;
3433
+ if (task?.task) {
3434
+ const memoryContext = await fetchRelevantMemories(
3435
+ apiUrl,
3436
+ apiKey,
3437
+ agentId,
3438
+ task.task,
3439
+ task.id,
3440
+ );
3441
+ if (memoryContext) {
3442
+ triggerPrompt += memoryContext;
3443
+ console.log(`[${role}] Injected relevant memories into task prompt`);
3444
+ }
3445
+ }
3446
+ }
3601
3447
 
3602
- console.log(
3603
- `[${role}] Started task ${runningTask.taskId.slice(0, 8)} (${state.activeTasks.size}/${state.maxConcurrent} active, trigger: ${trigger.type})`,
3448
+ // Resolve --resume for child tasks with parentTaskId
3449
+ let effectiveAdditionalArgs = opts.additionalArgs || [];
3450
+ const taskObj = trigger.task as { parentTaskId?: string } | undefined;
3451
+ if (taskObj?.parentTaskId) {
3452
+ const parentSessionId = await fetchProviderSessionId(
3453
+ apiUrl,
3454
+ apiKey,
3455
+ taskObj.parentTaskId,
3604
3456
  );
3457
+ if (parentSessionId) {
3458
+ effectiveAdditionalArgs = [...effectiveAdditionalArgs, "--resume", parentSessionId];
3459
+ console.log(
3460
+ `[${role}] Child task — resuming parent session ${parentSessionId.slice(0, 8)}`,
3461
+ );
3462
+ } else {
3463
+ console.log(`[${role}] Child task — parent session ID not found, starting fresh`);
3464
+ }
3605
3465
  }
3606
- } else {
3607
- console.log(
3608
- `[${role}] At capacity (${state.activeTasks.size}/${state.maxConcurrent}), waiting for completion...`,
3609
- );
3610
- await Bun.sleep(1000);
3611
- }
3612
- }
3613
- } else {
3614
- // Original AI-loop mode (existing behavior)
3615
- console.log(`[${role}] Mode: AI-based polling (legacy)`);
3616
-
3617
- // Create API config for ping/close
3618
- const apiConfig: ApiConfig = { apiUrl, apiKey, agentId };
3619
3466
 
3620
- // Setup graceful shutdown handlers with API config for close on exit
3621
- setupShutdownHandlers(role, apiConfig);
3467
+ // Extract model from task data for per-task model selection
3468
+ const taskModel = (trigger.task as { model?: string } | undefined)?.model;
3469
+
3470
+ // Detect Slack context for conditional prompt sections
3471
+ const taskSlackChannelId = (trigger.task as { slackChannelId?: string } | undefined)
3472
+ ?.slackChannelId;
3473
+ const taskSlackThreadTs = (trigger.task as { slackThreadTs?: string } | undefined)
3474
+ ?.slackThreadTs;
3475
+ currentTaskSlackContext = taskSlackChannelId
3476
+ ? { channelId: taskSlackChannelId, threadTs: taskSlackThreadTs }
3477
+ : undefined;
3478
+
3479
+ // Handle repo context for tasks with vcsRepo (GitHub/GitLab)
3480
+ const taskVcsRepo = (trigger.task as { vcsRepo?: string } | undefined)?.vcsRepo;
3481
+ if (taskVcsRepo && apiUrl) {
3482
+ const repoConfig = await fetchRepoConfig(apiUrl, apiKey, taskVcsRepo);
3483
+ // Fall back to convention-based config if repo is not registered
3484
+ const effectiveConfig = repoConfig ?? {
3485
+ url: taskVcsRepo,
3486
+ name: taskVcsRepo.split("/").pop() || taskVcsRepo,
3487
+ clonePath: `/workspace/repos/${taskVcsRepo.split("/").pop() || taskVcsRepo}`,
3488
+ defaultBranch: "main",
3489
+ };
3490
+ const repoResult = await ensureRepoForTask(effectiveConfig, role);
3491
+ currentRepoContext = {
3492
+ ...repoResult,
3493
+ guidelines: repoConfig?.guidelines ?? null,
3494
+ };
3495
+ } else {
3496
+ currentRepoContext = undefined;
3497
+ }
3622
3498
 
3623
- while (true) {
3624
- // Ping server on each iteration to keep status updated
3625
- await pingServer(apiConfig, role);
3499
+ // Resolve effective working directory (priority: task.dir > repoContext.clonePath > process.cwd())
3500
+ const taskDir = (trigger.task as { dir?: string } | undefined)?.dir;
3501
+ let effectiveCwd: string | undefined;
3626
3502
 
3627
- iteration++;
3628
- const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3629
- const logFile = `${logDir}/${timestamp}.jsonl`;
3503
+ if (taskDir) {
3504
+ try {
3505
+ if (existsSync(taskDir) && statSync(taskDir).isDirectory()) {
3506
+ effectiveCwd = taskDir;
3507
+ } else {
3508
+ console.warn(
3509
+ `[${role}] Task dir "${taskDir}" does not exist or is not a directory, falling back to default cwd`,
3510
+ );
3511
+ }
3512
+ } catch {
3513
+ console.warn(
3514
+ `[${role}] Failed to check task dir "${taskDir}", falling back to default cwd`,
3515
+ );
3516
+ }
3517
+ }
3630
3518
 
3631
- console.log(`\n[${role}] === Iteration ${iteration} ===`);
3632
- console.log(`[${role}] Logging to: ${logFile}`);
3519
+ if (!effectiveCwd && currentRepoContext?.clonePath) {
3520
+ effectiveCwd = currentRepoContext.clonePath;
3521
+ }
3633
3522
 
3634
- const metadata = {
3635
- type: metadataType,
3636
- sessionId,
3637
- iteration,
3638
- timestamp: new Date().toISOString(),
3639
- prompt,
3640
- yolo: isYolo,
3641
- };
3642
- await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3523
+ // Annotate prompt with working directory context
3524
+ if (effectiveCwd && effectiveCwd !== process.cwd()) {
3525
+ triggerPrompt += `\n\n---\n**Working Directory**: You are starting in \`${effectiveCwd}\`. `;
3526
+ if (taskDir) {
3527
+ triggerPrompt += "This was explicitly set on the task.";
3528
+ } else if (currentRepoContext?.clonePath) {
3529
+ triggerPrompt += "This is the repository clone path for this task's VCS repo.";
3530
+ }
3531
+ triggerPrompt +=
3532
+ " You can still access any path on the filesystem — this is just your starting directory.";
3533
+ }
3643
3534
 
3644
- const iterationResult = await runProviderIteration(adapter, {
3645
- prompt,
3646
- logFile,
3647
- systemPrompt: resolvedSystemPrompt,
3648
- additionalArgs: opts.additionalArgs,
3649
- role,
3650
- apiUrl,
3651
- apiKey,
3652
- agentId,
3653
- });
3535
+ // Warn in system prompt when task dir was specified but doesn't exist
3536
+ let cwdWarning = "";
3537
+ if (taskDir && !effectiveCwd) {
3538
+ cwdWarning = `\n\nNote: The task requested working directory "${taskDir}" but it does not exist. Falling back to default directory.`;
3539
+ }
3654
3540
 
3655
- if (iterationResult.exitCode !== 0) {
3656
- const failureReason =
3657
- iterationResult.failureReason || `Process exited with code ${iterationResult.exitCode}`;
3541
+ // Rebuild system prompt with per-task repo context
3542
+ const taskBasePrompt = await buildSystemPrompt();
3543
+ const taskSystemPrompt =
3544
+ (additionalSystemPrompt
3545
+ ? `${taskBasePrompt}\n\n${additionalSystemPrompt}`
3546
+ : taskBasePrompt) + cwdWarning;
3547
+
3548
+ iteration++;
3549
+ const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
3550
+ const taskIdSlice = trigger.taskId?.slice(0, 8) || "notask";
3551
+ const logFile = `${logDir}/${timestamp}-${taskIdSlice}.jsonl`;
3552
+
3553
+ console.log(`\n[${role}] === Iteration ${iteration} ===`);
3554
+ console.log(`[${role}] Logging to: ${logFile}`);
3555
+ console.log(`[${role}] Prompt: ${triggerPrompt.slice(0, 100)}...`);
3556
+ if (effectiveCwd) {
3557
+ console.log(`[${role}] Working directory: ${effectiveCwd}`);
3558
+ }
3658
3559
 
3659
- const errorLog = {
3660
- timestamp: new Date().toISOString(),
3560
+ const metadata = {
3561
+ type: metadataType,
3562
+ sessionId,
3661
3563
  iteration,
3662
- exitCode: iterationResult.exitCode,
3663
- failureReason,
3664
- error: true,
3564
+ timestamp: new Date().toISOString(),
3565
+ prompt: triggerPrompt,
3566
+ trigger: trigger.type,
3567
+ yolo: isYolo,
3665
3568
  };
3569
+ await Bun.write(logFile, `${JSON.stringify(metadata)}\n`);
3570
+
3571
+ // Per-task runner session ID so session logs are scoped to this task
3572
+ const taskRunnerSessionId = crypto.randomUUID();
3573
+
3574
+ // Spawn without blocking (await to set up session, but process runs async)
3575
+ let runningTask: RunningTask;
3576
+ try {
3577
+ runningTask = await spawnProviderProcess(
3578
+ adapter,
3579
+ {
3580
+ prompt: triggerPrompt,
3581
+ logFile,
3582
+ systemPrompt: taskSystemPrompt,
3583
+ additionalArgs: effectiveAdditionalArgs,
3584
+ role,
3585
+ apiUrl,
3586
+ apiKey,
3587
+ agentId,
3588
+ runnerSessionId: taskRunnerSessionId,
3589
+ iteration,
3590
+ taskId: trigger.taskId,
3591
+ model: taskModel,
3592
+ cwd: effectiveCwd,
3593
+ vcsRepo: taskVcsRepo,
3594
+ },
3595
+ logDir,
3596
+ isYolo,
3597
+ );
3598
+ } catch (spawnErr) {
3599
+ const errMsg = spawnErr instanceof Error ? spawnErr.message : String(spawnErr);
3600
+ console.error(
3601
+ `[${role}] Failed to spawn process for task ${trigger.taskId?.slice(0, 8) || "unknown"}: ${errMsg}`,
3602
+ );
3603
+ if (trigger.taskId) {
3604
+ await ensureTaskFinished(
3605
+ apiConfig,
3606
+ role,
3607
+ trigger.taskId,
3608
+ 1,
3609
+ `Spawn failed: ${errMsg}`,
3610
+ undefined,
3611
+ state.harnessProvider,
3612
+ );
3613
+ }
3614
+ continue;
3615
+ }
3616
+
3617
+ ensure({
3618
+ id: "worker_process_spawned",
3619
+ flow: "task",
3620
+ runId: runningTask.taskId,
3621
+ depIds: ["worker_received"],
3622
+ data: {
3623
+ taskId: runningTask.taskId,
3624
+ agentId,
3625
+ role,
3626
+ model: taskModel,
3627
+ },
3628
+ // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
3629
+ filter: ({}, ctx) => ctx.deps.length > 0,
3630
+ conditions: [{ timeout_ms: 60_000 }], // 1 min: process startup
3631
+ });
3666
3632
 
3667
- const errorsFile = `${logDir}/errors.jsonl`;
3668
- const errorsFileRef = Bun.file(errorsFile);
3669
- const existingErrors = (await errorsFileRef.exists()) ? await errorsFileRef.text() : "";
3670
- await Bun.write(errorsFile, `${existingErrors}${JSON.stringify(errorLog)}\n`);
3633
+ // Attach trigger metadata for logging
3634
+ runningTask.triggerType = trigger.type;
3635
+ runningTask.workingDir = effectiveCwd;
3671
3636
 
3672
- if (!isYolo) {
3673
- console.error(`[${role}] ${failureReason}. Stopping.`);
3674
- console.error(`[${role}] Error logged to: ${errorsFile}`);
3675
- process.exit(iterationResult.exitCode);
3637
+ // Attach deferred cursor updates for channel_activity triggers
3638
+ if (trigger.type === "channel_activity" && trigger.cursorUpdates) {
3639
+ runningTask.cursorUpdates = trigger.cursorUpdates as Array<{
3640
+ channelId: string;
3641
+ ts: string;
3642
+ }>;
3676
3643
  }
3677
3644
 
3678
- console.warn(`[${role}] ${failureReason}. YOLO mode - continuing...`);
3679
- }
3645
+ state.activeTasks.set(runningTask.taskId, runningTask);
3680
3646
 
3681
- console.log(`[${role}] Iteration ${iteration} complete. Starting next iteration...`);
3647
+ // Register active session for concurrency awareness
3648
+ const taskDesc =
3649
+ trigger.task && typeof trigger.task === "object" && "task" in trigger.task
3650
+ ? String((trigger.task as { task: string }).task).slice(0, 200)
3651
+ : undefined;
3652
+ registerActiveSession(apiConfig, {
3653
+ taskId: runningTask.taskId,
3654
+ triggerType: trigger.type,
3655
+ taskDescription: taskDesc,
3656
+ runnerSessionId: taskRunnerSessionId,
3657
+ });
3658
+
3659
+ console.log(
3660
+ `[${role}] Started task ${runningTask.taskId.slice(0, 8)} (${state.activeTasks.size}/${state.maxConcurrent} active, trigger: ${trigger.type})`,
3661
+ );
3662
+ }
3663
+ } else {
3664
+ console.log(
3665
+ `[${role}] At capacity (${state.activeTasks.size}/${state.maxConcurrent}), waiting for completion...`,
3666
+ );
3667
+ await Bun.sleep(1000);
3682
3668
  }
3683
3669
  }
3684
3670
  }