npm - @desplega.ai/agent-swarm - Versions diffs - 1.86.0 → 1.87.0 - Mend

@desplega.ai/agent-swarm 1.86.0 → 1.87.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/openapi.json +72 -1
package/package.json +3 -1
package/src/be/db-queries/tracker.ts +21 -0
package/src/be/db.ts +235 -14
package/src/be/migrations/079_task_followup_config.sql +1 -0
package/src/be/modelsdev-cache.json +77663 -74073
package/src/cli.tsx +26 -0
package/src/commands/context-preamble.ts +272 -0
package/src/commands/e2b.ts +728 -0
package/src/commands/resume-session.ts +35 -78
package/src/commands/runner.ts +125 -13
package/src/e2b/dispatch.ts +429 -0
package/src/e2b/env.ts +206 -0
package/src/heartbeat/heartbeat.ts +145 -30
package/src/heartbeat/templates.ts +11 -7
package/src/http/session-data.ts +8 -1
package/src/http/tasks.ts +152 -3
package/src/jira/sync.ts +4 -4
package/src/linear/sync.ts +6 -5
package/src/providers/claude-adapter.ts +10 -76
package/src/providers/claude-managed-adapter.ts +61 -75
package/src/providers/codex-adapter.ts +15 -18
package/src/providers/codex-oauth/auth-json.ts +18 -1
package/src/providers/codex-oauth/flow.ts +24 -1
package/src/providers/types.ts +6 -0
package/src/tasks/worker-follow-up.ts +162 -2
package/src/telemetry.ts +11 -1
package/src/tests/claude-adapter.test.ts +5 -27
package/src/tests/claude-managed-adapter.test.ts +38 -52
package/src/tests/codex-adapter.test.ts +6 -31
package/src/tests/codex-oauth.test.ts +149 -3
package/src/tests/codex-pool.test.ts +14 -3
package/src/tests/e2b-dispatch.test.ts +330 -0
package/src/tests/heartbeat-supersede-resume.test.ts +285 -0
package/src/tests/heartbeat.test.ts +26 -16
package/src/tests/prompt-template-remaining.test.ts +4 -0
package/src/tests/resume-session.test.ts +42 -50
package/src/tests/structured-output.test.ts +69 -0
package/src/tests/task-completion-idempotency.test.ts +185 -2
package/src/tests/task-supersede-resume.test.ts +722 -0
package/src/tests/telemetry-init.test.ts +69 -0
package/src/tests/vcs-tracking.test.ts +39 -0
package/src/tools/send-task.ts +12 -1
package/src/tools/store-progress.ts +2 -2
package/src/tools/templates.ts +14 -2
package/src/types.ts +46 -1
package/src/workflows/executors/agent-task.ts +3 -0

package/src/providers/claude-adapter.ts CHANGED Viewed

@@ -423,10 +423,6 @@ class ClaudeSession implements ProviderSession {
       this.config.prompt,
     ];
-    if (this.config.resumeSessionId) {
-      cmd.push("--resume", this.config.resumeSessionId);
-    }
     if (this.config.additionalArgs?.length) {
       cmd.push(...this.config.additionalArgs);
     }
@@ -728,78 +724,7 @@ class ClaudeSession implements ProviderSession {
   }
   async waitForCompletion(): Promise<ProviderResult> {
-    const result = await this.completionPromise;
-    // Stale session retry: if process failed because session not found and we used --resume,
-    // strip --resume and retry with a fresh session
-    if (result.exitCode !== 0 && this.errorTracker.isSessionNotFound()) {
-      const hasResume =
-        !!this.config.resumeSessionId || (this.config.additionalArgs || []).includes("--resume");
-      if (hasResume) {
-        console.log(
-          `\x1b[33m[${this.config.role}] Session resume failed for task ${this.config.taskId.slice(0, 8)} — retrying without --resume\x1b[0m`,
-        );
-        const freshArgs = (this.config.additionalArgs || []).filter((arg, idx, arr) => {
-          if (arg === "--resume") return false;
-          if (idx > 0 && arr[idx - 1] === "--resume") return false;
-          return true;
-        });
-        const logDir = this.config.logFile.substring(0, this.config.logFile.lastIndexOf("/"));
-        const retryTimestamp = new Date().toISOString().replace(/[:.]/g, "-");
-        const retryLogFile = `${logDir}/${retryTimestamp}-retry-${this.config.taskId.slice(0, 8)}.jsonl`;
-        const retryConfig: ProviderSessionConfig = {
-          ...this.config,
-          additionalArgs: freshArgs,
-          logFile: retryLogFile,
-          resumeSessionId: undefined,
-        };
-        // Write new task file for retry
-        const taskFilePath = await writeTaskFile(this.taskFilePid, {
-          taskId: this.config.taskId,
-          agentId: this.config.agentId,
-          startedAt: new Date().toISOString(),
-        });
-        // Re-stage the system prompt for the retry — the original was unlinked
-        // when the first session finished. Same soft-fail semantics: null
-        // falls back to the inline --append-system-prompt argv.
-        let retrySystemPromptFile: string | null = null;
-        if (retryConfig.systemPrompt) {
-          const candidate = getSystemPromptFilePath(retryConfig.taskId);
-          try {
-            await writeFile(candidate, retryConfig.systemPrompt);
-            retrySystemPromptFile = candidate;
-          } catch (err) {
-            console.warn(
-              `\x1b[33m[claude]\x1b[0m Failed to stage retry system prompt to ${candidate} (${err}); falling back to --append-system-prompt argv.`,
-            );
-          }
-        }
-        const retrySession = new ClaudeSession(
-          retryConfig,
-          this.model,
-          taskFilePath,
-          this.taskFilePid,
-          null,
-          this.claudeBinaryArgv,
-          retrySystemPromptFile,
-        );
-        // Forward events from retry to our listeners
-        for (const listener of this.listeners) {
-          retrySession.onEvent(listener);
-        }
-        return retrySession.waitForCompletion();
-      }
-    }
-    return result;
+    return this.completionPromise;
   }
   async abort(): Promise<void> {
@@ -812,6 +737,15 @@ export class ClaudeAdapter implements ProviderAdapter {
   readonly traits = { hasMcp: true, hasLocalEnvironment: true };
   async createSession(config: ProviderSessionConfig): Promise<ProviderSession> {
+    // Native resume is deprecated. Follow-up continuity is delivered via the
+    // context preamble (see src/commands/context-preamble.ts). Any stray
+    // resumeSessionId is logged and ignored — we always spawn a fresh session.
+    if (config.resumeSessionId) {
+      console.warn(
+        "[claude-adapter] resumeSessionId ignored — native resume is disabled by deprecation plan",
+      );
+    }
     const model = config.model || "opus";
     const credType = validateClaudeCredentials(config.env || process.env);

package/src/providers/claude-managed-adapter.ts CHANGED Viewed

@@ -808,85 +808,71 @@ export class ClaudeManagedAdapter implements ProviderAdapter {
   }
   async createSession(config: ProviderSessionConfig): Promise<ProviderSession> {
-    let sessionId: string;
-    let userMessageContent: BetaManagedAgentsTextBlock[] | null;
+    // Native resume is deprecated. Follow-up continuity is delivered via the
+    // context preamble (see src/commands/context-preamble.ts). Any stray
+    // resumeSessionId is logged and ignored — we always create a fresh session.
+    if (config.resumeSessionId) {
+      console.warn(
+        "[claude-managed-adapter] resumeSessionId ignored — native resume is disabled by deprecation plan",
+      );
+    }
     const seenEventIds = new Set<string>();
-    if (config.resumeSessionId) {
-      // Resume path: skip `sessions.create`. Pre-fetch event history via
-      // `events.list` so the SSE loop can skip duplicates that the live
-      // stream replays. NO new `user.message` is sent (the agent already
-      // has one in flight).
-      sessionId = config.resumeSessionId;
-      userMessageContent = null;
-      try {
-        const list = await Promise.resolve(this.client.beta.sessions.events.list(sessionId));
-        for await (const evt of list) {
-          if ("id" in evt && evt.id) {
-            seenEventIds.add(evt.id);
-          }
-        }
-      } catch {
-        // If history fetch fails, fall through with an empty `seenEventIds`
-        // — the worst case is that the listener sees a few duplicate events
-        // (which the runner-side dedup handles).
-      }
-    } else {
-      // Fresh session. Compose the cache-control-annotated user message and
-      // open the managed session against the pre-existing agent + env.
-      userMessageContent = composeManagedUserMessage(config);
-      // Phase 4: derive `resources` from `config.vcsRepo` (which the runner
-      // copies from `task.vcsRepo` at the spawn site, see
-      // src/commands/runner.ts:3296). The SDK contract is
-      // `BetaManagedAgentsGitHubRepositoryResourceParams`:
-      //   { type: 'github_repository', url, authorization_token, checkout?: { type: 'branch', name } }
-      // We default `branch` to "main" since `ProviderSessionConfig` only
-      // carries the repo identifier as a string.
-      //
-      // GitHub auth: prefer the operator-side `MANAGED_GITHUB_VAULT_ID`
-      // (passed via `vault_ids` on the session — see runbook §"Claude Managed
-      // Agents — GitHub access"). If a literal PAT is supplied via
-      // `MANAGED_GITHUB_TOKEN`, use that instead. Without either, the SDK's
-      // required `authorization_token` field gets an empty string and the
-      // operator sees an authentication error from Anthropic — which is
-      // strictly better than silently dropping `resources`.
-      const createParams: Record<string, unknown> = {
-        agent: this.agentId,
-        environment_id: this.environmentId,
-        title: `Task ${config.taskId}`,
-        metadata: {
-          swarmAgentId: config.agentId,
-          swarmTaskId: config.taskId,
+    // Fresh session. Compose the cache-control-annotated user message and
+    // open the managed session against the pre-existing agent + env.
+    const userMessageContent: BetaManagedAgentsTextBlock[] | null =
+      composeManagedUserMessage(config);
+    // Phase 4: derive `resources` from `config.vcsRepo` (which the runner
+    // copies from `task.vcsRepo` at the spawn site, see
+    // src/commands/runner.ts:3296). The SDK contract is
+    // `BetaManagedAgentsGitHubRepositoryResourceParams`:
+    //   { type: 'github_repository', url, authorization_token, checkout?: { type: 'branch', name } }
+    // We default `branch` to "main" since `ProviderSessionConfig` only
+    // carries the repo identifier as a string.
+    //
+    // GitHub auth: prefer the operator-side `MANAGED_GITHUB_VAULT_ID`
+    // (passed via `vault_ids` on the session — see runbook §"Claude Managed
+    // Agents — GitHub access"). If a literal PAT is supplied via
+    // `MANAGED_GITHUB_TOKEN`, use that instead. Without either, the SDK's
+    // required `authorization_token` field gets an empty string and the
+    // operator sees an authentication error from Anthropic — which is
+    // strictly better than silently dropping `resources`.
+    const createParams: Record<string, unknown> = {
+      agent: this.agentId,
+      environment_id: this.environmentId,
+      title: `Task ${config.taskId}`,
+      metadata: {
+        swarmAgentId: config.agentId,
+        swarmTaskId: config.taskId,
+      },
+    };
+    if (config.vcsRepo) {
+      const repoUrl = normalizeRepoUrl(config.vcsRepo);
+      const branch = "main"; // ProviderSessionConfig doesn't carry per-task branch info today.
+      const githubToken = process.env.MANAGED_GITHUB_TOKEN ?? "";
+      createParams.resources = [
+        {
+          type: "github_repository",
+          url: repoUrl,
+          authorization_token: githubToken,
+          checkout: { type: "branch", name: branch },
         },
-      };
-      if (config.vcsRepo) {
-        const repoUrl = normalizeRepoUrl(config.vcsRepo);
-        const branch = "main"; // ProviderSessionConfig doesn't carry per-task branch info today.
-        const githubToken = process.env.MANAGED_GITHUB_TOKEN ?? "";
-        createParams.resources = [
-          {
-            type: "github_repository",
-            url: repoUrl,
-            authorization_token: githubToken,
-            checkout: { type: "branch", name: branch },
-          },
-        ];
-      }
-      // Multiple vaults can be linked to a single session — `vault_ids` is an
-      // array. The MCP vault holds the static-bearer credential for our
-      // `/mcp` endpoint (provisioned by `claude-managed-setup`); the GitHub
-      // vault holds the credential used by the `github_repository` resource.
-      // Either or both may be unset.
-      const vaultIds = [
-        process.env.MANAGED_MCP_VAULT_ID,
-        process.env.MANAGED_GITHUB_VAULT_ID,
-      ].filter((v): v is string => !!v && v.length > 0);
-      if (vaultIds.length > 0) {
-        createParams.vault_ids = Array.from(new Set(vaultIds));
-      }
-      const created = await Promise.resolve(this.client.beta.sessions.create(createParams));
-      sessionId = created.id;
+      ];
+    }
+    // Multiple vaults can be linked to a single session — `vault_ids` is an
+    // array. The MCP vault holds the static-bearer credential for our
+    // `/mcp` endpoint (provisioned by `claude-managed-setup`); the GitHub
+    // vault holds the credential used by the `github_repository` resource.
+    // Either or both may be unset.
+    const vaultIds = [process.env.MANAGED_MCP_VAULT_ID, process.env.MANAGED_GITHUB_VAULT_ID].filter(
+      (v): v is string => !!v && v.length > 0,
+    );
+    if (vaultIds.length > 0) {
+      createParams.vault_ids = Array.from(new Set(vaultIds));
     }
+    const created = await Promise.resolve(this.client.beta.sessions.create(createParams));
+    const sessionId = created.id;
     return new ClaudeManagedSession(
       this.client,

package/src/providers/codex-adapter.ts CHANGED Viewed

@@ -6,7 +6,8 @@
  *
  *   Phase 1 — factory wiring + skeleton classes.
  *   Phase 2 — event stream normalization, CostData, AbortController, log file,
- *             AGENTS.md system-prompt injection, canResume via resumeThread.
+ *             AGENTS.md system-prompt injection. (Native resume was removed in
+ *             the 2026-05-28 deprecate-native-resume plan — see context-preamble.ts.)
  *   Phase 3 — per-session MCP config builder + model catalogue wiring. The
  *             baseline Codex config (`~/.codex/config.toml`) is written at
  *             Docker image build time (deferred to Phase 6). For local dev
@@ -1280,9 +1281,15 @@ export async function createInProcessCodexSession(
       model: resolvedModel,
     };
-    const thread = config.resumeSessionId
-      ? codex.resumeThread(config.resumeSessionId, threadOptions)
-      : codex.startThread(threadOptions);
+    // Native resume is deprecated. Follow-up continuity is delivered via the
+    // context preamble (see src/commands/context-preamble.ts). Any stray
+    // resumeSessionId is logged and ignored — we always start a fresh thread.
+    if (config.resumeSessionId) {
+      console.warn(
+        "[codex-adapter] resumeSessionId ignored — native resume is disabled by deprecation plan",
+      );
+    }
+    const thread = codex.startThread(threadOptions);
     return new CodexSession(
       thread,
@@ -1615,20 +1622,10 @@ export class CodexAdapter implements ProviderAdapter {
     return new CodexSubprocessSession(config, this.skillsDir);
   }
-  async canResume(sessionId: string): Promise<boolean> {
-    if (!sessionId || typeof sessionId !== "string") {
-      return false;
-    }
-    try {
-      const codex = new Codex();
-      // `resumeThread` is synchronous in 0.118.x and returns a Thread handle.
-      // The runner only calls canResume when deciding whether to resume a
-      // task, so we accept the (cheap) handshake cost.
-      codex.resumeThread(sessionId);
-      return true;
-    } catch {
-      return false;
-    }
+  async canResume(_sessionId: string): Promise<boolean> {
+    // Native resume is deprecated; runner no longer threads resumeSessionId
+    // to adapters. Follow-up continuity flows via the context preamble.
+    return false;
   }
   formatCommand(commandName: string): string {

package/src/providers/codex-oauth/auth-json.ts CHANGED Viewed

@@ -22,14 +22,31 @@
  * purposes and doesn't validate it as a separate JWT.
  */
+import { extractChatgptUserId } from "./flow.js";
 import type { CodexAuthJson, CodexOAuthCredentials } from "./types.js";
 export function authJsonToCredentialSelection(auth: CodexAuthJson, slot = 0, total = 1) {
+  // Prefer the per-grant `chatgpt_user_id` so two slots authenticated against
+  // the same ChatGPT Team workspace get distinct suffixes. Fall back to
+  // account_id when the JWT lacks the claim — preserves boot for any
+  // unexpected token shape, at the cost of re-introducing the slot-collision
+  // bug for that specific slot only. The warn is a deliberate canary.
+  const userId = extractChatgptUserId(auth.tokens.access_token);
+  const suffixSource = userId ?? auth.tokens.account_id;
+  if (!userId) {
+    console.warn(
+      "[codex-oauth] No chatgpt_user_id in JWT — falling back to account_id for keySuffix derivation. " +
+        "If two slots share an account, their suffixes will collide.",
+    );
+  }
   return {
+    // `selected` satisfies the CredentialSelection interface but is never read
+    // for CODEX_OAUTH: creds are materialised to ~/.codex/auth.json (not env-injected),
+    // and all tracking flows through `keySuffix` + `index` (never `selected`).
     selected: auth.tokens.account_id,
     index: slot,
     total,
-    keySuffix: auth.tokens.account_id.slice(-5),
+    keySuffix: suffixSource.slice(-5),
     keyType: "CODEX_OAUTH",
   };
 }

package/src/providers/codex-oauth/flow.ts CHANGED Viewed

@@ -85,7 +85,12 @@ export function decodeJwt(token: string): Record<string, unknown> | null {
     const parts = token.split(".");
     if (parts.length !== 3) return null;
     const payload = parts[1] ?? "";
-    const decoded = atob(payload);
+    // Normalize base64url → standard base64 before decoding.
+    // JWTs use base64url (RFC 7515): '-' replaces '+', '_' replaces '/', padding stripped.
+    // atob() only accepts standard base64; passing raw base64url throws on '-' or '_'.
+    const b64 = payload.replace(/-/g, "+").replace(/_/g, "/");
+    const padded = b64 + "=".repeat((4 - (b64.length % 4)) % 4);
+    const decoded = atob(padded);
     return JSON.parse(decoded);
   } catch {
     return null;
@@ -172,6 +177,24 @@ export function getAccountId(accessToken: string): string | null {
   return typeof accountId === "string" && accountId.length > 0 ? accountId : null;
 }
+/**
+ * Extract `chatgpt_user_id` from the JWT's OpenAI-auth claim namespace. This
+ * is per-OAuth-grant (unique per user-on-account), distinct from
+ * `chatgpt_account_id` which is shared across all users in a ChatGPT Team
+ * workspace. Used to give pooled credentials a slot-unique `keySuffix` even
+ * when multiple pool slots authenticate against the same Team account.
+ *
+ * Returns null if the JWT cannot be decoded, the namespace is absent, or the
+ * claim is missing/empty. Callers MUST handle null by falling back to a
+ * different identifier (typically `chatgpt_account_id`) — do not throw.
+ */
+export function extractChatgptUserId(accessToken: string): string | null {
+  const payload = decodeJwt(accessToken);
+  const auth = payload?.[JWT_CLAIM_PATH] as Record<string, unknown> | undefined;
+  const userId = auth?.chatgpt_user_id;
+  return typeof userId === "string" && userId.length > 0 ? userId : null;
+}
 export async function createAuthorizationFlow(
   originator = "agent-swarm",
 ): Promise<{ verifier: string; state: string; url: string }> {

package/src/providers/types.ts CHANGED Viewed

@@ -89,6 +89,12 @@ export interface ProviderSessionConfig {
   apiKey: string;
   cwd: string;
   vcsRepo?: string;
+  /**
+   * @deprecated Never set by the runner — native session resume was removed in
+   * the 2026-05-28 plan. Adapters log + ignore any stray value. Follow-up
+   * continuity flows through the context preamble; see
+   * `src/commands/context-preamble.ts` and `src/commands/resume-session.ts`.
+   */
   resumeSessionId?: string;
   iteration?: number;
   logFile: string;

package/src/tasks/worker-follow-up.ts CHANGED Viewed

@@ -1,9 +1,27 @@
-import { createTaskExtended, getAgentById, getLeadAgent, getTaskAttachments } from "../be/db";
+import {
+  createTaskExtended,
+  getActiveTaskCount,
+  getAgentById,
+  getLeadAgent,
+  getTaskAttachments,
+  getTaskById,
+} from "../be/db";
+import { repointTrackerSyncBySwarmId } from "../be/db-queries/tracker";
 import { resolveTemplate } from "../prompts/resolver";
-import type { AgentTask, TaskAttachment } from "../types";
+import type { AgentTask, ResumeReason, TaskAttachment } from "../types";
 // Side-effect import: registers task lifecycle templates in the in-memory registry.
 import "../tools/templates";
+/**
+ * Liveness window (seconds) for considering a worker "online" enough to
+ * pre-assign a resume task. Defaults to 30s; override via env. The worker
+ * heartbeats `lastActivityAt` on its agent row at least once per
+ * provider tool-call / poll tick, so 30s comfortably covers a healthy worker.
+ */
+export const WORKER_LIVENESS_WINDOW_SECONDS = Number(
+  process.env.WORKER_LIVENESS_WINDOW_SECONDS || "30",
+);
 function attachmentPointer(a: TaskAttachment): string {
   switch (a.kind) {
     case "url":
@@ -36,6 +54,7 @@ export function createWorkerTaskFollowUp(args: {
   const { task, status, output, failureReason } = args;
   if (task.workflowRunId) return null;
+  if (task.followUpConfig?.disabled === true) return null;
   const taskAgent = getAgentById(task.agentId ?? "");
   if (!taskAgent || taskAgent.isLead) return null;
@@ -45,6 +64,16 @@ export function createWorkerTaskFollowUp(args: {
   const agentName = taskAgent.name || task.agentId?.slice(0, 8) || "Unknown";
   const taskDesc = task.task.slice(0, 200);
+  const creatorAgent = task.creatorAgentId
+    ? `${task.creatorAgentId}${task.creatorAgentId === leadAgent.id ? " (you)" : ""}`
+    : "<none>";
+  const instructions =
+    status === "completed"
+      ? (task.followUpConfig?.onCompleted ?? "")
+      : (task.followUpConfig?.onFailed ?? "");
+  const followUpInstructions = instructions
+    ? `\nAdditional instructions from the task creator:\n${instructions}\n`
+    : "";
   let followUpDescription: string;
   if (status === "completed") {
@@ -55,7 +84,9 @@ export function createWorkerTaskFollowUp(args: {
     const completedResult = resolveTemplate("task.worker.completed", {
       agent_name: agentName,
       task_desc: taskDesc,
+      creator_agent: creatorAgent,
       output_summary: outputSummary,
+      follow_up_instructions: followUpInstructions,
       task_id: task.id,
     });
     followUpDescription = completedResult.text;
@@ -64,7 +95,9 @@ export function createWorkerTaskFollowUp(args: {
     const failedResult = resolveTemplate("task.worker.failed", {
       agent_name: agentName,
       task_desc: taskDesc,
+      creator_agent: creatorAgent,
       failure_reason: reason,
+      follow_up_instructions: followUpInstructions,
       task_id: task.id,
     });
     followUpDescription = failedResult.text;
@@ -80,3 +113,130 @@ export function createWorkerTaskFollowUp(args: {
     slackUserId: task.slackUserId,
   });
 }
+/** Result of `createResumeFollowUp`. */
+export type CreateResumeFollowUpResult =
+  | { kind: "created"; task: AgentTask }
+  | { kind: "workflow-skip"; stepId: string }
+  | { kind: "skipped"; reason: "parent_not_found" | "lead_not_found" };
+/**
+ * Create a "resume" follow-up task for a parent that is being superseded
+ * (graceful shutdown, context-limit pressure, manual operator action).
+ *
+ * Workflow carve-out: if the parent is a workflow step (`workflowRunStepId`
+ * is set), no follow-up is created. Returns `{ kind: 'workflow-skip', stepId }`
+ * so the caller can `failTask(parent.id, 'superseded_workflow_task')` and let
+ * the workflow engine's retry/failure policy take over.
+ *
+ * Field inheritance is transitive via `createTaskExtended`'s `parentTaskId`
+ * lookup (`dir`, `vcsRepo`/`vcsProvider`/etc., `outputSchema`, Slack/AgentMail
+ * context, `requestedByUserId`, `contextKey`, `followUpConfig`). This was chosen
+ * over re-listing fields here so there is a single source of truth.
+ *
+ * `model` is intentionally NOT inherited: a resume task is routinely claimed by
+ * a different worker (and thus a different harness/provider) than the parent, so
+ * carrying the parent's concrete provider-specific model would break the child
+ * at session-init. The resume task runs on the assignee agent's own model. See
+ * the `model` carve-out comment in `createTaskExtended` (`src/be/db.ts`).
+ *
+ * Routing: the parent's assigned worker (`parent.agentId`) is preferred if
+ * its `lastActivityAt` is within `WORKER_LIVENESS_WINDOW_SECONDS` AND it has
+ * remaining capacity (`getActiveTaskCount < agent.maxTasks`). Otherwise the
+ * resume task goes to the unassigned pool for any worker to pick up.
+ */
+export function createResumeFollowUp(args: {
+  parentId: string;
+  reason: ResumeReason;
+}): CreateResumeFollowUpResult {
+  const parent = getTaskById(args.parentId);
+  if (!parent) return { kind: "skipped", reason: "parent_not_found" };
+  // Workflow carve-out — let the engine's retry policy handle recovery.
+  if (parent.workflowRunStepId) {
+    return { kind: "workflow-skip", stepId: parent.workflowRunStepId };
+  }
+  // Routing decision — same DB process so the read-then-create window is
+  // small. Acceptable for v1 per the plan (the unassigned-pool fallback
+  // covers the race anyway).
+  //
+  // For `graceful_shutdown` specifically, force the unassigned-pool path:
+  // the parent worker is exiting and will call `closeAgent` (→ offline)
+  // moments after the supersede loop. At the moment of this check it
+  // still looks fresh + has capacity (the parent just terminal-
+  // transitioned), so the liveness branch would assign the resume task to
+  // a dying worker — leaving it orphaned in `pending` once the worker
+  // closes. Pool routing lets any live worker claim it.
+  //
+  // Other reasons keep the liveness-aware routing:
+  //   - `crash_recovery`: parent worker is presumed dead → `lastActivityAt`
+  //     is stale or `status === "offline"`, so the existing check already
+  //     rejects it naturally.
+  //   - `context_limits` / `manual_supersede`: the worker is alive and
+  //     can keep handling the resume task on a fresh session.
+  let preferredAgentId: string | undefined;
+  if (parent.agentId && args.reason !== "graceful_shutdown") {
+    const candidate = getAgentById(parent.agentId);
+    if (candidate && candidate.status !== "offline") {
+      const lastActivity = candidate.lastActivityAt ? Date.parse(candidate.lastActivityAt) : 0;
+      const fresh =
+        Number.isFinite(lastActivity) &&
+        Date.now() - lastActivity < WORKER_LIVENESS_WINDOW_SECONDS * 1000;
+      const activeCount = getActiveTaskCount(candidate.id);
+      const hasCap = activeCount < (candidate.maxTasks ?? 1);
+      if (fresh && hasCap) {
+        preferredAgentId = candidate.id;
+      }
+    }
+  }
+  const parentDesc = parent.task.slice(0, 200);
+  const followUpDescription = [
+    "Resume interrupted task.",
+    "",
+    `Parent task: ${parentDesc}`,
+    "",
+    `Reason: ${args.reason}`,
+    "",
+    "The full prior context (description, recent tool calls, artifacts) is",
+    "prepended to this prompt at dispatch time via the resume context preamble.",
+    "Do NOT redo work already completed — extend it.",
+  ].join("\n");
+  const priority = Math.min(100, (parent.priority ?? 50) + 10);
+  const tags = ["auto-resume", `reason:${args.reason}`];
+  // Identity-shaped fields (dir, VCS provider/repo/number/url/etc.,
+  // outputSchema, slack channel/thread/user, agentmail, mention, contextKey,
+  // requestedByUserId, followUpConfig) are auto-inherited from the parent by
+  // `createTaskExtended`'s parentTaskId block (see src/be/db.ts). `model` is
+  // deliberately excluded there so the resume task resolves to the claiming
+  // agent's own provider/model — never the parent's concrete model string.
+  // We only override what's SPECIFIC to the resume task here.
+  const created = createTaskExtended(followUpDescription, {
+    agentId: preferredAgentId,
+    creatorAgentId: parent.creatorAgentId,
+    source: "system",
+    taskType: "resume",
+    tags,
+    priority,
+    parentTaskId: parent.id,
+  });
+  // Repoint Linear / Jira `tracker_sync` rows from the (now terminal) parent
+  // to the resume child. Without this, outbound completion posts for the
+  // resume task can't find their tracker_sync row, and subsequent inbound
+  // webhooks load the terminal parent and create duplicate tasks.
+  //
+  // Safe to call when no tracker_sync rows exist for this parent (no-op).
+  // Covers all providers (Linear AND Jira) and entity types in one call.
+  const repointed = repointTrackerSyncBySwarmId(parent.id, created.id);
+  if (repointed > 0) {
+    console.log(
+      `[ResumeFollowUp] Repointed ${repointed} tracker_sync row(s) from ${parent.id.slice(0, 8)} → ${created.id.slice(0, 8)}`,
+    );
+  }
+  return { kind: "created", task: created };
+}

package/src/telemetry.ts CHANGED Viewed

@@ -153,6 +153,16 @@ function isCloudDeployment(): boolean {
   return raw === "true" || raw === "1";
 }
+function getTelemetryEnvironment(): string {
+  const explicit = process.env.DESPLEGA_TELEMETRY_ENV?.trim();
+  if (explicit) return explicit;
+  // Do not default from NODE_ENV: shipped Bun/npm installs can report
+  // "development" even when the operator did not choose a telemetry cohort.
+  if (process.env.NODE_ENV === "test") return "test";
+  return "production";
+}
 /** Fire-and-forget telemetry event. Never throws, never blocks. */
 export function track(options: TrackOptions): void {
   if (!isEnabled() || !installationId) return;
@@ -177,7 +187,7 @@ export function track(options: TrackOptions): void {
       metadata: {
         transport: "https",
         schema_version: 1,
-        environment: process.env.NODE_ENV ?? "production",
+        environment: getTelemetryEnvironment(),
         is_cloud: isCloudDeployment(),
         ...getOrgIdentity(),
         ...options.metadata,