@desplega.ai/agent-swarm 1.86.0 → 1.87.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/openapi.json +72 -1
  2. package/package.json +3 -1
  3. package/src/be/db-queries/tracker.ts +21 -0
  4. package/src/be/db.ts +235 -14
  5. package/src/be/migrations/079_task_followup_config.sql +1 -0
  6. package/src/be/modelsdev-cache.json +77663 -74073
  7. package/src/cli.tsx +26 -0
  8. package/src/commands/context-preamble.ts +272 -0
  9. package/src/commands/e2b.ts +728 -0
  10. package/src/commands/resume-session.ts +35 -78
  11. package/src/commands/runner.ts +125 -13
  12. package/src/e2b/dispatch.ts +429 -0
  13. package/src/e2b/env.ts +206 -0
  14. package/src/heartbeat/heartbeat.ts +145 -30
  15. package/src/heartbeat/templates.ts +11 -7
  16. package/src/http/session-data.ts +8 -1
  17. package/src/http/tasks.ts +152 -3
  18. package/src/jira/sync.ts +4 -4
  19. package/src/linear/sync.ts +6 -5
  20. package/src/providers/claude-adapter.ts +10 -76
  21. package/src/providers/claude-managed-adapter.ts +61 -75
  22. package/src/providers/codex-adapter.ts +15 -18
  23. package/src/providers/codex-oauth/auth-json.ts +18 -1
  24. package/src/providers/codex-oauth/flow.ts +24 -1
  25. package/src/providers/types.ts +6 -0
  26. package/src/tasks/worker-follow-up.ts +162 -2
  27. package/src/telemetry.ts +11 -1
  28. package/src/tests/claude-adapter.test.ts +5 -27
  29. package/src/tests/claude-managed-adapter.test.ts +38 -52
  30. package/src/tests/codex-adapter.test.ts +6 -31
  31. package/src/tests/codex-oauth.test.ts +149 -3
  32. package/src/tests/codex-pool.test.ts +14 -3
  33. package/src/tests/e2b-dispatch.test.ts +330 -0
  34. package/src/tests/heartbeat-supersede-resume.test.ts +285 -0
  35. package/src/tests/heartbeat.test.ts +26 -16
  36. package/src/tests/prompt-template-remaining.test.ts +4 -0
  37. package/src/tests/resume-session.test.ts +42 -50
  38. package/src/tests/structured-output.test.ts +69 -0
  39. package/src/tests/task-completion-idempotency.test.ts +185 -2
  40. package/src/tests/task-supersede-resume.test.ts +722 -0
  41. package/src/tests/telemetry-init.test.ts +69 -0
  42. package/src/tests/vcs-tracking.test.ts +39 -0
  43. package/src/tools/send-task.ts +12 -1
  44. package/src/tools/store-progress.ts +2 -2
  45. package/src/tools/templates.ts +14 -2
  46. package/src/types.ts +46 -1
  47. package/src/workflows/executors/agent-task.ts +3 -0
@@ -423,10 +423,6 @@ class ClaudeSession implements ProviderSession {
423
423
  this.config.prompt,
424
424
  ];
425
425
 
426
- if (this.config.resumeSessionId) {
427
- cmd.push("--resume", this.config.resumeSessionId);
428
- }
429
-
430
426
  if (this.config.additionalArgs?.length) {
431
427
  cmd.push(...this.config.additionalArgs);
432
428
  }
@@ -728,78 +724,7 @@ class ClaudeSession implements ProviderSession {
728
724
  }
729
725
 
730
726
  async waitForCompletion(): Promise<ProviderResult> {
731
- const result = await this.completionPromise;
732
-
733
- // Stale session retry: if process failed because session not found and we used --resume,
734
- // strip --resume and retry with a fresh session
735
- if (result.exitCode !== 0 && this.errorTracker.isSessionNotFound()) {
736
- const hasResume =
737
- !!this.config.resumeSessionId || (this.config.additionalArgs || []).includes("--resume");
738
- if (hasResume) {
739
- console.log(
740
- `\x1b[33m[${this.config.role}] Session resume failed for task ${this.config.taskId.slice(0, 8)} — retrying without --resume\x1b[0m`,
741
- );
742
-
743
- const freshArgs = (this.config.additionalArgs || []).filter((arg, idx, arr) => {
744
- if (arg === "--resume") return false;
745
- if (idx > 0 && arr[idx - 1] === "--resume") return false;
746
- return true;
747
- });
748
-
749
- const logDir = this.config.logFile.substring(0, this.config.logFile.lastIndexOf("/"));
750
- const retryTimestamp = new Date().toISOString().replace(/[:.]/g, "-");
751
- const retryLogFile = `${logDir}/${retryTimestamp}-retry-${this.config.taskId.slice(0, 8)}.jsonl`;
752
-
753
- const retryConfig: ProviderSessionConfig = {
754
- ...this.config,
755
- additionalArgs: freshArgs,
756
- logFile: retryLogFile,
757
- resumeSessionId: undefined,
758
- };
759
-
760
- // Write new task file for retry
761
- const taskFilePath = await writeTaskFile(this.taskFilePid, {
762
- taskId: this.config.taskId,
763
- agentId: this.config.agentId,
764
- startedAt: new Date().toISOString(),
765
- });
766
-
767
- // Re-stage the system prompt for the retry — the original was unlinked
768
- // when the first session finished. Same soft-fail semantics: null
769
- // falls back to the inline --append-system-prompt argv.
770
- let retrySystemPromptFile: string | null = null;
771
- if (retryConfig.systemPrompt) {
772
- const candidate = getSystemPromptFilePath(retryConfig.taskId);
773
- try {
774
- await writeFile(candidate, retryConfig.systemPrompt);
775
- retrySystemPromptFile = candidate;
776
- } catch (err) {
777
- console.warn(
778
- `\x1b[33m[claude]\x1b[0m Failed to stage retry system prompt to ${candidate} (${err}); falling back to --append-system-prompt argv.`,
779
- );
780
- }
781
- }
782
-
783
- const retrySession = new ClaudeSession(
784
- retryConfig,
785
- this.model,
786
- taskFilePath,
787
- this.taskFilePid,
788
- null,
789
- this.claudeBinaryArgv,
790
- retrySystemPromptFile,
791
- );
792
-
793
- // Forward events from retry to our listeners
794
- for (const listener of this.listeners) {
795
- retrySession.onEvent(listener);
796
- }
797
-
798
- return retrySession.waitForCompletion();
799
- }
800
- }
801
-
802
- return result;
727
+ return this.completionPromise;
803
728
  }
804
729
 
805
730
  async abort(): Promise<void> {
@@ -812,6 +737,15 @@ export class ClaudeAdapter implements ProviderAdapter {
812
737
  readonly traits = { hasMcp: true, hasLocalEnvironment: true };
813
738
 
814
739
  async createSession(config: ProviderSessionConfig): Promise<ProviderSession> {
740
+ // Native resume is deprecated. Follow-up continuity is delivered via the
741
+ // context preamble (see src/commands/context-preamble.ts). Any stray
742
+ // resumeSessionId is logged and ignored — we always spawn a fresh session.
743
+ if (config.resumeSessionId) {
744
+ console.warn(
745
+ "[claude-adapter] resumeSessionId ignored — native resume is disabled by deprecation plan",
746
+ );
747
+ }
748
+
815
749
  const model = config.model || "opus";
816
750
 
817
751
  const credType = validateClaudeCredentials(config.env || process.env);
@@ -808,85 +808,71 @@ export class ClaudeManagedAdapter implements ProviderAdapter {
808
808
  }
809
809
 
810
810
  async createSession(config: ProviderSessionConfig): Promise<ProviderSession> {
811
- let sessionId: string;
812
- let userMessageContent: BetaManagedAgentsTextBlock[] | null;
811
+ // Native resume is deprecated. Follow-up continuity is delivered via the
812
+ // context preamble (see src/commands/context-preamble.ts). Any stray
813
+ // resumeSessionId is logged and ignored — we always create a fresh session.
814
+ if (config.resumeSessionId) {
815
+ console.warn(
816
+ "[claude-managed-adapter] resumeSessionId ignored — native resume is disabled by deprecation plan",
817
+ );
818
+ }
819
+
813
820
  const seenEventIds = new Set<string>();
814
821
 
815
- if (config.resumeSessionId) {
816
- // Resume path: skip `sessions.create`. Pre-fetch event history via
817
- // `events.list` so the SSE loop can skip duplicates that the live
818
- // stream replays. NO new `user.message` is sent (the agent already
819
- // has one in flight).
820
- sessionId = config.resumeSessionId;
821
- userMessageContent = null;
822
- try {
823
- const list = await Promise.resolve(this.client.beta.sessions.events.list(sessionId));
824
- for await (const evt of list) {
825
- if ("id" in evt && evt.id) {
826
- seenEventIds.add(evt.id);
827
- }
828
- }
829
- } catch {
830
- // If history fetch fails, fall through with an empty `seenEventIds`
831
- // the worst case is that the listener sees a few duplicate events
832
- // (which the runner-side dedup handles).
833
- }
834
- } else {
835
- // Fresh session. Compose the cache-control-annotated user message and
836
- // open the managed session against the pre-existing agent + env.
837
- userMessageContent = composeManagedUserMessage(config);
838
- // Phase 4: derive `resources` from `config.vcsRepo` (which the runner
839
- // copies from `task.vcsRepo` at the spawn site, see
840
- // src/commands/runner.ts:3296). The SDK contract is
841
- // `BetaManagedAgentsGitHubRepositoryResourceParams`:
842
- // { type: 'github_repository', url, authorization_token, checkout?: { type: 'branch', name } }
843
- // We default `branch` to "main" since `ProviderSessionConfig` only
844
- // carries the repo identifier as a string.
845
- //
846
- // GitHub auth: prefer the operator-side `MANAGED_GITHUB_VAULT_ID`
847
- // (passed via `vault_ids` on the session — see runbook §"Claude Managed
848
- // Agents — GitHub access"). If a literal PAT is supplied via
849
- // `MANAGED_GITHUB_TOKEN`, use that instead. Without either, the SDK's
850
- // required `authorization_token` field gets an empty string and the
851
- // operator sees an authentication error from Anthropic — which is
852
- // strictly better than silently dropping `resources`.
853
- const createParams: Record<string, unknown> = {
854
- agent: this.agentId,
855
- environment_id: this.environmentId,
856
- title: `Task ${config.taskId}`,
857
- metadata: {
858
- swarmAgentId: config.agentId,
859
- swarmTaskId: config.taskId,
822
+ // Fresh session. Compose the cache-control-annotated user message and
823
+ // open the managed session against the pre-existing agent + env.
824
+ const userMessageContent: BetaManagedAgentsTextBlock[] | null =
825
+ composeManagedUserMessage(config);
826
+ // Phase 4: derive `resources` from `config.vcsRepo` (which the runner
827
+ // copies from `task.vcsRepo` at the spawn site, see
828
+ // src/commands/runner.ts:3296). The SDK contract is
829
+ // `BetaManagedAgentsGitHubRepositoryResourceParams`:
830
+ // { type: 'github_repository', url, authorization_token, checkout?: { type: 'branch', name } }
831
+ // We default `branch` to "main" since `ProviderSessionConfig` only
832
+ // carries the repo identifier as a string.
833
+ //
834
+ // GitHub auth: prefer the operator-side `MANAGED_GITHUB_VAULT_ID`
835
+ // (passed via `vault_ids` on the session — see runbook §"Claude Managed
836
+ // Agents — GitHub access"). If a literal PAT is supplied via
837
+ // `MANAGED_GITHUB_TOKEN`, use that instead. Without either, the SDK's
838
+ // required `authorization_token` field gets an empty string and the
839
+ // operator sees an authentication error from Anthropic — which is
840
+ // strictly better than silently dropping `resources`.
841
+ const createParams: Record<string, unknown> = {
842
+ agent: this.agentId,
843
+ environment_id: this.environmentId,
844
+ title: `Task ${config.taskId}`,
845
+ metadata: {
846
+ swarmAgentId: config.agentId,
847
+ swarmTaskId: config.taskId,
848
+ },
849
+ };
850
+ if (config.vcsRepo) {
851
+ const repoUrl = normalizeRepoUrl(config.vcsRepo);
852
+ const branch = "main"; // ProviderSessionConfig doesn't carry per-task branch info today.
853
+ const githubToken = process.env.MANAGED_GITHUB_TOKEN ?? "";
854
+ createParams.resources = [
855
+ {
856
+ type: "github_repository",
857
+ url: repoUrl,
858
+ authorization_token: githubToken,
859
+ checkout: { type: "branch", name: branch },
860
860
  },
861
- };
862
- if (config.vcsRepo) {
863
- const repoUrl = normalizeRepoUrl(config.vcsRepo);
864
- const branch = "main"; // ProviderSessionConfig doesn't carry per-task branch info today.
865
- const githubToken = process.env.MANAGED_GITHUB_TOKEN ?? "";
866
- createParams.resources = [
867
- {
868
- type: "github_repository",
869
- url: repoUrl,
870
- authorization_token: githubToken,
871
- checkout: { type: "branch", name: branch },
872
- },
873
- ];
874
- }
875
- // Multiple vaults can be linked to a single session — `vault_ids` is an
876
- // array. The MCP vault holds the static-bearer credential for our
877
- // `/mcp` endpoint (provisioned by `claude-managed-setup`); the GitHub
878
- // vault holds the credential used by the `github_repository` resource.
879
- // Either or both may be unset.
880
- const vaultIds = [
881
- process.env.MANAGED_MCP_VAULT_ID,
882
- process.env.MANAGED_GITHUB_VAULT_ID,
883
- ].filter((v): v is string => !!v && v.length > 0);
884
- if (vaultIds.length > 0) {
885
- createParams.vault_ids = Array.from(new Set(vaultIds));
886
- }
887
- const created = await Promise.resolve(this.client.beta.sessions.create(createParams));
888
- sessionId = created.id;
861
+ ];
862
+ }
863
+ // Multiple vaults can be linked to a single session — `vault_ids` is an
864
+ // array. The MCP vault holds the static-bearer credential for our
865
+ // `/mcp` endpoint (provisioned by `claude-managed-setup`); the GitHub
866
+ // vault holds the credential used by the `github_repository` resource.
867
+ // Either or both may be unset.
868
+ const vaultIds = [process.env.MANAGED_MCP_VAULT_ID, process.env.MANAGED_GITHUB_VAULT_ID].filter(
869
+ (v): v is string => !!v && v.length > 0,
870
+ );
871
+ if (vaultIds.length > 0) {
872
+ createParams.vault_ids = Array.from(new Set(vaultIds));
889
873
  }
874
+ const created = await Promise.resolve(this.client.beta.sessions.create(createParams));
875
+ const sessionId = created.id;
890
876
 
891
877
  return new ClaudeManagedSession(
892
878
  this.client,
@@ -6,7 +6,8 @@
6
6
  *
7
7
  * Phase 1 — factory wiring + skeleton classes.
8
8
  * Phase 2 — event stream normalization, CostData, AbortController, log file,
9
- * AGENTS.md system-prompt injection, canResume via resumeThread.
9
+ * AGENTS.md system-prompt injection. (Native resume was removed in
10
+ * the 2026-05-28 deprecate-native-resume plan — see context-preamble.ts.)
10
11
  * Phase 3 — per-session MCP config builder + model catalogue wiring. The
11
12
  * baseline Codex config (`~/.codex/config.toml`) is written at
12
13
  * Docker image build time (deferred to Phase 6). For local dev
@@ -1280,9 +1281,15 @@ export async function createInProcessCodexSession(
1280
1281
  model: resolvedModel,
1281
1282
  };
1282
1283
 
1283
- const thread = config.resumeSessionId
1284
- ? codex.resumeThread(config.resumeSessionId, threadOptions)
1285
- : codex.startThread(threadOptions);
1284
+ // Native resume is deprecated. Follow-up continuity is delivered via the
1285
+ // context preamble (see src/commands/context-preamble.ts). Any stray
1286
+ // resumeSessionId is logged and ignored — we always start a fresh thread.
1287
+ if (config.resumeSessionId) {
1288
+ console.warn(
1289
+ "[codex-adapter] resumeSessionId ignored — native resume is disabled by deprecation plan",
1290
+ );
1291
+ }
1292
+ const thread = codex.startThread(threadOptions);
1286
1293
 
1287
1294
  return new CodexSession(
1288
1295
  thread,
@@ -1615,20 +1622,10 @@ export class CodexAdapter implements ProviderAdapter {
1615
1622
  return new CodexSubprocessSession(config, this.skillsDir);
1616
1623
  }
1617
1624
 
1618
- async canResume(sessionId: string): Promise<boolean> {
1619
- if (!sessionId || typeof sessionId !== "string") {
1620
- return false;
1621
- }
1622
- try {
1623
- const codex = new Codex();
1624
- // `resumeThread` is synchronous in 0.118.x and returns a Thread handle.
1625
- // The runner only calls canResume when deciding whether to resume a
1626
- // task, so we accept the (cheap) handshake cost.
1627
- codex.resumeThread(sessionId);
1628
- return true;
1629
- } catch {
1630
- return false;
1631
- }
1625
+ async canResume(_sessionId: string): Promise<boolean> {
1626
+ // Native resume is deprecated; runner no longer threads resumeSessionId
1627
+ // to adapters. Follow-up continuity flows via the context preamble.
1628
+ return false;
1632
1629
  }
1633
1630
 
1634
1631
  formatCommand(commandName: string): string {
@@ -22,14 +22,31 @@
22
22
  * purposes and doesn't validate it as a separate JWT.
23
23
  */
24
24
 
25
+ import { extractChatgptUserId } from "./flow.js";
25
26
  import type { CodexAuthJson, CodexOAuthCredentials } from "./types.js";
26
27
 
27
28
  export function authJsonToCredentialSelection(auth: CodexAuthJson, slot = 0, total = 1) {
29
+ // Prefer the per-grant `chatgpt_user_id` so two slots authenticated against
30
+ // the same ChatGPT Team workspace get distinct suffixes. Fall back to
31
+ // account_id when the JWT lacks the claim — preserves boot for any
32
+ // unexpected token shape, at the cost of re-introducing the slot-collision
33
+ // bug for that specific slot only. The warn is a deliberate canary.
34
+ const userId = extractChatgptUserId(auth.tokens.access_token);
35
+ const suffixSource = userId ?? auth.tokens.account_id;
36
+ if (!userId) {
37
+ console.warn(
38
+ "[codex-oauth] No chatgpt_user_id in JWT — falling back to account_id for keySuffix derivation. " +
39
+ "If two slots share an account, their suffixes will collide.",
40
+ );
41
+ }
28
42
  return {
43
+ // `selected` satisfies the CredentialSelection interface but is never read
44
+ // for CODEX_OAUTH: creds are materialised to ~/.codex/auth.json (not env-injected),
45
+ // and all tracking flows through `keySuffix` + `index` (never `selected`).
29
46
  selected: auth.tokens.account_id,
30
47
  index: slot,
31
48
  total,
32
- keySuffix: auth.tokens.account_id.slice(-5),
49
+ keySuffix: suffixSource.slice(-5),
33
50
  keyType: "CODEX_OAUTH",
34
51
  };
35
52
  }
@@ -85,7 +85,12 @@ export function decodeJwt(token: string): Record<string, unknown> | null {
85
85
  const parts = token.split(".");
86
86
  if (parts.length !== 3) return null;
87
87
  const payload = parts[1] ?? "";
88
- const decoded = atob(payload);
88
+ // Normalize base64url → standard base64 before decoding.
89
+ // JWTs use base64url (RFC 7515): '-' replaces '+', '_' replaces '/', padding stripped.
90
+ // atob() only accepts standard base64; passing raw base64url throws on '-' or '_'.
91
+ const b64 = payload.replace(/-/g, "+").replace(/_/g, "/");
92
+ const padded = b64 + "=".repeat((4 - (b64.length % 4)) % 4);
93
+ const decoded = atob(padded);
89
94
  return JSON.parse(decoded);
90
95
  } catch {
91
96
  return null;
@@ -172,6 +177,24 @@ export function getAccountId(accessToken: string): string | null {
172
177
  return typeof accountId === "string" && accountId.length > 0 ? accountId : null;
173
178
  }
174
179
 
180
+ /**
181
+ * Extract `chatgpt_user_id` from the JWT's OpenAI-auth claim namespace. This
182
+ * is per-OAuth-grant (unique per user-on-account), distinct from
183
+ * `chatgpt_account_id` which is shared across all users in a ChatGPT Team
184
+ * workspace. Used to give pooled credentials a slot-unique `keySuffix` even
185
+ * when multiple pool slots authenticate against the same Team account.
186
+ *
187
+ * Returns null if the JWT cannot be decoded, the namespace is absent, or the
188
+ * claim is missing/empty. Callers MUST handle null by falling back to a
189
+ * different identifier (typically `chatgpt_account_id`) — do not throw.
190
+ */
191
+ export function extractChatgptUserId(accessToken: string): string | null {
192
+ const payload = decodeJwt(accessToken);
193
+ const auth = payload?.[JWT_CLAIM_PATH] as Record<string, unknown> | undefined;
194
+ const userId = auth?.chatgpt_user_id;
195
+ return typeof userId === "string" && userId.length > 0 ? userId : null;
196
+ }
197
+
175
198
  export async function createAuthorizationFlow(
176
199
  originator = "agent-swarm",
177
200
  ): Promise<{ verifier: string; state: string; url: string }> {
@@ -89,6 +89,12 @@ export interface ProviderSessionConfig {
89
89
  apiKey: string;
90
90
  cwd: string;
91
91
  vcsRepo?: string;
92
+ /**
93
+ * @deprecated Never set by the runner — native session resume was removed in
94
+ * the 2026-05-28 plan. Adapters log + ignore any stray value. Follow-up
95
+ * continuity flows through the context preamble; see
96
+ * `src/commands/context-preamble.ts` and `src/commands/resume-session.ts`.
97
+ */
92
98
  resumeSessionId?: string;
93
99
  iteration?: number;
94
100
  logFile: string;
@@ -1,9 +1,27 @@
1
- import { createTaskExtended, getAgentById, getLeadAgent, getTaskAttachments } from "../be/db";
1
+ import {
2
+ createTaskExtended,
3
+ getActiveTaskCount,
4
+ getAgentById,
5
+ getLeadAgent,
6
+ getTaskAttachments,
7
+ getTaskById,
8
+ } from "../be/db";
9
+ import { repointTrackerSyncBySwarmId } from "../be/db-queries/tracker";
2
10
  import { resolveTemplate } from "../prompts/resolver";
3
- import type { AgentTask, TaskAttachment } from "../types";
11
+ import type { AgentTask, ResumeReason, TaskAttachment } from "../types";
4
12
  // Side-effect import: registers task lifecycle templates in the in-memory registry.
5
13
  import "../tools/templates";
6
14
 
15
+ /**
16
+ * Liveness window (seconds) for considering a worker "online" enough to
17
+ * pre-assign a resume task. Defaults to 30s; override via env. The worker
18
+ * heartbeats `lastActivityAt` on its agent row at least once per
19
+ * provider tool-call / poll tick, so 30s comfortably covers a healthy worker.
20
+ */
21
+ export const WORKER_LIVENESS_WINDOW_SECONDS = Number(
22
+ process.env.WORKER_LIVENESS_WINDOW_SECONDS || "30",
23
+ );
24
+
7
25
  function attachmentPointer(a: TaskAttachment): string {
8
26
  switch (a.kind) {
9
27
  case "url":
@@ -36,6 +54,7 @@ export function createWorkerTaskFollowUp(args: {
36
54
  const { task, status, output, failureReason } = args;
37
55
 
38
56
  if (task.workflowRunId) return null;
57
+ if (task.followUpConfig?.disabled === true) return null;
39
58
 
40
59
  const taskAgent = getAgentById(task.agentId ?? "");
41
60
  if (!taskAgent || taskAgent.isLead) return null;
@@ -45,6 +64,16 @@ export function createWorkerTaskFollowUp(args: {
45
64
 
46
65
  const agentName = taskAgent.name || task.agentId?.slice(0, 8) || "Unknown";
47
66
  const taskDesc = task.task.slice(0, 200);
67
+ const creatorAgent = task.creatorAgentId
68
+ ? `${task.creatorAgentId}${task.creatorAgentId === leadAgent.id ? " (you)" : ""}`
69
+ : "<none>";
70
+ const instructions =
71
+ status === "completed"
72
+ ? (task.followUpConfig?.onCompleted ?? "")
73
+ : (task.followUpConfig?.onFailed ?? "");
74
+ const followUpInstructions = instructions
75
+ ? `\nAdditional instructions from the task creator:\n${instructions}\n`
76
+ : "";
48
77
 
49
78
  let followUpDescription: string;
50
79
  if (status === "completed") {
@@ -55,7 +84,9 @@ export function createWorkerTaskFollowUp(args: {
55
84
  const completedResult = resolveTemplate("task.worker.completed", {
56
85
  agent_name: agentName,
57
86
  task_desc: taskDesc,
87
+ creator_agent: creatorAgent,
58
88
  output_summary: outputSummary,
89
+ follow_up_instructions: followUpInstructions,
59
90
  task_id: task.id,
60
91
  });
61
92
  followUpDescription = completedResult.text;
@@ -64,7 +95,9 @@ export function createWorkerTaskFollowUp(args: {
64
95
  const failedResult = resolveTemplate("task.worker.failed", {
65
96
  agent_name: agentName,
66
97
  task_desc: taskDesc,
98
+ creator_agent: creatorAgent,
67
99
  failure_reason: reason,
100
+ follow_up_instructions: followUpInstructions,
68
101
  task_id: task.id,
69
102
  });
70
103
  followUpDescription = failedResult.text;
@@ -80,3 +113,130 @@ export function createWorkerTaskFollowUp(args: {
80
113
  slackUserId: task.slackUserId,
81
114
  });
82
115
  }
116
+
117
+ /** Result of `createResumeFollowUp`. */
118
+ export type CreateResumeFollowUpResult =
119
+ | { kind: "created"; task: AgentTask }
120
+ | { kind: "workflow-skip"; stepId: string }
121
+ | { kind: "skipped"; reason: "parent_not_found" | "lead_not_found" };
122
+
123
+ /**
124
+ * Create a "resume" follow-up task for a parent that is being superseded
125
+ * (graceful shutdown, context-limit pressure, manual operator action).
126
+ *
127
+ * Workflow carve-out: if the parent is a workflow step (`workflowRunStepId`
128
+ * is set), no follow-up is created. Returns `{ kind: 'workflow-skip', stepId }`
129
+ * so the caller can `failTask(parent.id, 'superseded_workflow_task')` and let
130
+ * the workflow engine's retry/failure policy take over.
131
+ *
132
+ * Field inheritance is transitive via `createTaskExtended`'s `parentTaskId`
133
+ * lookup (`dir`, `vcsRepo`/`vcsProvider`/etc., `outputSchema`, Slack/AgentMail
134
+ * context, `requestedByUserId`, `contextKey`, `followUpConfig`). This was chosen
135
+ * over re-listing fields here so there is a single source of truth.
136
+ *
137
+ * `model` is intentionally NOT inherited: a resume task is routinely claimed by
138
+ * a different worker (and thus a different harness/provider) than the parent, so
139
+ * carrying the parent's concrete provider-specific model would break the child
140
+ * at session-init. The resume task runs on the assignee agent's own model. See
141
+ * the `model` carve-out comment in `createTaskExtended` (`src/be/db.ts`).
142
+ *
143
+ * Routing: the parent's assigned worker (`parent.agentId`) is preferred if
144
+ * its `lastActivityAt` is within `WORKER_LIVENESS_WINDOW_SECONDS` AND it has
145
+ * remaining capacity (`getActiveTaskCount < agent.maxTasks`). Otherwise the
146
+ * resume task goes to the unassigned pool for any worker to pick up.
147
+ */
148
+ export function createResumeFollowUp(args: {
149
+ parentId: string;
150
+ reason: ResumeReason;
151
+ }): CreateResumeFollowUpResult {
152
+ const parent = getTaskById(args.parentId);
153
+ if (!parent) return { kind: "skipped", reason: "parent_not_found" };
154
+
155
+ // Workflow carve-out — let the engine's retry policy handle recovery.
156
+ if (parent.workflowRunStepId) {
157
+ return { kind: "workflow-skip", stepId: parent.workflowRunStepId };
158
+ }
159
+
160
+ // Routing decision — same DB process so the read-then-create window is
161
+ // small. Acceptable for v1 per the plan (the unassigned-pool fallback
162
+ // covers the race anyway).
163
+ //
164
+ // For `graceful_shutdown` specifically, force the unassigned-pool path:
165
+ // the parent worker is exiting and will call `closeAgent` (→ offline)
166
+ // moments after the supersede loop. At the moment of this check it
167
+ // still looks fresh + has capacity (the parent just terminal-
168
+ // transitioned), so the liveness branch would assign the resume task to
169
+ // a dying worker — leaving it orphaned in `pending` once the worker
170
+ // closes. Pool routing lets any live worker claim it.
171
+ //
172
+ // Other reasons keep the liveness-aware routing:
173
+ // - `crash_recovery`: parent worker is presumed dead → `lastActivityAt`
174
+ // is stale or `status === "offline"`, so the existing check already
175
+ // rejects it naturally.
176
+ // - `context_limits` / `manual_supersede`: the worker is alive and
177
+ // can keep handling the resume task on a fresh session.
178
+ let preferredAgentId: string | undefined;
179
+ if (parent.agentId && args.reason !== "graceful_shutdown") {
180
+ const candidate = getAgentById(parent.agentId);
181
+ if (candidate && candidate.status !== "offline") {
182
+ const lastActivity = candidate.lastActivityAt ? Date.parse(candidate.lastActivityAt) : 0;
183
+ const fresh =
184
+ Number.isFinite(lastActivity) &&
185
+ Date.now() - lastActivity < WORKER_LIVENESS_WINDOW_SECONDS * 1000;
186
+ const activeCount = getActiveTaskCount(candidate.id);
187
+ const hasCap = activeCount < (candidate.maxTasks ?? 1);
188
+ if (fresh && hasCap) {
189
+ preferredAgentId = candidate.id;
190
+ }
191
+ }
192
+ }
193
+
194
+ const parentDesc = parent.task.slice(0, 200);
195
+ const followUpDescription = [
196
+ "Resume interrupted task.",
197
+ "",
198
+ `Parent task: ${parentDesc}`,
199
+ "",
200
+ `Reason: ${args.reason}`,
201
+ "",
202
+ "The full prior context (description, recent tool calls, artifacts) is",
203
+ "prepended to this prompt at dispatch time via the resume context preamble.",
204
+ "Do NOT redo work already completed — extend it.",
205
+ ].join("\n");
206
+
207
+ const priority = Math.min(100, (parent.priority ?? 50) + 10);
208
+ const tags = ["auto-resume", `reason:${args.reason}`];
209
+
210
+ // Identity-shaped fields (dir, VCS provider/repo/number/url/etc.,
211
+ // outputSchema, slack channel/thread/user, agentmail, mention, contextKey,
212
+ // requestedByUserId, followUpConfig) are auto-inherited from the parent by
213
+ // `createTaskExtended`'s parentTaskId block (see src/be/db.ts). `model` is
214
+ // deliberately excluded there so the resume task resolves to the claiming
215
+ // agent's own provider/model — never the parent's concrete model string.
216
+ // We only override what's SPECIFIC to the resume task here.
217
+ const created = createTaskExtended(followUpDescription, {
218
+ agentId: preferredAgentId,
219
+ creatorAgentId: parent.creatorAgentId,
220
+ source: "system",
221
+ taskType: "resume",
222
+ tags,
223
+ priority,
224
+ parentTaskId: parent.id,
225
+ });
226
+
227
+ // Repoint Linear / Jira `tracker_sync` rows from the (now terminal) parent
228
+ // to the resume child. Without this, outbound completion posts for the
229
+ // resume task can't find their tracker_sync row, and subsequent inbound
230
+ // webhooks load the terminal parent and create duplicate tasks.
231
+ //
232
+ // Safe to call when no tracker_sync rows exist for this parent (no-op).
233
+ // Covers all providers (Linear AND Jira) and entity types in one call.
234
+ const repointed = repointTrackerSyncBySwarmId(parent.id, created.id);
235
+ if (repointed > 0) {
236
+ console.log(
237
+ `[ResumeFollowUp] Repointed ${repointed} tracker_sync row(s) from ${parent.id.slice(0, 8)} → ${created.id.slice(0, 8)}`,
238
+ );
239
+ }
240
+
241
+ return { kind: "created", task: created };
242
+ }
package/src/telemetry.ts CHANGED
@@ -153,6 +153,16 @@ function isCloudDeployment(): boolean {
153
153
  return raw === "true" || raw === "1";
154
154
  }
155
155
 
156
+ function getTelemetryEnvironment(): string {
157
+ const explicit = process.env.DESPLEGA_TELEMETRY_ENV?.trim();
158
+ if (explicit) return explicit;
159
+
160
+ // Do not default from NODE_ENV: shipped Bun/npm installs can report
161
+ // "development" even when the operator did not choose a telemetry cohort.
162
+ if (process.env.NODE_ENV === "test") return "test";
163
+ return "production";
164
+ }
165
+
156
166
  /** Fire-and-forget telemetry event. Never throws, never blocks. */
157
167
  export function track(options: TrackOptions): void {
158
168
  if (!isEnabled() || !installationId) return;
@@ -177,7 +187,7 @@ export function track(options: TrackOptions): void {
177
187
  metadata: {
178
188
  transport: "https",
179
189
  schema_version: 1,
180
- environment: process.env.NODE_ENV ?? "production",
190
+ environment: getTelemetryEnvironment(),
181
191
  is_cloud: isCloudDeployment(),
182
192
  ...getOrgIdentity(),
183
193
  ...options.metadata,