@desplega.ai/agent-swarm 1.100.1 → 1.100.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openapi.json +1 -1
- package/package.json +1 -1
- package/src/be/db.ts +131 -4
- package/src/be/memory/raters/retrieval.ts +6 -3
- package/src/be/migrations/097_memory_retrieval_grouping.sql +10 -0
- package/src/github/handlers.ts +84 -7
- package/src/github/templates.ts +6 -2
- package/src/heartbeat/heartbeat.ts +191 -5
- package/src/providers/claude-adapter.ts +41 -4
- package/src/slack/assistant.ts +28 -0
- package/src/slack/channel-join.ts +38 -3
- package/src/slack/handlers.ts +4 -1
- package/src/tasks/worker-follow-up.ts +181 -20
- package/src/tests/claude-adapter-binary.test.ts +74 -0
- package/src/tests/github-handlers-inline-comments.test.ts +308 -0
- package/src/tests/heartbeat-reroute-decision.test.ts +570 -0
- package/src/tests/heartbeat-supersede-resume.test.ts +137 -0
- package/src/tests/heartbeat.test.ts +4 -2
- package/src/tests/memory-rater-implicit-citation.test.ts +31 -0
- package/src/tests/prompt-template-remaining.test.ts +2 -1
- package/src/tests/slack-assistant-comention-production.test.ts +319 -0
- package/src/tests/slack-assistant-comention.test.ts +139 -0
- package/src/tests/slack-channel-join.test.ts +150 -16
- package/src/tools/send-task.ts +51 -1
- package/src/tools/templates.ts +61 -0
|
@@ -144,15 +144,46 @@ export function resolveClaudeBridgeEnabled(
|
|
|
144
144
|
return parseClaudeBridgeEnabled(candidate);
|
|
145
145
|
}
|
|
146
146
|
|
|
147
|
-
|
|
147
|
+
/**
|
|
148
|
+
* Resolve the claude binary argv, gating claude-bridge on an OAuth token.
|
|
149
|
+
*
|
|
150
|
+
* claude-bridge exists to keep subscription/OAuth billing correct by driving
|
|
151
|
+
* the real interactive Claude TUI in tmux. It authenticates the child claude
|
|
152
|
+
* from `CLAUDE_CODE_OAUTH_TOKEN` only — it deliberately strips `ANTHROPIC_*`
|
|
153
|
+
* from the launched process — so it cannot run on an Anthropic API key. And
|
|
154
|
+
* API-key billing is identical headless vs interactive, so there's no reason to
|
|
155
|
+
* pay the bridge's complexity/footguns when only an API key is available.
|
|
156
|
+
*
|
|
157
|
+
* Therefore: only route through claude-bridge when an OAuth token is present.
|
|
158
|
+
* If the bridge is requested (`SWARM_USE_CLAUDE_BRIDGE`) but no OAuth token is
|
|
159
|
+
* set, fall back to stock `claude`, which Claude Code authenticates fine from
|
|
160
|
+
* the API key. `bridgeRequestedWithoutOAuth` lets the caller log why.
|
|
161
|
+
*
|
|
162
|
+
* Exported for unit testing.
|
|
163
|
+
*/
|
|
164
|
+
export function resolveClaudeBinaryArgv(
|
|
148
165
|
resolvedEnv: Record<string, string | undefined>,
|
|
149
166
|
fallbackEnv: Record<string, string | undefined> = process.env,
|
|
150
|
-
): {
|
|
151
|
-
|
|
167
|
+
): {
|
|
168
|
+
raw: string;
|
|
169
|
+
argv: string[];
|
|
170
|
+
useClaudeBridge: boolean;
|
|
171
|
+
bridgeRequestedWithoutOAuth: boolean;
|
|
172
|
+
} {
|
|
173
|
+
const bridgeRequested = resolveClaudeBridgeEnabled(resolvedEnv, fallbackEnv);
|
|
174
|
+
const hasOAuthToken = Boolean(
|
|
175
|
+
(resolvedEnv.CLAUDE_CODE_OAUTH_TOKEN ?? fallbackEnv.CLAUDE_CODE_OAUTH_TOKEN)?.trim(),
|
|
176
|
+
);
|
|
177
|
+
const useClaudeBridge = bridgeRequested && hasOAuthToken;
|
|
152
178
|
const raw = useClaudeBridge
|
|
153
179
|
? CLAUDE_BRIDGE_BINARY
|
|
154
180
|
: resolveClaudeBinary(resolvedEnv, fallbackEnv);
|
|
155
|
-
return {
|
|
181
|
+
return {
|
|
182
|
+
raw,
|
|
183
|
+
argv: parseClaudeBinary(raw),
|
|
184
|
+
useClaudeBridge,
|
|
185
|
+
bridgeRequestedWithoutOAuth: bridgeRequested && !hasOAuthToken,
|
|
186
|
+
};
|
|
156
187
|
}
|
|
157
188
|
|
|
158
189
|
function isLegacyClaudeBridgeCompatBinary(raw: string): boolean {
|
|
@@ -898,7 +929,13 @@ export class ClaudeAdapter implements ProviderAdapter {
|
|
|
898
929
|
raw: claudeBinaryRaw,
|
|
899
930
|
argv: claudeBinaryArgv,
|
|
900
931
|
useClaudeBridge,
|
|
932
|
+
bridgeRequestedWithoutOAuth,
|
|
901
933
|
} = resolveClaudeBinaryArgv(sourceEnv);
|
|
934
|
+
if (bridgeRequestedWithoutOAuth) {
|
|
935
|
+
console.warn(
|
|
936
|
+
`\x1b[33m[claude]\x1b[0m SWARM_USE_CLAUDE_BRIDGE is set but no CLAUDE_CODE_OAUTH_TOKEN is present — falling back to stock 'claude'. claude-bridge requires a subscription/OAuth token (it forwards only the OAuth token to claude and strips ANTHROPIC_*); API-key billing is identical headless vs interactive, so the bridge isn't needed.`,
|
|
937
|
+
);
|
|
938
|
+
}
|
|
902
939
|
const isLegacyBridgeCompat = isLegacyClaudeBridgeCompatBinary(claudeBinaryRaw);
|
|
903
940
|
const effectiveClaudeBinaryArgv = useClaudeBridge
|
|
904
941
|
? withClaudeBridgeAuthArgs(claudeBinaryArgv, sourceEnv)
|
package/src/slack/assistant.ts
CHANGED
|
@@ -5,12 +5,17 @@ import { slackContextKey } from "../tasks/context-key";
|
|
|
5
5
|
import { createTaskWithSiblingAwareness } from "../tasks/sibling-awareness";
|
|
6
6
|
import { resolveSlackUserId } from "./enrich";
|
|
7
7
|
import { wasEventSeen } from "./event-dedup";
|
|
8
|
+
import { hasOtherUserMention } from "./router";
|
|
8
9
|
import { bufferThreadMessage } from "./thread-buffer";
|
|
9
10
|
// Side-effect import: registers all Slack event templates in the in-memory registry
|
|
10
11
|
import "./templates";
|
|
11
12
|
|
|
12
13
|
const additiveSlack = process.env.ADDITIVE_SLACK === "true";
|
|
13
14
|
|
|
15
|
+
// Cache the bot's own Slack user ID so we can suppress messages that @-mention
|
|
16
|
+
// a different agent (e.g. Devin) rather than our bot.
|
|
17
|
+
let cachedBotUserId: string | null = null;
|
|
18
|
+
|
|
14
19
|
export function createAssistant(): Assistant {
|
|
15
20
|
return new Assistant({
|
|
16
21
|
threadStarted: async ({ say, setSuggestedPrompts, saveThreadContext }) => {
|
|
@@ -72,6 +77,29 @@ export function createAssistant(): Assistant {
|
|
|
72
77
|
const messageText = (msg.text as string) || "";
|
|
73
78
|
const userId = (msg.user as string) || "";
|
|
74
79
|
|
|
80
|
+
// Resolve the bot's own Slack user ID (cached after first call) so we can
|
|
81
|
+
// check whether this message is actually addressed to us.
|
|
82
|
+
if (!cachedBotUserId) {
|
|
83
|
+
try {
|
|
84
|
+
const authResult = await client.auth.test();
|
|
85
|
+
cachedBotUserId = (authResult.user_id as string) ?? null;
|
|
86
|
+
} catch (e) {
|
|
87
|
+
console.warn("[Slack] assistant: auth.test() failed — skipping bot-mention check", e);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// If the message @-mentions someone OTHER than our bot and does NOT mention
|
|
92
|
+
// our bot, it is addressed to a different agent/user — do not spawn a task.
|
|
93
|
+
if (cachedBotUserId) {
|
|
94
|
+
const botMentioned = messageText.includes(`<@${cachedBotUserId}>`);
|
|
95
|
+
if (!botMentioned && hasOtherUserMention(messageText, cachedBotUserId)) {
|
|
96
|
+
console.log(
|
|
97
|
+
`[Slack] assistant: skipping message in ${channelId}/${threadTs} — mentions another user, not us`,
|
|
98
|
+
);
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
75
103
|
// Resolve canonical user identity via the shared cascade. On no-email,
|
|
76
104
|
// the cascade records the user in the kv unmapped tracker; this handler
|
|
77
105
|
// proceeds without a `requestedByUserId`.
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import type { WebClient } from "@slack/web-api";
|
|
2
2
|
|
|
3
|
+
const logger = console;
|
|
4
|
+
|
|
3
5
|
// @slack/web-api platform errors set message to "An API error occurred: <code>"
|
|
4
6
|
// and store the raw Slack API code at error.data.error.
|
|
5
7
|
function slackCode(error: unknown): string | undefined {
|
|
@@ -8,12 +10,38 @@ function slackCode(error: unknown): string | undefined {
|
|
|
8
10
|
return typeof d?.error === "string" ? d.error : undefined;
|
|
9
11
|
}
|
|
10
12
|
|
|
13
|
+
/**
|
|
14
|
+
* Returns true if the channel has any external (non-host-org) members.
|
|
15
|
+
* Uses Slack's documented flags: is_ext_shared (accepted Connect) and
|
|
16
|
+
* is_pending_ext_shared (invite sent, not yet accepted). These two booleans
|
|
17
|
+
* are the authoritative org-boundary signal per Slack's API docs.
|
|
18
|
+
*/
|
|
19
|
+
async function isKnownExternalChannel(client: WebClient, channelId: string): Promise<boolean> {
|
|
20
|
+
try {
|
|
21
|
+
const resp = await client.conversations.info({ channel: channelId });
|
|
22
|
+
const ch = (resp.channel ?? {}) as {
|
|
23
|
+
is_ext_shared?: boolean;
|
|
24
|
+
is_pending_ext_shared?: boolean;
|
|
25
|
+
};
|
|
26
|
+
return ch.is_ext_shared === true || ch.is_pending_ext_shared === true;
|
|
27
|
+
} catch (error) {
|
|
28
|
+
logger.warn(
|
|
29
|
+
`[Slack] conversations.info failed for ${channelId}; attempting join fallback:`,
|
|
30
|
+
error,
|
|
31
|
+
);
|
|
32
|
+
return false;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
11
36
|
/**
|
|
12
37
|
* Wraps a Slack API call with automatic channel join for public channels.
|
|
13
38
|
*
|
|
14
|
-
* On not_in_channel:
|
|
15
|
-
*
|
|
16
|
-
*
|
|
39
|
+
* On not_in_channel: checks conversations.info first — if is_ext_shared or
|
|
40
|
+
* is_pending_ext_shared is true the channel has external members; throws a
|
|
41
|
+
* human-invite error instead of self-joining. Internal channels (including
|
|
42
|
+
* Enterprise Grid org-shared channels) proceed normally.
|
|
43
|
+
* On private channel (method_not_supported_for_channel_type): throws a
|
|
44
|
+
* descriptive error telling the caller to /invite the bot.
|
|
17
45
|
*/
|
|
18
46
|
export async function withAutoJoin<T>(
|
|
19
47
|
client: WebClient,
|
|
@@ -25,6 +53,13 @@ export async function withAutoJoin<T>(
|
|
|
25
53
|
} catch (error) {
|
|
26
54
|
if (slackCode(error) !== "not_in_channel") throw error;
|
|
27
55
|
|
|
56
|
+
// Only block when Slack positively identifies an external channel.
|
|
57
|
+
if (await isKnownExternalChannel(client, channelId)) {
|
|
58
|
+
throw new Error(
|
|
59
|
+
`Cannot auto-join external channel ${channelId} — invite the bot with /invite @<bot-name> first.`,
|
|
60
|
+
);
|
|
61
|
+
}
|
|
62
|
+
|
|
28
63
|
try {
|
|
29
64
|
await client.conversations.join({ channel: channelId });
|
|
30
65
|
} catch (joinError) {
|
package/src/slack/handlers.ts
CHANGED
|
@@ -490,8 +490,11 @@ export function registerMessageHandler(app: App): void {
|
|
|
490
490
|
// Detect assistant thread context — file_share messages in DM assistant threads
|
|
491
491
|
// bypass the assistant handler and land here instead. Treat them as implicit mentions
|
|
492
492
|
// so they route to the lead agent rather than being silently dropped.
|
|
493
|
+
// Guard: suppress implicit mention when the message @-mentions someone else but NOT us —
|
|
494
|
+
// those messages are addressed to a different agent/user (e.g. Devin) and must not spawn.
|
|
493
495
|
const isAssistantThread = !!msg.assistant_thread;
|
|
494
|
-
const isImplicitMention =
|
|
496
|
+
const isImplicitMention =
|
|
497
|
+
isAssistantThread && !botMentioned && !hasOtherUserMention(effectiveText, botUserId);
|
|
495
498
|
|
|
496
499
|
// ADDITIVE_SLACK: Check for !now command in threads
|
|
497
500
|
const additiveSlack = process.env.ADDITIVE_SLACK === "true";
|
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
getLeadAgent,
|
|
7
7
|
getTaskAttachments,
|
|
8
8
|
getTaskById,
|
|
9
|
+
hasNonTerminalRerouteDecisionChild,
|
|
9
10
|
} from "../be/db";
|
|
10
11
|
import { repointTrackerSyncBySwarmId } from "../be/db-queries/tracker";
|
|
11
12
|
import { resolveTemplate } from "../prompts/resolver";
|
|
@@ -23,8 +24,33 @@ export const WORKER_LIVENESS_WINDOW_SECONDS = Number(
|
|
|
23
24
|
process.env.WORKER_LIVENESS_WINDOW_SECONDS || "30",
|
|
24
25
|
);
|
|
25
26
|
|
|
27
|
+
/**
|
|
28
|
+
* Rollback switch (DES-523) for the same-agent crash-recovery pin. ON by
|
|
29
|
+
* default: `crash_recovery` resumes pin back to their original agent regardless
|
|
30
|
+
* of `lastActivityAt` freshness. Set `HEARTBEAT_PIN_CRASH_RESUME=0` to restore
|
|
31
|
+
* the pre-DES-523 behavior verbatim — `crash_recovery` then requires the 30s
|
|
32
|
+
* `fresh` window like every other reason, so at the ~5-min detection mark it
|
|
33
|
+
* falls back to the unassigned pool. A reversible kill-switch for this
|
|
34
|
+
* production crash-path change (no code revert needed if the pin misbehaves).
|
|
35
|
+
*/
|
|
36
|
+
export const HEARTBEAT_PIN_CRASH_RESUME = process.env.HEARTBEAT_PIN_CRASH_RESUME !== "0";
|
|
37
|
+
|
|
26
38
|
export const RESUME_GENERATION_TAG_PREFIX = "resume-generation:";
|
|
27
39
|
|
|
40
|
+
/**
|
|
41
|
+
* Tag set ONLY on a genuine same-agent `crash_recovery` pin (i.e. when the
|
|
42
|
+
* resume is actually assigned back to the original agent). The heartbeat reaper
|
|
43
|
+
* (`getStalePinnedResumes`) scopes its sweep to this tag so it cannot mistake a
|
|
44
|
+
* *pooled* resume that `autoAssignPoolTasks` later flips to `pending` — which
|
|
45
|
+
* keeps its original `createdAt` and would otherwise look identical to a stale
|
|
46
|
+
* pin — for an unreclaimed crash pin, and so it never escalates a
|
|
47
|
+
* `context_limits` / `manual_supersede` pin under a `crash_recovery` label.
|
|
48
|
+
*
|
|
49
|
+
* The literal is duplicated in `getStalePinnedResumes` (src/be/db.ts) rather
|
|
50
|
+
* than imported, to avoid a worker-follow-up ↔ db import cycle — keep them in sync.
|
|
51
|
+
*/
|
|
52
|
+
export const CRASH_RECOVERY_PIN_TAG = "crash-recovery-pin";
|
|
53
|
+
|
|
28
54
|
export function getResumeGeneration(task: Pick<AgentTask, "tags">): number {
|
|
29
55
|
const tag = task.tags.find((value) => value.startsWith(RESUME_GENERATION_TAG_PREFIX));
|
|
30
56
|
if (!tag) return 0;
|
|
@@ -166,10 +192,25 @@ export type CreateResumeFollowUpResult =
|
|
|
166
192
|
* at session-init. The resume task runs on the assignee agent's own model. See
|
|
167
193
|
* the `model` carve-out comment in `createTaskExtended` (`src/be/db.ts`).
|
|
168
194
|
*
|
|
169
|
-
* Routing: the parent's assigned worker (`parent.agentId`) is preferred
|
|
170
|
-
*
|
|
171
|
-
*
|
|
172
|
-
*
|
|
195
|
+
* Routing: the parent's assigned worker (`parent.agentId`) is preferred when
|
|
196
|
+
* the agent row still exists, is not `offline`, and has remaining capacity
|
|
197
|
+
* (`getActiveTaskCount < agent.maxTasks`). For `crash_recovery` the pin holds
|
|
198
|
+
* regardless of `lastActivityAt` freshness — the agent ID is stable across a
|
|
199
|
+
* restart and the crashed row survives intact, so a stale `lastActivityAt` at
|
|
200
|
+
* the ~5-min crash-detection mark means "restarting", not "gone". Pinning keeps
|
|
201
|
+
* the resume off the role-blind unassigned pool so no wrong-specialization
|
|
202
|
+
* worker can grab it (DES-523). For `context_limits` / `manual_supersede` the
|
|
203
|
+
* worker is alive, so `lastActivityAt` freshness is still required. The resume
|
|
204
|
+
* falls back to the unassigned pool only when the agent is genuinely gone
|
|
205
|
+
* (graceful close → `offline`) or its row is absent.
|
|
206
|
+
*
|
|
207
|
+
* Gone-agent / never-reclaimed case: a pin whose agent never returns is NOT
|
|
208
|
+
* re-pooled — the heartbeat's stale-resume reaper (`escalateUnreclaimedResumes`
|
|
209
|
+
* in `src/heartbeat/heartbeat.ts`) escalates it to a Lead re-delegation decision
|
|
210
|
+
* once `HEARTBEAT_RESUME_PIN_GRACE_MIN` lapses.
|
|
211
|
+
*
|
|
212
|
+
* The pin itself is gated by `HEARTBEAT_PIN_CRASH_RESUME` (default on); set it to
|
|
213
|
+
* `0` to restore the pre-DES-523 pool-fallback behavior.
|
|
173
214
|
*/
|
|
174
215
|
export function createResumeFollowUp(args: {
|
|
175
216
|
parentId: string;
|
|
@@ -183,24 +224,34 @@ export function createResumeFollowUp(args: {
|
|
|
183
224
|
return { kind: "workflow-skip", stepId: parent.workflowRunStepId };
|
|
184
225
|
}
|
|
185
226
|
|
|
186
|
-
// Routing decision — same DB process so the read-then-create window is
|
|
187
|
-
// small. Acceptable for v1 per the plan (the unassigned-pool fallback
|
|
188
|
-
// covers the race anyway).
|
|
227
|
+
// Routing decision — same DB process so the read-then-create window is small.
|
|
189
228
|
//
|
|
190
|
-
// For `graceful_shutdown
|
|
191
|
-
//
|
|
192
|
-
//
|
|
193
|
-
//
|
|
194
|
-
//
|
|
195
|
-
//
|
|
196
|
-
//
|
|
229
|
+
// For `graceful_shutdown`, force the unassigned-pool path: the parent worker
|
|
230
|
+
// is exiting and will call `closeAgent` (→ offline) moments after the
|
|
231
|
+
// supersede loop. At this check it still looks fresh + has capacity (it just
|
|
232
|
+
// terminal-transitioned), so the liveness branch would pin the resume to a
|
|
233
|
+
// dying worker — orphaning it in `pending` once the worker closes. Pool
|
|
234
|
+
// routing lets any live worker claim it.
|
|
235
|
+
//
|
|
236
|
+
// For `crash_recovery`, deliberately PIN to the same (stable-ID) agent even
|
|
237
|
+
// when `lastActivityAt` is stale. This REVERSES the prior "let staleness pool
|
|
238
|
+
// it" behavior: crash detection only fires after STALL_THRESHOLD_NO_SESSION_MIN
|
|
239
|
+
// (~5 min), by which point a healthy-but-restarting worker is always >30s
|
|
240
|
+
// stale, so the old `fresh` gate dumped every crash resume into the role-blind
|
|
241
|
+
// pool where a wrong-specialization worker could grab it (DES-523). The agent
|
|
242
|
+
// ID is stable across restart and the crashed row survives intact, so here
|
|
243
|
+
// "stale" means "restarting", not "gone". We KEEP the `offline` guard — only a
|
|
244
|
+
// graceful close sets `offline`, i.e. genuinely gone → pool — and the capacity
|
|
245
|
+
// guard. An unreclaimed pin is escalated to a Lead decision by the heartbeat
|
|
246
|
+
// reaper, never silently re-pooled.
|
|
247
|
+
//
|
|
248
|
+
// Brittleness note: this relies on a hard crash NEVER marking the agent
|
|
249
|
+
// `offline` (only `POST /close` does). If future code offlines stale agents
|
|
250
|
+
// before remediation, this re-opens the pool path for `crash_recovery` —
|
|
251
|
+
// revisit the gate then.
|
|
197
252
|
//
|
|
198
|
-
// Other reasons keep the liveness-aware routing:
|
|
199
|
-
// - `crash_recovery`: parent worker is presumed dead → `lastActivityAt`
|
|
200
|
-
// is stale or `status === "offline"`, so the existing check already
|
|
201
|
-
// rejects it naturally.
|
|
202
253
|
// - `context_limits` / `manual_supersede`: the worker is alive and
|
|
203
|
-
//
|
|
254
|
+
// responsive, so keep requiring `fresh`.
|
|
204
255
|
let preferredAgentId: string | undefined;
|
|
205
256
|
if (parent.agentId && args.reason !== "graceful_shutdown") {
|
|
206
257
|
const candidate = getAgentById(parent.agentId);
|
|
@@ -211,8 +262,17 @@ export function createResumeFollowUp(args: {
|
|
|
211
262
|
Date.now() - lastActivity < WORKER_LIVENESS_WINDOW_SECONDS * 1000;
|
|
212
263
|
const activeCount = getActiveTaskCount(candidate.id);
|
|
213
264
|
const hasCap = activeCount < (candidate.maxTasks ?? 1);
|
|
214
|
-
|
|
265
|
+
const isCrashRecovery = args.reason === "crash_recovery" && HEARTBEAT_PIN_CRASH_RESUME;
|
|
266
|
+
// crash_recovery pins regardless of `fresh` (unless the rollback switch is
|
|
267
|
+
// off); other reasons still require it.
|
|
268
|
+
if (hasCap && (isCrashRecovery || fresh)) {
|
|
215
269
|
preferredAgentId = candidate.id;
|
|
270
|
+
} else if (isCrashRecovery && !hasCap) {
|
|
271
|
+
// The only reason a crash_recovery pin is skipped here is capacity —
|
|
272
|
+
// surface the pool fallback instead of letting it happen silently.
|
|
273
|
+
console.warn(
|
|
274
|
+
`[Heartbeat] crash_recovery resume for task ${parent.id.slice(0, 8)} NOT pinned: agent ${candidate.id.slice(0, 8)} at capacity (${activeCount}/${candidate.maxTasks ?? 1}); falling back to unassigned pool`,
|
|
275
|
+
);
|
|
216
276
|
}
|
|
217
277
|
}
|
|
218
278
|
}
|
|
@@ -236,6 +296,14 @@ export function createResumeFollowUp(args: {
|
|
|
236
296
|
`reason:${args.reason}`,
|
|
237
297
|
`${RESUME_GENERATION_TAG_PREFIX}${getNextResumeGeneration(parent)}`,
|
|
238
298
|
];
|
|
299
|
+
// Mark a GENUINE same-agent crash pin (crash_recovery that actually pinned to
|
|
300
|
+
// the original agent) so the heartbeat reaper can scope to these only. A
|
|
301
|
+
// pooled resume — including a crash_recovery resume that fell to the pool at
|
|
302
|
+
// capacity — never gets this tag, so it can't be mistaken for a stale pin
|
|
303
|
+
// after autoAssignPoolTasks flips it to `pending`.
|
|
304
|
+
if (args.reason === "crash_recovery" && preferredAgentId !== undefined) {
|
|
305
|
+
tags.push(CRASH_RECOVERY_PIN_TAG);
|
|
306
|
+
}
|
|
239
307
|
|
|
240
308
|
// Identity-shaped fields (dir, VCS provider/repo/number/url/etc.,
|
|
241
309
|
// outputSchema, slack channel/thread/user, agentmail, mention, contextKey,
|
|
@@ -270,3 +338,96 @@ export function createResumeFollowUp(args: {
|
|
|
270
338
|
|
|
271
339
|
return { kind: "created", task: created };
|
|
272
340
|
}
|
|
341
|
+
|
|
342
|
+
/** Result of `createRerouteDecisionTask`. */
|
|
343
|
+
export type CreateRerouteDecisionResult =
|
|
344
|
+
| { kind: "created"; task: AgentTask }
|
|
345
|
+
| { kind: "skipped"; reason: "lead_not_found" | "duplicate_exists" };
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Hand the Lead a re-delegation DECISION task for a crash-recovery resume that
|
|
349
|
+
* was pinned to its original agent but never reclaimed within the grace window
|
|
350
|
+
* (DES-523). The Lead receives context — the crashed agent's identity + the
|
|
351
|
+
* original work — and must re-dispatch via `send-task` with an explicit
|
|
352
|
+
* `agentId`; it does NOT execute the work itself, and the work is never
|
|
353
|
+
* re-pooled. Mirrors `createWorkerTaskFollowUp`'s Lead-owned-follow-up shape.
|
|
354
|
+
*
|
|
355
|
+
* Invoked by the heartbeat reaper (`escalateUnreclaimedResumes`), NOT at crash
|
|
356
|
+
* time: "gone" can't be distinguished from "restarting" at detection time, so
|
|
357
|
+
* the Lead path is only reached after a pin has demonstrably failed to be
|
|
358
|
+
* reclaimed.
|
|
359
|
+
*
|
|
360
|
+
* Discriminator: `taskType: "reroute-decision"` (NOT "follow-up") so it is
|
|
361
|
+
* distinguishable from ordinary completion follow-ups for dedup and so the
|
|
362
|
+
* `send-task` Slack re-delegation guard (which only fires for `taskType ===
|
|
363
|
+
* "follow-up"`) never blocks the Lead's re-dispatch.
|
|
364
|
+
*
|
|
365
|
+
* Idempotent: skips when a non-terminal reroute-decision child already exists
|
|
366
|
+
* for the original. No lead → no-op (fail-safe), mirroring
|
|
367
|
+
* `createWorkerTaskFollowUp`.
|
|
368
|
+
*
|
|
369
|
+
* @param staleResume the failed pinned resume (R1). The generation budget for
|
|
370
|
+
* the Lead's re-dispatch is derived from it (`gen(R1)+1`), NOT from the root
|
|
371
|
+
* `original` (which carries no resume-generation tag and would reset to 1
|
|
372
|
+
* every escalation cycle, defeating MAX_RESUME_GENERATIONS via the Lead path).
|
|
373
|
+
* @param maxGenerations passed in (rather than imported from heartbeat.ts) to
|
|
374
|
+
* avoid a circular import — heartbeat.ts already imports this module.
|
|
375
|
+
*/
|
|
376
|
+
export function createRerouteDecisionTask(args: {
|
|
377
|
+
original: AgentTask;
|
|
378
|
+
staleResume: AgentTask;
|
|
379
|
+
reason: ResumeReason;
|
|
380
|
+
maxGenerations: number;
|
|
381
|
+
}): CreateRerouteDecisionResult {
|
|
382
|
+
const { original, staleResume, reason, maxGenerations } = args;
|
|
383
|
+
|
|
384
|
+
const leadAgent = getLeadAgent();
|
|
385
|
+
if (!leadAgent) return { kind: "skipped", reason: "lead_not_found" };
|
|
386
|
+
|
|
387
|
+
// Idempotency: a prior sweep may already have escalated this original.
|
|
388
|
+
if (hasNonTerminalRerouteDecisionChild(original.id)) {
|
|
389
|
+
return { kind: "skipped", reason: "duplicate_exists" };
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
const crashedAgent = original.agentId ? getAgentById(original.agentId) : null;
|
|
393
|
+
const agentName = crashedAgent?.name || original.agentId?.slice(0, 8) || "unknown";
|
|
394
|
+
const identitySlice = crashedAgent?.identityMd
|
|
395
|
+
? `${crashedAgent.identityMd.slice(0, 500)}${crashedAgent.identityMd.length > 500 ? "..." : ""}`
|
|
396
|
+
: "(no identity recorded)";
|
|
397
|
+
const attachmentsBlock = formatAttachmentsBlock(getTaskAttachments(original.id));
|
|
398
|
+
|
|
399
|
+
const decision = resolveTemplate("task.reroute.decision", {
|
|
400
|
+
original_agent_name: agentName,
|
|
401
|
+
original_agent_identity: identitySlice,
|
|
402
|
+
original_task_id: original.id,
|
|
403
|
+
reason,
|
|
404
|
+
task_desc: original.task.slice(0, 200),
|
|
405
|
+
// Derive from the FAILED PIN (staleResume), not `original` (the root with no
|
|
406
|
+
// generation tag) — otherwise every escalation resets to gen 1 and the
|
|
407
|
+
// MAX_RESUME_GENERATIONS cap is never reached on the Lead path.
|
|
408
|
+
generation_next: getNextResumeGeneration(staleResume),
|
|
409
|
+
max_generations: maxGenerations,
|
|
410
|
+
artifacts_block: attachmentsBlock,
|
|
411
|
+
});
|
|
412
|
+
|
|
413
|
+
// Lead-owned `pending` decision task (createTaskExtended derives `pending`
|
|
414
|
+
// from a set agentId). Slack/VCS/etc. context is inherited from the original
|
|
415
|
+
// via parentTaskId. taskType is the distinct "reroute-decision" marker.
|
|
416
|
+
const created = createTaskExtended(decision.text, {
|
|
417
|
+
agentId: leadAgent.id,
|
|
418
|
+
creatorAgentId: original.creatorAgentId,
|
|
419
|
+
source: "system",
|
|
420
|
+
taskType: "reroute-decision",
|
|
421
|
+
tags: ["reroute-decision"],
|
|
422
|
+
priority: Math.min(100, (original.priority ?? 50) + 10),
|
|
423
|
+
parentTaskId: original.id,
|
|
424
|
+
// Inherit Slack/VCS context from the original, but NOT its outputSchema: this
|
|
425
|
+
// is a control-plane task the Lead completes by re-delegating via send-task,
|
|
426
|
+
// not by producing the original work's structured output. Inheriting it would
|
|
427
|
+
// make store-progress reject the Lead's completion and strand the decision
|
|
428
|
+
// (blocking further escalation via the duplicate-decision guard) — DES-523.
|
|
429
|
+
inheritParentOutputSchema: false,
|
|
430
|
+
});
|
|
431
|
+
|
|
432
|
+
return { kind: "created", task: created };
|
|
433
|
+
}
|
|
@@ -34,6 +34,7 @@ import {
|
|
|
34
34
|
parseClaudeBridgeEnabled,
|
|
35
35
|
preseedClaudeTrustDialog,
|
|
36
36
|
resolveClaudeBinary,
|
|
37
|
+
resolveClaudeBinaryArgv,
|
|
37
38
|
resolveClaudeBridgeEnabled,
|
|
38
39
|
} from "../providers/claude-adapter";
|
|
39
40
|
import type { ProviderSessionConfig } from "../providers/types";
|
|
@@ -217,6 +218,59 @@ describe("SWARM_USE_CLAUDE_BRIDGE boolean parsing", () => {
|
|
|
217
218
|
});
|
|
218
219
|
});
|
|
219
220
|
|
|
221
|
+
describe("resolveClaudeBinaryArgv — claude-bridge requires an OAuth token", () => {
|
|
222
|
+
test("bridge requested + OAuth token present → routes to claude-bridge", () => {
|
|
223
|
+
const r = resolveClaudeBinaryArgv(
|
|
224
|
+
{ SWARM_USE_CLAUDE_BRIDGE: "true", CLAUDE_CODE_OAUTH_TOKEN: "sk-ant-oat01-x" },
|
|
225
|
+
{},
|
|
226
|
+
);
|
|
227
|
+
expect(r.useClaudeBridge).toBe(true);
|
|
228
|
+
expect(r.argv).toEqual(["claude-bridge"]);
|
|
229
|
+
expect(r.bridgeRequestedWithoutOAuth).toBe(false);
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
test("bridge requested + no OAuth (only API key) → falls back to stock claude", () => {
|
|
233
|
+
const r = resolveClaudeBinaryArgv(
|
|
234
|
+
{ SWARM_USE_CLAUDE_BRIDGE: "true", ANTHROPIC_API_KEY: "sk-ant-api" },
|
|
235
|
+
{},
|
|
236
|
+
);
|
|
237
|
+
expect(r.useClaudeBridge).toBe(false);
|
|
238
|
+
expect(r.argv).toEqual(["claude"]);
|
|
239
|
+
expect(r.bridgeRequestedWithoutOAuth).toBe(true);
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
test("bridge requested + no creds at all → stock claude, flag set", () => {
|
|
243
|
+
const r = resolveClaudeBinaryArgv({ SWARM_USE_CLAUDE_BRIDGE: "1" }, {});
|
|
244
|
+
expect(r.useClaudeBridge).toBe(false);
|
|
245
|
+
expect(r.bridgeRequestedWithoutOAuth).toBe(true);
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
test("OAuth token from fallbackEnv (container env) also enables the bridge", () => {
|
|
249
|
+
const r = resolveClaudeBinaryArgv(
|
|
250
|
+
{ SWARM_USE_CLAUDE_BRIDGE: "true" },
|
|
251
|
+
{ CLAUDE_CODE_OAUTH_TOKEN: "sk-ant-oat01-fallback" },
|
|
252
|
+
);
|
|
253
|
+
expect(r.useClaudeBridge).toBe(true);
|
|
254
|
+
expect(r.bridgeRequestedWithoutOAuth).toBe(false);
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
test("whitespace-only OAuth token does not count as present", () => {
|
|
258
|
+
const r = resolveClaudeBinaryArgv(
|
|
259
|
+
{ SWARM_USE_CLAUDE_BRIDGE: "true", CLAUDE_CODE_OAUTH_TOKEN: " " },
|
|
260
|
+
{},
|
|
261
|
+
);
|
|
262
|
+
expect(r.useClaudeBridge).toBe(false);
|
|
263
|
+
expect(r.bridgeRequestedWithoutOAuth).toBe(true);
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
test("bridge not requested → never flagged, stock claude", () => {
|
|
267
|
+
const r = resolveClaudeBinaryArgv({ CLAUDE_CODE_OAUTH_TOKEN: "sk-ant-oat01-x" }, {});
|
|
268
|
+
expect(r.useClaudeBridge).toBe(false);
|
|
269
|
+
expect(r.bridgeRequestedWithoutOAuth).toBe(false);
|
|
270
|
+
expect(r.argv).toEqual(["claude"]);
|
|
271
|
+
});
|
|
272
|
+
});
|
|
273
|
+
|
|
220
274
|
describe("preseedClaudeTrustDialog", () => {
|
|
221
275
|
let homeDir: string;
|
|
222
276
|
|
|
@@ -581,6 +635,26 @@ describe("CLAUDE_BINARY env override", () => {
|
|
|
581
635
|
|
|
582
636
|
expect(spawnedArgs[0][0]).toBe("claude");
|
|
583
637
|
});
|
|
638
|
+
|
|
639
|
+
test("SWARM_USE_CLAUDE_BRIDGE=true without OAuth token falls back to stock claude", async () => {
|
|
640
|
+
const origApiKey = process.env.ANTHROPIC_API_KEY;
|
|
641
|
+
delete process.env.CLAUDE_CODE_OAUTH_TOKEN;
|
|
642
|
+
process.env.ANTHROPIC_API_KEY = "sk-ant-test";
|
|
643
|
+
process.env.SWARM_USE_CLAUDE_BRIDGE = "true";
|
|
644
|
+
try {
|
|
645
|
+
const adapter = new ClaudeAdapter();
|
|
646
|
+
await adapter.createSession(makeConfig());
|
|
647
|
+
// No OAuth token → bridge is skipped, stock claude is used (Claude Code
|
|
648
|
+
// authenticates fine from ANTHROPIC_API_KEY; the bridge can't).
|
|
649
|
+
expect(spawnedArgs[0][0]).toBe("claude");
|
|
650
|
+
} finally {
|
|
651
|
+
if (origApiKey === undefined) {
|
|
652
|
+
delete process.env.ANTHROPIC_API_KEY;
|
|
653
|
+
} else {
|
|
654
|
+
process.env.ANTHROPIC_API_KEY = origApiKey;
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
});
|
|
584
658
|
});
|
|
585
659
|
|
|
586
660
|
describe("Claude Bridge tmux fail-fast gate", () => {
|