claude-tempo 0.26.0-beta.2 → 0.26.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +11 -6
- package/dist/activities/outbox.js +155 -44
- package/dist/activities/resolve.js +2 -4
- package/dist/adapters/base.d.ts +95 -5
- package/dist/adapters/base.js +463 -184
- package/dist/adapters/claude-code/adapter.d.ts +22 -8
- package/dist/adapters/claude-code/adapter.js +63 -16
- package/dist/adapters/copilot/adapter.js +7 -5
- package/dist/adapters/terminal-error.d.ts +27 -0
- package/dist/adapters/terminal-error.js +39 -0
- package/dist/cli/commands.d.ts +0 -10
- package/dist/cli/commands.js +19 -506
- package/dist/cli/config-command.js +10 -2
- package/dist/cli/daemon-command.d.ts +47 -0
- package/dist/cli/daemon-command.js +356 -0
- package/dist/cli/daemon.d.ts +52 -0
- package/dist/cli/daemon.js +148 -2
- package/dist/cli/help-text.d.ts +1 -0
- package/dist/cli/help-text.js +142 -0
- package/dist/cli/upgrade-command.d.ts +5 -0
- package/dist/cli/upgrade-command.js +240 -0
- package/dist/cli.js +105 -47
- package/dist/client/index.js +5 -7
- package/dist/daemon.d.ts +11 -11
- package/dist/daemon.js +47 -75
- package/dist/scripts/run-shard.js +121 -0
- package/dist/scripts/verify-daemon-isolation-guard.js +128 -0
- package/dist/tools/broadcast.js +2 -2
- package/dist/tools/ensemble.js +2 -0
- package/dist/tui/commands.d.ts +36 -0
- package/dist/tui/commands.js +71 -6
- package/dist/tui/components/PlayerDetailView.js +7 -2
- package/dist/tui/store.js +32 -6
- package/dist/types.d.ts +19 -1
- package/dist/utils/search-attributes.d.ts +76 -0
- package/dist/utils/search-attributes.js +86 -0
- package/dist/utils/validation.d.ts +14 -0
- package/dist/utils/validation.js +15 -1
- package/dist/workflows/attachment-math.d.ts +56 -0
- package/dist/workflows/attachment-math.js +47 -0
- package/dist/workflows/session.js +92 -27
- package/dist/workflows/signals.d.ts +1 -0
- package/dist/workflows/signals.js +16 -1
- package/package.json +10 -4
- package/workflow-bundle.js +167 -29
package/CLAUDE.md
CHANGED
|
@@ -20,12 +20,15 @@ src/
|
|
|
20
20
|
├── cli.ts # CLI entry point (claude-tempo command)
|
|
21
21
|
├── daemon.ts # Daemon entry point — runs Temporal workers as a detached background process
|
|
22
22
|
├── cli/
|
|
23
|
-
│ ├── commands.ts # CLI command implementations (up, start, conduct, status, stop,
|
|
24
|
-
│ ├── config-command.ts # config subcommand (interactive + set/show)
|
|
25
|
-
│ ├── daemon.ts # Daemon management utilities (start, stop, status,
|
|
23
|
+
│ ├── commands.ts # CLI command implementations (up, start, conduct, status, stop, …)
|
|
24
|
+
│ ├── config-command.ts # config subcommand (interactive + set/show) — crash-proof for show/set
|
|
25
|
+
│ ├── daemon.ts # Daemon management utilities (start, stop, status, heartbeat, isDaemonRunning)
|
|
26
|
+
│ ├── daemon-command.ts # daemon subcommand handler — crash-proof, no Temporal deps
|
|
27
|
+
│ ├── help-text.ts # help output — crash-proof, no Temporal deps
|
|
26
28
|
│ ├── mcp.ts # MCP server registration helpers (init, global vs project)
|
|
27
29
|
│ ├── output.ts # Shared CLI output formatting helpers
|
|
28
|
-
│
|
|
30
|
+
│ ├── preflight.ts # Environment preflight checks
|
|
31
|
+
│ └── upgrade-command.ts # upgrade subcommand — crash-proof; dynamic-imports Temporal only for active-session warning
|
|
29
32
|
├── adapters/
|
|
30
33
|
│ ├── README.md # Adapter contract documentation
|
|
31
34
|
│ ├── index.ts # Adapter registry bootstrap + barrel exports
|
|
@@ -44,6 +47,7 @@ src/
|
|
|
44
47
|
│ ├── session.ts # claude-session workflow
|
|
45
48
|
│ ├── scheduler.ts # durable scheduler workflow (one per ensemble)
|
|
46
49
|
│ ├── maestro.ts # Maestro workflows — per-ensemble hub and global hub
|
|
50
|
+
│ ├── attachment-math.ts # Pure CAN-boundary lease-extension helper (no Temporal imports)
|
|
47
51
|
│ ├── maestro-signals.ts / scheduler-signals.ts / signals.ts # Signal/query/update type defs
|
|
48
52
|
│ └── index.ts # Workflow re-exports for worker bundle
|
|
49
53
|
├── activities/
|
|
@@ -73,7 +77,7 @@ src/
|
|
|
73
77
|
│ ├── components/ # Ink components — see docs/tui.md for inventory
|
|
74
78
|
│ └── utils/ # format, platform, theme, fullscreen, history
|
|
75
79
|
├── utils/
|
|
76
|
-
│ ├── validation.ts / worktree.ts / safe-path.ts / duration.ts
|
|
80
|
+
│ ├── validation.ts / worktree.ts / safe-path.ts / duration.ts / search-attributes.ts
|
|
77
81
|
├── types.ts # Shared type definitions
|
|
78
82
|
├── git-info.ts # Git repository detection helper
|
|
79
83
|
└── config.ts # Env var handling
|
|
@@ -86,7 +90,7 @@ touching `src/tui/`.
|
|
|
86
90
|
|
|
87
91
|
```bash
|
|
88
92
|
npm install
|
|
89
|
-
npm run build # compiles TS
|
|
93
|
+
npm run build # compiles TS, scripts/*.ts → dist/scripts/, and pre-bundles workflow code into workflow-bundle.js
|
|
90
94
|
npm test
|
|
91
95
|
```
|
|
92
96
|
|
|
@@ -113,6 +117,7 @@ daemon worker notes, `npx ts-node` dev runner).
|
|
|
113
117
|
- **Part**: A player's description of what it's working on
|
|
114
118
|
- **Outbox**: Outbound requests (cue, report, recruit, restart, detach, destroy, …) go through the session's workflow outbox instead of directly signaling other workflows. The dispatch loop processes entries via activities, decoupling tools from cross-workflow signaling.
|
|
115
119
|
- **Attachment phase** (v0.26): Seven phases tracked on the session workflow — `booting → attached → processing | awaiting → draining → detached → gone`. The phase is authoritative for lifecycle truth: adapters drive it via `claimAttachment` / `adapterExited` / `forceDetach` / `destroy`, and the workflow publishes it on the `ClaudeTempoAttachmentState` search attribute. Replaced the v0.25 `ClaudeTempoStatus` heuristic (removed in v0.26). See [docs/concepts.md](docs/concepts.md) for the phase table and [docs/ops/v0.26-migration.md](docs/ops/v0.26-migration.md) for the upgrade path.
|
|
120
|
+
- **Adapter heartbeat observability** (#249): After `claimAttachment`, the base adapter logs `first heartbeat scheduled in Xms` then `heartbeat#1 delivered` on the first tick. Every 10 ticks it emits `heartbeats-delivered=N / phase-ticks=N` breadcrumbs. Any silent guard trip in `tickHeartbeat` / `tickPhaseWatcher` now emits a structured `guard tripped: {stopped, reconnecting, …}` log instead of silently orphaning the timer. The phase-watcher emits `WARNING: heartbeat staleness` when `lastHeartbeatAt` falls more than 2× `heartbeatMs` behind `now`. Grep `[claude-tempo:adapter]` to confirm loop health without parsing Temporal history.
|
|
116
121
|
- **Per-host task queues**: `host` param on `recruit`/`restart`/`migrate` routes to `claude-tempo-{hostname}` task queue. See [docs/concepts.md](docs/concepts.md) for cross-machine recruiting details.
|
|
117
122
|
- **Wire protocol**: All signal/query/update names are documented in [`docs/WIRE-PROTOCOL.md`](docs/WIRE-PROTOCOL.md) and are stable — renaming or removing any is a breaking change. **Process**: update `docs/WIRE-PROTOCOL.md` in the same commit as any new signal, query, or update.
|
|
118
123
|
- **Daemon**: Standalone background process (`src/daemon.ts`) that runs all Temporal workers. Auto-started by any `claude-tempo` command. PID at `~/.claude-tempo/daemon.pid`; logs at `~/.claude-tempo/daemon.log`.
|
|
@@ -51,6 +51,87 @@ const adapters_1 = require("../adapters");
|
|
|
51
51
|
const hard_terminate_1 = require("./hard-terminate");
|
|
52
52
|
const signals_1 = require("../workflows/signals");
|
|
53
53
|
const log = (...args) => console.error('[claude-tempo:outbox]', ...args);
|
|
54
|
+
/**
|
|
55
|
+
* Classify a Temporal client error raised by `handle.query` / `handle.signal`
|
|
56
|
+
* / `handle.executeUpdate` as retryable (transient) vs permanent (#140).
|
|
57
|
+
*
|
|
58
|
+
* ## Contract
|
|
59
|
+
* - Returns `true` → caller should **re-throw the underlying Error** so the
|
|
60
|
+
* activity's retry policy can back off and retry (per-worker config).
|
|
61
|
+
* - Returns `false` → caller should wrap in `ApplicationFailure.nonRetryable`
|
|
62
|
+
* so the outbox surfaces a permanent failure and stops retrying.
|
|
63
|
+
*
|
|
64
|
+
* ## Safety posture
|
|
65
|
+
* **Conservative default: unknown → non-retryable.** Over-classifying as
|
|
66
|
+
* retryable causes infinite retry loops on genuinely permanent errors. The
|
|
67
|
+
* activity will fail fast on unknown cases; a follow-up PR can whitelist more
|
|
68
|
+
* transient signatures if we see false-permanent rates in the wild.
|
|
69
|
+
*
|
|
70
|
+
* ## Why name/message sniffing, not `instanceof`
|
|
71
|
+
* Matches the established pattern in `src/adapters/terminal-error.ts`
|
|
72
|
+
* `isTerminalWorkflowError`: the Temporal Node SDK surfaces slightly different
|
|
73
|
+
* error shapes between `@temporalio/client`, the gRPC layer, and
|
|
74
|
+
* `WorkflowUpdateFailedError` wrappers. Sniffing on name + message is resilient
|
|
75
|
+
* across those shapes. Activity-side classification is kept separate here so
|
|
76
|
+
* `src/activities/` has no adapter-module dependency.
|
|
77
|
+
*/
|
|
78
|
+
function isRetryableTemporalError(err) {
|
|
79
|
+
// ApplicationFailure instances have already been classified by the thrower
|
|
80
|
+
// (nonRetryable=true/false). The calling code paths in this module only ask
|
|
81
|
+
// about non-ApplicationFailure errors, but this guard makes the helper safe
|
|
82
|
+
// to call unconditionally.
|
|
83
|
+
if (err instanceof activity_1.ApplicationFailure)
|
|
84
|
+
return false;
|
|
85
|
+
const e = err;
|
|
86
|
+
const name = e?.name ?? '';
|
|
87
|
+
const msg = e?.message ?? '';
|
|
88
|
+
// ── Permanent: workflow is genuinely gone or validator rejected the op. ──
|
|
89
|
+
if (name.includes('WorkflowNotFound') ||
|
|
90
|
+
name.includes('WorkflowExecutionAlreadyCompleted') ||
|
|
91
|
+
// Update rejected by the workflow-side validator (e.g. `WorkflowGone`
|
|
92
|
+
// thrown from `claimAttachment`'s validator on a destroyed session).
|
|
93
|
+
// A retry won't make the validator change its mind.
|
|
94
|
+
name.includes('WorkflowUpdateFailed') ||
|
|
95
|
+
msg.includes('WorkflowGone') ||
|
|
96
|
+
msg.includes('workflow execution already completed'))
|
|
97
|
+
return false;
|
|
98
|
+
// ── Transient: RPC / network / temporary SDK unavailability. ──
|
|
99
|
+
if (name.includes('TransportError') ||
|
|
100
|
+
name.includes('TimeoutError') ||
|
|
101
|
+
msg.includes('DEADLINE_EXCEEDED') ||
|
|
102
|
+
msg.includes('UNAVAILABLE') ||
|
|
103
|
+
msg.includes('RESOURCE_EXHAUSTED') ||
|
|
104
|
+
msg.includes('CANCELLED') ||
|
|
105
|
+
/\bECONNRESET\b/.test(msg) ||
|
|
106
|
+
/\bECONNREFUSED\b/.test(msg) ||
|
|
107
|
+
/\bETIMEDOUT\b/.test(msg) ||
|
|
108
|
+
/\bENOTFOUND\b/.test(msg) ||
|
|
109
|
+
/\bEAI_AGAIN\b/.test(msg))
|
|
110
|
+
return true;
|
|
111
|
+
// Unknown shape — stay permanent (see "Safety posture" above).
|
|
112
|
+
return false;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Standard shape for the 3 §8.2 deliver activities' catch-all tail.
|
|
116
|
+
* Centralises the branch so each activity body stays concise.
|
|
117
|
+
*
|
|
118
|
+
* - If `err` is already an `ApplicationFailure` (typed permanent — e.g. the
|
|
119
|
+
* explicit "not found" / "destroyed" throws), re-throw as-is.
|
|
120
|
+
* - If `err` is retryable per {@link isRetryableTemporalError}, re-throw the
|
|
121
|
+
* original `Error` so the activity retry policy handles it.
|
|
122
|
+
* - Otherwise wrap in `ApplicationFailure.nonRetryable` with a caller-supplied
|
|
123
|
+
* context prefix (e.g. `Detach failed for "alice"`).
|
|
124
|
+
*/
|
|
125
|
+
function classifyAndRethrow(err, contextPrefix) {
|
|
126
|
+
if (err instanceof activity_1.ApplicationFailure)
|
|
127
|
+
throw err;
|
|
128
|
+
if (isRetryableTemporalError(err)) {
|
|
129
|
+
// Re-throw the original so the activity retry policy backs off and retries.
|
|
130
|
+
// Normalise non-Error throwables (extremely rare) into Error form.
|
|
131
|
+
throw err instanceof Error ? err : new Error(String(err));
|
|
132
|
+
}
|
|
133
|
+
throw activity_1.ApplicationFailure.nonRetryable(`${contextPrefix}: ${err instanceof Error ? err.message : String(err)}`);
|
|
134
|
+
}
|
|
54
135
|
/**
|
|
55
136
|
* Create outbox delivery activities bound to a Temporal client and config.
|
|
56
137
|
* The returned object is registered with the worker as activities.
|
|
@@ -59,12 +140,20 @@ function createOutboxActivities(client, config) {
|
|
|
59
140
|
return {
|
|
60
141
|
async deliverCue(input) {
|
|
61
142
|
const { ensemble, fromPlayerId, targetPlayerId, message } = input;
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
143
|
+
try {
|
|
144
|
+
const handle = await (0, resolve_1.resolveSession)(client, ensemble, targetPlayerId);
|
|
145
|
+
if (!handle) {
|
|
146
|
+
throw activity_1.ApplicationFailure.nonRetryable(`No active session found for "${targetPlayerId}"`);
|
|
147
|
+
}
|
|
148
|
+
await handle.signal('receiveMessage', { from: fromPlayerId, text: message });
|
|
149
|
+
return { success: true };
|
|
150
|
+
}
|
|
151
|
+
catch (err) {
|
|
152
|
+
// #236: transient RPC errors (e.g. DEADLINE_EXCEEDED on the signal call)
|
|
153
|
+
// retry per the activity policy; WorkflowNotFound / validator rejections
|
|
154
|
+
// stay permanent. Unknown errors default to non-retryable.
|
|
155
|
+
classifyAndRethrow(err, `Cue failed for "${targetPlayerId}"`);
|
|
65
156
|
}
|
|
66
|
-
await handle.signal('receiveMessage', { from: fromPlayerId, text: message });
|
|
67
|
-
return { success: true };
|
|
68
157
|
},
|
|
69
158
|
async deliverReport(input) {
|
|
70
159
|
const { ensemble, fromPlayerId, text, reportType } = input;
|
|
@@ -76,37 +165,44 @@ function createOutboxActivities(client, config) {
|
|
|
76
165
|
return { success: true };
|
|
77
166
|
}
|
|
78
167
|
catch (err) {
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
168
|
+
// #236: describe() / signal() hitting a transient RPC error now retries;
|
|
169
|
+
// WorkflowNotFound (conductor gone) stays permanent as before.
|
|
170
|
+
classifyAndRethrow(err, 'Failed to deliver report to conductor');
|
|
82
171
|
}
|
|
83
172
|
},
|
|
84
173
|
async terminateSession(input) {
|
|
85
174
|
const { ensemble, targetPlayerId, terminatedBy } = input;
|
|
86
|
-
const handle = await (0, resolve_1.resolveSession)(client, ensemble, targetPlayerId);
|
|
87
|
-
if (!handle) {
|
|
88
|
-
throw activity_1.ApplicationFailure.nonRetryable(`No active session found for "${targetPlayerId}"`);
|
|
89
|
-
}
|
|
90
|
-
// PR-C commit 4: use the V2 `destroy` update — explicit operator termination
|
|
91
|
-
// per §2.5 (abandon in-flight, phase=gone, COMPLETE). The former
|
|
92
|
-
// `updateMetadata({ status: 'terminated' })` signal path was retired.
|
|
93
|
-
await handle.executeUpdate('destroy', {
|
|
94
|
-
args: [{ reason: 'stop via tool', terminatedBy }],
|
|
95
|
-
});
|
|
96
|
-
// Notify conductor about the termination (best effort)
|
|
97
175
|
try {
|
|
98
|
-
const
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
176
|
+
const handle = await (0, resolve_1.resolveSession)(client, ensemble, targetPlayerId);
|
|
177
|
+
if (!handle) {
|
|
178
|
+
throw activity_1.ApplicationFailure.nonRetryable(`No active session found for "${targetPlayerId}"`);
|
|
179
|
+
}
|
|
180
|
+
// PR-C commit 4: use the V2 `destroy` update — explicit operator termination
|
|
181
|
+
// per §2.5 (abandon in-flight, phase=gone, COMPLETE). The former
|
|
182
|
+
// `updateMetadata({ status: 'terminated' })` signal path was retired.
|
|
183
|
+
await handle.executeUpdate('destroy', {
|
|
184
|
+
args: [{ reason: 'stop via tool', terminatedBy }],
|
|
104
185
|
});
|
|
186
|
+
// Notify conductor about the termination (best effort)
|
|
187
|
+
try {
|
|
188
|
+
const conductorId = (0, config_1.conductorWorkflowId)(ensemble);
|
|
189
|
+
const conductorHandle = client.workflow.getHandle(conductorId);
|
|
190
|
+
await conductorHandle.signal('receiveMessage', {
|
|
191
|
+
from: 'system',
|
|
192
|
+
text: `Session "${targetPlayerId}" was terminated by ${terminatedBy}.`,
|
|
193
|
+
responseRequested: false,
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
catch {
|
|
197
|
+
// Conductor may not exist — that's fine
|
|
198
|
+
}
|
|
199
|
+
return { success: true };
|
|
105
200
|
}
|
|
106
|
-
catch {
|
|
107
|
-
//
|
|
201
|
+
catch (err) {
|
|
202
|
+
// #236: transient RPC on the destroy update now retries; validator rejection
|
|
203
|
+
// (WorkflowGone, AttachmentMismatch) stays permanent.
|
|
204
|
+
classifyAndRethrow(err, `Terminate failed for "${targetPlayerId}"`);
|
|
108
205
|
}
|
|
109
|
-
return { success: true };
|
|
110
206
|
},
|
|
111
207
|
async startRecruitedSession(input) {
|
|
112
208
|
const { ensemble, targetName, workDir, isConductor, initialMessage, fromPlayerId, agent, systemPrompt, taskQueue, agentDefinition, agentDefinitionDescription, held } = input;
|
|
@@ -177,7 +273,11 @@ function createOutboxActivities(client, config) {
|
|
|
177
273
|
return { success: true, sessionId };
|
|
178
274
|
}
|
|
179
275
|
catch (err) {
|
|
180
|
-
|
|
276
|
+
// #236: transient RPC during workflow.start (e.g. temporal server flap)
|
|
277
|
+
// now retries; WorkflowNotFound / validation / auth failures stay permanent.
|
|
278
|
+
// Note: this activity's pre-#236 catch was missing the ApplicationFailure
|
|
279
|
+
// passthrough guard — `classifyAndRethrow` restores it for free.
|
|
280
|
+
classifyAndRethrow(err, `Failed to start recruited session "${targetName}"`);
|
|
181
281
|
}
|
|
182
282
|
},
|
|
183
283
|
async spawnProcess(input) {
|
|
@@ -267,7 +367,14 @@ function createOutboxActivities(client, config) {
|
|
|
267
367
|
return { success: true };
|
|
268
368
|
}
|
|
269
369
|
catch (err) {
|
|
270
|
-
|
|
370
|
+
// #236: spawnProcess throws predominantly OS-side errors (ENOENT/EACCES
|
|
371
|
+
// on the claude binary, EAGAIN on process-table overflow). The classifier
|
|
372
|
+
// is tuned for Temporal RPC; OS errors don't match its transient
|
|
373
|
+
// signatures, so they still flow through as non-retryable — byte-for-byte
|
|
374
|
+
// behavior preservation. The upside of going through the helper: if a
|
|
375
|
+
// future OS error surfaces a transient shape we add to the classifier,
|
|
376
|
+
// spawnProcess benefits automatically.
|
|
377
|
+
classifyAndRethrow(err, `Failed to spawn process for "${targetName}"`);
|
|
271
378
|
}
|
|
272
379
|
},
|
|
273
380
|
async releasePlayer(input) {
|
|
@@ -288,9 +395,9 @@ function createOutboxActivities(client, config) {
|
|
|
288
395
|
return { success: true };
|
|
289
396
|
}
|
|
290
397
|
catch (err) {
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
398
|
+
// #236: transient RPC on outboxLocked query / releaseHeld signal now
|
|
399
|
+
// retries; WorkflowNotFound / not-held validation stay permanent.
|
|
400
|
+
classifyAndRethrow(err, `Release failed for "${targetPlayerId}"`);
|
|
294
401
|
}
|
|
295
402
|
},
|
|
296
403
|
/**
|
|
@@ -318,9 +425,10 @@ function createOutboxActivities(client, config) {
|
|
|
318
425
|
return { success: true };
|
|
319
426
|
}
|
|
320
427
|
catch (err) {
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
428
|
+
// #140: re-throw transient RPC/network errors so the activity retry
|
|
429
|
+
// policy handles them; permanent cases (validator rejection, workflow
|
|
430
|
+
// gone, unknown) become `ApplicationFailure.nonRetryable`.
|
|
431
|
+
classifyAndRethrow(err, `Detach failed for "${targetPlayerId}"`);
|
|
324
432
|
}
|
|
325
433
|
},
|
|
326
434
|
/**
|
|
@@ -359,9 +467,10 @@ function createOutboxActivities(client, config) {
|
|
|
359
467
|
return { success: true };
|
|
360
468
|
}
|
|
361
469
|
catch (err) {
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
470
|
+
// #140: transient errors (network, RPC timeout) become retryable;
|
|
471
|
+
// permanent cases (WorkflowNotFound, validator rejection) stay
|
|
472
|
+
// non-retryable. Unknown errors default to non-retryable.
|
|
473
|
+
classifyAndRethrow(err, `Destroy failed for "${targetPlayerId}"`);
|
|
365
474
|
}
|
|
366
475
|
},
|
|
367
476
|
/**
|
|
@@ -393,7 +502,7 @@ function createOutboxActivities(client, config) {
|
|
|
393
502
|
try {
|
|
394
503
|
await handle.signal(signals_1.requestDetachSignal, {
|
|
395
504
|
reason: 'restart',
|
|
396
|
-
deadlineMs:
|
|
505
|
+
deadlineMs: validation_1.DEFAULT_RESTART_DETACH_DEADLINE_MS,
|
|
397
506
|
});
|
|
398
507
|
}
|
|
399
508
|
catch {
|
|
@@ -433,7 +542,7 @@ function createOutboxActivities(client, config) {
|
|
|
433
542
|
host: targetHost,
|
|
434
543
|
adapterId,
|
|
435
544
|
adapterClass,
|
|
436
|
-
leaseMs:
|
|
545
|
+
leaseMs: validation_1.DEFAULT_RESTART_LEASE_MS,
|
|
437
546
|
}],
|
|
438
547
|
});
|
|
439
548
|
// Step 5 — optional context replay.
|
|
@@ -503,9 +612,11 @@ function createOutboxActivities(client, config) {
|
|
|
503
612
|
return { success: true };
|
|
504
613
|
}
|
|
505
614
|
catch (err) {
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
615
|
+
// #140: the §8.2 restart algorithm fires many RPCs; any of them may
|
|
616
|
+
// hit a transient network/RPC error. Those get retried. Validator
|
|
617
|
+
// rejections (e.g. claim race), workflow-gone, and unknown errors
|
|
618
|
+
// stay permanent to avoid wedging the outbox on a dead target.
|
|
619
|
+
classifyAndRethrow(err, `Restart failed for "${targetPlayerId}"`);
|
|
509
620
|
}
|
|
510
621
|
},
|
|
511
622
|
/**
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.resolveSession = resolveSession;
|
|
4
4
|
exports.scanEnsembleSessions = scanEnsembleSessions;
|
|
5
|
+
const search_attributes_1 = require("../utils/search-attributes");
|
|
5
6
|
/** Shared query for listing running session workflows. */
|
|
6
7
|
const SESSION_LIST_QUERY = `WorkflowType = "claudeSessionWorkflow" AND ExecutionStatus = "Running"`;
|
|
7
8
|
/**
|
|
@@ -43,10 +44,7 @@ async function scanEnsembleSessions(client, ensemble) {
|
|
|
43
44
|
const part = await handle.query('getPart');
|
|
44
45
|
// Attachment phase lives in the `ClaudeTempoAttachmentState` search
|
|
45
46
|
// attribute (written by the workflow on every phase transition).
|
|
46
|
-
const
|
|
47
|
-
const phase = Array.isArray(phaseArr) && phaseArr.length > 0
|
|
48
|
-
? phaseArr[0]
|
|
49
|
-
: undefined;
|
|
47
|
+
const phase = (0, search_attributes_1.getAttachmentPhase)(workflow);
|
|
50
48
|
sessions.push({
|
|
51
49
|
workflowId: workflow.workflowId,
|
|
52
50
|
playerId: metadata.playerId,
|
package/dist/adapters/base.d.ts
CHANGED
|
@@ -65,6 +65,27 @@ export declare abstract class BaseAttachment {
|
|
|
65
65
|
private stopped;
|
|
66
66
|
private terminalFired;
|
|
67
67
|
private knownPhase;
|
|
68
|
+
/**
|
|
69
|
+
* `true` once a heartbeat has successfully landed on the current attachment (or rebind).
|
|
70
|
+
* Cleared on `startV2Lifecycle`, reconnect-loop success, and CAN rebind so each freshly
|
|
71
|
+
* live attachment emits its own `heartbeat#1 delivered` diagnostic. Added in #249 to
|
|
72
|
+
* distinguish "claim OK but heartbeat loop died" from "adapter just hasn't ticked yet."
|
|
73
|
+
*/
|
|
74
|
+
private firstHeartbeatLogged;
|
|
75
|
+
/**
|
|
76
|
+
* Monotonic heartbeat counter for the current attachment cycle. Reset on
|
|
77
|
+
* claim/reconnect/CAN-rebind. Emitted periodically (every {@link HEARTBEAT_SUMMARY_EVERY}
|
|
78
|
+
* ticks) so a long-running session leaves breadcrumbs in the log proving the loop is
|
|
79
|
+
* alive — operators can `grep 'heartbeats-delivered='` to confirm health without
|
|
80
|
+
* parsing Temporal history. Added in #249.
|
|
81
|
+
*/
|
|
82
|
+
private heartbeatsSent;
|
|
83
|
+
/**
|
|
84
|
+
* Mirror of {@link heartbeatsSent} for the phase-watcher loop. Same emission cadence,
|
|
85
|
+
* same rationale — the watcher is the only self-heal surface when the heartbeat loop
|
|
86
|
+
* dies silently, so a summary log line proves it's still live too.
|
|
87
|
+
*/
|
|
88
|
+
private phaseTicksDone;
|
|
68
89
|
private readonly phaseChangeListeners;
|
|
69
90
|
private readonly leaseRevokedListeners;
|
|
70
91
|
private readonly terminalListeners;
|
|
@@ -133,17 +154,68 @@ export declare abstract class BaseAttachment {
|
|
|
133
154
|
*/
|
|
134
155
|
protected stopV2Lifecycle(reason?: DetachReason, graceful?: boolean): Promise<void>;
|
|
135
156
|
private scheduleHeartbeat;
|
|
157
|
+
/**
|
|
158
|
+
* Emit a loud diagnostic when a tick early-returns via one of its guard paths (#249).
|
|
159
|
+
* Pre-#249 these returns were silent — the only observable effect was "heartbeats stop
|
|
160
|
+
* arriving." Now operators can grep `adapter.*guard tripped` to confirm or rule out
|
|
161
|
+
* tick-orphan as a failure mode without needing workflow history.
|
|
162
|
+
*
|
|
163
|
+
* `terminalFired=true` / `stopped=true` guards are load-bearing on the terminal path
|
|
164
|
+
* (don't want to re-enter terminal) so they're expected during teardown; we still log
|
|
165
|
+
* them but at the same level — operators can correlate timestamps against the preceding
|
|
166
|
+
* `terminal (...) — stopping delivery poll permanently` line.
|
|
167
|
+
*/
|
|
168
|
+
private logGuardTrip;
|
|
169
|
+
/**
|
|
170
|
+
* Single tick of the heartbeat loop. Try/finally scaffolding (#249) guarantees
|
|
171
|
+
* reschedule in every path except genuinely terminal state (`stopped`,
|
|
172
|
+
* `terminalFired`) or when the reconnect loop has taken ownership of scheduling
|
|
173
|
+
* (`reconnecting`). Pre-#249 the three early-return paths at the top + the
|
|
174
|
+
* handled-terminal-error path silently orphaned the timer forever; a transient
|
|
175
|
+
* `reconnecting=true` window or a null-handle race was enough to kill the loop
|
|
176
|
+
* with no log and no teardown.
|
|
177
|
+
*
|
|
178
|
+
* Handled terminals (CAN rebind, destroy) still short-circuit via `return` —
|
|
179
|
+
* the `finally` block re-checks `reconnecting` / `terminalFired` before
|
|
180
|
+
* rescheduling, so the reconnect/terminal machinery keeps ownership of
|
|
181
|
+
* whatever comes next.
|
|
182
|
+
*/
|
|
136
183
|
private tickHeartbeat;
|
|
137
184
|
private schedulePhaseWatcher;
|
|
185
|
+
/**
|
|
186
|
+
* Single tick of the phase-watcher loop. Same orphan-resistance scaffolding as
|
|
187
|
+
* {@link tickHeartbeat} (#249): try/finally reschedule, unconditional unless
|
|
188
|
+
* `stopped` / `terminalFired` / `reconnecting`. When the heartbeat loop dies
|
|
189
|
+
* silently, the watcher is the only remaining self-heal surface — losing it
|
|
190
|
+
* too meant the adapter had no path back to a healthy state short of process
|
|
191
|
+
* restart.
|
|
192
|
+
*/
|
|
138
193
|
private tickPhaseWatcher;
|
|
139
194
|
/**
|
|
140
|
-
*
|
|
195
|
+
* Shared error-classification path for the heartbeat + phase-watcher ticks (#226).
|
|
196
|
+
*
|
|
197
|
+
* Returns `true` if the error was a terminal-class (handled inline: CAN rebind
|
|
198
|
+
* kicked off, or destroy fired). Returns `false` when the caller should treat
|
|
199
|
+
* the error as transient and continue its backoff.
|
|
200
|
+
*
|
|
201
|
+
* Always consults `fetchHistory` on any terminal-class error, because the
|
|
202
|
+
* Temporal SDK can't distinguish CAN-close from true-complete at the error
|
|
203
|
+
* level — see {@link isTerminalWorkflowError}. The history lookup is cheap
|
|
204
|
+
* (only runs on terminal, so at most once per adapter lifetime per terminal)
|
|
205
|
+
* and safer than re-querying by workflow id (which could race a fresh session
|
|
206
|
+
* reusing the id).
|
|
207
|
+
*/
|
|
208
|
+
private handleRunEndError;
|
|
209
|
+
/**
|
|
210
|
+
* Fetch the closed pinned run's history and return the runId of a CAN successor
|
|
211
|
+
* if present, else `null`. Scoped to the pinned (old) run via `this.pinnedHandle`,
|
|
212
|
+
* so it can't be fooled by a fresh session that happens to reuse the workflow id.
|
|
141
213
|
*
|
|
142
|
-
*
|
|
143
|
-
*
|
|
144
|
-
*
|
|
214
|
+
* Called only on the terminal path from {@link handleRunEndError}, so the cost
|
|
215
|
+
* of `fetchHistory` (a full event stream for the closed run) is paid at most
|
|
216
|
+
* once per terminal — not on every tick.
|
|
145
217
|
*/
|
|
146
|
-
private
|
|
218
|
+
private findCanSuccessorRunId;
|
|
147
219
|
private fireTerminal;
|
|
148
220
|
/**
|
|
149
221
|
* Opt-in reconnect policy. Default: return `false` — the base class behaves
|
|
@@ -194,6 +266,24 @@ export declare abstract class BaseAttachment {
|
|
|
194
266
|
* when the reason is potentially recoverable.
|
|
195
267
|
*/
|
|
196
268
|
private fireTerminalOrReconnect;
|
|
269
|
+
/**
|
|
270
|
+
* #226 CAN rebind. Transparently repoints `pinnedHandle` at the successor run,
|
|
271
|
+
* keeps the existing `attachmentId` / `leaseMs` (the workflow extended the lease
|
|
272
|
+
* by one heartbeat interval during the CAN transition per §2.3, so the lease is
|
|
273
|
+
* still live on the new run), notifies the subclass to restart its delivery
|
|
274
|
+
* loop, and resumes heartbeat + phase-watcher.
|
|
275
|
+
*
|
|
276
|
+
* Why this is safe without re-claiming:
|
|
277
|
+
* - The new run carries forward `currentAttachment` verbatim from the old run.
|
|
278
|
+
* - The adapter's `attachmentId` still matches, so the next `heartbeat` /
|
|
279
|
+
* `markDelivered` / `adapterExited` signal on the new pinned handle will be
|
|
280
|
+
* accepted unchanged by the workflow's handlers.
|
|
281
|
+
* - If the lease actually did expire before we got here (e.g. adapter was
|
|
282
|
+
* offline through multiple CAN cycles), the next phase-watcher tick on the
|
|
283
|
+
* new pinned handle will see `phase=detached` + no current attachment and
|
|
284
|
+
* fall through to the existing #201 reclaim path — belt-and-suspenders.
|
|
285
|
+
*/
|
|
286
|
+
private runCanRebind;
|
|
197
287
|
/**
|
|
198
288
|
* Budget-bounded reconnect loop.
|
|
199
289
|
*
|