claude-tempo 0.27.0 → 0.28.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +7 -1
- package/dist/adapters/base.d.ts +84 -0
- package/dist/adapters/base.js +345 -20
- package/dist/cli/daemon-command.d.ts +4 -0
- package/dist/cli/daemon-command.js +102 -1
- package/dist/client/core.d.ts +24 -0
- package/dist/client/core.js +1033 -0
- package/dist/client/ensure-conductor-spawned.d.ts +9 -6
- package/dist/client/ensure-conductor-spawned.js +7 -4
- package/dist/client/index.d.ts +32 -4
- package/dist/client/index.js +15 -1030
- package/dist/client/interface.d.ts +99 -19
- package/dist/client/subscribe.d.ts +108 -0
- package/dist/client/subscribe.js +598 -0
- package/dist/client/with-spawn.d.ts +27 -0
- package/dist/client/with-spawn.js +87 -0
- package/dist/config.d.ts +21 -0
- package/dist/config.js +12 -0
- package/dist/daemon.d.ts +19 -0
- package/dist/daemon.js +117 -1
- package/dist/http/aggregate.d.ts +161 -0
- package/dist/http/aggregate.js +466 -0
- package/dist/http/auth.d.ts +67 -0
- package/dist/http/auth.js +177 -0
- package/dist/http/cors.d.ts +42 -0
- package/dist/http/cors.js +111 -0
- package/dist/http/event-bus.d.ts +217 -0
- package/dist/http/event-bus.js +365 -0
- package/dist/http/event-id.d.ts +77 -0
- package/dist/http/event-id.js +117 -0
- package/dist/http/event-types.d.ts +280 -0
- package/dist/http/event-types.js +36 -0
- package/dist/http/index.d.ts +21 -0
- package/dist/http/index.js +61 -0
- package/dist/http/port-file.d.ts +22 -0
- package/dist/http/port-file.js +132 -0
- package/dist/http/responses.d.ts +27 -0
- package/dist/http/responses.js +40 -0
- package/dist/http/ring-buffer.d.ts +41 -0
- package/dist/http/ring-buffer.js +80 -0
- package/dist/http/server.d.ts +101 -0
- package/dist/http/server.js +368 -0
- package/dist/http/snapshot.d.ts +51 -0
- package/dist/http/snapshot.js +109 -0
- package/dist/http/sse-handler.d.ts +75 -0
- package/dist/http/sse-handler.js +276 -0
- package/dist/reconcile/orphans.d.ts +3 -6
- package/dist/reconcile/orphans.js +10 -4
- package/dist/server.js +21 -6
- package/dist/tools/destroy.js +25 -7
- package/dist/tools/restore.js +20 -4
- package/dist/tui/App.js +91 -86
- package/dist/tui/commands.d.ts +0 -16
- package/dist/tui/commands.js +0 -20
- package/dist/tui/index.js +9 -0
- package/dist/tui/sse-handler.d.ts +52 -0
- package/dist/tui/sse-handler.js +156 -0
- package/dist/tui/store.d.ts +54 -3
- package/dist/tui/store.js +87 -3
- package/dist/tui/utils/history.d.ts +0 -5
- package/dist/tui/utils/history.js +0 -17
- package/dist/tui/utils/platform.d.ts +0 -11
- package/dist/tui/utils/platform.js +0 -37
- package/dist/utils/hosts.d.ts +32 -8
- package/dist/utils/hosts.js +52 -21
- package/dist/workflows/maestro-signals.d.ts +21 -0
- package/dist/workflows/maestro-signals.js +19 -1
- package/dist/workflows/maestro.js +7 -0
- package/dist/workflows/session.js +41 -4
- package/package.json +4 -2
- package/workflow-bundle.js +68 -6
- package/dist/tools/detach.d.ts +0 -4
- package/dist/tools/detach.js +0 -45
- package/dist/tui/components/CommandOverlay.d.ts +0 -15
- package/dist/tui/components/CommandOverlay.js +0 -34
- package/dist/tui/components/ScheduleOverlay.d.ts +0 -13
- package/dist/tui/components/ScheduleOverlay.js +0 -113
package/CLAUDE.md
CHANGED
|
@@ -76,7 +76,6 @@ src/
|
|
|
76
76
|
│ └── helpers.ts # Zod/MCP tool registration wrapper
|
|
77
77
|
├── tui/
|
|
78
78
|
│ ├── App.tsx / store.ts / commands.ts # TUI root, state, slash commands
|
|
79
|
-
│ ├── client.ts # Backward-compat shim → src/client/
|
|
80
79
|
│ ├── components/ # Ink components — see docs/tui.md for inventory
|
|
81
80
|
│ └── utils/ # format, platform, theme, fullscreen, history
|
|
82
81
|
├── utils/
|
|
@@ -108,6 +107,13 @@ npm test
|
|
|
108
107
|
> surveys or migrations, always grep **both** `test/` and `tests/` or you will miss
|
|
109
108
|
> mocks and assertions that only live in one directory.
|
|
110
109
|
|
|
110
|
+
> **Test-only hooks live with the module they reset and follow the
|
|
111
|
+
> `__<verb><Noun>ForTests` naming convention** — see
|
|
112
|
+
> [docs/adr/0006-test-hooks-naming.md](docs/adr/0006-test-hooks-naming.md). The
|
|
113
|
+
> double-underscore prefix telegraphs "test escape hatch, do not call from
|
|
114
|
+
> production code"; the hook's doc-comment should restate that explicitly. Hooks
|
|
115
|
+
> are never surfaced through barrels or `TempoClient`.
|
|
116
|
+
|
|
111
117
|
See [docs/development.md](docs/development.md) for full setup (Temporal dev server command,
|
|
112
118
|
daemon worker notes, `npx ts-node` dev runner).
|
|
113
119
|
|
package/dist/adapters/base.d.ts
CHANGED
|
@@ -14,6 +14,35 @@
|
|
|
14
14
|
*/
|
|
15
15
|
import type { Client, WorkflowHandle } from '@temporalio/client';
|
|
16
16
|
import type { AdapterClass, AdapterDescriptor, AttachmentToken, AttachmentPhase, DetachReason } from '../types';
|
|
17
|
+
/** Snapshot of adapter state included in every telemetry frame. */
|
|
18
|
+
interface AdapterTelemetrySnapshot {
|
|
19
|
+
attachmentId: string | null;
|
|
20
|
+
workflowId: string | null;
|
|
21
|
+
runId: string | null;
|
|
22
|
+
heartbeatsSent: number;
|
|
23
|
+
phaseTicksDone: number;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Build the structured frame emitted by every lifecycle handler. Pure
|
|
27
|
+
* function — exposed for unit tests that don't want to spawn a child
|
|
28
|
+
* process.
|
|
29
|
+
*/
|
|
30
|
+
export declare function buildProcessTerminatingFrame(signal: string, errorMessage?: string, snapshot?: AdapterTelemetrySnapshot[]): string;
|
|
31
|
+
/**
|
|
32
|
+
* Install the process-lifecycle telemetry handlers. Idempotent. Skipped
|
|
33
|
+
* by default in test environments (see {@link shouldInstallLifecycleTelemetry}).
|
|
34
|
+
*
|
|
35
|
+
* Production callers (and the first `startV2Lifecycle()` call on any
|
|
36
|
+
* adapter) invoke without arguments. Unit tests pass `{ force: true }`
|
|
37
|
+
* to bypass the env gate.
|
|
38
|
+
*/
|
|
39
|
+
export declare function installProcessLifecycleTelemetry(opts?: {
|
|
40
|
+
force?: boolean;
|
|
41
|
+
}): void;
|
|
42
|
+
/** Test-only — uninstall handlers + reset state. */
|
|
43
|
+
export declare function _resetProcessLifecycleTelemetryForTest(): void;
|
|
44
|
+
/** Test-only — direct access to the live-adapter set. */
|
|
45
|
+
export declare function _liveAdaptersForTest(): ReadonlySet<BaseAttachment>;
|
|
17
46
|
/**
|
|
18
47
|
* Override bundle for the reconnect loop timing (#201). Production defaults are
|
|
19
48
|
* tuned for laptop-sleep cycles (15-min elapsed budget, 10s base, 60s cap). Tests
|
|
@@ -124,6 +153,13 @@ export declare abstract class BaseAttachment {
|
|
|
124
153
|
onPhaseChange(listener: (phase: AttachmentPhase) => void): () => void;
|
|
125
154
|
/** Subscribe to lease-revocation events (§9.3 split-brain resolution). */
|
|
126
155
|
onLeaseRevoked(listener: (reason: DetachReason) => void): () => void;
|
|
156
|
+
/**
|
|
157
|
+
* Hypothesis A telemetry — capture the adapter state included in
|
|
158
|
+
* process-lifecycle log frames. Public so the module-level
|
|
159
|
+
* `snapshotLiveAdapters()` helper can read private fields without an
|
|
160
|
+
* `any` cast; consumers other than the telemetry path should not call it.
|
|
161
|
+
*/
|
|
162
|
+
_captureTelemetrySnapshot(): AdapterTelemetrySnapshot;
|
|
127
163
|
/**
|
|
128
164
|
* Subscribe to terminal events — `WorkflowNotFound` (§9.4) and phase `gone`.
|
|
129
165
|
* Terminal fires at most once per instance. Subclasses stop delivery + exit.
|
|
@@ -216,7 +252,55 @@ export declare abstract class BaseAttachment {
|
|
|
216
252
|
* once per terminal — not on every tick.
|
|
217
253
|
*/
|
|
218
254
|
private findCanSuccessorRunId;
|
|
255
|
+
/**
|
|
256
|
+
* Fire the terminal hook — the adapter is going dark and won't recover.
|
|
257
|
+
*
|
|
258
|
+
* #258: emits a structured log line on every fire so the next post-CAN
|
|
259
|
+
* silence incident is unambiguous in logs. Pre-#258, a `fireTerminal`
|
|
260
|
+
* from an unexpected source (the root cause was a silent destroy from
|
|
261
|
+
* the reconnect-loop pre-check on a transient terminal-class error) was
|
|
262
|
+
* indistinguishable from process death in workflow history — both produced
|
|
263
|
+
* "no further heartbeats." The structured log includes:
|
|
264
|
+
*
|
|
265
|
+
* - `reason` — the existing DetachReason
|
|
266
|
+
* - `callsite` — the calling function or rationale (passed by every
|
|
267
|
+
* callsite so the source is grep-able without parsing stack traces)
|
|
268
|
+
* - `attachmentId` / `workflowId` / `runId` — for cross-referencing
|
|
269
|
+
* against workflow history when bisecting an incident
|
|
270
|
+
* - `heartbeatsSent` / `phaseTicksDone` — the existing #249 counters
|
|
271
|
+
* so an operator can correlate "loop alive at N heartbeats, then
|
|
272
|
+
* terminal fired at this callsite" without external context
|
|
273
|
+
*
|
|
274
|
+
* Idempotent — repeat calls (e.g. reconnect-exhausted re-fires after
|
|
275
|
+
* destroy) early-return without re-logging. The first fire wins.
|
|
276
|
+
*/
|
|
219
277
|
private fireTerminal;
|
|
278
|
+
/**
|
|
279
|
+
* #258 tiebreaker: confirm whether a workflow is genuinely terminal after
|
|
280
|
+
* the reconnect-loop pre-check threw a terminal-class error. Used to
|
|
281
|
+
* distinguish a real workflow-gone state from a transient gRPC /
|
|
282
|
+
* visibility-API blip that classified as terminal.
|
|
283
|
+
*
|
|
284
|
+
* Returns:
|
|
285
|
+
* - `{ kind: 'running', statusName }` — workflow is alive (any
|
|
286
|
+
* non-terminal status). Caller should treat the original error as
|
|
287
|
+
* transient and continue the reconnect loop.
|
|
288
|
+
* - `{ kind: 'terminal', statusName }` — workflow is in a terminal
|
|
289
|
+
* status (`COMPLETED` / `FAILED` / `CANCELLED` / `TERMINATED` /
|
|
290
|
+
* `CONTINUED_AS_NEW` / `TIMED_OUT`). Caller should fire destroy.
|
|
291
|
+
* - `{ kind: 'describe-threw' }` — `describe()` itself failed. Treat
|
|
292
|
+
* as terminal (fire destroy) — consistent with pre-#258 semantics
|
|
293
|
+
* when classification is ambiguous, and avoids spinning forever on
|
|
294
|
+
* a workflow we can't reach.
|
|
295
|
+
* - `{ kind: 'timed-out' }` — `describe()` exceeded
|
|
296
|
+
* {@link DESCRIBE_TIMEOUT_MS}. Treat as terminal (fire destroy) —
|
|
297
|
+
* same rationale: prefer clean shutdown to a hung loop.
|
|
298
|
+
*
|
|
299
|
+
* The unpinned handle follows any CAN chain to the latest run, so
|
|
300
|
+
* `desc.status.name === 'CONTINUED_AS_NEW'` here means the workflow
|
|
301
|
+
* id itself is closed (no successor) — genuinely terminal.
|
|
302
|
+
*/
|
|
303
|
+
private confirmWorkflowTerminal;
|
|
220
304
|
/**
|
|
221
305
|
* Opt-in reconnect policy. Default: return `false` — the base class behaves
|
|
222
306
|
* exactly as it did before #201 (fire terminal, tear down). Subclasses that
|
package/dist/adapters/base.js
CHANGED
|
@@ -1,9 +1,174 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.AdapterRegistry = exports.BaseAttachment = void 0;
|
|
4
|
+
exports.buildProcessTerminatingFrame = buildProcessTerminatingFrame;
|
|
5
|
+
exports.installProcessLifecycleTelemetry = installProcessLifecycleTelemetry;
|
|
6
|
+
exports._resetProcessLifecycleTelemetryForTest = _resetProcessLifecycleTelemetryForTest;
|
|
7
|
+
exports._liveAdaptersForTest = _liveAdaptersForTest;
|
|
4
8
|
const signals_1 = require("../workflows/signals");
|
|
5
9
|
const terminal_error_1 = require("./terminal-error");
|
|
6
10
|
const log = (...args) => console.error('[claude-tempo:adapter]', ...args);
|
|
11
|
+
// ── Hypothesis A telemetry (#258 follow-up) ─────────────────────────────
|
|
12
|
+
//
|
|
13
|
+
// The structured `terminal fire` log shipped in #258 made the next
|
|
14
|
+
// adapter-silence incident self-describing — but only for cases where
|
|
15
|
+
// `fireTerminal` actually fires. Hypothesis A (process death — crash, OOM,
|
|
16
|
+
// Windows sleep, terminal close, SIGKILL) wouldn't produce that log
|
|
17
|
+
// because the process never reached the code path. The handlers below
|
|
18
|
+
// close that gap: a future #258 recurrence with no `fireTerminal` log AND
|
|
19
|
+
// no `adapter-process-terminating` log narrows to a distinct hypothesis
|
|
20
|
+
// (likely SIGKILL / abrupt OS termination — file separately).
|
|
21
|
+
//
|
|
22
|
+
// Design tenets:
|
|
23
|
+
// - **Idempotent registration**: a module-level boolean ensures multiple
|
|
24
|
+
// adapter instances spawning in the same process never double-register
|
|
25
|
+
// handlers. Repeated `installProcessLifecycleTelemetry()` calls no-op.
|
|
26
|
+
// - **Additive only**: every `process.on(...)` call appends; nothing
|
|
27
|
+
// calls `removeAllListeners`. Coexists with the test-cleanup chain in
|
|
28
|
+
// `test/helpers.ts` (#312) and the daemon's own SIGTERM/SIGINT
|
|
29
|
+
// shutdown function.
|
|
30
|
+
// - **Synchronous logging on terminal signals**: process termination
|
|
31
|
+
// doesn't await async log flushes. `console.error` to stderr is
|
|
32
|
+
// synchronous on POSIX + Windows, which is enough.
|
|
33
|
+
// - **No behavior change on uncaughtException**: we register
|
|
34
|
+
// `uncaughtExceptionMonitor` (Node 13.7+) to telemeter without
|
|
35
|
+
// suppressing Node's default crash. If the runtime predates that
|
|
36
|
+
// event, we fall back to `uncaughtException` + `process.exit(1)`
|
|
37
|
+
// which preserves "don't swallow."
|
|
38
|
+
// - **Test gating**: mocha defines `it` globally (vitest with
|
|
39
|
+
// `globals: false` does not). Skip auto-install whenever the test
|
|
40
|
+
// framework signal is present so we don't fight the existing zombie
|
|
41
|
+
// reap in `test/helpers.ts`. The unit tests for these handlers spawn
|
|
42
|
+
// a dedicated child Node process where the gate doesn't fire.
|
|
43
|
+
/**
|
|
44
|
+
* Live adapters in this process. Populated by `startV2Lifecycle` after a
|
|
45
|
+
* successful claim; emptied on `stopV2Lifecycle` and `fireTerminal`.
|
|
46
|
+
* Each lifecycle handler iterates this set to build the per-adapter
|
|
47
|
+
* snapshot in the structured log.
|
|
48
|
+
*/
|
|
49
|
+
const liveAdapters = new Set();
|
|
50
|
+
let processLifecycleTelemetryInstalled = false;
|
|
51
|
+
let processLifecycleHandlerRefs = [];
|
|
52
|
+
/**
|
|
53
|
+
* Should `installProcessLifecycleTelemetry()` actually wire up handlers?
|
|
54
|
+
*
|
|
55
|
+
* - Forced on by `CLAUDE_TEMPO_LIFECYCLE_TELEMETRY=1` (used by the
|
|
56
|
+
* child-process tests for these handlers — see
|
|
57
|
+
* `test/adapter-process-lifecycle-telemetry.test.ts`).
|
|
58
|
+
* - Forced off by `CLAUDE_TEMPO_LIFECYCLE_TELEMETRY=0`.
|
|
59
|
+
* - Off when running under mocha (detected via `globalThis.it` —
|
|
60
|
+
* mocha defines this; vitest with `globals: false` does not).
|
|
61
|
+
* - Off when `NODE_ENV === 'test'` — belt and suspenders.
|
|
62
|
+
* - Otherwise on.
|
|
63
|
+
*/
|
|
64
|
+
function shouldInstallLifecycleTelemetry(force) {
|
|
65
|
+
if (force)
|
|
66
|
+
return true;
|
|
67
|
+
const flag = process.env.CLAUDE_TEMPO_LIFECYCLE_TELEMETRY;
|
|
68
|
+
if (flag === '1' || flag === 'true')
|
|
69
|
+
return true;
|
|
70
|
+
if (flag === '0' || flag === 'false')
|
|
71
|
+
return false;
|
|
72
|
+
// Mocha exposes BDD globals (`it`, `describe`, …) on the global object;
|
|
73
|
+
// our vitest config opts out of globals so it doesn't trigger this gate.
|
|
74
|
+
if (typeof globalThis.it === 'function')
|
|
75
|
+
return false;
|
|
76
|
+
if (process.env.NODE_ENV === 'test')
|
|
77
|
+
return false;
|
|
78
|
+
return true;
|
|
79
|
+
}
|
|
80
|
+
function snapshotLiveAdapters() {
|
|
81
|
+
const out = [];
|
|
82
|
+
for (const adapter of liveAdapters) {
|
|
83
|
+
out.push(adapter._captureTelemetrySnapshot());
|
|
84
|
+
}
|
|
85
|
+
return out;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Build the structured frame emitted by every lifecycle handler. Pure
|
|
89
|
+
* function — exposed for unit tests that don't want to spawn a child
|
|
90
|
+
* process.
|
|
91
|
+
*/
|
|
92
|
+
function buildProcessTerminatingFrame(signal, errorMessage, snapshot = snapshotLiveAdapters()) {
|
|
93
|
+
return JSON.stringify({
|
|
94
|
+
event: 'adapter-process-terminating',
|
|
95
|
+
signal,
|
|
96
|
+
...(errorMessage !== undefined ? { errorMessage } : {}),
|
|
97
|
+
adapterCount: snapshot.length,
|
|
98
|
+
adapters: snapshot,
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
function emitTerminatingLog(signal, errorMessage) {
|
|
102
|
+
// `console.error` synchronously writes to stderr on POSIX + Windows.
|
|
103
|
+
// The `[claude-tempo:adapter]` prefix matches the rest of the adapter
|
|
104
|
+
// logs so a single grep surfaces both the existing `terminal fire`
|
|
105
|
+
// line and these new lifecycle lines for the same incident.
|
|
106
|
+
log(`adapter-process-terminating: ${buildProcessTerminatingFrame(signal, errorMessage)}`);
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Install the process-lifecycle telemetry handlers. Idempotent. Skipped
|
|
110
|
+
* by default in test environments (see {@link shouldInstallLifecycleTelemetry}).
|
|
111
|
+
*
|
|
112
|
+
* Production callers (and the first `startV2Lifecycle()` call on any
|
|
113
|
+
* adapter) invoke without arguments. Unit tests pass `{ force: true }`
|
|
114
|
+
* to bypass the env gate.
|
|
115
|
+
*/
|
|
116
|
+
function installProcessLifecycleTelemetry(opts = {}) {
|
|
117
|
+
if (processLifecycleTelemetryInstalled)
|
|
118
|
+
return;
|
|
119
|
+
if (!shouldInstallLifecycleTelemetry(opts.force === true))
|
|
120
|
+
return;
|
|
121
|
+
processLifecycleTelemetryInstalled = true;
|
|
122
|
+
const handlers = [];
|
|
123
|
+
// `exit` — synchronous, last chance. Don't do async work; just log.
|
|
124
|
+
const onExit = () => emitTerminatingLog('exit');
|
|
125
|
+
process.on('exit', onExit);
|
|
126
|
+
handlers.push({ event: 'exit', handler: onExit });
|
|
127
|
+
// `beforeExit` — event loop is empty but Node hasn't exited yet.
|
|
128
|
+
const onBeforeExit = () => emitTerminatingLog('beforeExit');
|
|
129
|
+
process.on('beforeExit', onBeforeExit);
|
|
130
|
+
handlers.push({ event: 'beforeExit', handler: onBeforeExit });
|
|
131
|
+
// SIGTERM — graceful termination request (kill, supervisord, systemd).
|
|
132
|
+
const onSigterm = () => emitTerminatingLog('SIGTERM');
|
|
133
|
+
process.on('SIGTERM', onSigterm);
|
|
134
|
+
handlers.push({ event: 'SIGTERM', handler: onSigterm });
|
|
135
|
+
// SIGINT — Ctrl+C from a controlling terminal.
|
|
136
|
+
const onSigint = () => emitTerminatingLog('SIGINT');
|
|
137
|
+
process.on('SIGINT', onSigint);
|
|
138
|
+
handlers.push({ event: 'SIGINT', handler: onSigint });
|
|
139
|
+
// `uncaughtExceptionMonitor` lets us telemeter without suppressing
|
|
140
|
+
// Node's default crash behavior. The default action runs unchanged:
|
|
141
|
+
// print stack, exit non-zero. The codebase's `engines` requirement
|
|
142
|
+
// (Node 20+) guarantees this event is available — Node 13.7+.
|
|
143
|
+
const onUncaughtMonitor = (err) => {
|
|
144
|
+
emitTerminatingLog('uncaughtException', err instanceof Error ? err.message : String(err));
|
|
145
|
+
};
|
|
146
|
+
process.on('uncaughtExceptionMonitor', onUncaughtMonitor);
|
|
147
|
+
handlers.push({ event: 'uncaughtExceptionMonitor', handler: onUncaughtMonitor });
|
|
148
|
+
// `unhandledRejection` — log only. Adding a listener prevents Node's
|
|
149
|
+
// default crash on unhandled promise rejections (Node 15+); that's
|
|
150
|
+
// intentional per the brief ("log + don't crash").
|
|
151
|
+
const onUnhandled = (reason) => {
|
|
152
|
+
const msg = reason instanceof Error ? reason.message : String(reason);
|
|
153
|
+
emitTerminatingLog('unhandledRejection', msg);
|
|
154
|
+
};
|
|
155
|
+
process.on('unhandledRejection', onUnhandled);
|
|
156
|
+
handlers.push({ event: 'unhandledRejection', handler: onUnhandled });
|
|
157
|
+
processLifecycleHandlerRefs = handlers;
|
|
158
|
+
}
|
|
159
|
+
/** Test-only — uninstall handlers + reset state. */
|
|
160
|
+
function _resetProcessLifecycleTelemetryForTest() {
|
|
161
|
+
for (const { event, handler } of processLifecycleHandlerRefs) {
|
|
162
|
+
process.off(event, handler);
|
|
163
|
+
}
|
|
164
|
+
processLifecycleHandlerRefs = [];
|
|
165
|
+
processLifecycleTelemetryInstalled = false;
|
|
166
|
+
liveAdapters.clear();
|
|
167
|
+
}
|
|
168
|
+
/** Test-only — direct access to the live-adapter set. */
|
|
169
|
+
function _liveAdaptersForTest() {
|
|
170
|
+
return liveAdapters;
|
|
171
|
+
}
|
|
7
172
|
/** Backoff tuning for the heartbeat + phase-watcher loops on transient errors. */
|
|
8
173
|
const LOOP_BACKOFF_FACTOR = 1.5;
|
|
9
174
|
const LOOP_BACKOFF_MAX_MS = 30_000;
|
|
@@ -26,6 +191,31 @@ const RECONNECT_TOTAL_BUDGET_MS = 15 * 60_000;
|
|
|
26
191
|
const RECONNECT_BASE_MS = 10_000;
|
|
27
192
|
const RECONNECT_MAX_MS = 60_000;
|
|
28
193
|
const RECONNECT_BACKOFF_FACTOR = 1.5;
|
|
194
|
+
/**
|
|
195
|
+
* #258: tiebreaker timeout for the `describe()` confirmation that gates
|
|
196
|
+
* `fireTerminal('destroy')` from the reconnect-loop pre-check. The Temporal
|
|
197
|
+
* SDK's per-call default is conservative (10s+); we'd rather conclude
|
|
198
|
+
* "describe is hung, treat as terminal" in 3s than freeze the reconnect
|
|
199
|
+
* loop on a slow visibility-API call.
|
|
200
|
+
*/
|
|
201
|
+
const DESCRIBE_TIMEOUT_MS = 3_000;
|
|
202
|
+
/**
|
|
203
|
+
* Workflow execution statuses that are unambiguously terminal — used by
|
|
204
|
+
* the #258 `describe()` tiebreaker to decide whether a transient
|
|
205
|
+
* pre-check error reflects a genuinely-gone workflow (fire destroy) or a
|
|
206
|
+
* transient blip (continue the loop). Anything not in this set, including
|
|
207
|
+
* `RUNNING`, `PAUSED`, `UNSPECIFIED`, and `UNKNOWN`, is treated as
|
|
208
|
+
* non-terminal — conservatively keeps the loop alive when classification
|
|
209
|
+
* is ambiguous.
|
|
210
|
+
*/
|
|
211
|
+
const TERMINAL_WORKFLOW_STATUSES = new Set([
|
|
212
|
+
'COMPLETED',
|
|
213
|
+
'FAILED',
|
|
214
|
+
'CANCELLED',
|
|
215
|
+
'TERMINATED',
|
|
216
|
+
'CONTINUED_AS_NEW',
|
|
217
|
+
'TIMED_OUT',
|
|
218
|
+
]);
|
|
29
219
|
/**
|
|
30
220
|
* Abstract base class for session adapters.
|
|
31
221
|
*
|
|
@@ -142,6 +332,21 @@ class BaseAttachment {
|
|
|
142
332
|
this.leaseRevokedListeners.splice(i, 1);
|
|
143
333
|
};
|
|
144
334
|
}
|
|
335
|
+
/**
|
|
336
|
+
* Hypothesis A telemetry — capture the adapter state included in
|
|
337
|
+
* process-lifecycle log frames. Public so the module-level
|
|
338
|
+
* `snapshotLiveAdapters()` helper can read private fields without an
|
|
339
|
+
* `any` cast; consumers other than the telemetry path should not call it.
|
|
340
|
+
*/
|
|
341
|
+
_captureTelemetrySnapshot() {
|
|
342
|
+
return {
|
|
343
|
+
attachmentId: this.token?.attachmentId ?? null,
|
|
344
|
+
workflowId: this.pinnedHandle?.workflowId ?? null,
|
|
345
|
+
runId: this.token?.runId ?? null,
|
|
346
|
+
heartbeatsSent: this.heartbeatsSent,
|
|
347
|
+
phaseTicksDone: this.phaseTicksDone,
|
|
348
|
+
};
|
|
349
|
+
}
|
|
145
350
|
/**
|
|
146
351
|
* Subscribe to terminal events — `WorkflowNotFound` (§9.4) and phase `gone`.
|
|
147
352
|
* Terminal fires at most once per instance. Subclasses stop delivery + exit.
|
|
@@ -187,6 +392,13 @@ class BaseAttachment {
|
|
|
187
392
|
}],
|
|
188
393
|
});
|
|
189
394
|
this.pinnedHandle = this.client.workflow.getHandle(workflowId, this.token.runId);
|
|
395
|
+
// Hypothesis A telemetry — register this adapter so a future process-
|
|
396
|
+
// lifecycle handler (exit / SIGTERM / uncaughtException / …) can
|
|
397
|
+
// include its state in the structured log. `installProcessLifecycleTelemetry`
|
|
398
|
+
// is idempotent + env-gated; first call wires the handlers, subsequent
|
|
399
|
+
// calls no-op.
|
|
400
|
+
liveAdapters.add(this);
|
|
401
|
+
installProcessLifecycleTelemetry();
|
|
190
402
|
// #249: reset the per-attachment diagnostic counters so the next tick emits
|
|
191
403
|
// `heartbeat#1 delivered` on the freshly live lease. Without this reset a
|
|
192
404
|
// renewal path (e.g. restart → renewed claim) would never re-log first-heartbeat.
|
|
@@ -211,6 +423,10 @@ class BaseAttachment {
|
|
|
211
423
|
if (this.stopped)
|
|
212
424
|
return;
|
|
213
425
|
this.stopped = true;
|
|
426
|
+
// Hypothesis A telemetry — keep `liveAdapters` accurate so a subsequent
|
|
427
|
+
// process-lifecycle handler firing after stop doesn't include a
|
|
428
|
+
// already-torn-down adapter in its frame.
|
|
429
|
+
liveAdapters.delete(this);
|
|
214
430
|
if (this.heartbeatTimer) {
|
|
215
431
|
clearTimeout(this.heartbeatTimer);
|
|
216
432
|
this.heartbeatTimer = null;
|
|
@@ -416,7 +632,7 @@ class BaseAttachment {
|
|
|
416
632
|
}
|
|
417
633
|
// Phase `gone` is terminal — workflow destroyed. Never recoverable.
|
|
418
634
|
if (info.phase === 'gone') {
|
|
419
|
-
this.fireTerminal('destroy');
|
|
635
|
+
this.fireTerminal('destroy', 'tickPhaseWatcher:phase-gone');
|
|
420
636
|
return;
|
|
421
637
|
}
|
|
422
638
|
}
|
|
@@ -460,7 +676,7 @@ class BaseAttachment {
|
|
|
460
676
|
}
|
|
461
677
|
// No CAN event in the closed run's history → truly terminal (COMPLETED /
|
|
462
678
|
// TERMINATED / FAILED / workflow-id GC'd).
|
|
463
|
-
this.fireTerminal('destroy');
|
|
679
|
+
this.fireTerminal('destroy', 'handleRunEndError:no-can-successor');
|
|
464
680
|
return true;
|
|
465
681
|
}
|
|
466
682
|
/**
|
|
@@ -491,11 +707,44 @@ class BaseAttachment {
|
|
|
491
707
|
return null;
|
|
492
708
|
}
|
|
493
709
|
}
|
|
494
|
-
|
|
710
|
+
/**
|
|
711
|
+
* Fire the terminal hook — the adapter is going dark and won't recover.
|
|
712
|
+
*
|
|
713
|
+
* #258: emits a structured log line on every fire so the next post-CAN
|
|
714
|
+
* silence incident is unambiguous in logs. Pre-#258, a `fireTerminal`
|
|
715
|
+
* from an unexpected source (the root cause was a silent destroy from
|
|
716
|
+
* the reconnect-loop pre-check on a transient terminal-class error) was
|
|
717
|
+
* indistinguishable from process death in workflow history — both produced
|
|
718
|
+
* "no further heartbeats." The structured log includes:
|
|
719
|
+
*
|
|
720
|
+
* - `reason` — the existing DetachReason
|
|
721
|
+
* - `callsite` — the calling function or rationale (passed by every
|
|
722
|
+
* callsite so the source is grep-able without parsing stack traces)
|
|
723
|
+
* - `attachmentId` / `workflowId` / `runId` — for cross-referencing
|
|
724
|
+
* against workflow history when bisecting an incident
|
|
725
|
+
* - `heartbeatsSent` / `phaseTicksDone` — the existing #249 counters
|
|
726
|
+
* so an operator can correlate "loop alive at N heartbeats, then
|
|
727
|
+
* terminal fired at this callsite" without external context
|
|
728
|
+
*
|
|
729
|
+
* Idempotent — repeat calls (e.g. reconnect-exhausted re-fires after
|
|
730
|
+
* destroy) early-return without re-logging. The first fire wins.
|
|
731
|
+
*/
|
|
732
|
+
fireTerminal(reason, callsite = 'unspecified') {
|
|
495
733
|
if (this.terminalFired)
|
|
496
734
|
return;
|
|
497
735
|
this.terminalFired = true;
|
|
498
736
|
this.stopped = true;
|
|
737
|
+
// Hypothesis A telemetry — same reasoning as `stopV2Lifecycle`.
|
|
738
|
+
liveAdapters.delete(this);
|
|
739
|
+
log(`terminal fire:`, JSON.stringify({
|
|
740
|
+
reason,
|
|
741
|
+
callsite,
|
|
742
|
+
attachmentId: this.token?.attachmentId ?? null,
|
|
743
|
+
workflowId: this.pinnedHandle?.workflowId ?? null,
|
|
744
|
+
runId: this.token?.runId ?? null,
|
|
745
|
+
heartbeatsSent: this.heartbeatsSent,
|
|
746
|
+
phaseTicksDone: this.phaseTicksDone,
|
|
747
|
+
}));
|
|
499
748
|
if (this.heartbeatTimer) {
|
|
500
749
|
clearTimeout(this.heartbeatTimer);
|
|
501
750
|
this.heartbeatTimer = null;
|
|
@@ -514,6 +763,56 @@ class BaseAttachment {
|
|
|
514
763
|
}
|
|
515
764
|
}
|
|
516
765
|
}
|
|
766
|
+
/**
|
|
767
|
+
* #258 tiebreaker: confirm whether a workflow is genuinely terminal after
|
|
768
|
+
* the reconnect-loop pre-check threw a terminal-class error. Used to
|
|
769
|
+
* distinguish a real workflow-gone state from a transient gRPC /
|
|
770
|
+
* visibility-API blip that classified as terminal.
|
|
771
|
+
*
|
|
772
|
+
* Returns:
|
|
773
|
+
* - `{ kind: 'running', statusName }` — workflow is alive (any
|
|
774
|
+
* non-terminal status). Caller should treat the original error as
|
|
775
|
+
* transient and continue the reconnect loop.
|
|
776
|
+
* - `{ kind: 'terminal', statusName }` — workflow is in a terminal
|
|
777
|
+
* status (`COMPLETED` / `FAILED` / `CANCELLED` / `TERMINATED` /
|
|
778
|
+
* `CONTINUED_AS_NEW` / `TIMED_OUT`). Caller should fire destroy.
|
|
779
|
+
* - `{ kind: 'describe-threw' }` — `describe()` itself failed. Treat
|
|
780
|
+
* as terminal (fire destroy) — consistent with pre-#258 semantics
|
|
781
|
+
* when classification is ambiguous, and avoids spinning forever on
|
|
782
|
+
* a workflow we can't reach.
|
|
783
|
+
* - `{ kind: 'timed-out' }` — `describe()` exceeded
|
|
784
|
+
* {@link DESCRIBE_TIMEOUT_MS}. Treat as terminal (fire destroy) —
|
|
785
|
+
* same rationale: prefer clean shutdown to a hung loop.
|
|
786
|
+
*
|
|
787
|
+
* The unpinned handle follows any CAN chain to the latest run, so
|
|
788
|
+
* `desc.status.name === 'CONTINUED_AS_NEW'` here means the workflow
|
|
789
|
+
* id itself is closed (no successor) — genuinely terminal.
|
|
790
|
+
*/
|
|
791
|
+
async confirmWorkflowTerminal(unpinned) {
|
|
792
|
+
let timer = null;
|
|
793
|
+
try {
|
|
794
|
+
const desc = await Promise.race([
|
|
795
|
+
unpinned.describe(),
|
|
796
|
+
new Promise((resolve) => {
|
|
797
|
+
timer = setTimeout(() => resolve('timeout'), DESCRIBE_TIMEOUT_MS);
|
|
798
|
+
}),
|
|
799
|
+
]);
|
|
800
|
+
if (desc === 'timeout')
|
|
801
|
+
return { kind: 'timed-out' };
|
|
802
|
+
const statusName = desc.status?.name ?? 'UNKNOWN';
|
|
803
|
+
if (TERMINAL_WORKFLOW_STATUSES.has(statusName)) {
|
|
804
|
+
return { kind: 'terminal', statusName };
|
|
805
|
+
}
|
|
806
|
+
return { kind: 'running', statusName };
|
|
807
|
+
}
|
|
808
|
+
catch {
|
|
809
|
+
return { kind: 'describe-threw' };
|
|
810
|
+
}
|
|
811
|
+
finally {
|
|
812
|
+
if (timer)
|
|
813
|
+
clearTimeout(timer);
|
|
814
|
+
}
|
|
815
|
+
}
|
|
517
816
|
// ───────────────────────────────────────────────────────────────────────────
|
|
518
817
|
// #201 reconnect machinery. Subclasses opt in by overriding `shouldReconnect`.
|
|
519
818
|
// ───────────────────────────────────────────────────────────────────────────
|
|
@@ -605,7 +904,7 @@ class BaseAttachment {
|
|
|
605
904
|
if (this.stopped || this.terminalFired || this.reconnecting)
|
|
606
905
|
return;
|
|
607
906
|
if (!this.shouldReconnect(reason)) {
|
|
608
|
-
this.fireTerminal(reason);
|
|
907
|
+
this.fireTerminal(reason, 'fireTerminalOrReconnect:not-recoverable');
|
|
609
908
|
return;
|
|
610
909
|
}
|
|
611
910
|
// Pause the heartbeat + watcher loops for the duration of the reconnect.
|
|
@@ -627,14 +926,14 @@ class BaseAttachment {
|
|
|
627
926
|
void this.runCanRebind(canSuccessorRunId).catch((err) => {
|
|
628
927
|
log(`CAN rebind crashed:`, err?.message ?? err);
|
|
629
928
|
this.reconnecting = false;
|
|
630
|
-
this.fireTerminal('reconnect-exhausted');
|
|
929
|
+
this.fireTerminal('reconnect-exhausted', 'runCanRebind:crashed');
|
|
631
930
|
});
|
|
632
931
|
return;
|
|
633
932
|
}
|
|
634
933
|
void this.runReconnectLoop(reason).catch((err) => {
|
|
635
934
|
log(`reconnect loop crashed:`, err?.message ?? err);
|
|
636
935
|
this.reconnecting = false;
|
|
637
|
-
this.fireTerminal('reconnect-exhausted');
|
|
936
|
+
this.fireTerminal('reconnect-exhausted', 'runReconnectLoop:crashed');
|
|
638
937
|
});
|
|
639
938
|
}
|
|
640
939
|
/**
|
|
@@ -658,7 +957,7 @@ class BaseAttachment {
|
|
|
658
957
|
try {
|
|
659
958
|
if (!this.client || !this.pinnedHandle || !this.token) {
|
|
660
959
|
log('runCanRebind: missing client/handle/token — firing terminal');
|
|
661
|
-
this.fireTerminal('reconnect-exhausted');
|
|
960
|
+
this.fireTerminal('reconnect-exhausted', 'runCanRebind:missing-state');
|
|
662
961
|
return;
|
|
663
962
|
}
|
|
664
963
|
const workflowId = this.pinnedHandle.workflowId;
|
|
@@ -735,7 +1034,7 @@ class BaseAttachment {
|
|
|
735
1034
|
try {
|
|
736
1035
|
if (!this.client || !this.host || !this.token || !this.pinnedHandle) {
|
|
737
1036
|
log('runReconnectLoop: missing client/host/token/handle — aborting');
|
|
738
|
-
this.fireTerminal('reconnect-exhausted');
|
|
1037
|
+
this.fireTerminal('reconnect-exhausted', 'runReconnectLoop:missing-state');
|
|
739
1038
|
return;
|
|
740
1039
|
}
|
|
741
1040
|
const workflowId = this.pinnedHandle.workflowId;
|
|
@@ -774,14 +1073,40 @@ class BaseAttachment {
|
|
|
774
1073
|
}
|
|
775
1074
|
catch (err) {
|
|
776
1075
|
if ((0, terminal_error_1.isTerminalWorkflowError)(err)) {
|
|
777
|
-
// #
|
|
778
|
-
//
|
|
779
|
-
//
|
|
780
|
-
//
|
|
781
|
-
//
|
|
782
|
-
//
|
|
783
|
-
|
|
784
|
-
|
|
1076
|
+
// #258: ONE terminal-class pre-check error is not enough evidence
|
|
1077
|
+
// to destroy the adapter. The classifier matches phrasings
|
|
1078
|
+
// (`WorkflowNotFound`, `NOT_FOUND`, "workflow execution already
|
|
1079
|
+
// completed") that can ALSO surface from transient gRPC blips and
|
|
1080
|
+
// momentary visibility-API hiccups. Pre-#258, this branch fired
|
|
1081
|
+
// `fireTerminal('destroy')` immediately — a single transient
|
|
1082
|
+
// error orphaned the adapter for the rest of the session
|
|
1083
|
+
// (heartbeat + watcher dead via `terminalFired`, poller torn
|
|
1084
|
+
// down by `onReconnectStart` + `onTerminal` listener).
|
|
1085
|
+
//
|
|
1086
|
+
// Tiebreaker: confirm with `describe()` against the same unpinned
|
|
1087
|
+
// handle. If the workflow is genuinely gone, `describe()` will
|
|
1088
|
+
// either return a closed status (COMPLETED/TERMINATED/...) or
|
|
1089
|
+
// itself throw — fire destroy with confidence. If it returns
|
|
1090
|
+
// RUNNING (or any non-terminal status), the original error was
|
|
1091
|
+
// transient — log and continue the loop. Bounded by
|
|
1092
|
+
// `DESCRIBE_TIMEOUT_MS` so a slow visibility-API call can't hang
|
|
1093
|
+
// the reconnect path indefinitely.
|
|
1094
|
+
const errClass = err?.name ?? 'unknown';
|
|
1095
|
+
const errMsg = err?.message ?? String(err);
|
|
1096
|
+
const tiebreak = await this.confirmWorkflowTerminal(unpinned);
|
|
1097
|
+
if (tiebreak.kind === 'running') {
|
|
1098
|
+
log(`reconnect: pre-check threw ${errClass} but describe() shows ` +
|
|
1099
|
+
`${tiebreak.statusName} — treating as transient, continuing loop ` +
|
|
1100
|
+
`(originalError="${errMsg}")`);
|
|
1101
|
+
backoff = Math.min(backoff * this.reconnectBackoffFactor, this.reconnectMaxMs);
|
|
1102
|
+
continue;
|
|
1103
|
+
}
|
|
1104
|
+
const confirmDesc = tiebreak.kind === 'terminal'
|
|
1105
|
+
? `describe() confirmed ${tiebreak.statusName}`
|
|
1106
|
+
: `describe() ${tiebreak.kind === 'describe-threw' ? 'threw' : 'timed out'}`;
|
|
1107
|
+
log(`reconnect: pre-check terminal (${errClass}) and ${confirmDesc} — firing destroy ` +
|
|
1108
|
+
`(originalError="${errMsg}")`);
|
|
1109
|
+
this.fireTerminal('destroy', 'runReconnectLoop:precheck-terminal-confirmed');
|
|
785
1110
|
return;
|
|
786
1111
|
}
|
|
787
1112
|
backoff = Math.min(backoff * this.reconnectBackoffFactor, this.reconnectMaxMs);
|
|
@@ -790,12 +1115,12 @@ class BaseAttachment {
|
|
|
790
1115
|
}
|
|
791
1116
|
if (info.phase === 'gone') {
|
|
792
1117
|
log('reconnect: phase=gone — giving up');
|
|
793
|
-
this.fireTerminal('destroy');
|
|
1118
|
+
this.fireTerminal('destroy', 'runReconnectLoop:phase-gone');
|
|
794
1119
|
return;
|
|
795
1120
|
}
|
|
796
1121
|
if (info.currentAttachment && info.currentAttachment.attachmentId !== oldAttachmentId) {
|
|
797
1122
|
log(`reconnect: another adapter holds the lease (${info.currentAttachment.attachmentId}) — bailing`);
|
|
798
|
-
this.fireTerminal('superseded');
|
|
1123
|
+
this.fireTerminal('superseded', 'runReconnectLoop:other-holder');
|
|
799
1124
|
return;
|
|
800
1125
|
}
|
|
801
1126
|
if (info.phase === 'draining') {
|
|
@@ -849,7 +1174,7 @@ class BaseAttachment {
|
|
|
849
1174
|
catch (err) {
|
|
850
1175
|
if ((0, terminal_error_1.isTerminalWorkflowError)(err)) {
|
|
851
1176
|
log('reconnect: workflow gone during claim');
|
|
852
|
-
this.fireTerminal('destroy');
|
|
1177
|
+
this.fireTerminal('destroy', 'runReconnectLoop:claim-terminal');
|
|
853
1178
|
return;
|
|
854
1179
|
}
|
|
855
1180
|
backoff = Math.min(backoff * this.reconnectBackoffFactor, this.reconnectMaxMs);
|
|
@@ -858,7 +1183,7 @@ class BaseAttachment {
|
|
|
858
1183
|
}
|
|
859
1184
|
// Budget exhausted — give up cleanly.
|
|
860
1185
|
log(`reconnect budget exhausted after ${attempt} attempt(s)`);
|
|
861
|
-
this.fireTerminal('reconnect-exhausted');
|
|
1186
|
+
this.fireTerminal('reconnect-exhausted', 'runReconnectLoop:budget-exhausted');
|
|
862
1187
|
}
|
|
863
1188
|
finally {
|
|
864
1189
|
// Guarantee state reset regardless of which path we exited on. Safe to
|
|
@@ -45,3 +45,7 @@ export interface DaemonOpts extends CliOverrides {
|
|
|
45
45
|
force?: boolean;
|
|
46
46
|
}
|
|
47
47
|
export declare function daemon(opts: DaemonOpts): Promise<void>;
|
|
48
|
+
/** Pretty-print a byte count as MB (rounded). Pure helper, exported for tests. */
|
|
49
|
+
export declare function formatBytesAsMb(n: unknown): string;
|
|
50
|
+
/** Pretty-print a millisecond uptime as `Xh Ym Zs`. Pure helper, exported for tests. */
|
|
51
|
+
export declare function formatUptime(ms: unknown): string;
|