claude-tempo 0.27.0 → 0.28.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CLAUDE.md +7 -1
  2. package/dist/adapters/base.d.ts +84 -0
  3. package/dist/adapters/base.js +345 -20
  4. package/dist/cli/daemon-command.d.ts +4 -0
  5. package/dist/cli/daemon-command.js +102 -1
  6. package/dist/client/core.d.ts +24 -0
  7. package/dist/client/core.js +1033 -0
  8. package/dist/client/ensure-conductor-spawned.d.ts +9 -6
  9. package/dist/client/ensure-conductor-spawned.js +7 -4
  10. package/dist/client/index.d.ts +32 -4
  11. package/dist/client/index.js +15 -1030
  12. package/dist/client/interface.d.ts +99 -19
  13. package/dist/client/subscribe.d.ts +108 -0
  14. package/dist/client/subscribe.js +598 -0
  15. package/dist/client/with-spawn.d.ts +27 -0
  16. package/dist/client/with-spawn.js +87 -0
  17. package/dist/config.d.ts +21 -0
  18. package/dist/config.js +12 -0
  19. package/dist/daemon.d.ts +19 -0
  20. package/dist/daemon.js +117 -1
  21. package/dist/http/aggregate.d.ts +161 -0
  22. package/dist/http/aggregate.js +466 -0
  23. package/dist/http/auth.d.ts +67 -0
  24. package/dist/http/auth.js +177 -0
  25. package/dist/http/cors.d.ts +42 -0
  26. package/dist/http/cors.js +111 -0
  27. package/dist/http/event-bus.d.ts +217 -0
  28. package/dist/http/event-bus.js +365 -0
  29. package/dist/http/event-id.d.ts +77 -0
  30. package/dist/http/event-id.js +117 -0
  31. package/dist/http/event-types.d.ts +280 -0
  32. package/dist/http/event-types.js +36 -0
  33. package/dist/http/index.d.ts +21 -0
  34. package/dist/http/index.js +61 -0
  35. package/dist/http/port-file.d.ts +22 -0
  36. package/dist/http/port-file.js +132 -0
  37. package/dist/http/responses.d.ts +27 -0
  38. package/dist/http/responses.js +40 -0
  39. package/dist/http/ring-buffer.d.ts +41 -0
  40. package/dist/http/ring-buffer.js +80 -0
  41. package/dist/http/server.d.ts +101 -0
  42. package/dist/http/server.js +368 -0
  43. package/dist/http/snapshot.d.ts +51 -0
  44. package/dist/http/snapshot.js +109 -0
  45. package/dist/http/sse-handler.d.ts +75 -0
  46. package/dist/http/sse-handler.js +276 -0
  47. package/dist/reconcile/orphans.d.ts +3 -6
  48. package/dist/reconcile/orphans.js +10 -4
  49. package/dist/server.js +21 -6
  50. package/dist/tools/destroy.js +25 -7
  51. package/dist/tools/restore.js +20 -4
  52. package/dist/tui/App.js +91 -86
  53. package/dist/tui/commands.d.ts +0 -16
  54. package/dist/tui/commands.js +0 -20
  55. package/dist/tui/index.js +9 -0
  56. package/dist/tui/sse-handler.d.ts +52 -0
  57. package/dist/tui/sse-handler.js +156 -0
  58. package/dist/tui/store.d.ts +54 -3
  59. package/dist/tui/store.js +87 -3
  60. package/dist/tui/utils/history.d.ts +0 -5
  61. package/dist/tui/utils/history.js +0 -17
  62. package/dist/tui/utils/platform.d.ts +0 -11
  63. package/dist/tui/utils/platform.js +0 -37
  64. package/dist/utils/hosts.d.ts +32 -8
  65. package/dist/utils/hosts.js +52 -21
  66. package/dist/workflows/maestro-signals.d.ts +21 -0
  67. package/dist/workflows/maestro-signals.js +19 -1
  68. package/dist/workflows/maestro.js +7 -0
  69. package/dist/workflows/session.js +41 -4
  70. package/package.json +4 -2
  71. package/workflow-bundle.js +68 -6
  72. package/dist/tools/detach.d.ts +0 -4
  73. package/dist/tools/detach.js +0 -45
  74. package/dist/tui/components/CommandOverlay.d.ts +0 -15
  75. package/dist/tui/components/CommandOverlay.js +0 -34
  76. package/dist/tui/components/ScheduleOverlay.d.ts +0 -13
  77. package/dist/tui/components/ScheduleOverlay.js +0 -113
package/CLAUDE.md CHANGED
@@ -76,7 +76,6 @@ src/
76
76
  │ └── helpers.ts # Zod/MCP tool registration wrapper
77
77
  ├── tui/
78
78
  │ ├── App.tsx / store.ts / commands.ts # TUI root, state, slash commands
79
- │ ├── client.ts # Backward-compat shim → src/client/
80
79
  │ ├── components/ # Ink components — see docs/tui.md for inventory
81
80
  │ └── utils/ # format, platform, theme, fullscreen, history
82
81
  ├── utils/
@@ -108,6 +107,13 @@ npm test
108
107
  > surveys or migrations, always grep **both** `test/` and `tests/` or you will miss
109
108
  > mocks and assertions that only live in one directory.
110
109
 
110
+ > **Test-only hooks live with the module they reset and follow the
111
+ > `__<verb><Noun>ForTests` naming convention** — see
112
+ > [docs/adr/0006-test-hooks-naming.md](docs/adr/0006-test-hooks-naming.md). The
113
+ > double-underscore prefix telegraphs "test escape hatch, do not call from
114
+ > production code"; the hook's doc-comment should restate that explicitly. Hooks
115
+ > are never surfaced through barrels or `TempoClient`.
116
+
111
117
  See [docs/development.md](docs/development.md) for full setup (Temporal dev server command,
112
118
  daemon worker notes, `npx ts-node` dev runner).
113
119
 
@@ -14,6 +14,35 @@
14
14
  */
15
15
  import type { Client, WorkflowHandle } from '@temporalio/client';
16
16
  import type { AdapterClass, AdapterDescriptor, AttachmentToken, AttachmentPhase, DetachReason } from '../types';
17
+ /** Snapshot of adapter state included in every telemetry frame. */
18
+ interface AdapterTelemetrySnapshot {
19
+ attachmentId: string | null;
20
+ workflowId: string | null;
21
+ runId: string | null;
22
+ heartbeatsSent: number;
23
+ phaseTicksDone: number;
24
+ }
25
+ /**
26
+ * Build the structured frame emitted by every lifecycle handler. Pure
27
+ * function — exposed for unit tests that don't want to spawn a child
28
+ * process.
29
+ */
30
+ export declare function buildProcessTerminatingFrame(signal: string, errorMessage?: string, snapshot?: AdapterTelemetrySnapshot[]): string;
31
+ /**
32
+ * Install the process-lifecycle telemetry handlers. Idempotent. Skipped
33
+ * by default in test environments (see {@link shouldInstallLifecycleTelemetry}).
34
+ *
35
+ * Production callers (and the first `startV2Lifecycle()` call on any
36
+ * adapter) invoke without arguments. Unit tests pass `{ force: true }`
37
+ * to bypass the env gate.
38
+ */
39
+ export declare function installProcessLifecycleTelemetry(opts?: {
40
+ force?: boolean;
41
+ }): void;
42
+ /** Test-only — uninstall handlers + reset state. */
43
+ export declare function _resetProcessLifecycleTelemetryForTest(): void;
44
+ /** Test-only — direct access to the live-adapter set. */
45
+ export declare function _liveAdaptersForTest(): ReadonlySet<BaseAttachment>;
17
46
  /**
18
47
  * Override bundle for the reconnect loop timing (#201). Production defaults are
19
48
  * tuned for laptop-sleep cycles (15-min elapsed budget, 10s base, 60s cap). Tests
@@ -124,6 +153,13 @@ export declare abstract class BaseAttachment {
124
153
  onPhaseChange(listener: (phase: AttachmentPhase) => void): () => void;
125
154
  /** Subscribe to lease-revocation events (§9.3 split-brain resolution). */
126
155
  onLeaseRevoked(listener: (reason: DetachReason) => void): () => void;
156
+ /**
157
+ * Hypothesis A telemetry — capture the adapter state included in
158
+ * process-lifecycle log frames. Public so the module-level
159
+ * `snapshotLiveAdapters()` helper can read private fields without an
160
+ * `any` cast; consumers other than the telemetry path should not call it.
161
+ */
162
+ _captureTelemetrySnapshot(): AdapterTelemetrySnapshot;
127
163
  /**
128
164
  * Subscribe to terminal events — `WorkflowNotFound` (§9.4) and phase `gone`.
129
165
  * Terminal fires at most once per instance. Subclasses stop delivery + exit.
@@ -216,7 +252,55 @@ export declare abstract class BaseAttachment {
216
252
  * once per terminal — not on every tick.
217
253
  */
218
254
  private findCanSuccessorRunId;
255
+ /**
256
+ * Fire the terminal hook — the adapter is going dark and won't recover.
257
+ *
258
+ * #258: emits a structured log line on every fire so the next post-CAN
259
+ * silence incident is unambiguous in logs. Pre-#258, a `fireTerminal`
260
+ * from an unexpected source (the root cause was a silent destroy from
261
+ * the reconnect-loop pre-check on a transient terminal-class error) was
262
+ * indistinguishable from process death in workflow history — both produced
263
+ * "no further heartbeats." The structured log includes:
264
+ *
265
+ * - `reason` — the existing DetachReason
266
+ * - `callsite` — the calling function or rationale (passed by every
267
+ * callsite so the source is grep-able without parsing stack traces)
268
+ * - `attachmentId` / `workflowId` / `runId` — for cross-referencing
269
+ * against workflow history when bisecting an incident
270
+ * - `heartbeatsSent` / `phaseTicksDone` — the existing #249 counters
271
+ * so an operator can correlate "loop alive at N heartbeats, then
272
+ * terminal fired at this callsite" without external context
273
+ *
274
+ * Idempotent — repeat calls (e.g. reconnect-exhausted re-fires after
275
+ * destroy) early-return without re-logging. The first fire wins.
276
+ */
219
277
  private fireTerminal;
278
+ /**
279
+ * #258 tiebreaker: confirm whether a workflow is genuinely terminal after
280
+ * the reconnect-loop pre-check threw a terminal-class error. Used to
281
+ * distinguish a real workflow-gone state from a transient gRPC /
282
+ * visibility-API blip that classified as terminal.
283
+ *
284
+ * Returns:
285
+ * - `{ kind: 'running', statusName }` — workflow is alive (any
286
+ * non-terminal status). Caller should treat the original error as
287
+ * transient and continue the reconnect loop.
288
+ * - `{ kind: 'terminal', statusName }` — workflow is in a terminal
289
+ * status (`COMPLETED` / `FAILED` / `CANCELLED` / `TERMINATED` /
290
+ * `CONTINUED_AS_NEW` / `TIMED_OUT`). Caller should fire destroy.
291
+ * - `{ kind: 'describe-threw' }` — `describe()` itself failed. Treat
292
+ * as terminal (fire destroy) — consistent with pre-#258 semantics
293
+ * when classification is ambiguous, and avoids spinning forever on
294
+ * a workflow we can't reach.
295
+ * - `{ kind: 'timed-out' }` — `describe()` exceeded
296
+ * {@link DESCRIBE_TIMEOUT_MS}. Treat as terminal (fire destroy) —
297
+ * same rationale: prefer clean shutdown to a hung loop.
298
+ *
299
+ * The unpinned handle follows any CAN chain to the latest run, so
300
+ * `desc.status.name === 'CONTINUED_AS_NEW'` here means the workflow
301
+ * id itself is closed (no successor) — genuinely terminal.
302
+ */
303
+ private confirmWorkflowTerminal;
220
304
  /**
221
305
  * Opt-in reconnect policy. Default: return `false` — the base class behaves
222
306
  * exactly as it did before #201 (fire terminal, tear down). Subclasses that
@@ -1,9 +1,174 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.AdapterRegistry = exports.BaseAttachment = void 0;
4
+ exports.buildProcessTerminatingFrame = buildProcessTerminatingFrame;
5
+ exports.installProcessLifecycleTelemetry = installProcessLifecycleTelemetry;
6
+ exports._resetProcessLifecycleTelemetryForTest = _resetProcessLifecycleTelemetryForTest;
7
+ exports._liveAdaptersForTest = _liveAdaptersForTest;
4
8
  const signals_1 = require("../workflows/signals");
5
9
  const terminal_error_1 = require("./terminal-error");
6
10
  const log = (...args) => console.error('[claude-tempo:adapter]', ...args);
11
+ // ── Hypothesis A telemetry (#258 follow-up) ─────────────────────────────
12
+ //
13
+ // The structured `terminal fire` log shipped in #258 made the next
14
+ // adapter-silence incident self-describing — but only for cases where
15
+ // `fireTerminal` actually fires. Hypothesis A (process death — crash, OOM,
16
+ // Windows sleep, terminal close, SIGKILL) wouldn't produce that log
17
+ // because the process never reached the code path. The handlers below
18
+ // close that gap: a future #258 recurrence with no `fireTerminal` log AND
19
+ // no `adapter-process-terminating` log narrows to a distinct hypothesis
20
+ // (likely SIGKILL / abrupt OS termination — file separately).
21
+ //
22
+ // Design tenets:
23
+ // - **Idempotent registration**: a module-level boolean ensures multiple
24
+ // adapter instances spawning in the same process never double-register
25
+ // handlers. Repeated `installProcessLifecycleTelemetry()` calls no-op.
26
+ // - **Additive only**: every `process.on(...)` call appends; nothing
27
+ // calls `removeAllListeners`. Coexists with the test-cleanup chain in
28
+ // `test/helpers.ts` (#312) and the daemon's own SIGTERM/SIGINT
29
+ // shutdown function.
30
+ // - **Synchronous logging on terminal signals**: process termination
31
+ // doesn't await async log flushes. `console.error` to stderr is
32
+ // synchronous on POSIX + Windows, which is enough.
33
+ // - **No behavior change on uncaughtException**: we register
34
+ // `uncaughtExceptionMonitor` (Node 13.7+) to telemeter without
35
+ // suppressing Node's default crash. If the runtime predates that
36
+ // event, we fall back to `uncaughtException` + `process.exit(1)`
37
+ // which preserves "don't swallow."
38
+ // - **Test gating**: mocha defines `it` globally (vitest with
39
+ // `globals: false` does not). Skip auto-install whenever the test
40
+ // framework signal is present so we don't fight the existing zombie
41
+ // reap in `test/helpers.ts`. The unit tests for these handlers spawn
42
+ // a dedicated child Node process where the gate doesn't fire.
43
+ /**
44
+ * Live adapters in this process. Populated by `startV2Lifecycle` after a
45
+ * successful claim; emptied on `stopV2Lifecycle` and `fireTerminal`.
46
+ * Each lifecycle handler iterates this set to build the per-adapter
47
+ * snapshot in the structured log.
48
+ */
49
+ const liveAdapters = new Set();
50
+ let processLifecycleTelemetryInstalled = false;
51
+ let processLifecycleHandlerRefs = [];
52
+ /**
53
+ * Should `installProcessLifecycleTelemetry()` actually wire up handlers?
54
+ *
55
+ * - Forced on by `CLAUDE_TEMPO_LIFECYCLE_TELEMETRY=1` (used by the
56
+ * child-process tests for these handlers — see
57
+ * `test/adapter-process-lifecycle-telemetry.test.ts`).
58
+ * - Forced off by `CLAUDE_TEMPO_LIFECYCLE_TELEMETRY=0`.
59
+ * - Off when running under mocha (detected via `globalThis.it` —
60
+ * mocha defines this; vitest with `globals: false` does not).
61
+ * - Off when `NODE_ENV === 'test'` — belt and suspenders.
62
+ * - Otherwise on.
63
+ */
64
+ function shouldInstallLifecycleTelemetry(force) {
65
+ if (force)
66
+ return true;
67
+ const flag = process.env.CLAUDE_TEMPO_LIFECYCLE_TELEMETRY;
68
+ if (flag === '1' || flag === 'true')
69
+ return true;
70
+ if (flag === '0' || flag === 'false')
71
+ return false;
72
+ // Mocha exposes BDD globals (`it`, `describe`, …) on the global object;
73
+ // our vitest config opts out of globals so it doesn't trigger this gate.
74
+ if (typeof globalThis.it === 'function')
75
+ return false;
76
+ if (process.env.NODE_ENV === 'test')
77
+ return false;
78
+ return true;
79
+ }
80
+ function snapshotLiveAdapters() {
81
+ const out = [];
82
+ for (const adapter of liveAdapters) {
83
+ out.push(adapter._captureTelemetrySnapshot());
84
+ }
85
+ return out;
86
+ }
87
+ /**
88
+ * Build the structured frame emitted by every lifecycle handler. Pure
89
+ * function — exposed for unit tests that don't want to spawn a child
90
+ * process.
91
+ */
92
+ function buildProcessTerminatingFrame(signal, errorMessage, snapshot = snapshotLiveAdapters()) {
93
+ return JSON.stringify({
94
+ event: 'adapter-process-terminating',
95
+ signal,
96
+ ...(errorMessage !== undefined ? { errorMessage } : {}),
97
+ adapterCount: snapshot.length,
98
+ adapters: snapshot,
99
+ });
100
+ }
101
+ function emitTerminatingLog(signal, errorMessage) {
102
+ // `console.error` synchronously writes to stderr on POSIX + Windows.
103
+ // The `[claude-tempo:adapter]` prefix matches the rest of the adapter
104
+ // logs so a single grep surfaces both the existing `terminal fire`
105
+ // line and these new lifecycle lines for the same incident.
106
+ log(`adapter-process-terminating: ${buildProcessTerminatingFrame(signal, errorMessage)}`);
107
+ }
108
+ /**
109
+ * Install the process-lifecycle telemetry handlers. Idempotent. Skipped
110
+ * by default in test environments (see {@link shouldInstallLifecycleTelemetry}).
111
+ *
112
+ * Production callers (and the first `startV2Lifecycle()` call on any
113
+ * adapter) invoke without arguments. Unit tests pass `{ force: true }`
114
+ * to bypass the env gate.
115
+ */
116
+ function installProcessLifecycleTelemetry(opts = {}) {
117
+ if (processLifecycleTelemetryInstalled)
118
+ return;
119
+ if (!shouldInstallLifecycleTelemetry(opts.force === true))
120
+ return;
121
+ processLifecycleTelemetryInstalled = true;
122
+ const handlers = [];
123
+ // `exit` — synchronous, last chance. Don't do async work; just log.
124
+ const onExit = () => emitTerminatingLog('exit');
125
+ process.on('exit', onExit);
126
+ handlers.push({ event: 'exit', handler: onExit });
127
+ // `beforeExit` — event loop is empty but Node hasn't exited yet.
128
+ const onBeforeExit = () => emitTerminatingLog('beforeExit');
129
+ process.on('beforeExit', onBeforeExit);
130
+ handlers.push({ event: 'beforeExit', handler: onBeforeExit });
131
+ // SIGTERM — graceful termination request (kill, supervisord, systemd).
132
+ const onSigterm = () => emitTerminatingLog('SIGTERM');
133
+ process.on('SIGTERM', onSigterm);
134
+ handlers.push({ event: 'SIGTERM', handler: onSigterm });
135
+ // SIGINT — Ctrl+C from a controlling terminal.
136
+ const onSigint = () => emitTerminatingLog('SIGINT');
137
+ process.on('SIGINT', onSigint);
138
+ handlers.push({ event: 'SIGINT', handler: onSigint });
139
+ // `uncaughtExceptionMonitor` lets us telemeter without suppressing
140
+ // Node's default crash behavior. The default action runs unchanged:
141
+ // print stack, exit non-zero. The codebase's `engines` requirement
142
+ // (Node 20+) guarantees this event is available — Node 13.7+.
143
+ const onUncaughtMonitor = (err) => {
144
+ emitTerminatingLog('uncaughtException', err instanceof Error ? err.message : String(err));
145
+ };
146
+ process.on('uncaughtExceptionMonitor', onUncaughtMonitor);
147
+ handlers.push({ event: 'uncaughtExceptionMonitor', handler: onUncaughtMonitor });
148
+ // `unhandledRejection` — log only. Adding a listener prevents Node's
149
+ // default crash on unhandled promise rejections (Node 15+); that's
150
+ // intentional per the brief ("log + don't crash").
151
+ const onUnhandled = (reason) => {
152
+ const msg = reason instanceof Error ? reason.message : String(reason);
153
+ emitTerminatingLog('unhandledRejection', msg);
154
+ };
155
+ process.on('unhandledRejection', onUnhandled);
156
+ handlers.push({ event: 'unhandledRejection', handler: onUnhandled });
157
+ processLifecycleHandlerRefs = handlers;
158
+ }
159
+ /** Test-only — uninstall handlers + reset state. */
160
+ function _resetProcessLifecycleTelemetryForTest() {
161
+ for (const { event, handler } of processLifecycleHandlerRefs) {
162
+ process.off(event, handler);
163
+ }
164
+ processLifecycleHandlerRefs = [];
165
+ processLifecycleTelemetryInstalled = false;
166
+ liveAdapters.clear();
167
+ }
168
+ /** Test-only — direct access to the live-adapter set. */
169
+ function _liveAdaptersForTest() {
170
+ return liveAdapters;
171
+ }
7
172
  /** Backoff tuning for the heartbeat + phase-watcher loops on transient errors. */
8
173
  const LOOP_BACKOFF_FACTOR = 1.5;
9
174
  const LOOP_BACKOFF_MAX_MS = 30_000;
@@ -26,6 +191,31 @@ const RECONNECT_TOTAL_BUDGET_MS = 15 * 60_000;
26
191
  const RECONNECT_BASE_MS = 10_000;
27
192
  const RECONNECT_MAX_MS = 60_000;
28
193
  const RECONNECT_BACKOFF_FACTOR = 1.5;
194
+ /**
195
+ * #258: tiebreaker timeout for the `describe()` confirmation that gates
196
+ * `fireTerminal('destroy')` from the reconnect-loop pre-check. The Temporal
197
+ * SDK's per-call default is conservative (10s+); we'd rather conclude
198
+ * "describe is hung, treat as terminal" in 3s than freeze the reconnect
199
+ * loop on a slow visibility-API call.
200
+ */
201
+ const DESCRIBE_TIMEOUT_MS = 3_000;
202
+ /**
203
+ * Workflow execution statuses that are unambiguously terminal — used by
204
+ * the #258 `describe()` tiebreaker to decide whether a transient
205
+ * pre-check error reflects a genuinely-gone workflow (fire destroy) or a
206
+ * transient blip (continue the loop). Anything not in this set, including
207
+ * `RUNNING`, `PAUSED`, `UNSPECIFIED`, and `UNKNOWN`, is treated as
208
+ * non-terminal — conservatively keeps the loop alive when classification
209
+ * is ambiguous.
210
+ */
211
+ const TERMINAL_WORKFLOW_STATUSES = new Set([
212
+ 'COMPLETED',
213
+ 'FAILED',
214
+ 'CANCELLED',
215
+ 'TERMINATED',
216
+ 'CONTINUED_AS_NEW',
217
+ 'TIMED_OUT',
218
+ ]);
29
219
  /**
30
220
  * Abstract base class for session adapters.
31
221
  *
@@ -142,6 +332,21 @@ class BaseAttachment {
142
332
  this.leaseRevokedListeners.splice(i, 1);
143
333
  };
144
334
  }
335
+ /**
336
+ * Hypothesis A telemetry — capture the adapter state included in
337
+ * process-lifecycle log frames. Public so the module-level
338
+ * `snapshotLiveAdapters()` helper can read private fields without an
339
+ * `any` cast; consumers other than the telemetry path should not call it.
340
+ */
341
+ _captureTelemetrySnapshot() {
342
+ return {
343
+ attachmentId: this.token?.attachmentId ?? null,
344
+ workflowId: this.pinnedHandle?.workflowId ?? null,
345
+ runId: this.token?.runId ?? null,
346
+ heartbeatsSent: this.heartbeatsSent,
347
+ phaseTicksDone: this.phaseTicksDone,
348
+ };
349
+ }
145
350
  /**
146
351
  * Subscribe to terminal events — `WorkflowNotFound` (§9.4) and phase `gone`.
147
352
  * Terminal fires at most once per instance. Subclasses stop delivery + exit.
@@ -187,6 +392,13 @@ class BaseAttachment {
187
392
  }],
188
393
  });
189
394
  this.pinnedHandle = this.client.workflow.getHandle(workflowId, this.token.runId);
395
+ // Hypothesis A telemetry — register this adapter so a future process-
396
+ // lifecycle handler (exit / SIGTERM / uncaughtException / …) can
397
+ // include its state in the structured log. `installProcessLifecycleTelemetry`
398
+ // is idempotent + env-gated; first call wires the handlers, subsequent
399
+ // calls no-op.
400
+ liveAdapters.add(this);
401
+ installProcessLifecycleTelemetry();
190
402
  // #249: reset the per-attachment diagnostic counters so the next tick emits
191
403
  // `heartbeat#1 delivered` on the freshly live lease. Without this reset a
192
404
  // renewal path (e.g. restart → renewed claim) would never re-log first-heartbeat.
@@ -211,6 +423,10 @@ class BaseAttachment {
211
423
  if (this.stopped)
212
424
  return;
213
425
  this.stopped = true;
426
+ // Hypothesis A telemetry — keep `liveAdapters` accurate so a subsequent
427
+ // process-lifecycle handler firing after stop doesn't include a
428
+ // already-torn-down adapter in its frame.
429
+ liveAdapters.delete(this);
214
430
  if (this.heartbeatTimer) {
215
431
  clearTimeout(this.heartbeatTimer);
216
432
  this.heartbeatTimer = null;
@@ -416,7 +632,7 @@ class BaseAttachment {
416
632
  }
417
633
  // Phase `gone` is terminal — workflow destroyed. Never recoverable.
418
634
  if (info.phase === 'gone') {
419
- this.fireTerminal('destroy');
635
+ this.fireTerminal('destroy', 'tickPhaseWatcher:phase-gone');
420
636
  return;
421
637
  }
422
638
  }
@@ -460,7 +676,7 @@ class BaseAttachment {
460
676
  }
461
677
  // No CAN event in the closed run's history → truly terminal (COMPLETED /
462
678
  // TERMINATED / FAILED / workflow-id GC'd).
463
- this.fireTerminal('destroy');
679
+ this.fireTerminal('destroy', 'handleRunEndError:no-can-successor');
464
680
  return true;
465
681
  }
466
682
  /**
@@ -491,11 +707,44 @@ class BaseAttachment {
491
707
  return null;
492
708
  }
493
709
  }
494
- fireTerminal(reason) {
710
+ /**
711
+ * Fire the terminal hook — the adapter is going dark and won't recover.
712
+ *
713
+ * #258: emits a structured log line on every fire so the next post-CAN
714
+ * silence incident is unambiguous in logs. Pre-#258, a `fireTerminal`
715
+ * from an unexpected source (the root cause was a silent destroy from
716
+ * the reconnect-loop pre-check on a transient terminal-class error) was
717
+ * indistinguishable from process death in workflow history — both produced
718
+ * "no further heartbeats." The structured log includes:
719
+ *
720
+ * - `reason` — the existing DetachReason
721
+ * - `callsite` — the calling function or rationale (passed by every
722
+ * callsite so the source is grep-able without parsing stack traces)
723
+ * - `attachmentId` / `workflowId` / `runId` — for cross-referencing
724
+ * against workflow history when bisecting an incident
725
+ * - `heartbeatsSent` / `phaseTicksDone` — the existing #249 counters
726
+ * so an operator can correlate "loop alive at N heartbeats, then
727
+ * terminal fired at this callsite" without external context
728
+ *
729
+ * Idempotent — repeat calls (e.g. reconnect-exhausted re-fires after
730
+ * destroy) early-return without re-logging. The first fire wins.
731
+ */
732
+ fireTerminal(reason, callsite = 'unspecified') {
495
733
  if (this.terminalFired)
496
734
  return;
497
735
  this.terminalFired = true;
498
736
  this.stopped = true;
737
+ // Hypothesis A telemetry — same reasoning as `stopV2Lifecycle`.
738
+ liveAdapters.delete(this);
739
+ log(`terminal fire:`, JSON.stringify({
740
+ reason,
741
+ callsite,
742
+ attachmentId: this.token?.attachmentId ?? null,
743
+ workflowId: this.pinnedHandle?.workflowId ?? null,
744
+ runId: this.token?.runId ?? null,
745
+ heartbeatsSent: this.heartbeatsSent,
746
+ phaseTicksDone: this.phaseTicksDone,
747
+ }));
499
748
  if (this.heartbeatTimer) {
500
749
  clearTimeout(this.heartbeatTimer);
501
750
  this.heartbeatTimer = null;
@@ -514,6 +763,56 @@ class BaseAttachment {
514
763
  }
515
764
  }
516
765
  }
766
+ /**
767
+ * #258 tiebreaker: confirm whether a workflow is genuinely terminal after
768
+ * the reconnect-loop pre-check threw a terminal-class error. Used to
769
+ * distinguish a real workflow-gone state from a transient gRPC /
770
+ * visibility-API blip that classified as terminal.
771
+ *
772
+ * Returns:
773
+ * - `{ kind: 'running', statusName }` — workflow is alive (any
774
+ * non-terminal status). Caller should treat the original error as
775
+ * transient and continue the reconnect loop.
776
+ * - `{ kind: 'terminal', statusName }` — workflow is in a terminal
777
+ * status (`COMPLETED` / `FAILED` / `CANCELLED` / `TERMINATED` /
778
+ * `CONTINUED_AS_NEW` / `TIMED_OUT`). Caller should fire destroy.
779
+ * - `{ kind: 'describe-threw' }` — `describe()` itself failed. Treat
780
+ * as terminal (fire destroy) — consistent with pre-#258 semantics
781
+ * when classification is ambiguous, and avoids spinning forever on
782
+ * a workflow we can't reach.
783
+ * - `{ kind: 'timed-out' }` — `describe()` exceeded
784
+ * {@link DESCRIBE_TIMEOUT_MS}. Treat as terminal (fire destroy) —
785
+ * same rationale: prefer clean shutdown to a hung loop.
786
+ *
787
+ * The unpinned handle follows any CAN chain to the latest run, so
788
+ * `desc.status.name === 'CONTINUED_AS_NEW'` here means the workflow
789
+ * id itself is closed (no successor) — genuinely terminal.
790
+ */
791
+ async confirmWorkflowTerminal(unpinned) {
792
+ let timer = null;
793
+ try {
794
+ const desc = await Promise.race([
795
+ unpinned.describe(),
796
+ new Promise((resolve) => {
797
+ timer = setTimeout(() => resolve('timeout'), DESCRIBE_TIMEOUT_MS);
798
+ }),
799
+ ]);
800
+ if (desc === 'timeout')
801
+ return { kind: 'timed-out' };
802
+ const statusName = desc.status?.name ?? 'UNKNOWN';
803
+ if (TERMINAL_WORKFLOW_STATUSES.has(statusName)) {
804
+ return { kind: 'terminal', statusName };
805
+ }
806
+ return { kind: 'running', statusName };
807
+ }
808
+ catch {
809
+ return { kind: 'describe-threw' };
810
+ }
811
+ finally {
812
+ if (timer)
813
+ clearTimeout(timer);
814
+ }
815
+ }
517
816
  // ───────────────────────────────────────────────────────────────────────────
518
817
  // #201 reconnect machinery. Subclasses opt in by overriding `shouldReconnect`.
519
818
  // ───────────────────────────────────────────────────────────────────────────
@@ -605,7 +904,7 @@ class BaseAttachment {
605
904
  if (this.stopped || this.terminalFired || this.reconnecting)
606
905
  return;
607
906
  if (!this.shouldReconnect(reason)) {
608
- this.fireTerminal(reason);
907
+ this.fireTerminal(reason, 'fireTerminalOrReconnect:not-recoverable');
609
908
  return;
610
909
  }
611
910
  // Pause the heartbeat + watcher loops for the duration of the reconnect.
@@ -627,14 +926,14 @@ class BaseAttachment {
627
926
  void this.runCanRebind(canSuccessorRunId).catch((err) => {
628
927
  log(`CAN rebind crashed:`, err?.message ?? err);
629
928
  this.reconnecting = false;
630
- this.fireTerminal('reconnect-exhausted');
929
+ this.fireTerminal('reconnect-exhausted', 'runCanRebind:crashed');
631
930
  });
632
931
  return;
633
932
  }
634
933
  void this.runReconnectLoop(reason).catch((err) => {
635
934
  log(`reconnect loop crashed:`, err?.message ?? err);
636
935
  this.reconnecting = false;
637
- this.fireTerminal('reconnect-exhausted');
936
+ this.fireTerminal('reconnect-exhausted', 'runReconnectLoop:crashed');
638
937
  });
639
938
  }
640
939
  /**
@@ -658,7 +957,7 @@ class BaseAttachment {
658
957
  try {
659
958
  if (!this.client || !this.pinnedHandle || !this.token) {
660
959
  log('runCanRebind: missing client/handle/token — firing terminal');
661
- this.fireTerminal('reconnect-exhausted');
960
+ this.fireTerminal('reconnect-exhausted', 'runCanRebind:missing-state');
662
961
  return;
663
962
  }
664
963
  const workflowId = this.pinnedHandle.workflowId;
@@ -735,7 +1034,7 @@ class BaseAttachment {
735
1034
  try {
736
1035
  if (!this.client || !this.host || !this.token || !this.pinnedHandle) {
737
1036
  log('runReconnectLoop: missing client/host/token/handle — aborting');
738
- this.fireTerminal('reconnect-exhausted');
1037
+ this.fireTerminal('reconnect-exhausted', 'runReconnectLoop:missing-state');
739
1038
  return;
740
1039
  }
741
1040
  const workflowId = this.pinnedHandle.workflowId;
@@ -774,14 +1073,40 @@ class BaseAttachment {
774
1073
  }
775
1074
  catch (err) {
776
1075
  if ((0, terminal_error_1.isTerminalWorkflowError)(err)) {
777
- // #226: either terminal kind inside the reconnect loop's pre-check
778
- // ends the loop. We don't recurse into another CAN rebind here —
779
- // that path is only for the pinned-handle tick ticks where we can
780
- // read the closed run's history. Inside the loop the unpinned
781
- // query has already followed any CAN chain, so a gone error means
782
- // the workflow id is truly absent.
783
- log('reconnect: workflow gone during pre-check');
784
- this.fireTerminal('destroy');
1076
+ // #258: ONE terminal-class pre-check error is not enough evidence
1077
+ // to destroy the adapter. The classifier matches phrasings
1078
+ // (`WorkflowNotFound`, `NOT_FOUND`, "workflow execution already
1079
+ // completed") that can ALSO surface from transient gRPC blips and
1080
+ // momentary visibility-API hiccups. Pre-#258, this branch fired
1081
+ // `fireTerminal('destroy')` immediately a single transient
1082
+ // error orphaned the adapter for the rest of the session
1083
+ // (heartbeat + watcher dead via `terminalFired`, poller torn
1084
+ // down by `onReconnectStart` + `onTerminal` listener).
1085
+ //
1086
+ // Tiebreaker: confirm with `describe()` against the same unpinned
1087
+ // handle. If the workflow is genuinely gone, `describe()` will
1088
+ // either return a closed status (COMPLETED/TERMINATED/...) or
1089
+ // itself throw — fire destroy with confidence. If it returns
1090
+ // RUNNING (or any non-terminal status), the original error was
1091
+ // transient — log and continue the loop. Bounded by
1092
+ // `DESCRIBE_TIMEOUT_MS` so a slow visibility-API call can't hang
1093
+ // the reconnect path indefinitely.
1094
+ const errClass = err?.name ?? 'unknown';
1095
+ const errMsg = err?.message ?? String(err);
1096
+ const tiebreak = await this.confirmWorkflowTerminal(unpinned);
1097
+ if (tiebreak.kind === 'running') {
1098
+ log(`reconnect: pre-check threw ${errClass} but describe() shows ` +
1099
+ `${tiebreak.statusName} — treating as transient, continuing loop ` +
1100
+ `(originalError="${errMsg}")`);
1101
+ backoff = Math.min(backoff * this.reconnectBackoffFactor, this.reconnectMaxMs);
1102
+ continue;
1103
+ }
1104
+ const confirmDesc = tiebreak.kind === 'terminal'
1105
+ ? `describe() confirmed ${tiebreak.statusName}`
1106
+ : `describe() ${tiebreak.kind === 'describe-threw' ? 'threw' : 'timed out'}`;
1107
+ log(`reconnect: pre-check terminal (${errClass}) and ${confirmDesc} — firing destroy ` +
1108
+ `(originalError="${errMsg}")`);
1109
+ this.fireTerminal('destroy', 'runReconnectLoop:precheck-terminal-confirmed');
785
1110
  return;
786
1111
  }
787
1112
  backoff = Math.min(backoff * this.reconnectBackoffFactor, this.reconnectMaxMs);
@@ -790,12 +1115,12 @@ class BaseAttachment {
790
1115
  }
791
1116
  if (info.phase === 'gone') {
792
1117
  log('reconnect: phase=gone — giving up');
793
- this.fireTerminal('destroy');
1118
+ this.fireTerminal('destroy', 'runReconnectLoop:phase-gone');
794
1119
  return;
795
1120
  }
796
1121
  if (info.currentAttachment && info.currentAttachment.attachmentId !== oldAttachmentId) {
797
1122
  log(`reconnect: another adapter holds the lease (${info.currentAttachment.attachmentId}) — bailing`);
798
- this.fireTerminal('superseded');
1123
+ this.fireTerminal('superseded', 'runReconnectLoop:other-holder');
799
1124
  return;
800
1125
  }
801
1126
  if (info.phase === 'draining') {
@@ -849,7 +1174,7 @@ class BaseAttachment {
849
1174
  catch (err) {
850
1175
  if ((0, terminal_error_1.isTerminalWorkflowError)(err)) {
851
1176
  log('reconnect: workflow gone during claim');
852
- this.fireTerminal('destroy');
1177
+ this.fireTerminal('destroy', 'runReconnectLoop:claim-terminal');
853
1178
  return;
854
1179
  }
855
1180
  backoff = Math.min(backoff * this.reconnectBackoffFactor, this.reconnectMaxMs);
@@ -858,7 +1183,7 @@ class BaseAttachment {
858
1183
  }
859
1184
  // Budget exhausted — give up cleanly.
860
1185
  log(`reconnect budget exhausted after ${attempt} attempt(s)`);
861
- this.fireTerminal('reconnect-exhausted');
1186
+ this.fireTerminal('reconnect-exhausted', 'runReconnectLoop:budget-exhausted');
862
1187
  }
863
1188
  finally {
864
1189
  // Guarantee state reset regardless of which path we exited on. Safe to
@@ -45,3 +45,7 @@ export interface DaemonOpts extends CliOverrides {
45
45
  force?: boolean;
46
46
  }
47
47
  export declare function daemon(opts: DaemonOpts): Promise<void>;
48
+ /** Pretty-print a byte count as MB (rounded). Pure helper, exported for tests. */
49
+ export declare function formatBytesAsMb(n: unknown): string;
50
+ /** Pretty-print a millisecond uptime as `Xh Ym Zs`. Pure helper, exported for tests. */
51
+ export declare function formatUptime(ms: unknown): string;