agent-tempo 1.7.0-beta.11 → 1.7.0-beta.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "agent-tempo-dashboard",
3
3
  "private": true,
4
- "version": "1.7.0-beta.11",
4
+ "version": "1.7.0-beta.13",
5
5
  "type": "module",
6
6
  "description": "Web dashboard for agent-tempo. Bundled into the npm package; served by the daemon at /dashboard/*.",
7
7
  "scripts": {
@@ -46,6 +46,7 @@ const git_info_1 = require("../git-info");
46
46
  const spawn_1 = require("../spawn");
47
47
  const config_2 = require("../config");
48
48
  const resolve_1 = require("./resolve");
49
+ const visibility_deadline_1 = require("../utils/visibility-deadline");
49
50
  const action_counters_1 = require("../utils/action-counters");
50
51
  const search_attributes_1 = require("../utils/search-attributes");
51
52
  const agent_types_1 = require("../ensemble/agent-types");
@@ -86,6 +87,15 @@ function isRetryableTemporalError(err) {
86
87
  // to call unconditionally.
87
88
  if (err instanceof activity_1.ApplicationFailure)
88
89
  return false;
90
+ // #845 Mode A: a truncated visibility scan (resolveSession's deadline
91
+ // tripped mid-scan, #336/#529) is a LATENCY failure — the target may well
92
+ // exist; the scan just didn't finish. Treat it as transient so the
93
+ // activity retry policy re-runs resolveSession with a fresh 10s deadline
94
+ // (backoff-bounded) instead of collapsing it into a permanent
95
+ // "No active session found". This must NOT re-add an in-resolver retry
96
+ // loop — the bounding lives in Temporal's policy, not a hot-path scan.
97
+ if ((0, visibility_deadline_1.isVisibilityTimeout)(err))
98
+ return true;
89
99
  const e = err;
90
100
  const name = e?.name ?? '';
91
101
  const msg = e?.message ?? '';
@@ -1,5 +1,13 @@
1
1
  import { Client, WorkflowHandle } from '@temporalio/client';
2
2
  import { AttachmentPhase } from '../types';
3
+ /**
4
+ * Mode-B describe-by-id timeout (#845). The strongly-consistent
5
+ * `describe()` fallback on `resolveSession`'s not-found branch is a single
6
+ * O(1) RPC; 2s mirrors {@link DEFAULT_QUERY_TIMEOUT_MS} — two orders of
7
+ * magnitude over a healthy describe — so a wedged frontend can't re-hang
8
+ * the outbox loop the visibility deadline (#336/#529) was added to bound.
9
+ */
10
+ export declare const RESOLVE_DESCRIBE_TIMEOUT_MS = 2000;
3
11
  /** Shared query for listing running session workflows. Exported for the
4
12
  * ensemble-scoped variants in `client/core.ts` (#751). */
5
13
  export declare const SESSION_LIST_QUERY = "WorkflowType = \"agentSessionWorkflow\" AND ExecutionStatus = \"Running\"";
@@ -18,15 +26,36 @@ export declare const SESSION_LIST_QUERY = "WorkflowType = \"agentSessionWorkflow
18
26
  * in `scanEnsembleSessionsCloud`. Enforced by
19
27
  * tests/conformance/decision-path-fence.test.ts.
20
28
  *
21
- * **Deadline (#336/#529):** the visibility iterator is bounded by
22
- * `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On timeout,
23
- * throws `VisibilityIteratorTimeoutError` rather than returning `null`
24
- * — silent `null` on a partially-scanned set would be indistinguishable
25
- * from "definitely not found," producing false "Player not found" errors
26
- * upstream. Every existing caller wraps this in a try/catch (outbox
27
- * activities, MCP tools' `defineTool` helper, CLI dev-verbs); the throw
28
- * propagates as a retryable / user-visible "lookup timed out" rather
29
- * than the misleading "player not found."
29
+ * **Mode A — deadline truncation (#336/#529):** the visibility iterator is
30
+ * bounded by `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On
31
+ * timeout it throws `VisibilityIteratorTimeoutError` rather than returning
32
+ * `null` — silent `null` on a partially-scanned set would be
33
+ * indistinguishable from "definitely not found." The throw is classified
34
+ * **retryable** by the outbox activity (`isRetryableTemporalError`), so
35
+ * Temporal's activity retry policy re-runs the lookup with a fresh
36
+ * deadline rather than collapsing it to a permanent "player not found."
37
+ * Synchronous tool/CLI callers surface it as a distinct "resolution
38
+ * incomplete — retry," never "not found."
39
+ *
40
+ * **Mode B — visibility-index lag (#845):** `list()` can complete normally
41
+ * (no throw) yet miss a freshly-started workflow because the visibility
42
+ * index trails the workflow store (observed live as a 3/8→8/8 roster
43
+ * during post-restart worker warmup). An early-exhausting scan is NOT
44
+ * proof of absence. So on the not-found branch we do **exactly one**
45
+ * strongly-consistent `describe()` against the *derived* workflow id —
46
+ * an O(1) read by primary key that bypasses the lagging index. This is a
47
+ * point lookup, NOT a re-scan: it cannot re-introduce the unbounded-scan
48
+ * hang the deadline guard was added to prevent.
49
+ *
50
+ * **Documented Mode-B limitation:** the derived id
51
+ * `agent-session-{ensemble}-{playerName}` is minted from a player's
52
+ * INITIAL name at spawn; `set_name` does not change the workflow id. So
53
+ * describe-by-derived-id false-negatives for a player that was both
54
+ * RENAMED and is currently index-lagged — it falls back to `null` (looks
55
+ * absent) for that narrow intersection. Accepted by design: it closes the
56
+ * gap for the cold-boot/warmup incident class (nobody renames mid-boot),
57
+ * and a second full re-scan to cover renamed∩lagged would put scan cost on
58
+ * every genuine typo'd-name lookup. See issue #845.
30
59
  */
31
60
  export declare function resolveSession(client: Client, ensemble: string, playerName: string): Promise<WorkflowHandle | null>;
32
61
  /** Info returned for each session by scanEnsembleSessions. */
@@ -82,19 +111,55 @@ export interface EnsembleSessionInfo {
82
111
  */
83
112
  export declare function scanEnsembleSessionsCloud(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleSessionInfo[]>;
84
113
  /**
85
- * Scan all running session workflows in an ensemble.
86
- * Returns metadata + part for each session. Shared by the ensemble MCP tool
87
- * and the Maestro refresh activity.
114
+ * Result of {@link scanEnsembleSessionsWithStatus} the session rows plus
115
+ * whether the visibility scan completed or was cut short (#845).
116
+ *
117
+ * `truncated` is the Mode-A signal: a `VisibilityIteratorTimeoutError`
118
+ * fired (the wall-clock deadline tripped mid-scan), so `sessions` is a
119
+ * partial snapshot, NOT the full roster. Callers that render a roster (the
120
+ * `ensemble` tool) MUST surface this so a partial set is never mistaken
121
+ * for a complete one. NOTE: this does NOT cover Mode B (visibility-index
122
+ * lag) — there the scan completes normally and `truncated` is `false` even
123
+ * though a freshly-started workflow may be missing; that's best-effort by
124
+ * design and self-heals on the next tick.
125
+ *
126
+ * `scanned` is the number of running workflows the iterator visited before
127
+ * completing or timing out — useful for warn logs ("partial: 3 of ≥N").
128
+ */
129
+ export interface EnsembleScanResult {
130
+ sessions: EnsembleSessionInfo[];
131
+ truncated: boolean;
132
+ scanned: number;
133
+ }
134
+ /**
135
+ * Scan all running session workflows in an ensemble, reporting whether the
136
+ * scan completed or was truncated by the visibility deadline (#845).
137
+ *
138
+ * This is the single source of truth for the local-profile ensemble scan;
139
+ * {@link scanEnsembleSessions} is a thin array-facade over it that drops
140
+ * the status fields for the many callers that don't need them.
88
141
  *
89
142
  * **Deadline (#336/#529):** the iterator is bounded by
90
- * `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On
91
- * timeout, returns the partial result accumulated so far and emits a
92
- * warn log. This site is **partial-tolerant by design** the caller
93
- * (maestro refresh, ensemble MCP tool) treats the result as a
94
- * best-effort snapshot that the next tick / re-invocation will fill in.
143
+ * `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On timeout
144
+ * the accumulated rows are returned with `truncated: true` and a warn log
145
+ * the scan is **partial-tolerant by design**, but the truncation is now
146
+ * SIGNALLED rather than silent so a roster renderer can flag it.
95
147
  *
96
148
  * T0.1 (#748): this legacy shape is the `costProfile: 'local'` path —
97
- * byte-identical to pre-#748 behavior. The cloud profile uses
149
+ * byte-identical row data to pre-#748 behavior. The cloud profile uses
98
150
  * {@link scanEnsembleSessionsCloud}.
99
151
  */
152
+ export declare function scanEnsembleSessionsWithStatus(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleScanResult>;
153
+ /**
154
+ * Scan all running session workflows in an ensemble — array facade over
155
+ * {@link scanEnsembleSessionsWithStatus}.
156
+ *
157
+ * Returns just the session rows; the truncation/scan-status fields are
158
+ * dropped. This is the byte-identical shape the maestro refresh activity,
159
+ * the #785 upgrade-snapshot, and the other roster consumers already depend
160
+ * on — keeping it a thin delegate means the truncation-signalling work
161
+ * (#845) does NOT ripple through those call sites. Callers that need to
162
+ * know whether the scan was complete (the `ensemble` tool) call the rich
163
+ * sibling directly.
164
+ */
100
165
  export declare function scanEnsembleSessions(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleSessionInfo[]>;
@@ -1,13 +1,23 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.SESSION_LIST_QUERY = void 0;
3
+ exports.SESSION_LIST_QUERY = exports.RESOLVE_DESCRIBE_TIMEOUT_MS = void 0;
4
4
  exports.resolveSession = resolveSession;
5
5
  exports.scanEnsembleSessionsCloud = scanEnsembleSessionsCloud;
6
+ exports.scanEnsembleSessionsWithStatus = scanEnsembleSessionsWithStatus;
6
7
  exports.scanEnsembleSessions = scanEnsembleSessions;
8
+ const config_1 = require("../config");
7
9
  const search_attributes_1 = require("../utils/search-attributes");
8
10
  const signals_1 = require("../workflows/signals");
9
11
  const query_timeout_1 = require("../utils/query-timeout");
10
12
  const visibility_deadline_1 = require("../utils/visibility-deadline");
13
+ /**
14
+ * Mode-B describe-by-id timeout (#845). The strongly-consistent
15
+ * `describe()` fallback on `resolveSession`'s not-found branch is a single
16
+ * O(1) RPC; 2s mirrors {@link DEFAULT_QUERY_TIMEOUT_MS} — two orders of
17
+ * magnitude over a healthy describe — so a wedged frontend can't re-hang
18
+ * the outbox loop the visibility deadline (#336/#529) was added to bound.
19
+ */
20
+ exports.RESOLVE_DESCRIBE_TIMEOUT_MS = 2000;
11
21
  /** Shared query for listing running session workflows. Exported for the
12
22
  * ensemble-scoped variants in `client/core.ts` (#751). */
13
23
  exports.SESSION_LIST_QUERY = `WorkflowType = "agentSessionWorkflow" AND ExecutionStatus = "Running"`;
@@ -26,15 +36,36 @@ exports.SESSION_LIST_QUERY = `WorkflowType = "agentSessionWorkflow" AND Executio
26
36
  * in `scanEnsembleSessionsCloud`. Enforced by
27
37
  * tests/conformance/decision-path-fence.test.ts.
28
38
  *
29
- * **Deadline (#336/#529):** the visibility iterator is bounded by
30
- * `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On timeout,
31
- * throws `VisibilityIteratorTimeoutError` rather than returning `null`
32
- * — silent `null` on a partially-scanned set would be indistinguishable
33
- * from "definitely not found," producing false "Player not found" errors
34
- * upstream. Every existing caller wraps this in a try/catch (outbox
35
- * activities, MCP tools' `defineTool` helper, CLI dev-verbs); the throw
36
- * propagates as a retryable / user-visible "lookup timed out" rather
37
- * than the misleading "player not found."
39
+ * **Mode A — deadline truncation (#336/#529):** the visibility iterator is
40
+ * bounded by `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On
41
+ * timeout it throws `VisibilityIteratorTimeoutError` rather than returning
42
+ * `null` — silent `null` on a partially-scanned set would be
43
+ * indistinguishable from "definitely not found." The throw is classified
44
+ * **retryable** by the outbox activity (`isRetryableTemporalError`), so
45
+ * Temporal's activity retry policy re-runs the lookup with a fresh
46
+ * deadline rather than collapsing it to a permanent "player not found."
47
+ * Synchronous tool/CLI callers surface it as a distinct "resolution
48
+ * incomplete — retry," never "not found."
49
+ *
50
+ * **Mode B — visibility-index lag (#845):** `list()` can complete normally
51
+ * (no throw) yet miss a freshly-started workflow because the visibility
52
+ * index trails the workflow store (observed live as a 3/8→8/8 roster
53
+ * during post-restart worker warmup). An early-exhausting scan is NOT
54
+ * proof of absence. So on the not-found branch we do **exactly one**
55
+ * strongly-consistent `describe()` against the *derived* workflow id —
56
+ * an O(1) read by primary key that bypasses the lagging index. This is a
57
+ * point lookup, NOT a re-scan: it cannot re-introduce the unbounded-scan
58
+ * hang the deadline guard was added to prevent.
59
+ *
60
+ * **Documented Mode-B limitation:** the derived id
61
+ * `agent-session-{ensemble}-{playerName}` is minted from a player's
62
+ * INITIAL name at spawn; `set_name` does not change the workflow id. So
63
+ * describe-by-derived-id false-negatives for a player that was both
64
+ * RENAMED and is currently index-lagged — it falls back to `null` (looks
65
+ * absent) for that narrow intersection. Accepted by design: it closes the
66
+ * gap for the cold-boot/warmup incident class (nobody renames mid-boot),
67
+ * and a second full re-scan to cover renamed∩lagged would put scan cost on
68
+ * every genuine typo'd-name lookup. See issue #845.
38
69
  */
39
70
  async function resolveSession(client, ensemble, playerName) {
40
71
  for await (const wf of (0, visibility_deadline_1.iterateWithDeadline)(client.workflow.list({ query: exports.SESSION_LIST_QUERY }), visibility_deadline_1.VISIBILITY_DEADLINES_MS.resolveSession, 'resolveSession')) {
@@ -49,16 +80,77 @@ async function resolveSession(client, ensemble, playerName) {
49
80
  }
50
81
  }
51
82
  catch (err) {
52
- // Re-throw deadline timeouts — callers that wrap us in try/catch
53
- // already treat unknown throws as a soft "lookup failed" path,
54
- // and the typed error name makes the failure mode legible in
55
- // outbox logs / user-facing tool errors.
83
+ // Re-throw deadline timeouts (Mode A) — callers that wrap us in
84
+ // try/catch treat the typed throw as a soft "lookup timed out" path,
85
+ // distinct from the not-found `null` below.
56
86
  if ((0, visibility_deadline_1.isVisibilityTimeout)(err))
57
87
  throw err;
58
88
  // Workflow may have just completed, or worker is wedged (#433) — skip
59
89
  }
60
90
  }
61
- return null;
91
+ // Mode B (#845): the scan completed without a match, but the visibility
92
+ // index may simply be lagging a just-started workflow. One strongly-
93
+ // consistent describe-by-derived-id disambiguates "index lag" from
94
+ // "genuinely absent" without a second scan.
95
+ return resolveByDerivedId(client, ensemble, playerName);
96
+ }
97
+ /**
98
+ * Mode-B (#845) strongly-consistent fallback for {@link resolveSession}.
99
+ *
100
+ * Reads the session workflow by its *derived* id
101
+ * (`agent-session-{ensemble}-{playerName}`) via a single bounded
102
+ * `describe()` — a primary-key lookup that bypasses the eventually-
103
+ * consistent visibility index. Returns the handle whenever the execution
104
+ * is `RUNNING`; otherwise `null` (genuinely absent, terminated/completed,
105
+ * renamed-false-negative, or describe timed out).
106
+ *
107
+ * Deliberately RUNNING-only — NO attachment-phase filter (#845 JC2): a
108
+ * `gone` player has a LIVE workflow with a terminal adapter, which the
109
+ * #822/#834 deliverability contract handles as warn-but-queue, not
110
+ * "not found". Filtering it here would regress #834 for the lagged-gone
111
+ * window and diverge from the main scan loop (which has no phase filter).
112
+ */
113
+ async function resolveByDerivedId(client, ensemble, playerName) {
114
+ let timer;
115
+ try {
116
+ // `getHandle` is a lazy, no-RPC handle construction in the real client;
117
+ // kept inside the try purely so a defensive throw can never escape the
118
+ // fallback (it must only ever upgrade a null to a handle, never error).
119
+ const handle = client.workflow.getHandle((0, config_1.sessionWorkflowId)(ensemble, playerName));
120
+ const timeout = new Promise((_, reject) => {
121
+ timer = setTimeout(() => reject(new Error('describe-by-id timed out')), exports.RESOLVE_DESCRIBE_TIMEOUT_MS);
122
+ timer.unref?.();
123
+ });
124
+ const desc = await Promise.race([handle.describe(), timeout]);
125
+ // Only a live (RUNNING) execution is a valid resolve target. A
126
+ // COMPLETED/TERMINATED latest run at this id means the player is gone,
127
+ // or the id was reused by a since-closed run → null. A RUNNING run
128
+ // under a reused id is legitimately the current player → return it.
129
+ //
130
+ // No attachment-phase filter (#845 JC2, architect ruling): the main
131
+ // scan loop returns the handle for ANY running session — phase=`gone`
132
+ // included — and #822/#834 treat `gone` as warn-but-QUEUE (the cue
133
+ // durably queues and auto-redelivers on re-attach), NOT "not found".
134
+ // Returning null for a lagged-`gone` player would bypass #822, re-
135
+ // introduce the false-not-found #834 fixed, and make resolution depend
136
+ // on visibility-index timing. The "don't deliver to a torn-down
137
+ // adapter" concern lives at the deliverability layer, not here.
138
+ if (desc.status.name !== 'RUNNING')
139
+ return null;
140
+ return handle;
141
+ }
142
+ catch {
143
+ // NotFound → genuinely absent (or the renamed∩lagged false-negative
144
+ // documented on resolveSession). Timeout/other → treat as absent; the
145
+ // caller's not-found path (or the activity retry policy for Mode A)
146
+ // handles it. We never throw from the fallback — it can only upgrade a
147
+ // null to a found handle, never turn a clean lookup into an error.
148
+ return null;
149
+ }
150
+ finally {
151
+ if (timer)
152
+ clearTimeout(timer);
153
+ }
62
154
  }
63
155
  /**
64
156
  * T0.1 (#748) — cloud-profile ensemble scan. Observation path ONLY (see the
@@ -162,25 +254,30 @@ async function scanEnsembleSessionsCloud(client, ensemble, log = () => { }) {
162
254
  return sessions;
163
255
  }
164
256
  /**
165
- * Scan all running session workflows in an ensemble.
166
- * Returns metadata + part for each session. Shared by the ensemble MCP tool
167
- * and the Maestro refresh activity.
257
+ * Scan all running session workflows in an ensemble, reporting whether the
258
+ * scan completed or was truncated by the visibility deadline (#845).
259
+ *
260
+ * This is the single source of truth for the local-profile ensemble scan;
261
+ * {@link scanEnsembleSessions} is a thin array-facade over it that drops
262
+ * the status fields for the many callers that don't need them.
168
263
  *
169
264
  * **Deadline (#336/#529):** the iterator is bounded by
170
- * `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On
171
- * timeout, returns the partial result accumulated so far and emits a
172
- * warn log. This site is **partial-tolerant by design** the caller
173
- * (maestro refresh, ensemble MCP tool) treats the result as a
174
- * best-effort snapshot that the next tick / re-invocation will fill in.
265
+ * `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On timeout
266
+ * the accumulated rows are returned with `truncated: true` and a warn log
267
+ * the scan is **partial-tolerant by design**, but the truncation is now
268
+ * SIGNALLED rather than silent so a roster renderer can flag it.
175
269
  *
176
270
  * T0.1 (#748): this legacy shape is the `costProfile: 'local'` path —
177
- * byte-identical to pre-#748 behavior. The cloud profile uses
271
+ * byte-identical row data to pre-#748 behavior. The cloud profile uses
178
272
  * {@link scanEnsembleSessionsCloud}.
179
273
  */
180
- async function scanEnsembleSessions(client, ensemble, log = () => { }) {
274
+ async function scanEnsembleSessionsWithStatus(client, ensemble, log = () => { }) {
181
275
  const sessions = [];
276
+ let truncated = false;
277
+ let scanned = 0;
182
278
  try {
183
279
  for await (const workflow of (0, visibility_deadline_1.iterateWithDeadline)(client.workflow.list({ query: exports.SESSION_LIST_QUERY }), visibility_deadline_1.VISIBILITY_DEADLINES_MS.scanEnsembleSessions, 'scanEnsembleSessions')) {
280
+ scanned++;
184
281
  try {
185
282
  const handle = client.workflow.getHandle(workflow.workflowId);
186
283
  // Issue #433 — bound the metadata + part queries so a single wedged
@@ -234,11 +331,27 @@ async function scanEnsembleSessions(client, ensemble, log = () => { }) {
234
331
  }
235
332
  catch (err) {
236
333
  if ((0, visibility_deadline_1.isVisibilityTimeout)(err)) {
334
+ truncated = true;
237
335
  log(`scanEnsembleSessions: ${err.message} — returning partial (${sessions.length} sessions)`);
238
336
  }
239
337
  else {
240
338
  throw err;
241
339
  }
242
340
  }
243
- return sessions;
341
+ return { sessions, truncated, scanned };
342
+ }
343
+ /**
344
+ * Scan all running session workflows in an ensemble — array facade over
345
+ * {@link scanEnsembleSessionsWithStatus}.
346
+ *
347
+ * Returns just the session rows; the truncation/scan-status fields are
348
+ * dropped. This is the byte-identical shape the maestro refresh activity,
349
+ * the #785 upgrade-snapshot, and the other roster consumers already depend
350
+ * on — keeping it a thin delegate means the truncation-signalling work
351
+ * (#845) does NOT ripple through those call sites. Callers that need to
352
+ * know whether the scan was complete (the `ensemble` tool) call the rich
353
+ * sibling directly.
354
+ */
355
+ async function scanEnsembleSessions(client, ensemble, log = () => { }) {
356
+ return (await scanEnsembleSessionsWithStatus(client, ensemble, log)).sessions;
244
357
  }
@@ -65,6 +65,7 @@ const croner_1 = require("croner");
65
65
  const client_1 = require("@temporalio/client");
66
66
  const spawn_1 = require("../spawn");
67
67
  const probe_1 = require("../pi/probe");
68
+ const install_1 = require("../pi/install");
68
69
  const config_1 = require("../config");
69
70
  const git_info_1 = require("../git-info");
70
71
  const connection_1 = require("../connection");
@@ -1318,10 +1319,33 @@ async function up(opts) {
1318
1319
  if (!process.env.ANTHROPIC_API_KEY) {
1319
1320
  out.warn('ANTHROPIC_API_KEY is not set — the Pi conductor will fall back to Pi\'s own auth/default model. Set it if Pi needs an Anthropic key.');
1320
1321
  }
1322
+ // #825 — extension-registration guard (mirrors command-center's #820 Bug-2
1323
+ // guard). `up --agent pi` no longer passes an inline `-e` (that risked a
1324
+ // divergent-copy double-load, #825); it now relies on the player extension
1325
+ // being registered in Pi's settings.json. On a box that never ran `install-pi`,
1326
+ // a plain `pi` would launch with NO extension — no claim/heartbeat, a silent
1327
+ // non-conductor (the #820 Bug-2 failure, transplanted to the conductor). So
1328
+ // auto-install idempotently before spawning; fail loud with the manual command
1329
+ // if the write fails. (Checks the GLOBAL settings.json, like command-center; a
1330
+ // user who ran `install-pi --project` still works — `pi` loads the project
1331
+ // path and the same realpath dedupes, so the redundant global install is a
1332
+ // harmless idempotent write, never a second load.)
1333
+ if (!(0, install_1.arePiExtensionsRegistered)()) {
1334
+ try {
1335
+ const result = (0, install_1.installPiExtensions)();
1336
+ out.log(out.dim(` Registered the Pi extensions in ${result.settingsPath} (first-run install-pi).`));
1337
+ }
1338
+ catch (err) {
1339
+ out.error('Cannot start Pi conductor — the Pi extensions are not registered and auto-install failed: ' +
1340
+ `${err instanceof Error ? err.message : String(err)}. Run \`agent-tempo install-pi\` manually, then retry.`);
1341
+ process.exit(1);
1342
+ }
1343
+ }
1321
1344
  let piSpawn;
1322
1345
  try {
1323
- // resolvePiInteractiveBinary / resolvePiExtensionPath throw fail-clean
1324
- // (Pi CLI missing / extension unbuilt) caught here, no terminal launched.
1346
+ // resolvePiInteractiveBinary throws fail-clean (Pi CLI missing) — caught
1347
+ // here, no terminal launched. #825: no more `-e`/extension resolution the
1348
+ // player extension loads from settings.json, registered + guarded just above.
1325
1349
  piSpawn = (0, spawn_1.buildPiConductorSpawn)({
1326
1350
  ensemble: opts.ensemble,
1327
1351
  sessionName,
@@ -81,6 +81,16 @@ export interface SubscribeDeps {
81
81
  * present (Node 20), the wrapper falls back to fetch.
82
82
  */
83
83
  EventSourceImpl?: typeof EventSource;
84
+ /**
85
+ * #826 — force the fetch transport even when a native `EventSource` is
86
+ * available and no token is set. The fetch path is the only one that
87
+ * surfaces a permanent **401/404** as a thrown {@link SubscribeHttpError};
88
+ * native `EventSource` swallows those into its own silent reconnect cycle.
89
+ * The mission-control board needs that hard-error visibility (404 → `gone`,
90
+ * 401 → auth hint), so it sets this. TUI / dashboard leave it unset and keep
91
+ * the auto-selection (native `EventSource` on a tokenless loopback board).
92
+ */
93
+ forceFetch?: boolean;
84
94
  /**
85
95
  * Override sleep — used by tests to fast-forward backoff. Accepts an
86
96
  * `AbortSignal` so the wrapper can wake early on abort.
@@ -216,6 +216,8 @@ function makeIterator(args) {
216
216
  * for `Authorization: Bearer …` and is the only option in Node 20.
217
217
  */
218
218
  function canUseEventSource(deps) {
219
+ if (deps.forceFetch)
220
+ return false; // #826 — caller needs throw-on-permanent
219
221
  if (deps.token)
220
222
  return false;
221
223
  return resolveEventSource(deps) !== undefined;
@@ -41,14 +41,15 @@ export declare const DEFAULT_TAIL_LIMIT = 200;
41
41
  *
42
42
  * - `'connecting'` — initial / post-rebind, before the first coarse event lands.
43
43
  * - `'live'` — at least one coarse event has arrived on the current connection.
44
- * - `'reconnecting'` — the coarse stream ENDED (a non-404 error, a 401, or a
45
- * defensive normal-end) and the loop has exited. Rows are KEPT (rendered stale,
46
- * not cleared). NOTE (#827 review): this is terminal-until-rebind today — the
47
- * stream does NOT auto-resubscribe (genuine transient blips are swallowed
48
- * INSIDE `createSubscribe`, so the board stays `'live'` through them). The
49
- * renderer therefore labels it "STREAM ENDED reopens on re-bind", not
50
- * "reconnecting". Auto-re-arm with backoff is tracked in #828; restore the
51
- * reconnecting wording if/when the loop re-subscribes on this transition.
44
+ * - `'reconnecting'` — the coarse stream ended OR went silent past the watchdog
45
+ * threshold (#826), and the board is RE-ARMING. Rows are KEPT (rendered stale,
46
+ * not cleared). #828: the extension now auto-re-subscribes with bounded
47
+ * equal-jitter backoff (genuine transient blips are still swallowed INSIDE
48
+ * `createSubscribe`, so the board only reaches here on a real stream-death).
49
+ * The variant is carried on `connectionDetail` (no new enum value): an arming
50
+ * detail `[RECONNECTING]`, a settled detail (re-arm capped at 30s) →
51
+ * `[STREAM DOWN]`, and the 401-auth path (which does NOT auto-re-arm a
52
+ * re-sub would just 401 again) keeps the `[STREAM ENDED]` + set-token hint.
52
53
  * - `'gone'` — a hard 404 on the per-ensemble stream: the ensemble's maestro is
53
54
  * gone. {@link setConnection} CLEARS the player list on this transition and the
54
55
  * extension STOPS the stream; the renderer shows "ENSEMBLE DESTROYED".
@@ -1,4 +1,5 @@
1
1
  import { type PiRole } from '../../config';
2
+ import { createSubscribe } from '../../client/subscribe';
2
3
  import { type BoardModel, type CommandLevel } from './board';
3
4
  import { MissionControlActions, type ActionResult } from './actions';
4
5
  import { type InfraProgress } from '../../cli/ensure-infra';
@@ -32,6 +33,14 @@ export interface MissionControlDeps {
32
33
  * `'player'` and `'none'` both keep it dormant.
33
34
  */
34
35
  role?: PiRole;
36
+ /**
37
+ * #826/#828 — override the coarse-stream subscribe factory (test seam).
38
+ * Defaults to {@link createSubscribe}. Lets a fake-timer test inject a mock
39
+ * `subscribe` generator to drive the watchdog + re-arm loop deterministically
40
+ * and assert the single-loop invariant (subscribe called exactly N times, not
41
+ * N+1). Production never sets it.
42
+ */
43
+ createSubscribeImpl?: typeof createSubscribe;
35
44
  }
36
45
  /**
37
46
  * Infra-bootstrap seam (#700 P1). Defaults to the real {@link ensureInfra}; the
@@ -93,6 +102,61 @@ export declare function classifyCoarseStreamEnd(err: unknown, aborted: boolean):
93
102
  connection: 'gone' | 'reconnecting';
94
103
  detail?: string;
95
104
  } | null;
105
+ /** #826 — watchdog poll cadence (how often we compare now − lastCoarseEventAt). */
106
+ export declare const WATCHDOG_TICK_MS = 5000;
107
+ /**
108
+ * #826 — board-level staleness threshold. The daemon emits a `heartbeat` SSE
109
+ * event every ≤10s on a live `/v1/events` stream, so >35s of TOTAL silence
110
+ * (3.5× heartbeat) means the stream is wedged/dead — a half-open socket from a
111
+ * hard `agent-tempo down` (ECONNREFUSED / dead TCP), which neither a 404 nor
112
+ * force-fetch's INTERNAL retry surfaces (that loop reconnects forever, never
113
+ * throws). Sits ABOVE the fetch loop's 30s internal backoff cap, so a healthy
114
+ * cycling loop still receiving heartbeats never trips it — this gap IS the
115
+ * no-double-retry boundary (watchdog = safety net ABOVE the transport).
116
+ */
117
+ export declare const COARSE_STALE_MS = 35000;
118
+ /**
119
+ * #828 — after this many consecutive failed re-arms the board stops claiming
120
+ * it's actively "reconnecting" and settles to the honest "[STREAM DOWN] —
121
+ * retrying every 30s" wording. Re-arm itself NEVER stops (a permanently silent
122
+ * wedge is the #752 silent-wedge class); only the label changes. ~5 steps takes
123
+ * the backoff ramp to its 30s cap.
124
+ */
125
+ export declare const REARM_SETTLE_THRESHOLD = 5;
126
+ /**
127
+ * #828 — equal-jitter backoff for the Nth re-arm attempt: `b/2 + rand(0, b/2)`
128
+ * where `b = min(1s·2^attempt, 30s)`. `Math.random()` is fine here — this is
129
+ * client code, not workflow code (the determinism rule does not apply). Jitter
130
+ * spreads re-arms so a fleet of boards doesn't thundering-herd a recovering
131
+ * daemon. `randomFn` is injectable for deterministic tests.
132
+ */
133
+ export declare function rearmDelayMs(attempt: number, randomFn?: () => number): number;
134
+ /**
135
+ * #828 — the reconnecting sub-variant wording for the Nth re-arm attempt: still
136
+ * ramping (< {@link REARM_SETTLE_THRESHOLD}) → "attempting to reconnect…";
137
+ * settled (≥) → "retrying every 30s". Carried on the model's `connectionDetail`
138
+ * (NO new BoardConnection enum value) and read by the renderer to pick the
139
+ * marker. Pure + exported for unit testing.
140
+ */
141
+ export declare function reconnectDetailForAttempt(attempt: number): string;
142
+ /**
143
+ * #828 — should a coarse stream-END auto-re-arm? Gate (architect ruling):
144
+ * - `null` (aborted teardown/rebind) → no
145
+ * - `gone` (404 — maestro torn down; a re-sub just 404s) → no (terminal by design)
146
+ * - `reconnecting` WITH a detail (the 401 auth path — tight-looping a
147
+ * guaranteed-fail) → no; keep the set-token hint
148
+ * - `reconnecting` WITHOUT a detail (generic stream-drop / normal-end) → yes
149
+ * Pure + exported for unit testing.
150
+ */
151
+ export declare function shouldRearmOnStreamEnd(end: {
152
+ connection: 'gone' | 'reconnecting';
153
+ detail?: string;
154
+ } | null): boolean;
155
+ /**
156
+ * #826 — is the coarse stream stale (silent past {@link COARSE_STALE_MS})?
157
+ * `lastEventAt === 0` means "not connected yet" → never stale. Pure.
158
+ */
159
+ export declare function isCoarseStale(lastEventAt: number, now: number): boolean;
96
160
  /**
97
161
  * The operator-command + board controller. Holds the model + the action client;
98
162
  * command methods are independently unit-testable with a fake actions + ctx.