agent-tempo 1.7.0-beta.12 → 1.7.0-beta.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "agent-tempo-dashboard",
3
3
  "private": true,
4
- "version": "1.7.0-beta.12",
4
+ "version": "1.7.0-beta.13",
5
5
  "type": "module",
6
6
  "description": "Web dashboard for agent-tempo. Bundled into the npm package; served by the daemon at /dashboard/*.",
7
7
  "scripts": {
@@ -46,6 +46,7 @@ const git_info_1 = require("../git-info");
46
46
  const spawn_1 = require("../spawn");
47
47
  const config_2 = require("../config");
48
48
  const resolve_1 = require("./resolve");
49
+ const visibility_deadline_1 = require("../utils/visibility-deadline");
49
50
  const action_counters_1 = require("../utils/action-counters");
50
51
  const search_attributes_1 = require("../utils/search-attributes");
51
52
  const agent_types_1 = require("../ensemble/agent-types");
@@ -86,6 +87,15 @@ function isRetryableTemporalError(err) {
86
87
  // to call unconditionally.
87
88
  if (err instanceof activity_1.ApplicationFailure)
88
89
  return false;
90
+ // #845 Mode A: a truncated visibility scan (resolveSession's deadline
91
+ // tripped mid-scan, #336/#529) is a LATENCY failure — the target may well
92
+ // exist; the scan just didn't finish. Treat it as transient so the
93
+ // activity retry policy re-runs resolveSession with a fresh 10s deadline
94
+ // (backoff-bounded) instead of collapsing it into a permanent
95
+ // "No active session found". This must NOT re-add an in-resolver retry
96
+ // loop — the bounding lives in Temporal's policy, not a hot-path scan.
97
+ if ((0, visibility_deadline_1.isVisibilityTimeout)(err))
98
+ return true;
89
99
  const e = err;
90
100
  const name = e?.name ?? '';
91
101
  const msg = e?.message ?? '';
@@ -1,5 +1,13 @@
1
1
  import { Client, WorkflowHandle } from '@temporalio/client';
2
2
  import { AttachmentPhase } from '../types';
3
+ /**
4
+ * Mode-B describe-by-id timeout (#845). The strongly-consistent
5
+ * `describe()` fallback on `resolveSession`'s not-found branch is a single
6
+ * O(1) RPC; 2s mirrors {@link DEFAULT_QUERY_TIMEOUT_MS} — two orders of
7
+ * magnitude over a healthy describe — so a wedged frontend can't re-hang
8
+ * the outbox loop the visibility deadline (#336/#529) was added to bound.
9
+ */
10
+ export declare const RESOLVE_DESCRIBE_TIMEOUT_MS = 2000;
3
11
  /** Shared query for listing running session workflows. Exported for the
4
12
  * ensemble-scoped variants in `client/core.ts` (#751). */
5
13
  export declare const SESSION_LIST_QUERY = "WorkflowType = \"agentSessionWorkflow\" AND ExecutionStatus = \"Running\"";
@@ -18,15 +26,36 @@ export declare const SESSION_LIST_QUERY = "WorkflowType = \"agentSessionWorkflow
18
26
  * in `scanEnsembleSessionsCloud`. Enforced by
19
27
  * tests/conformance/decision-path-fence.test.ts.
20
28
  *
21
- * **Deadline (#336/#529):** the visibility iterator is bounded by
22
- * `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On timeout,
23
- * throws `VisibilityIteratorTimeoutError` rather than returning `null`
24
- * — silent `null` on a partially-scanned set would be indistinguishable
25
- * from "definitely not found," producing false "Player not found" errors
26
- * upstream. Every existing caller wraps this in a try/catch (outbox
27
- * activities, MCP tools' `defineTool` helper, CLI dev-verbs); the throw
28
- * propagates as a retryable / user-visible "lookup timed out" rather
29
- * than the misleading "player not found."
29
+ * **Mode A — deadline truncation (#336/#529):** the visibility iterator is
30
+ * bounded by `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On
31
+ * timeout it throws `VisibilityIteratorTimeoutError` rather than returning
32
+ * `null` — silent `null` on a partially-scanned set would be
33
+ * indistinguishable from "definitely not found." The throw is classified
34
+ * **retryable** by the outbox activity (`isRetryableTemporalError`), so
35
+ * Temporal's activity retry policy re-runs the lookup with a fresh
36
+ * deadline rather than collapsing it to a permanent "player not found."
37
+ * Synchronous tool/CLI callers surface it as a distinct "resolution
38
+ * incomplete — retry," never "not found."
39
+ *
40
+ * **Mode B — visibility-index lag (#845):** `list()` can complete normally
41
+ * (no throw) yet miss a freshly-started workflow because the visibility
42
+ * index trails the workflow store (observed live as a 3/8→8/8 roster
43
+ * during post-restart worker warmup). An early-exhausting scan is NOT
44
+ * proof of absence. So on the not-found branch we do **exactly one**
45
+ * strongly-consistent `describe()` against the *derived* workflow id —
46
+ * an O(1) read by primary key that bypasses the lagging index. This is a
47
+ * point lookup, NOT a re-scan: it cannot re-introduce the unbounded-scan
48
+ * hang the deadline guard was added to prevent.
49
+ *
50
+ * **Documented Mode-B limitation:** the derived id
51
+ * `agent-session-{ensemble}-{playerName}` is minted from a player's
52
+ * INITIAL name at spawn; `set_name` does not change the workflow id. So
53
+ * describe-by-derived-id false-negatives for a player that was both
54
+ * RENAMED and is currently index-lagged — it falls back to `null` (looks
55
+ * absent) for that narrow intersection. Accepted by design: it closes the
56
+ * gap for the cold-boot/warmup incident class (nobody renames mid-boot),
57
+ * and a second full re-scan to cover renamed∩lagged would put scan cost on
58
+ * every genuine typo'd-name lookup. See issue #845.
30
59
  */
31
60
  export declare function resolveSession(client: Client, ensemble: string, playerName: string): Promise<WorkflowHandle | null>;
32
61
  /** Info returned for each session by scanEnsembleSessions. */
@@ -82,19 +111,55 @@ export interface EnsembleSessionInfo {
82
111
  */
83
112
  export declare function scanEnsembleSessionsCloud(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleSessionInfo[]>;
84
113
  /**
85
- * Scan all running session workflows in an ensemble.
86
- * Returns metadata + part for each session. Shared by the ensemble MCP tool
87
- * and the Maestro refresh activity.
114
+ * Result of {@link scanEnsembleSessionsWithStatus} the session rows plus
115
+ * whether the visibility scan completed or was cut short (#845).
116
+ *
117
+ * `truncated` is the Mode-A signal: a `VisibilityIteratorTimeoutError`
118
+ * fired (the wall-clock deadline tripped mid-scan), so `sessions` is a
119
+ * partial snapshot, NOT the full roster. Callers that render a roster (the
120
+ * `ensemble` tool) MUST surface this so a partial set is never mistaken
121
+ * for a complete one. NOTE: this does NOT cover Mode B (visibility-index
122
+ * lag) — there the scan completes normally and `truncated` is `false` even
123
+ * though a freshly-started workflow may be missing; that's best-effort by
124
+ * design and self-heals on the next tick.
125
+ *
126
+ * `scanned` is the number of running workflows the iterator visited before
127
+ * completing or timing out — useful for warn logs ("partial: 3 of ≥N").
128
+ */
129
+ export interface EnsembleScanResult {
130
+ sessions: EnsembleSessionInfo[];
131
+ truncated: boolean;
132
+ scanned: number;
133
+ }
134
+ /**
135
+ * Scan all running session workflows in an ensemble, reporting whether the
136
+ * scan completed or was truncated by the visibility deadline (#845).
137
+ *
138
+ * This is the single source of truth for the local-profile ensemble scan;
139
+ * {@link scanEnsembleSessions} is a thin array-facade over it that drops
140
+ * the status fields for the many callers that don't need them.
88
141
  *
89
142
  * **Deadline (#336/#529):** the iterator is bounded by
90
- * `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On
91
- * timeout, returns the partial result accumulated so far and emits a
92
- * warn log. This site is **partial-tolerant by design** the caller
93
- * (maestro refresh, ensemble MCP tool) treats the result as a
94
- * best-effort snapshot that the next tick / re-invocation will fill in.
143
+ * `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On timeout
144
+ * the accumulated rows are returned with `truncated: true` and a warn log
145
+ * the scan is **partial-tolerant by design**, but the truncation is now
146
+ * SIGNALLED rather than silent so a roster renderer can flag it.
95
147
  *
96
148
  * T0.1 (#748): this legacy shape is the `costProfile: 'local'` path —
97
- * byte-identical to pre-#748 behavior. The cloud profile uses
149
+ * byte-identical row data to pre-#748 behavior. The cloud profile uses
98
150
  * {@link scanEnsembleSessionsCloud}.
99
151
  */
152
+ export declare function scanEnsembleSessionsWithStatus(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleScanResult>;
153
+ /**
154
+ * Scan all running session workflows in an ensemble — array facade over
155
+ * {@link scanEnsembleSessionsWithStatus}.
156
+ *
157
+ * Returns just the session rows; the truncation/scan-status fields are
158
+ * dropped. This is the byte-identical shape the maestro refresh activity,
159
+ * the #785 upgrade-snapshot, and the other roster consumers already depend
160
+ * on — keeping it a thin delegate means the truncation-signalling work
161
+ * (#845) does NOT ripple through those call sites. Callers that need to
162
+ * know whether the scan was complete (the `ensemble` tool) call the rich
163
+ * sibling directly.
164
+ */
100
165
  export declare function scanEnsembleSessions(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleSessionInfo[]>;
@@ -1,13 +1,23 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.SESSION_LIST_QUERY = void 0;
3
+ exports.SESSION_LIST_QUERY = exports.RESOLVE_DESCRIBE_TIMEOUT_MS = void 0;
4
4
  exports.resolveSession = resolveSession;
5
5
  exports.scanEnsembleSessionsCloud = scanEnsembleSessionsCloud;
6
+ exports.scanEnsembleSessionsWithStatus = scanEnsembleSessionsWithStatus;
6
7
  exports.scanEnsembleSessions = scanEnsembleSessions;
8
+ const config_1 = require("../config");
7
9
  const search_attributes_1 = require("../utils/search-attributes");
8
10
  const signals_1 = require("../workflows/signals");
9
11
  const query_timeout_1 = require("../utils/query-timeout");
10
12
  const visibility_deadline_1 = require("../utils/visibility-deadline");
13
+ /**
14
+ * Mode-B describe-by-id timeout (#845). The strongly-consistent
15
+ * `describe()` fallback on `resolveSession`'s not-found branch is a single
16
+ * O(1) RPC; 2s mirrors {@link DEFAULT_QUERY_TIMEOUT_MS} — two orders of
17
+ * magnitude over a healthy describe — so a wedged frontend can't re-hang
18
+ * the outbox loop the visibility deadline (#336/#529) was added to bound.
19
+ */
20
+ exports.RESOLVE_DESCRIBE_TIMEOUT_MS = 2000;
11
21
  /** Shared query for listing running session workflows. Exported for the
12
22
  * ensemble-scoped variants in `client/core.ts` (#751). */
13
23
  exports.SESSION_LIST_QUERY = `WorkflowType = "agentSessionWorkflow" AND ExecutionStatus = "Running"`;
@@ -26,15 +36,36 @@ exports.SESSION_LIST_QUERY = `WorkflowType = "agentSessionWorkflow" AND Executio
26
36
  * in `scanEnsembleSessionsCloud`. Enforced by
27
37
  * tests/conformance/decision-path-fence.test.ts.
28
38
  *
29
- * **Deadline (#336/#529):** the visibility iterator is bounded by
30
- * `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On timeout,
31
- * throws `VisibilityIteratorTimeoutError` rather than returning `null`
32
- * — silent `null` on a partially-scanned set would be indistinguishable
33
- * from "definitely not found," producing false "Player not found" errors
34
- * upstream. Every existing caller wraps this in a try/catch (outbox
35
- * activities, MCP tools' `defineTool` helper, CLI dev-verbs); the throw
36
- * propagates as a retryable / user-visible "lookup timed out" rather
37
- * than the misleading "player not found."
39
+ * **Mode A — deadline truncation (#336/#529):** the visibility iterator is
40
+ * bounded by `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On
41
+ * timeout it throws `VisibilityIteratorTimeoutError` rather than returning
42
+ * `null` — silent `null` on a partially-scanned set would be
43
+ * indistinguishable from "definitely not found." The throw is classified
44
+ * **retryable** by the outbox activity (`isRetryableTemporalError`), so
45
+ * Temporal's activity retry policy re-runs the lookup with a fresh
46
+ * deadline rather than collapsing it to a permanent "player not found."
47
+ * Synchronous tool/CLI callers surface it as a distinct "resolution
48
+ * incomplete — retry," never "not found."
49
+ *
50
+ * **Mode B — visibility-index lag (#845):** `list()` can complete normally
51
+ * (no throw) yet miss a freshly-started workflow because the visibility
52
+ * index trails the workflow store (observed live as a 3/8→8/8 roster
53
+ * during post-restart worker warmup). An early-exhausting scan is NOT
54
+ * proof of absence. So on the not-found branch we do **exactly one**
55
+ * strongly-consistent `describe()` against the *derived* workflow id —
56
+ * an O(1) read by primary key that bypasses the lagging index. This is a
57
+ * point lookup, NOT a re-scan: it cannot re-introduce the unbounded-scan
58
+ * hang the deadline guard was added to prevent.
59
+ *
60
+ * **Documented Mode-B limitation:** the derived id
61
+ * `agent-session-{ensemble}-{playerName}` is minted from a player's
62
+ * INITIAL name at spawn; `set_name` does not change the workflow id. So
63
+ * describe-by-derived-id false-negatives for a player that was both
64
+ * RENAMED and is currently index-lagged — it falls back to `null` (looks
65
+ * absent) for that narrow intersection. Accepted by design: it closes the
66
+ * gap for the cold-boot/warmup incident class (nobody renames mid-boot),
67
+ * and a second full re-scan to cover renamed∩lagged would put scan cost on
68
+ * every genuine typo'd-name lookup. See issue #845.
38
69
  */
39
70
  async function resolveSession(client, ensemble, playerName) {
40
71
  for await (const wf of (0, visibility_deadline_1.iterateWithDeadline)(client.workflow.list({ query: exports.SESSION_LIST_QUERY }), visibility_deadline_1.VISIBILITY_DEADLINES_MS.resolveSession, 'resolveSession')) {
@@ -49,16 +80,77 @@ async function resolveSession(client, ensemble, playerName) {
49
80
  }
50
81
  }
51
82
  catch (err) {
52
- // Re-throw deadline timeouts — callers that wrap us in try/catch
53
- // already treat unknown throws as a soft "lookup failed" path,
54
- // and the typed error name makes the failure mode legible in
55
- // outbox logs / user-facing tool errors.
83
+ // Re-throw deadline timeouts (Mode A) — callers that wrap us in
84
+ // try/catch treat the typed throw as a soft "lookup timed out" path,
85
+ // distinct from the not-found `null` below.
56
86
  if ((0, visibility_deadline_1.isVisibilityTimeout)(err))
57
87
  throw err;
58
88
  // Workflow may have just completed, or worker is wedged (#433) — skip
59
89
  }
60
90
  }
61
- return null;
91
+ // Mode B (#845): the scan completed without a match, but the visibility
92
+ // index may simply be lagging a just-started workflow. One strongly-
93
+ // consistent describe-by-derived-id disambiguates "index lag" from
94
+ // "genuinely absent" without a second scan.
95
+ return resolveByDerivedId(client, ensemble, playerName);
96
+ }
97
+ /**
98
+ * Mode-B (#845) strongly-consistent fallback for {@link resolveSession}.
99
+ *
100
+ * Reads the session workflow by its *derived* id
101
+ * (`agent-session-{ensemble}-{playerName}`) via a single bounded
102
+ * `describe()` — a primary-key lookup that bypasses the eventually-
103
+ * consistent visibility index. Returns the handle whenever the execution
104
+ * is `RUNNING`; otherwise `null` (genuinely absent, terminated/completed,
105
+ * renamed-false-negative, or describe timed out).
106
+ *
107
+ * Deliberately RUNNING-only — NO attachment-phase filter (#845 JC2): a
108
+ * `gone` player has a LIVE workflow with a terminal adapter, which the
109
+ * #822/#834 deliverability contract handles as warn-but-queue, not
110
+ * "not found". Filtering it here would regress #834 for the lagged-gone
111
+ * window and diverge from the main scan loop (which has no phase filter).
112
+ */
113
+ async function resolveByDerivedId(client, ensemble, playerName) {
114
+ let timer;
115
+ try {
116
+ // `getHandle` is a lazy, no-RPC handle construction in the real client;
117
+ // kept inside the try purely so a defensive throw can never escape the
118
+ // fallback (it must only ever upgrade a null to a handle, never error).
119
+ const handle = client.workflow.getHandle((0, config_1.sessionWorkflowId)(ensemble, playerName));
120
+ const timeout = new Promise((_, reject) => {
121
+ timer = setTimeout(() => reject(new Error('describe-by-id timed out')), exports.RESOLVE_DESCRIBE_TIMEOUT_MS);
122
+ timer.unref?.();
123
+ });
124
+ const desc = await Promise.race([handle.describe(), timeout]);
125
+ // Only a live (RUNNING) execution is a valid resolve target. A
126
+ // COMPLETED/TERMINATED latest run at this id means the player is gone,
127
+ // or the id was reused by a since-closed run → null. A RUNNING run
128
+ // under a reused id is legitimately the current player → return it.
129
+ //
130
+ // No attachment-phase filter (#845 JC2, architect ruling): the main
131
+ // scan loop returns the handle for ANY running session — phase=`gone`
132
+ // included — and #822/#834 treat `gone` as warn-but-QUEUE (the cue
133
+ // durably queues and auto-redelivers on re-attach), NOT "not found".
134
+ // Returning null for a lagged-`gone` player would bypass #822, re-
135
+ // introduce the false-not-found #834 fixed, and make resolution depend
136
+ // on visibility-index timing. The "don't deliver to a torn-down
137
+ // adapter" concern lives at the deliverability layer, not here.
138
+ if (desc.status.name !== 'RUNNING')
139
+ return null;
140
+ return handle;
141
+ }
142
+ catch {
143
+ // NotFound → genuinely absent (or the renamed∩lagged false-negative
144
+ // documented on resolveSession). Timeout/other → treat as absent; the
145
+ // caller's not-found path (or the activity retry policy for Mode A)
146
+ // handles it. We never throw from the fallback — it can only upgrade a
147
+ // null to a found handle, never turn a clean lookup into an error.
148
+ return null;
149
+ }
150
+ finally {
151
+ if (timer)
152
+ clearTimeout(timer);
153
+ }
62
154
  }
63
155
  /**
64
156
  * T0.1 (#748) — cloud-profile ensemble scan. Observation path ONLY (see the
@@ -162,25 +254,30 @@ async function scanEnsembleSessionsCloud(client, ensemble, log = () => { }) {
162
254
  return sessions;
163
255
  }
164
256
  /**
165
- * Scan all running session workflows in an ensemble.
166
- * Returns metadata + part for each session. Shared by the ensemble MCP tool
167
- * and the Maestro refresh activity.
257
+ * Scan all running session workflows in an ensemble, reporting whether the
258
+ * scan completed or was truncated by the visibility deadline (#845).
259
+ *
260
+ * This is the single source of truth for the local-profile ensemble scan;
261
+ * {@link scanEnsembleSessions} is a thin array-facade over it that drops
262
+ * the status fields for the many callers that don't need them.
168
263
  *
169
264
  * **Deadline (#336/#529):** the iterator is bounded by
170
- * `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On
171
- * timeout, returns the partial result accumulated so far and emits a
172
- * warn log. This site is **partial-tolerant by design** the caller
173
- * (maestro refresh, ensemble MCP tool) treats the result as a
174
- * best-effort snapshot that the next tick / re-invocation will fill in.
265
+ * `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On timeout
266
+ * the accumulated rows are returned with `truncated: true` and a warn log
267
+ * the scan is **partial-tolerant by design**, but the truncation is now
268
+ * SIGNALLED rather than silent so a roster renderer can flag it.
175
269
  *
176
270
  * T0.1 (#748): this legacy shape is the `costProfile: 'local'` path —
177
- * byte-identical to pre-#748 behavior. The cloud profile uses
271
+ * byte-identical row data to pre-#748 behavior. The cloud profile uses
178
272
  * {@link scanEnsembleSessionsCloud}.
179
273
  */
180
- async function scanEnsembleSessions(client, ensemble, log = () => { }) {
274
+ async function scanEnsembleSessionsWithStatus(client, ensemble, log = () => { }) {
181
275
  const sessions = [];
276
+ let truncated = false;
277
+ let scanned = 0;
182
278
  try {
183
279
  for await (const workflow of (0, visibility_deadline_1.iterateWithDeadline)(client.workflow.list({ query: exports.SESSION_LIST_QUERY }), visibility_deadline_1.VISIBILITY_DEADLINES_MS.scanEnsembleSessions, 'scanEnsembleSessions')) {
280
+ scanned++;
184
281
  try {
185
282
  const handle = client.workflow.getHandle(workflow.workflowId);
186
283
  // Issue #433 — bound the metadata + part queries so a single wedged
@@ -234,11 +331,27 @@ async function scanEnsembleSessions(client, ensemble, log = () => { }) {
234
331
  }
235
332
  catch (err) {
236
333
  if ((0, visibility_deadline_1.isVisibilityTimeout)(err)) {
334
+ truncated = true;
237
335
  log(`scanEnsembleSessions: ${err.message} — returning partial (${sessions.length} sessions)`);
238
336
  }
239
337
  else {
240
338
  throw err;
241
339
  }
242
340
  }
243
- return sessions;
341
+ return { sessions, truncated, scanned };
342
+ }
343
+ /**
344
+ * Scan all running session workflows in an ensemble — array facade over
345
+ * {@link scanEnsembleSessionsWithStatus}.
346
+ *
347
+ * Returns just the session rows; the truncation/scan-status fields are
348
+ * dropped. This is the byte-identical shape the maestro refresh activity,
349
+ * the #785 upgrade-snapshot, and the other roster consumers already depend
350
+ * on — keeping it a thin delegate means the truncation-signalling work
351
+ * (#845) does NOT ripple through those call sites. Callers that need to
352
+ * know whether the scan was complete (the `ensemble` tool) call the rich
353
+ * sibling directly.
354
+ */
355
+ async function scanEnsembleSessions(client, ensemble, log = () => { }) {
356
+ return (await scanEnsembleSessionsWithStatus(client, ensemble, log)).sessions;
244
357
  }
@@ -65,6 +65,7 @@ const croner_1 = require("croner");
65
65
  const client_1 = require("@temporalio/client");
66
66
  const spawn_1 = require("../spawn");
67
67
  const probe_1 = require("../pi/probe");
68
+ const install_1 = require("../pi/install");
68
69
  const config_1 = require("../config");
69
70
  const git_info_1 = require("../git-info");
70
71
  const connection_1 = require("../connection");
@@ -1318,10 +1319,33 @@ async function up(opts) {
1318
1319
  if (!process.env.ANTHROPIC_API_KEY) {
1319
1320
  out.warn('ANTHROPIC_API_KEY is not set — the Pi conductor will fall back to Pi\'s own auth/default model. Set it if Pi needs an Anthropic key.');
1320
1321
  }
1322
+ // #825 — extension-registration guard (mirrors command-center's #820 Bug-2
1323
+ // guard). `up --agent pi` no longer passes an inline `-e` (that risked a
1324
+ // divergent-copy double-load, #825); it now relies on the player extension
1325
+ // being registered in Pi's settings.json. On a box that never ran `install-pi`,
1326
+ // a plain `pi` would launch with NO extension — no claim/heartbeat, a silent
1327
+ // non-conductor (the #820 Bug-2 failure, transplanted to the conductor). So
1328
+ // auto-install idempotently before spawning; fail loud with the manual command
1329
+ // if the write fails. (Checks the GLOBAL settings.json, like command-center; a
1330
+ // user who ran `install-pi --project` still works — `pi` loads the project
1331
+ // path and the same realpath dedupes, so the redundant global install is a
1332
+ // harmless idempotent write, never a second load.)
1333
+ if (!(0, install_1.arePiExtensionsRegistered)()) {
1334
+ try {
1335
+ const result = (0, install_1.installPiExtensions)();
1336
+ out.log(out.dim(` Registered the Pi extensions in ${result.settingsPath} (first-run install-pi).`));
1337
+ }
1338
+ catch (err) {
1339
+ out.error('Cannot start Pi conductor — the Pi extensions are not registered and auto-install failed: ' +
1340
+ `${err instanceof Error ? err.message : String(err)}. Run \`agent-tempo install-pi\` manually, then retry.`);
1341
+ process.exit(1);
1342
+ }
1343
+ }
1321
1344
  let piSpawn;
1322
1345
  try {
1323
- // resolvePiInteractiveBinary / resolvePiExtensionPath throw fail-clean
1324
- // (Pi CLI missing / extension unbuilt) caught here, no terminal launched.
1346
+ // resolvePiInteractiveBinary throws fail-clean (Pi CLI missing) — caught
1347
+ // here, no terminal launched. #825: no more `-e`/extension resolution the
1348
+ // player extension loads from settings.json, registered + guarded just above.
1325
1349
  piSpawn = (0, spawn_1.buildPiConductorSpawn)({
1326
1350
  ensemble: opts.ensemble,
1327
1351
  sessionName,
package/dist/spawn.d.ts CHANGED
@@ -129,18 +129,6 @@ export declare function resolvePiInteractiveBinary(deps?: {
129
129
  cmd: string;
130
130
  args: string[];
131
131
  };
132
- /**
133
- * Resolve the absolute path to the BUNDLED `dist/pi/extension.js` for `pi -e <abs>`
134
- * (#666). Pi loads the BUILT CommonJS extension even in dev. Mirrors
135
- * {@link resolvePiPath}'s dev/prod `__dirname` split: prod `__dirname` = `dist/`
136
- * (→ `dist/pi/extension.js`); dev `__dirname` = `src/` (→ sibling `dist/pi/…`).
137
- * Existence-checked + fail-clean ("run npm run build"). Injectable for tests.
138
- */
139
- export declare function resolvePiExtensionPath(deps?: {
140
- exists?: (p: string) => boolean;
141
- isDev?: boolean;
142
- baseDir?: string;
143
- }): string;
144
132
  /** Inputs for {@link buildPiConductorSpawn} (pure — unit-tested without spawning). */
145
133
  export interface PiConductorSpawnOpts {
146
134
  ensemble: string;
@@ -154,20 +142,28 @@ export interface PiConductorSpawnOpts {
154
142
  conductorTypeName?: string;
155
143
  /** Forwarded if set (warn-not-fail upstream when unset). */
156
144
  anthropicApiKey?: string;
157
- /** Injectable resolvers (default to the real ones, which fail-clean on miss). */
145
+ /** Injectable binary resolver (defaults to the real one, which fails-clean on miss). */
158
146
  resolveBinary?: () => {
159
147
  cmd: string;
160
148
  args: string[];
161
149
  };
162
- resolveExtension?: () => string;
163
150
  }
164
151
  /**
165
152
  * Build the interactive Pi conductor spawn spec — `{ cmd, args, env }` for
166
153
  * {@link launchInTerminal} (#666 C3). PURE + injectable so the env/args mapping is
167
- * unit-tested. The default resolvers THROW fail-clean (binary missing / extension
168
- * unbuilt) BEFORE a terminal is launched. `args` = `[...binArgs, '-e', <ext>]`;
169
- * conductor INSTRUCTIONS arrive via the lineup-baked workflow messages → cue pump
170
- * (no `--system-prompt` for the MVP).
154
+ * unit-tested. The default binary resolver THROWS fail-clean (binary missing)
155
+ * BEFORE a terminal is launched.
156
+ *
157
+ * #825 — NO inline `-e <ext>`. `up --agent pi` now relies on the player extension
158
+ * being registered in Pi's `settings.json` (by `installPiExtensions`, guarded
159
+ * before launch in the `up` pi branch) + the `resolvePiRole`→`'player'` gate
160
+ * (`PLAYER_NAME` is set in the env below). This collapses the two Pi-launch paths
161
+ * onto ONE registration source, so no divergent on-disk copy (e.g. dev `node
162
+ * dist/cli.js`'s repo `dist/pi/extension.js` vs the global settings.json copy) can
163
+ * escape Pi's realpath-dedup and double-load the player factory. Mirrors
164
+ * {@link buildPiCommandCenterSpawn}. `args` = `[...binArgs]`; conductor
165
+ * INSTRUCTIONS arrive via the lineup-baked workflow messages → cue pump (no
166
+ * `--system-prompt` for the MVP).
171
167
  */
172
168
  export declare function buildPiConductorSpawn(opts: PiConductorSpawnOpts): {
173
169
  cmd: string;
@@ -196,11 +192,19 @@ export interface PiCommandCenterSpawnOpts {
196
192
  * Build the interactive Pi COMMAND-CENTER (mission-control) spawn spec —
197
193
  * `{ cmd, args, env }` for {@link launchInTerminal} (#729). PURE + injectable.
198
194
  *
199
- * Unlike {@link buildPiConductorSpawn}, this passes NO `-e <ext>`: install-pi
200
- * registers BOTH Pi extensions in `~/.pi/agent/settings.json`, so a plain `pi`
201
- * auto-loads them and {@link resolvePiRole} (via the env below) picks exactly one.
202
- * Passing `-e` here would DOUBLE-LOAD mission-control (settings.json + `-e`) → a
203
- * command re-registration error. The env carries the OPERATOR subset only:
195
+ * Like {@link buildPiConductorSpawn} (post-#825), this passes NO `-e <ext>`:
196
+ * install-pi registers BOTH Pi extensions in `~/.pi/agent/settings.json`, so a
197
+ * plain `pi` auto-loads them and {@link resolvePiRole} (via the env below) picks
198
+ * exactly one.
199
+ *
200
+ * #825 (comment correction): a SAME-path `-e` would NOT cause a re-registration
201
+ * error — the #825 spike found Pi realpath-dedupes CLI `-e` paths against
202
+ * `settings.json` (`mergePaths` → `canonicalizePath`/`realpathSync`), and even an
203
+ * un-deduped duplicate is first-registration-wins at the tool layer (no throw,
204
+ * Pi 0.79.x). The real reason both spawn specs OMIT `-e` is a SINGLE registration
205
+ * source: it prevents a DIVERGENT on-disk copy (a different physical path that
206
+ * escapes realpath-dedup) from double-loading the extension factory. The env
207
+ * carries the OPERATOR subset only:
204
208
  * - `AGENT_TEMPO_PI_ROLE=command-center` → the DETERMINISTIC role force (top of
205
209
  * {@link resolvePiRole}'s precedence — beats an inherited `PLAYER_NAME`).
206
210
  * - `AGENT_TEMPO_MISSION_CONTROL=1` → the role opt-in (kept for legacy parity /
package/dist/spawn.js CHANGED
@@ -14,7 +14,6 @@ exports.buildTerminalCommand = buildTerminalCommand;
14
14
  exports.launchInTerminal = launchInTerminal;
15
15
  exports.spawnInTerminal = spawnInTerminal;
16
16
  exports.resolvePiInteractiveBinary = resolvePiInteractiveBinary;
17
- exports.resolvePiExtensionPath = resolvePiExtensionPath;
18
17
  exports.buildPiConductorSpawn = buildPiConductorSpawn;
19
18
  exports.buildPiCommandCenterSpawn = buildPiCommandCenterSpawn;
20
19
  exports.spawnCopilotBridge = spawnCopilotBridge;
@@ -305,7 +304,18 @@ function writeSecretEnvFile(secretEnv, opts) {
305
304
  content = keys.map((k) => `set -gx ${k} ${fishQuote(secretEnv[k])}`).join('\n') + '\n';
306
305
  }
307
306
  else if (opts.syntax === 'cmd') {
308
- content = keys.map((k) => `set "${k}=${cmdEscape(secretEnv[k])}"`).join('\r\n') + '\r\n';
307
+ // #847 `@`-prefix EVERY line at the GENERATOR level (a structural map, not a
308
+ // per-line author choice). cmd `call`s this file into the persistent `cmd /k`
309
+ // session under its default echo-ON, which echoes each line of a called batch
310
+ // FILE to the terminal — so an un-prefixed `set "ANTHROPIC_API_KEY=…"` printed
311
+ // the SECRET VALUE to scrollback (the #847 leak; #689 had closed only the
312
+ // command-line/history vector). The per-line `@` is self-contained: unlike
313
+ // `@echo off`, it does NOT persist echo state to the caller (`call` shares the
314
+ // parent echo scope, and neither `call` nor `setlocal` scopes echo), so the
315
+ // trailing `del`, the bin launch, and the user's prompt still echo normally.
316
+ // Mapping at the generator means any FUTURE non-`set` line is suppressed too.
317
+ const lines = keys.map((k) => `set "${k}=${cmdEscape(secretEnv[k])}"`);
318
+ content = lines.map((line) => `@${line}`).join('\r\n') + '\r\n';
309
319
  }
310
320
  else {
311
321
  content = keys.map((k) => `export ${k}=${shellQuote(secretEnv[k])}`).join('\n') + '\n';
@@ -627,37 +637,27 @@ function resolvePiInteractiveBinary(deps = {}) {
627
637
  throw new Error('Pi CLI not found. Install it with `npm install -g pi-ai` and ensure `pi` is on PATH ' +
628
638
  '(or add the @earendil-works/pi-coding-agent package). The conductor needs the interactive Pi CLI.');
629
639
  }
630
- /**
631
- * Resolve the absolute path to the BUNDLED `dist/pi/extension.js` for `pi -e <abs>`
632
- * (#666). Pi loads the BUILT CommonJS extension even in dev. Mirrors
633
- * {@link resolvePiPath}'s dev/prod `__dirname` split: prod `__dirname` = `dist/`
634
- * (→ `dist/pi/extension.js`); dev `__dirname` = `src/` (→ sibling `dist/pi/…`).
635
- * Existence-checked + fail-clean ("run npm run build"). Injectable for tests.
636
- */
637
- function resolvePiExtensionPath(deps = {}) {
638
- const exists = deps.exists ?? fs_1.existsSync;
639
- const isDev = deps.isDev ?? __filename.endsWith('.ts');
640
- const base = deps.baseDir ?? __dirname;
641
- const extPath = isDev
642
- ? (0, path_1.resolve)(base, '..', 'dist', 'pi', 'extension.js') // dev: src/ → repo/dist/pi/extension.js
643
- : (0, path_1.resolve)(base, 'pi', 'extension.js'); // prod: dist/ → dist/pi/extension.js
644
- if (!exists(extPath)) {
645
- throw new Error(`Pi conductor extension not found at ${extPath}. Run \`npm run build\` first.`);
646
- }
647
- return extPath;
648
- }
649
640
  /**
650
641
  * Build the interactive Pi conductor spawn spec — `{ cmd, args, env }` for
651
642
  * {@link launchInTerminal} (#666 C3). PURE + injectable so the env/args mapping is
652
- * unit-tested. The default resolvers THROW fail-clean (binary missing / extension
653
- * unbuilt) BEFORE a terminal is launched. `args` = `[...binArgs, '-e', <ext>]`;
654
- * conductor INSTRUCTIONS arrive via the lineup-baked workflow messages → cue pump
655
- * (no `--system-prompt` for the MVP).
643
+ * unit-tested. The default binary resolver THROWS fail-clean (binary missing)
644
+ * BEFORE a terminal is launched.
645
+ *
646
+ * #825 — NO inline `-e <ext>`. `up --agent pi` now relies on the player extension
647
+ * being registered in Pi's `settings.json` (by `installPiExtensions`, guarded
648
+ * before launch in the `up` pi branch) + the `resolvePiRole`→`'player'` gate
649
+ * (`PLAYER_NAME` is set in the env below). This collapses the two Pi-launch paths
650
+ * onto ONE registration source, so no divergent on-disk copy (e.g. dev `node
651
+ * dist/cli.js`'s repo `dist/pi/extension.js` vs the global settings.json copy) can
652
+ * escape Pi's realpath-dedup and double-load the player factory. Mirrors
653
+ * {@link buildPiCommandCenterSpawn}. `args` = `[...binArgs]`; conductor
654
+ * INSTRUCTIONS arrive via the lineup-baked workflow messages → cue pump (no
655
+ * `--system-prompt` for the MVP).
656
656
  */
657
657
  function buildPiConductorSpawn(opts) {
658
658
  const { cmd, args: binArgs } = (opts.resolveBinary ?? resolvePiInteractiveBinary)();
659
- const extPath = (opts.resolveExtension ?? resolvePiExtensionPath)();
660
- const args = [...binArgs, '-e', extPath];
659
+ // #825 single registration source: no inline `-e` (see the doc-comment above).
660
+ const args = [...binArgs];
661
661
  const env = {
662
662
  ...opts.temporalEnvVars,
663
663
  [config_1.ENV.TASK_QUEUE]: opts.taskQueue,
@@ -678,11 +678,19 @@ function buildPiConductorSpawn(opts) {
678
678
  * Build the interactive Pi COMMAND-CENTER (mission-control) spawn spec —
679
679
  * `{ cmd, args, env }` for {@link launchInTerminal} (#729). PURE + injectable.
680
680
  *
681
- * Unlike {@link buildPiConductorSpawn}, this passes NO `-e <ext>`: install-pi
682
- * registers BOTH Pi extensions in `~/.pi/agent/settings.json`, so a plain `pi`
683
- * auto-loads them and {@link resolvePiRole} (via the env below) picks exactly one.
684
- * Passing `-e` here would DOUBLE-LOAD mission-control (settings.json + `-e`) → a
685
- * command re-registration error. The env carries the OPERATOR subset only:
681
+ * Like {@link buildPiConductorSpawn} (post-#825), this passes NO `-e <ext>`:
682
+ * install-pi registers BOTH Pi extensions in `~/.pi/agent/settings.json`, so a
683
+ * plain `pi` auto-loads them and {@link resolvePiRole} (via the env below) picks
684
+ * exactly one.
685
+ *
686
+ * #825 (comment correction): a SAME-path `-e` would NOT cause a re-registration
687
+ * error — the #825 spike found Pi realpath-dedupes CLI `-e` paths against
688
+ * `settings.json` (`mergePaths` → `canonicalizePath`/`realpathSync`), and even an
689
+ * un-deduped duplicate is first-registration-wins at the tool layer (no throw,
690
+ * Pi 0.79.x). The real reason both spawn specs OMIT `-e` is a SINGLE registration
691
+ * source: it prevents a DIVERGENT on-disk copy (a different physical path that
692
+ * escapes realpath-dedup) from double-loading the extension factory. The env
693
+ * carries the OPERATOR subset only:
686
694
  * - `AGENT_TEMPO_PI_ROLE=command-center` → the DETERMINISTIC role force (top of
687
695
  * {@link resolvePiRole}'s precedence — beats an inherited `PLAYER_NAME`).
688
696
  * - `AGENT_TEMPO_MISSION_CONTROL=1` → the role opt-in (kept for legacy parity /
package/dist/tools/cue.js CHANGED
@@ -10,6 +10,7 @@ const resolve_1 = require("./resolve");
10
10
  const resolve_2 = require("../activities/resolve");
11
11
  const signals_1 = require("../workflows/signals");
12
12
  const query_timeout_1 = require("../utils/query-timeout");
13
+ const visibility_deadline_1 = require("../utils/visibility-deadline");
13
14
  const descriptor_1 = require("./descriptor");
14
15
  const validation_1 = require("../utils/validation");
15
16
  const suspension_1 = require("../utils/suspension");
@@ -139,6 +140,17 @@ function buildCueTool(client, config, getPlayerId, handle) {
139
140
  return (0, descriptor_1.ok)(`Message sent to ${playerId}. (outbox: ${entryId})`);
140
141
  }
141
142
  catch (err) {
143
+ // #845 Mode A: a truncated roster scan is NOT "player not found" —
144
+ // the target may exist; the visibility scan just hit its deadline
145
+ // (e.g. post-restart worker warmup). Surface a DISTINCT, actionable
146
+ // "resolution incomplete — retry" so the operator doesn't conclude
147
+ // the player vanished. (The `if (!resolved)` not-found path above
148
+ // only fires on a clean `null`; the timeout throws past it to here.)
149
+ if ((0, visibility_deadline_1.isVisibilityTimeout)(err)) {
150
+ return (0, descriptor_1.fail)(`Could not resolve "${playerId}": roster resolution incomplete — the ` +
151
+ `visibility scan hit its deadline (likely worker warmup), not a ` +
152
+ `"player not found". Retry in a moment.`);
153
+ }
142
154
  return (0, descriptor_1.fail)(`Failed to send message to ${playerId}: ${(0, descriptor_1.formatError)(err)}`);
143
155
  }
144
156
  },
@@ -42,6 +42,7 @@ const resolve_1 = require("../activities/resolve");
42
42
  const descriptor_1 = require("./descriptor");
43
43
  const duration_1 = require("../utils/duration");
44
44
  const suspension_1 = require("../utils/suspension");
45
+ const visibility_deadline_1 = require("../utils/visibility-deadline");
45
46
  /**
46
47
  * Default dormancy threshold (1 hour). Per #563: a `detached` player whose
47
48
  * last activity is older than this is considered dormant. `phase === 'gone'`
@@ -102,12 +103,29 @@ function buildEnsembleTool(client, config, getPlayerId, ownWorkflowId) {
102
103
  self: client.workflow.getHandle(ownWorkflowId),
103
104
  });
104
105
  let sessions;
106
+ let truncated = false;
107
+ let scanned = 0;
105
108
  try {
106
- sessions = await (0, resolve_1.scanEnsembleSessions)(client, config.ensemble);
109
+ const scan = await (0, resolve_1.scanEnsembleSessionsWithStatus)(client, config.ensemble);
110
+ sessions = scan.sessions;
111
+ truncated = scan.truncated;
112
+ scanned = scan.scanned;
107
113
  }
108
114
  catch (err) {
109
115
  return (0, descriptor_1.fail)(`Error listing workflows: ${(0, descriptor_1.formatError)(err)}`);
110
116
  }
117
+ // #845 Mode A — when the visibility scan hit its wall-clock deadline,
118
+ // `sessions` is a PARTIAL roster. Surface that explicitly so an
119
+ // operator never mistakes a mid-scan snapshot for the full ensemble
120
+ // (the incident: a 3/8 roster read as "5 players vanished"). Report
121
+ // `scanned` (workflows enumerated before the deadline) rather than the
122
+ // shown-row count — the shown count is post scope/dormancy filtering,
123
+ // so "N shown" would understate how far the scan actually got.
124
+ const partialBanner = truncated
125
+ ? `⚠ partial roster — ${scanned} workflow(s) enumerated before the ` +
126
+ `${Math.round(visibility_deadline_1.VISIBILITY_DEADLINES_MS.scanEnsembleSessions / 1000)}s visibility deadline ` +
127
+ `(likely worker warmup); some players may be missing — re-run to refresh.`
128
+ : undefined;
111
129
  // Apply scope filters
112
130
  let ownGitRoot;
113
131
  if (scope === 'repo') {
@@ -138,12 +156,25 @@ function buildEnsembleTool(client, config, getPlayerId, ownWorkflowId) {
138
156
  // #752: PAUSED/HELD banner leads the output so it can't be missed.
139
157
  const banner = (0, suspension_1.formatSuspensionBanner)(await suspensionPromise, config.ensemble);
140
158
  if (active.length === 0 && dormant.length === 0) {
159
+ // #845 CRITICAL: check truncation FIRST. A truncated scan that
160
+ // yielded zero rows must NOT render as "No active sessions found" —
161
+ // false-empty is the most dangerous case (an operator concludes the
162
+ // whole ensemble died and takes destructive action). Surface the
163
+ // partial banner instead.
164
+ if (partialBanner) {
165
+ return (0, descriptor_1.ok)([banner, partialBanner].filter(Boolean).join('\n\n'));
166
+ }
141
167
  return (0, descriptor_1.ok)(banner ? `${banner}\n\nNo active sessions found.` : 'No active sessions found.');
142
168
  }
143
169
  // #563 summary line — surface both counts so operators can see what's
144
170
  // being hidden behind the dormant filter without re-running.
145
171
  const summary = `**${config.ensemble}**: ${active.length} active, ${dormant.length} dormant`;
146
- const sections = banner ? [banner, summary] : [summary];
172
+ // Lead banners (suspension #752 + partial-roster #845) precede the
173
+ // summary so neither can be missed above the roster.
174
+ const sections = [
175
+ ...[banner, partialBanner].filter(Boolean),
176
+ summary,
177
+ ];
147
178
  const showActive = dormantFilter !== 'show-only';
148
179
  const showDormant = dormantFilter !== 'hide';
149
180
  if (showActive) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-tempo",
3
- "version": "1.7.0-beta.12",
3
+ "version": "1.7.0-beta.13",
4
4
  "description": "Many agents, one tempo. Durable coordination for multi-agent work via Temporal.",
5
5
  "keywords": [
6
6
  "mcp",