agent-tempo 1.7.0-beta.11 → 1.7.0-beta.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/package.json +1 -1
- package/dist/activities/outbox.js +10 -0
- package/dist/activities/resolve.d.ts +83 -18
- package/dist/activities/resolve.js +139 -26
- package/dist/cli/commands.js +26 -2
- package/dist/client/subscribe.d.ts +10 -0
- package/dist/client/subscribe.js +2 -0
- package/dist/pi/mission-control/board.d.ts +9 -8
- package/dist/pi/mission-control/extension.d.ts +64 -0
- package/dist/pi/mission-control/extension.js +191 -22
- package/dist/pi/mission-control/render.d.ts +12 -0
- package/dist/pi/mission-control/render.js +40 -15
- package/dist/spawn.d.ts +27 -23
- package/dist/spawn.js +40 -32
- package/dist/tools/cue.js +12 -0
- package/dist/tools/ensemble.js +33 -2
- package/package.json +3 -2
package/dashboard/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-tempo-dashboard",
|
|
3
3
|
"private": true,
|
|
4
|
-
"version": "1.7.0-beta.
|
|
4
|
+
"version": "1.7.0-beta.13",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"description": "Web dashboard for agent-tempo. Bundled into the npm package; served by the daemon at /dashboard/*.",
|
|
7
7
|
"scripts": {
|
|
@@ -46,6 +46,7 @@ const git_info_1 = require("../git-info");
|
|
|
46
46
|
const spawn_1 = require("../spawn");
|
|
47
47
|
const config_2 = require("../config");
|
|
48
48
|
const resolve_1 = require("./resolve");
|
|
49
|
+
const visibility_deadline_1 = require("../utils/visibility-deadline");
|
|
49
50
|
const action_counters_1 = require("../utils/action-counters");
|
|
50
51
|
const search_attributes_1 = require("../utils/search-attributes");
|
|
51
52
|
const agent_types_1 = require("../ensemble/agent-types");
|
|
@@ -86,6 +87,15 @@ function isRetryableTemporalError(err) {
|
|
|
86
87
|
// to call unconditionally.
|
|
87
88
|
if (err instanceof activity_1.ApplicationFailure)
|
|
88
89
|
return false;
|
|
90
|
+
// #845 Mode A: a truncated visibility scan (resolveSession's deadline
|
|
91
|
+
// tripped mid-scan, #336/#529) is a LATENCY failure — the target may well
|
|
92
|
+
// exist; the scan just didn't finish. Treat it as transient so the
|
|
93
|
+
// activity retry policy re-runs resolveSession with a fresh 10s deadline
|
|
94
|
+
// (backoff-bounded) instead of collapsing it into a permanent
|
|
95
|
+
// "No active session found". This must NOT re-add an in-resolver retry
|
|
96
|
+
// loop — the bounding lives in Temporal's policy, not a hot-path scan.
|
|
97
|
+
if ((0, visibility_deadline_1.isVisibilityTimeout)(err))
|
|
98
|
+
return true;
|
|
89
99
|
const e = err;
|
|
90
100
|
const name = e?.name ?? '';
|
|
91
101
|
const msg = e?.message ?? '';
|
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
import { Client, WorkflowHandle } from '@temporalio/client';
|
|
2
2
|
import { AttachmentPhase } from '../types';
|
|
3
|
+
/**
|
|
4
|
+
* Mode-B describe-by-id timeout (#845). The strongly-consistent
|
|
5
|
+
* `describe()` fallback on `resolveSession`'s not-found branch is a single
|
|
6
|
+
* O(1) RPC; 2s mirrors {@link DEFAULT_QUERY_TIMEOUT_MS} — two orders of
|
|
7
|
+
* magnitude over a healthy describe — so a wedged frontend can't re-hang
|
|
8
|
+
* the outbox loop the visibility deadline (#336/#529) was added to bound.
|
|
9
|
+
*/
|
|
10
|
+
export declare const RESOLVE_DESCRIBE_TIMEOUT_MS = 2000;
|
|
3
11
|
/** Shared query for listing running session workflows. Exported for the
|
|
4
12
|
* ensemble-scoped variants in `client/core.ts` (#751). */
|
|
5
13
|
export declare const SESSION_LIST_QUERY = "WorkflowType = \"agentSessionWorkflow\" AND ExecutionStatus = \"Running\"";
|
|
@@ -18,15 +26,36 @@ export declare const SESSION_LIST_QUERY = "WorkflowType = \"agentSessionWorkflow
|
|
|
18
26
|
* in `scanEnsembleSessionsCloud`. Enforced by
|
|
19
27
|
* tests/conformance/decision-path-fence.test.ts.
|
|
20
28
|
*
|
|
21
|
-
* **
|
|
22
|
-
* `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On
|
|
23
|
-
* throws `VisibilityIteratorTimeoutError` rather than returning
|
|
24
|
-
* — silent `null` on a partially-scanned set would be
|
|
25
|
-
* from "definitely not found
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
*
|
|
29
|
+
* **Mode A — deadline truncation (#336/#529):** the visibility iterator is
|
|
30
|
+
* bounded by `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On
|
|
31
|
+
* timeout it throws `VisibilityIteratorTimeoutError` rather than returning
|
|
32
|
+
* `null` — silent `null` on a partially-scanned set would be
|
|
33
|
+
* indistinguishable from "definitely not found." The throw is classified
|
|
34
|
+
* **retryable** by the outbox activity (`isRetryableTemporalError`), so
|
|
35
|
+
* Temporal's activity retry policy re-runs the lookup with a fresh
|
|
36
|
+
* deadline rather than collapsing it to a permanent "player not found."
|
|
37
|
+
* Synchronous tool/CLI callers surface it as a distinct "resolution
|
|
38
|
+
* incomplete — retry," never "not found."
|
|
39
|
+
*
|
|
40
|
+
* **Mode B — visibility-index lag (#845):** `list()` can complete normally
|
|
41
|
+
* (no throw) yet miss a freshly-started workflow because the visibility
|
|
42
|
+
* index trails the workflow store (observed live as a 3/8→8/8 roster
|
|
43
|
+
* during post-restart worker warmup). An early-exhausting scan is NOT
|
|
44
|
+
* proof of absence. So on the not-found branch we do **exactly one**
|
|
45
|
+
* strongly-consistent `describe()` against the *derived* workflow id —
|
|
46
|
+
* an O(1) read by primary key that bypasses the lagging index. This is a
|
|
47
|
+
* point lookup, NOT a re-scan: it cannot re-introduce the unbounded-scan
|
|
48
|
+
* hang the deadline guard was added to prevent.
|
|
49
|
+
*
|
|
50
|
+
* **Documented Mode-B limitation:** the derived id
|
|
51
|
+
* `agent-session-{ensemble}-{playerName}` is minted from a player's
|
|
52
|
+
* INITIAL name at spawn; `set_name` does not change the workflow id. So
|
|
53
|
+
* describe-by-derived-id false-negatives for a player that was both
|
|
54
|
+
* RENAMED and is currently index-lagged — it falls back to `null` (looks
|
|
55
|
+
* absent) for that narrow intersection. Accepted by design: it closes the
|
|
56
|
+
* gap for the cold-boot/warmup incident class (nobody renames mid-boot),
|
|
57
|
+
* and a second full re-scan to cover renamed∩lagged would put scan cost on
|
|
58
|
+
* every genuine typo'd-name lookup. See issue #845.
|
|
30
59
|
*/
|
|
31
60
|
export declare function resolveSession(client: Client, ensemble: string, playerName: string): Promise<WorkflowHandle | null>;
|
|
32
61
|
/** Info returned for each session by scanEnsembleSessions. */
|
|
@@ -82,19 +111,55 @@ export interface EnsembleSessionInfo {
|
|
|
82
111
|
*/
|
|
83
112
|
export declare function scanEnsembleSessionsCloud(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleSessionInfo[]>;
|
|
84
113
|
/**
|
|
85
|
-
*
|
|
86
|
-
*
|
|
87
|
-
*
|
|
114
|
+
* Result of {@link scanEnsembleSessionsWithStatus} — the session rows plus
|
|
115
|
+
* whether the visibility scan completed or was cut short (#845).
|
|
116
|
+
*
|
|
117
|
+
* `truncated` is the Mode-A signal: a `VisibilityIteratorTimeoutError`
|
|
118
|
+
* fired (the wall-clock deadline tripped mid-scan), so `sessions` is a
|
|
119
|
+
* partial snapshot, NOT the full roster. Callers that render a roster (the
|
|
120
|
+
* `ensemble` tool) MUST surface this so a partial set is never mistaken
|
|
121
|
+
* for a complete one. NOTE: this does NOT cover Mode B (visibility-index
|
|
122
|
+
* lag) — there the scan completes normally and `truncated` is `false` even
|
|
123
|
+
* though a freshly-started workflow may be missing; that's best-effort by
|
|
124
|
+
* design and self-heals on the next tick.
|
|
125
|
+
*
|
|
126
|
+
* `scanned` is the number of running workflows the iterator visited before
|
|
127
|
+
* completing or timing out — useful for warn logs ("partial: 3 of ≥N").
|
|
128
|
+
*/
|
|
129
|
+
export interface EnsembleScanResult {
|
|
130
|
+
sessions: EnsembleSessionInfo[];
|
|
131
|
+
truncated: boolean;
|
|
132
|
+
scanned: number;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Scan all running session workflows in an ensemble, reporting whether the
|
|
136
|
+
* scan completed or was truncated by the visibility deadline (#845).
|
|
137
|
+
*
|
|
138
|
+
* This is the single source of truth for the local-profile ensemble scan;
|
|
139
|
+
* {@link scanEnsembleSessions} is a thin array-facade over it that drops
|
|
140
|
+
* the status fields for the many callers that don't need them.
|
|
88
141
|
*
|
|
89
142
|
* **Deadline (#336/#529):** the iterator is bounded by
|
|
90
|
-
* `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On
|
|
91
|
-
*
|
|
92
|
-
*
|
|
93
|
-
*
|
|
94
|
-
* best-effort snapshot that the next tick / re-invocation will fill in.
|
|
143
|
+
* `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On timeout
|
|
144
|
+
* the accumulated rows are returned with `truncated: true` and a warn log
|
|
145
|
+
* — the scan is **partial-tolerant by design**, but the truncation is now
|
|
146
|
+
* SIGNALLED rather than silent so a roster renderer can flag it.
|
|
95
147
|
*
|
|
96
148
|
* T0.1 (#748): this legacy shape is the `costProfile: 'local'` path —
|
|
97
|
-
* byte-identical to pre-#748 behavior. The cloud profile uses
|
|
149
|
+
* byte-identical row data to pre-#748 behavior. The cloud profile uses
|
|
98
150
|
* {@link scanEnsembleSessionsCloud}.
|
|
99
151
|
*/
|
|
152
|
+
export declare function scanEnsembleSessionsWithStatus(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleScanResult>;
|
|
153
|
+
/**
|
|
154
|
+
* Scan all running session workflows in an ensemble — array facade over
|
|
155
|
+
* {@link scanEnsembleSessionsWithStatus}.
|
|
156
|
+
*
|
|
157
|
+
* Returns just the session rows; the truncation/scan-status fields are
|
|
158
|
+
* dropped. This is the byte-identical shape the maestro refresh activity,
|
|
159
|
+
* the #785 upgrade-snapshot, and the other roster consumers already depend
|
|
160
|
+
* on — keeping it a thin delegate means the truncation-signalling work
|
|
161
|
+
* (#845) does NOT ripple through those call sites. Callers that need to
|
|
162
|
+
* know whether the scan was complete (the `ensemble` tool) call the rich
|
|
163
|
+
* sibling directly.
|
|
164
|
+
*/
|
|
100
165
|
export declare function scanEnsembleSessions(client: Client, ensemble: string, log?: (...args: unknown[]) => void): Promise<EnsembleSessionInfo[]>;
|
|
@@ -1,13 +1,23 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.SESSION_LIST_QUERY = void 0;
|
|
3
|
+
exports.SESSION_LIST_QUERY = exports.RESOLVE_DESCRIBE_TIMEOUT_MS = void 0;
|
|
4
4
|
exports.resolveSession = resolveSession;
|
|
5
5
|
exports.scanEnsembleSessionsCloud = scanEnsembleSessionsCloud;
|
|
6
|
+
exports.scanEnsembleSessionsWithStatus = scanEnsembleSessionsWithStatus;
|
|
6
7
|
exports.scanEnsembleSessions = scanEnsembleSessions;
|
|
8
|
+
const config_1 = require("../config");
|
|
7
9
|
const search_attributes_1 = require("../utils/search-attributes");
|
|
8
10
|
const signals_1 = require("../workflows/signals");
|
|
9
11
|
const query_timeout_1 = require("../utils/query-timeout");
|
|
10
12
|
const visibility_deadline_1 = require("../utils/visibility-deadline");
|
|
13
|
+
/**
|
|
14
|
+
* Mode-B describe-by-id timeout (#845). The strongly-consistent
|
|
15
|
+
* `describe()` fallback on `resolveSession`'s not-found branch is a single
|
|
16
|
+
* O(1) RPC; 2s mirrors {@link DEFAULT_QUERY_TIMEOUT_MS} — two orders of
|
|
17
|
+
* magnitude over a healthy describe — so a wedged frontend can't re-hang
|
|
18
|
+
* the outbox loop the visibility deadline (#336/#529) was added to bound.
|
|
19
|
+
*/
|
|
20
|
+
exports.RESOLVE_DESCRIBE_TIMEOUT_MS = 2000;
|
|
11
21
|
/** Shared query for listing running session workflows. Exported for the
|
|
12
22
|
* ensemble-scoped variants in `client/core.ts` (#751). */
|
|
13
23
|
exports.SESSION_LIST_QUERY = `WorkflowType = "agentSessionWorkflow" AND ExecutionStatus = "Running"`;
|
|
@@ -26,15 +36,36 @@ exports.SESSION_LIST_QUERY = `WorkflowType = "agentSessionWorkflow" AND Executio
|
|
|
26
36
|
* in `scanEnsembleSessionsCloud`. Enforced by
|
|
27
37
|
* tests/conformance/decision-path-fence.test.ts.
|
|
28
38
|
*
|
|
29
|
-
* **
|
|
30
|
-
* `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On
|
|
31
|
-
* throws `VisibilityIteratorTimeoutError` rather than returning
|
|
32
|
-
* — silent `null` on a partially-scanned set would be
|
|
33
|
-
* from "definitely not found
|
|
34
|
-
*
|
|
35
|
-
*
|
|
36
|
-
*
|
|
37
|
-
*
|
|
39
|
+
* **Mode A — deadline truncation (#336/#529):** the visibility iterator is
|
|
40
|
+
* bounded by `VISIBILITY_DEADLINES_MS.resolveSession` (default 10s). On
|
|
41
|
+
* timeout it throws `VisibilityIteratorTimeoutError` rather than returning
|
|
42
|
+
* `null` — silent `null` on a partially-scanned set would be
|
|
43
|
+
* indistinguishable from "definitely not found." The throw is classified
|
|
44
|
+
* **retryable** by the outbox activity (`isRetryableTemporalError`), so
|
|
45
|
+
* Temporal's activity retry policy re-runs the lookup with a fresh
|
|
46
|
+
* deadline rather than collapsing it to a permanent "player not found."
|
|
47
|
+
* Synchronous tool/CLI callers surface it as a distinct "resolution
|
|
48
|
+
* incomplete — retry," never "not found."
|
|
49
|
+
*
|
|
50
|
+
* **Mode B — visibility-index lag (#845):** `list()` can complete normally
|
|
51
|
+
* (no throw) yet miss a freshly-started workflow because the visibility
|
|
52
|
+
* index trails the workflow store (observed live as a 3/8→8/8 roster
|
|
53
|
+
* during post-restart worker warmup). An early-exhausting scan is NOT
|
|
54
|
+
* proof of absence. So on the not-found branch we do **exactly one**
|
|
55
|
+
* strongly-consistent `describe()` against the *derived* workflow id —
|
|
56
|
+
* an O(1) read by primary key that bypasses the lagging index. This is a
|
|
57
|
+
* point lookup, NOT a re-scan: it cannot re-introduce the unbounded-scan
|
|
58
|
+
* hang the deadline guard was added to prevent.
|
|
59
|
+
*
|
|
60
|
+
* **Documented Mode-B limitation:** the derived id
|
|
61
|
+
* `agent-session-{ensemble}-{playerName}` is minted from a player's
|
|
62
|
+
* INITIAL name at spawn; `set_name` does not change the workflow id. So
|
|
63
|
+
* describe-by-derived-id false-negatives for a player that was both
|
|
64
|
+
* RENAMED and is currently index-lagged — it falls back to `null` (looks
|
|
65
|
+
* absent) for that narrow intersection. Accepted by design: it closes the
|
|
66
|
+
* gap for the cold-boot/warmup incident class (nobody renames mid-boot),
|
|
67
|
+
* and a second full re-scan to cover renamed∩lagged would put scan cost on
|
|
68
|
+
* every genuine typo'd-name lookup. See issue #845.
|
|
38
69
|
*/
|
|
39
70
|
async function resolveSession(client, ensemble, playerName) {
|
|
40
71
|
for await (const wf of (0, visibility_deadline_1.iterateWithDeadline)(client.workflow.list({ query: exports.SESSION_LIST_QUERY }), visibility_deadline_1.VISIBILITY_DEADLINES_MS.resolveSession, 'resolveSession')) {
|
|
@@ -49,16 +80,77 @@ async function resolveSession(client, ensemble, playerName) {
|
|
|
49
80
|
}
|
|
50
81
|
}
|
|
51
82
|
catch (err) {
|
|
52
|
-
// Re-throw deadline timeouts — callers that wrap us in
|
|
53
|
-
//
|
|
54
|
-
//
|
|
55
|
-
// outbox logs / user-facing tool errors.
|
|
83
|
+
// Re-throw deadline timeouts (Mode A) — callers that wrap us in
|
|
84
|
+
// try/catch treat the typed throw as a soft "lookup timed out" path,
|
|
85
|
+
// distinct from the not-found `null` below.
|
|
56
86
|
if ((0, visibility_deadline_1.isVisibilityTimeout)(err))
|
|
57
87
|
throw err;
|
|
58
88
|
// Workflow may have just completed, or worker is wedged (#433) — skip
|
|
59
89
|
}
|
|
60
90
|
}
|
|
61
|
-
|
|
91
|
+
// Mode B (#845): the scan completed without a match, but the visibility
|
|
92
|
+
// index may simply be lagging a just-started workflow. One strongly-
|
|
93
|
+
// consistent describe-by-derived-id disambiguates "index lag" from
|
|
94
|
+
// "genuinely absent" without a second scan.
|
|
95
|
+
return resolveByDerivedId(client, ensemble, playerName);
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Mode-B (#845) strongly-consistent fallback for {@link resolveSession}.
|
|
99
|
+
*
|
|
100
|
+
* Reads the session workflow by its *derived* id
|
|
101
|
+
* (`agent-session-{ensemble}-{playerName}`) via a single bounded
|
|
102
|
+
* `describe()` — a primary-key lookup that bypasses the eventually-
|
|
103
|
+
* consistent visibility index. Returns the handle whenever the execution
|
|
104
|
+
* is `RUNNING`; otherwise `null` (genuinely absent, terminated/completed,
|
|
105
|
+
* renamed-false-negative, or describe timed out).
|
|
106
|
+
*
|
|
107
|
+
* Deliberately RUNNING-only — NO attachment-phase filter (#845 JC2): a
|
|
108
|
+
* `gone` player has a LIVE workflow with a terminal adapter, which the
|
|
109
|
+
* #822/#834 deliverability contract handles as warn-but-queue, not
|
|
110
|
+
* "not found". Filtering it here would regress #834 for the lagged-gone
|
|
111
|
+
* window and diverge from the main scan loop (which has no phase filter).
|
|
112
|
+
*/
|
|
113
|
+
async function resolveByDerivedId(client, ensemble, playerName) {
|
|
114
|
+
let timer;
|
|
115
|
+
try {
|
|
116
|
+
// `getHandle` is a lazy, no-RPC handle construction in the real client;
|
|
117
|
+
// kept inside the try purely so a defensive throw can never escape the
|
|
118
|
+
// fallback (it must only ever upgrade a null to a handle, never error).
|
|
119
|
+
const handle = client.workflow.getHandle((0, config_1.sessionWorkflowId)(ensemble, playerName));
|
|
120
|
+
const timeout = new Promise((_, reject) => {
|
|
121
|
+
timer = setTimeout(() => reject(new Error('describe-by-id timed out')), exports.RESOLVE_DESCRIBE_TIMEOUT_MS);
|
|
122
|
+
timer.unref?.();
|
|
123
|
+
});
|
|
124
|
+
const desc = await Promise.race([handle.describe(), timeout]);
|
|
125
|
+
// Only a live (RUNNING) execution is a valid resolve target. A
|
|
126
|
+
// COMPLETED/TERMINATED latest run at this id means the player is gone,
|
|
127
|
+
// or the id was reused by a since-closed run → null. A RUNNING run
|
|
128
|
+
// under a reused id is legitimately the current player → return it.
|
|
129
|
+
//
|
|
130
|
+
// No attachment-phase filter (#845 JC2, architect ruling): the main
|
|
131
|
+
// scan loop returns the handle for ANY running session — phase=`gone`
|
|
132
|
+
// included — and #822/#834 treat `gone` as warn-but-QUEUE (the cue
|
|
133
|
+
// durably queues and auto-redelivers on re-attach), NOT "not found".
|
|
134
|
+
// Returning null for a lagged-`gone` player would bypass #822, re-
|
|
135
|
+
// introduce the false-not-found #834 fixed, and make resolution depend
|
|
136
|
+
// on visibility-index timing. The "don't deliver to a torn-down
|
|
137
|
+
// adapter" concern lives at the deliverability layer, not here.
|
|
138
|
+
if (desc.status.name !== 'RUNNING')
|
|
139
|
+
return null;
|
|
140
|
+
return handle;
|
|
141
|
+
}
|
|
142
|
+
catch {
|
|
143
|
+
// NotFound → genuinely absent (or the renamed∩lagged false-negative
|
|
144
|
+
// documented on resolveSession). Timeout/other → treat as absent; the
|
|
145
|
+
// caller's not-found path (or the activity retry policy for Mode A)
|
|
146
|
+
// handles it. We never throw from the fallback — it can only upgrade a
|
|
147
|
+
// null to a found handle, never turn a clean lookup into an error.
|
|
148
|
+
return null;
|
|
149
|
+
}
|
|
150
|
+
finally {
|
|
151
|
+
if (timer)
|
|
152
|
+
clearTimeout(timer);
|
|
153
|
+
}
|
|
62
154
|
}
|
|
63
155
|
/**
|
|
64
156
|
* T0.1 (#748) — cloud-profile ensemble scan. Observation path ONLY (see the
|
|
@@ -162,25 +254,30 @@ async function scanEnsembleSessionsCloud(client, ensemble, log = () => { }) {
|
|
|
162
254
|
return sessions;
|
|
163
255
|
}
|
|
164
256
|
/**
|
|
165
|
-
* Scan all running session workflows in an ensemble
|
|
166
|
-
*
|
|
167
|
-
*
|
|
257
|
+
* Scan all running session workflows in an ensemble, reporting whether the
|
|
258
|
+
* scan completed or was truncated by the visibility deadline (#845).
|
|
259
|
+
*
|
|
260
|
+
* This is the single source of truth for the local-profile ensemble scan;
|
|
261
|
+
* {@link scanEnsembleSessions} is a thin array-facade over it that drops
|
|
262
|
+
* the status fields for the many callers that don't need them.
|
|
168
263
|
*
|
|
169
264
|
* **Deadline (#336/#529):** the iterator is bounded by
|
|
170
|
-
* `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On
|
|
171
|
-
*
|
|
172
|
-
*
|
|
173
|
-
*
|
|
174
|
-
* best-effort snapshot that the next tick / re-invocation will fill in.
|
|
265
|
+
* `VISIBILITY_DEADLINES_MS.scanEnsembleSessions` (default 15s). On timeout
|
|
266
|
+
* the accumulated rows are returned with `truncated: true` and a warn log
|
|
267
|
+
* — the scan is **partial-tolerant by design**, but the truncation is now
|
|
268
|
+
* SIGNALLED rather than silent so a roster renderer can flag it.
|
|
175
269
|
*
|
|
176
270
|
* T0.1 (#748): this legacy shape is the `costProfile: 'local'` path —
|
|
177
|
-
* byte-identical to pre-#748 behavior. The cloud profile uses
|
|
271
|
+
* byte-identical row data to pre-#748 behavior. The cloud profile uses
|
|
178
272
|
* {@link scanEnsembleSessionsCloud}.
|
|
179
273
|
*/
|
|
180
|
-
async function
|
|
274
|
+
async function scanEnsembleSessionsWithStatus(client, ensemble, log = () => { }) {
|
|
181
275
|
const sessions = [];
|
|
276
|
+
let truncated = false;
|
|
277
|
+
let scanned = 0;
|
|
182
278
|
try {
|
|
183
279
|
for await (const workflow of (0, visibility_deadline_1.iterateWithDeadline)(client.workflow.list({ query: exports.SESSION_LIST_QUERY }), visibility_deadline_1.VISIBILITY_DEADLINES_MS.scanEnsembleSessions, 'scanEnsembleSessions')) {
|
|
280
|
+
scanned++;
|
|
184
281
|
try {
|
|
185
282
|
const handle = client.workflow.getHandle(workflow.workflowId);
|
|
186
283
|
// Issue #433 — bound the metadata + part queries so a single wedged
|
|
@@ -234,11 +331,27 @@ async function scanEnsembleSessions(client, ensemble, log = () => { }) {
|
|
|
234
331
|
}
|
|
235
332
|
catch (err) {
|
|
236
333
|
if ((0, visibility_deadline_1.isVisibilityTimeout)(err)) {
|
|
334
|
+
truncated = true;
|
|
237
335
|
log(`scanEnsembleSessions: ${err.message} — returning partial (${sessions.length} sessions)`);
|
|
238
336
|
}
|
|
239
337
|
else {
|
|
240
338
|
throw err;
|
|
241
339
|
}
|
|
242
340
|
}
|
|
243
|
-
return sessions;
|
|
341
|
+
return { sessions, truncated, scanned };
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Scan all running session workflows in an ensemble — array facade over
|
|
345
|
+
* {@link scanEnsembleSessionsWithStatus}.
|
|
346
|
+
*
|
|
347
|
+
* Returns just the session rows; the truncation/scan-status fields are
|
|
348
|
+
* dropped. This is the byte-identical shape the maestro refresh activity,
|
|
349
|
+
* the #785 upgrade-snapshot, and the other roster consumers already depend
|
|
350
|
+
* on — keeping it a thin delegate means the truncation-signalling work
|
|
351
|
+
* (#845) does NOT ripple through those call sites. Callers that need to
|
|
352
|
+
* know whether the scan was complete (the `ensemble` tool) call the rich
|
|
353
|
+
* sibling directly.
|
|
354
|
+
*/
|
|
355
|
+
async function scanEnsembleSessions(client, ensemble, log = () => { }) {
|
|
356
|
+
return (await scanEnsembleSessionsWithStatus(client, ensemble, log)).sessions;
|
|
244
357
|
}
|
package/dist/cli/commands.js
CHANGED
|
@@ -65,6 +65,7 @@ const croner_1 = require("croner");
|
|
|
65
65
|
const client_1 = require("@temporalio/client");
|
|
66
66
|
const spawn_1 = require("../spawn");
|
|
67
67
|
const probe_1 = require("../pi/probe");
|
|
68
|
+
const install_1 = require("../pi/install");
|
|
68
69
|
const config_1 = require("../config");
|
|
69
70
|
const git_info_1 = require("../git-info");
|
|
70
71
|
const connection_1 = require("../connection");
|
|
@@ -1318,10 +1319,33 @@ async function up(opts) {
|
|
|
1318
1319
|
if (!process.env.ANTHROPIC_API_KEY) {
|
|
1319
1320
|
out.warn('ANTHROPIC_API_KEY is not set — the Pi conductor will fall back to Pi\'s own auth/default model. Set it if Pi needs an Anthropic key.');
|
|
1320
1321
|
}
|
|
1322
|
+
// #825 — extension-registration guard (mirrors command-center's #820 Bug-2
|
|
1323
|
+
// guard). `up --agent pi` no longer passes an inline `-e` (that risked a
|
|
1324
|
+
// divergent-copy double-load, #825); it now relies on the player extension
|
|
1325
|
+
// being registered in Pi's settings.json. On a box that never ran `install-pi`,
|
|
1326
|
+
// a plain `pi` would launch with NO extension — no claim/heartbeat, a silent
|
|
1327
|
+
// non-conductor (the #820 Bug-2 failure, transplanted to the conductor). So
|
|
1328
|
+
// auto-install idempotently before spawning; fail loud with the manual command
|
|
1329
|
+
// if the write fails. (Checks the GLOBAL settings.json, like command-center; a
|
|
1330
|
+
// user who ran `install-pi --project` still works — `pi` loads the project
|
|
1331
|
+
// path and the same realpath dedupes, so the redundant global install is a
|
|
1332
|
+
// harmless idempotent write, never a second load.)
|
|
1333
|
+
if (!(0, install_1.arePiExtensionsRegistered)()) {
|
|
1334
|
+
try {
|
|
1335
|
+
const result = (0, install_1.installPiExtensions)();
|
|
1336
|
+
out.log(out.dim(` Registered the Pi extensions in ${result.settingsPath} (first-run install-pi).`));
|
|
1337
|
+
}
|
|
1338
|
+
catch (err) {
|
|
1339
|
+
out.error('Cannot start Pi conductor — the Pi extensions are not registered and auto-install failed: ' +
|
|
1340
|
+
`${err instanceof Error ? err.message : String(err)}. Run \`agent-tempo install-pi\` manually, then retry.`);
|
|
1341
|
+
process.exit(1);
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1321
1344
|
let piSpawn;
|
|
1322
1345
|
try {
|
|
1323
|
-
// resolvePiInteractiveBinary
|
|
1324
|
-
//
|
|
1346
|
+
// resolvePiInteractiveBinary throws fail-clean (Pi CLI missing) — caught
|
|
1347
|
+
// here, no terminal launched. #825: no more `-e`/extension resolution — the
|
|
1348
|
+
// player extension loads from settings.json, registered + guarded just above.
|
|
1325
1349
|
piSpawn = (0, spawn_1.buildPiConductorSpawn)({
|
|
1326
1350
|
ensemble: opts.ensemble,
|
|
1327
1351
|
sessionName,
|
|
@@ -81,6 +81,16 @@ export interface SubscribeDeps {
|
|
|
81
81
|
* present (Node 20), the wrapper falls back to fetch.
|
|
82
82
|
*/
|
|
83
83
|
EventSourceImpl?: typeof EventSource;
|
|
84
|
+
/**
|
|
85
|
+
* #826 — force the fetch transport even when a native `EventSource` is
|
|
86
|
+
* available and no token is set. The fetch path is the only one that
|
|
87
|
+
* surfaces a permanent **401/404** as a thrown {@link SubscribeHttpError};
|
|
88
|
+
* native `EventSource` swallows those into its own silent reconnect cycle.
|
|
89
|
+
* The mission-control board needs that hard-error visibility (404 → `gone`,
|
|
90
|
+
* 401 → auth hint), so it sets this. TUI / dashboard leave it unset and keep
|
|
91
|
+
* the auto-selection (native `EventSource` on a tokenless loopback board).
|
|
92
|
+
*/
|
|
93
|
+
forceFetch?: boolean;
|
|
84
94
|
/**
|
|
85
95
|
* Override sleep — used by tests to fast-forward backoff. Accepts an
|
|
86
96
|
* `AbortSignal` so the wrapper can wake early on abort.
|
package/dist/client/subscribe.js
CHANGED
|
@@ -216,6 +216,8 @@ function makeIterator(args) {
|
|
|
216
216
|
* for `Authorization: Bearer …` and is the only option in Node 20.
|
|
217
217
|
*/
|
|
218
218
|
function canUseEventSource(deps) {
|
|
219
|
+
if (deps.forceFetch)
|
|
220
|
+
return false; // #826 — caller needs throw-on-permanent
|
|
219
221
|
if (deps.token)
|
|
220
222
|
return false;
|
|
221
223
|
return resolveEventSource(deps) !== undefined;
|
|
@@ -41,14 +41,15 @@ export declare const DEFAULT_TAIL_LIMIT = 200;
|
|
|
41
41
|
*
|
|
42
42
|
* - `'connecting'` — initial / post-rebind, before the first coarse event lands.
|
|
43
43
|
* - `'live'` — at least one coarse event has arrived on the current connection.
|
|
44
|
-
* - `'reconnecting'` — the coarse stream
|
|
45
|
-
*
|
|
46
|
-
* not cleared).
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
*
|
|
50
|
-
*
|
|
51
|
-
*
|
|
44
|
+
* - `'reconnecting'` — the coarse stream ended OR went silent past the watchdog
|
|
45
|
+
* threshold (#826), and the board is RE-ARMING. Rows are KEPT (rendered stale,
|
|
46
|
+
* not cleared). #828: the extension now auto-re-subscribes with bounded
|
|
47
|
+
* equal-jitter backoff (genuine transient blips are still swallowed INSIDE
|
|
48
|
+
* `createSubscribe`, so the board only reaches here on a real stream-death).
|
|
49
|
+
* The variant is carried on `connectionDetail` (no new enum value): an arming
|
|
50
|
+
* detail → `[RECONNECTING]`, a settled detail (re-arm capped at 30s) →
|
|
51
|
+
* `[STREAM DOWN]`, and the 401-auth path (which does NOT auto-re-arm — a
|
|
52
|
+
* re-sub would just 401 again) keeps the `[STREAM ENDED]` + set-token hint.
|
|
52
53
|
* - `'gone'` — a hard 404 on the per-ensemble stream: the ensemble's maestro is
|
|
53
54
|
* gone. {@link setConnection} CLEARS the player list on this transition and the
|
|
54
55
|
* extension STOPS the stream; the renderer shows "ENSEMBLE DESTROYED".
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { type PiRole } from '../../config';
|
|
2
|
+
import { createSubscribe } from '../../client/subscribe';
|
|
2
3
|
import { type BoardModel, type CommandLevel } from './board';
|
|
3
4
|
import { MissionControlActions, type ActionResult } from './actions';
|
|
4
5
|
import { type InfraProgress } from '../../cli/ensure-infra';
|
|
@@ -32,6 +33,14 @@ export interface MissionControlDeps {
|
|
|
32
33
|
* `'player'` and `'none'` both keep it dormant.
|
|
33
34
|
*/
|
|
34
35
|
role?: PiRole;
|
|
36
|
+
/**
|
|
37
|
+
* #826/#828 — override the coarse-stream subscribe factory (test seam).
|
|
38
|
+
* Defaults to {@link createSubscribe}. Lets a fake-timer test inject a mock
|
|
39
|
+
* `subscribe` generator to drive the watchdog + re-arm loop deterministically
|
|
40
|
+
* and assert the single-loop invariant (subscribe called exactly N times, not
|
|
41
|
+
* N+1). Production never sets it.
|
|
42
|
+
*/
|
|
43
|
+
createSubscribeImpl?: typeof createSubscribe;
|
|
35
44
|
}
|
|
36
45
|
/**
|
|
37
46
|
* Infra-bootstrap seam (#700 P1). Defaults to the real {@link ensureInfra}; the
|
|
@@ -93,6 +102,61 @@ export declare function classifyCoarseStreamEnd(err: unknown, aborted: boolean):
|
|
|
93
102
|
connection: 'gone' | 'reconnecting';
|
|
94
103
|
detail?: string;
|
|
95
104
|
} | null;
|
|
105
|
+
/** #826 — watchdog poll cadence (how often we compare now − lastCoarseEventAt). */
|
|
106
|
+
export declare const WATCHDOG_TICK_MS = 5000;
|
|
107
|
+
/**
|
|
108
|
+
* #826 — board-level staleness threshold. The daemon emits a `heartbeat` SSE
|
|
109
|
+
* event every ≤10s on a live `/v1/events` stream, so >35s of TOTAL silence
|
|
110
|
+
* (3.5× heartbeat) means the stream is wedged/dead — a half-open socket from a
|
|
111
|
+
* hard `agent-tempo down` (ECONNREFUSED / dead TCP), which neither a 404 nor
|
|
112
|
+
* force-fetch's INTERNAL retry surfaces (that loop reconnects forever, never
|
|
113
|
+
* throws). Sits ABOVE the fetch loop's 30s internal backoff cap, so a healthy
|
|
114
|
+
* cycling loop still receiving heartbeats never trips it — this gap IS the
|
|
115
|
+
* no-double-retry boundary (watchdog = safety net ABOVE the transport).
|
|
116
|
+
*/
|
|
117
|
+
export declare const COARSE_STALE_MS = 35000;
|
|
118
|
+
/**
|
|
119
|
+
* #828 — after this many consecutive failed re-arms the board stops claiming
|
|
120
|
+
* it's actively "reconnecting" and settles to the honest "[STREAM DOWN] —
|
|
121
|
+
* retrying every 30s" wording. Re-arm itself NEVER stops (a permanently silent
|
|
122
|
+
* wedge is the #752 silent-wedge class); only the label changes. ~5 steps takes
|
|
123
|
+
* the backoff ramp to its 30s cap.
|
|
124
|
+
*/
|
|
125
|
+
export declare const REARM_SETTLE_THRESHOLD = 5;
|
|
126
|
+
/**
|
|
127
|
+
* #828 — equal-jitter backoff for the Nth re-arm attempt: `b/2 + rand(0, b/2)`
|
|
128
|
+
* where `b = min(1s·2^attempt, 30s)`. `Math.random()` is fine here — this is
|
|
129
|
+
* client code, not workflow code (the determinism rule does not apply). Jitter
|
|
130
|
+
* spreads re-arms so a fleet of boards doesn't thundering-herd a recovering
|
|
131
|
+
* daemon. `randomFn` is injectable for deterministic tests.
|
|
132
|
+
*/
|
|
133
|
+
export declare function rearmDelayMs(attempt: number, randomFn?: () => number): number;
|
|
134
|
+
/**
|
|
135
|
+
* #828 — the reconnecting sub-variant wording for the Nth re-arm attempt: still
|
|
136
|
+
* ramping (< {@link REARM_SETTLE_THRESHOLD}) → "attempting to reconnect…";
|
|
137
|
+
* settled (≥) → "retrying every 30s". Carried on the model's `connectionDetail`
|
|
138
|
+
* (NO new BoardConnection enum value) and read by the renderer to pick the
|
|
139
|
+
* marker. Pure + exported for unit testing.
|
|
140
|
+
*/
|
|
141
|
+
export declare function reconnectDetailForAttempt(attempt: number): string;
|
|
142
|
+
/**
|
|
143
|
+
* #828 — should a coarse stream-END auto-re-arm? Gate (architect ruling):
|
|
144
|
+
* - `null` (aborted teardown/rebind) → no
|
|
145
|
+
* - `gone` (404 — maestro torn down; a re-sub just 404s) → no (terminal by design)
|
|
146
|
+
* - `reconnecting` WITH a detail (the 401 auth path — tight-looping a
|
|
147
|
+
* guaranteed-fail) → no; keep the set-token hint
|
|
148
|
+
* - `reconnecting` WITHOUT a detail (generic stream-drop / normal-end) → yes
|
|
149
|
+
* Pure + exported for unit testing.
|
|
150
|
+
*/
|
|
151
|
+
export declare function shouldRearmOnStreamEnd(end: {
|
|
152
|
+
connection: 'gone' | 'reconnecting';
|
|
153
|
+
detail?: string;
|
|
154
|
+
} | null): boolean;
|
|
155
|
+
/**
|
|
156
|
+
* #826 — is the coarse stream stale (silent past {@link COARSE_STALE_MS})?
|
|
157
|
+
* `lastEventAt === 0` means "not connected yet" → never stale. Pure.
|
|
158
|
+
*/
|
|
159
|
+
export declare function isCoarseStale(lastEventAt: number, now: number): boolean;
|
|
96
160
|
/**
|
|
97
161
|
* The operator-command + board controller. Holds the model + the action client;
|
|
98
162
|
* command methods are independently unit-testable with a fake actions + ctx.
|