pi-crew 0.7.5 → 0.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +51 -0
- package/README.md +11 -11
- package/docs/commands-reference.md +14 -10
- package/docs/troubleshooting.md +131 -0
- package/docs/usage.md +9 -4
- package/package.json +1 -1
- package/src/config/config.ts +11 -4
- package/src/extension/action-suggestions.ts +71 -0
- package/src/extension/context-status-injection.ts +32 -1
- package/src/extension/register.ts +71 -65
- package/src/extension/team-tool/api.ts +3 -2
- package/src/extension/team-tool/cancel.ts +5 -4
- package/src/extension/team-tool/explain.ts +2 -1
- package/src/extension/team-tool/failure-patterns.ts +124 -0
- package/src/extension/team-tool/inspect.ts +10 -6
- package/src/extension/team-tool/lifecycle-actions.ts +5 -4
- package/src/extension/team-tool/respond.ts +4 -3
- package/src/extension/team-tool/run-not-found.ts +54 -0
- package/src/extension/team-tool/run.ts +26 -4
- package/src/extension/team-tool/status.ts +58 -4
- package/src/extension/team-tool.ts +5 -3
- package/src/runtime/async-runner.ts +7 -0
- package/src/runtime/background-runner.ts +7 -1
- package/src/runtime/chain-parser.ts +13 -5
- package/src/runtime/checkpoint.ts +13 -1
- package/src/runtime/child-pi.ts +9 -1
- package/src/runtime/live-session-runtime.ts +15 -1
- package/src/runtime/parent-guard.ts +2 -2
- package/src/runtime/stale-reconciler.ts +8 -3
- package/src/runtime/task-runner.ts +10 -1
- package/src/runtime/team-runner.ts +19 -2
- package/src/runtime/verification-gates.ts +21 -1
- package/src/schema/team-tool-schema.ts +9 -0
- package/src/state/blob-store.ts +12 -10
- package/src/state/event-log-rotation.ts +114 -93
- package/src/state/event-log.ts +79 -20
- package/src/state/health-store.ts +6 -1
- package/src/state/locks.ts +66 -16
- package/src/state/state-store.ts +14 -1
- package/src/ui/card-colors.ts +7 -3
- package/src/ui/dashboard-panes/agents-pane.ts +15 -2
- package/src/ui/live-duration.ts +58 -0
- package/src/ui/tool-render.ts +7 -11
- package/src/ui/tool-renderers/index.ts +6 -3
- package/src/ui/widget/widget-formatters.ts +2 -13
- package/src/utils/fs-watch.ts +11 -60
- package/src/utils/run-watcher-registry.ts +164 -0
- package/src/workflows/discover-workflows.ts +2 -1
- package/src/workflows/workflow-config.ts +5 -0
- package/src/runtime/dynamic-script-runner.ts +0 -497
- package/src/runtime/sandbox.ts +0 -335
package/src/state/event-log.ts
CHANGED
|
@@ -9,7 +9,7 @@ import { logInternalError } from "../utils/internal-error.ts";
|
|
|
9
9
|
import { readJsonlSince, type IncrementalReadState } from "../utils/incremental-reader.ts";
|
|
10
10
|
import { redactSecrets } from "../utils/redaction.ts";
|
|
11
11
|
import { sleepSync } from "../utils/sleep.ts";
|
|
12
|
-
import { needsRotation, compactEventLog, rotateEventLog } from "./event-log-rotation.ts";
|
|
12
|
+
import { needsRotation, compactEventLog, rotateEventLog, applyCompactionUnlocked, prepareCompaction, rotateEventLogUnlocked } from "./event-log-rotation.ts";
|
|
13
13
|
|
|
14
14
|
export type TeamEventProvenance = "live_worker" | "test" | "healthcheck" | "replay" | "api" | "background" | "team_runner";
|
|
15
15
|
export type TeamWatcherAction = "act" | "observe" | "ignore";
|
|
@@ -76,7 +76,7 @@ let overflowCounter = 0;
|
|
|
76
76
|
* `flushOneEventLogBuffer`, and `state/mailbox.ts`. Prefer the async alternative
|
|
77
77
|
* (`appendEventAsync`) for all new code.
|
|
78
78
|
*/
|
|
79
|
-
export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
|
|
79
|
+
export function withEventLogLockSync<T>(eventsPath: string, fn: () => T, options?: { timeoutMs?: number; staleMs?: number }): T {
|
|
80
80
|
// Ensure parent directory exists before attempting lock
|
|
81
81
|
fs.mkdirSync(path.dirname(eventsPath), { recursive: true });
|
|
82
82
|
const lockDir = `${eventsPath}.lock`;
|
|
@@ -86,8 +86,8 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
|
|
|
86
86
|
// event loop indefinitely. 500 retries × 10ms = 5s max. After timeout, we
|
|
87
87
|
// throw a clear error instead of blocking forever. This ensures AbortSignal
|
|
88
88
|
// handlers, SIGTERM, and graceful shutdown can fire within seconds.
|
|
89
|
-
const timeout = 5000;
|
|
90
|
-
const staleMs = 10000;
|
|
89
|
+
const timeout = options?.timeoutMs ?? 5000;
|
|
90
|
+
const staleMs = options?.staleMs ?? 10000;
|
|
91
91
|
let acquired = false;
|
|
92
92
|
while (true) {
|
|
93
93
|
try {
|
|
@@ -110,24 +110,35 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
|
|
|
110
110
|
// to check for orphaned .lock dirs / stale processes.
|
|
111
111
|
throw errors.eventLogLockTimeout(eventsPath, timeout);
|
|
112
112
|
}
|
|
113
|
-
//
|
|
113
|
+
// Round 26 (BUG 3): mtime-based stale check INDEPENDENT of pidFile.
|
|
114
|
+
// If the holder crashed between mkdir and writing pidFile, there is no
|
|
115
|
+
// pidFile to read — the old code just slept until the 5s timeout, then
|
|
116
|
+
// threw, leaving the dir orphaned FOREVER (every retry repeats the
|
|
117
|
+
// timeout). Now: if the lock dir's mtime exceeds staleMs, reclaim it.
|
|
118
|
+
try {
|
|
119
|
+
const dirStat = fs.statSync(lockDir);
|
|
120
|
+
if (Date.now() - dirStat.mtimeMs > staleMs) {
|
|
121
|
+
fs.rmSync(lockDir, { recursive: true, force: true });
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
} catch { /* dir vanished — let loop retry */ }
|
|
125
|
+
// Round 26 (BUG 4): the mtime check was previously NESTED inside
|
|
126
|
+
// `if (!alive)`, so a recycled PID (crashed holder's PID reused by an
|
|
127
|
+
// unrelated live process) kept `alive=true` and the mtime check NEVER
|
|
128
|
+
// fired → permanent wedge. mtime is now checked FIRST (above) for ALL
|
|
129
|
+
// holders. The PID check below is a secondary fast-path: if the holder
|
|
130
|
+
// PID is provably dead AND the lock isn't stale yet, we still wait
|
|
131
|
+
// (don't steal a fresh lock just because the pid lookup raced).
|
|
114
132
|
try {
|
|
115
133
|
const raw = fs.readFileSync(pidFile, "utf-8").trim();
|
|
116
134
|
const ownerPid = Number.parseInt(raw, 10);
|
|
117
135
|
if (!Number.isNaN(ownerPid) && ownerPid !== process.pid) {
|
|
118
136
|
let alive = false;
|
|
119
137
|
try { process.kill(ownerPid, 0); alive = true; } catch { /* dead */ }
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
const stat = fs.statSync(lockDir);
|
|
123
|
-
if (Date.now() - stat.mtimeMs > staleMs) {
|
|
124
|
-
fs.rmSync(lockDir, { recursive: true, force: true });
|
|
125
|
-
continue;
|
|
126
|
-
}
|
|
127
|
-
} catch { /* race — let loop sleep */ }
|
|
128
|
-
}
|
|
138
|
+
// (mtime already handled above; nothing to do here for dead-but-fresh.)
|
|
139
|
+
void alive;
|
|
129
140
|
}
|
|
130
|
-
} catch { /* no pid file —
|
|
141
|
+
} catch { /* no pid file — mtime check above already handles it */ }
|
|
131
142
|
sleepSync(10);
|
|
132
143
|
}
|
|
133
144
|
}
|
|
@@ -135,7 +146,19 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
|
|
|
135
146
|
return fn();
|
|
136
147
|
} finally {
|
|
137
148
|
if (acquired) {
|
|
138
|
-
|
|
149
|
+
// Round 26 (BUG 5): token/PID-guarded release. Previously the release
|
|
150
|
+
// was an UNCONDITIONAL rmSync. If our fn exceeded staleMs, another
|
|
151
|
+
// process could steal our lock (rm our dir, make its own); when our fn
|
|
152
|
+
// finished our finally block would then DELETE THE STEALER's dir → both
|
|
153
|
+
// in the critical section + lost lock. Verify the pidFile still records
|
|
154
|
+
// OUR pid before removing; if it doesn't, the lock was stolen and the
|
|
155
|
+
// current holder owns the dir.
|
|
156
|
+
try {
|
|
157
|
+
const currentPid = fs.readFileSync(pidFile, "utf-8").trim();
|
|
158
|
+
if (currentPid === String(process.pid)) {
|
|
159
|
+
fs.rmSync(lockDir, { recursive: true, force: true });
|
|
160
|
+
}
|
|
161
|
+
} catch { /* lock stolen or already gone — do not touch */ }
|
|
139
162
|
}
|
|
140
163
|
}
|
|
141
164
|
}
|
|
@@ -152,6 +175,29 @@ function evictOldestSequenceCacheEntries(): void {
|
|
|
152
175
|
}
|
|
153
176
|
}
|
|
154
177
|
|
|
178
|
+
/** @internal — exported for sequence-cache LRU testing (Round 19). */
|
|
179
|
+
export function __test__sequenceCacheSize(): number {
|
|
180
|
+
return sequenceCache.size;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/** @internal — seed an entry into the sequence cache for testing. */
|
|
184
|
+
export function __test__seedSequenceCache(eventsPath: string, lastAccessMs: number): void {
|
|
185
|
+
sequenceCache.set(eventsPath, { size: 1, mtimeMs: 0, seq: 0, lastAccessMs });
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/** @internal — expose eviction for testing. */
|
|
189
|
+
export function __test__evictOldestSequenceCacheEntries(): void {
|
|
190
|
+
evictOldestSequenceCacheEntries();
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/** @internal — clear the sequence cache. */
|
|
194
|
+
export function __test__clearSequenceCache(): void {
|
|
195
|
+
sequenceCache.clear();
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/** @internal — the max sequence cache entries bound. */
|
|
199
|
+
export const MAX_SEQUENCE_CACHE_ENTRIES_VALUE = MAX_SEQUENCE_CACHE_ENTRIES;
|
|
200
|
+
|
|
155
201
|
export function sequencePath(eventsPath: string): string {
|
|
156
202
|
return `${eventsPath}.seq`;
|
|
157
203
|
}
|
|
@@ -497,9 +543,14 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
|
|
|
497
543
|
if (!isTerminal && fs.existsSync(eventsPath)) {
|
|
498
544
|
const stat = fs.statSync(eventsPath);
|
|
499
545
|
if (stat.size > MAX_EVENTS_BYTES) {
|
|
500
|
-
// Try immediate compact (not waiting for counter % 100)
|
|
546
|
+
// Try immediate compact (not waiting for counter % 100).
|
|
547
|
+
// Round 24 (BUG 1): we are INSIDE withEventLogLockSync. Use the unlocked
|
|
548
|
+
// apply/rotate cores — the locked variants would deadlock (mkdir lock
|
|
549
|
+
// is not re-entrant → 5s timeout → compaction/rotation never ran →
|
|
550
|
+
// unbounded log growth → events silently dropped past 50MB).
|
|
501
551
|
try {
|
|
502
|
-
|
|
552
|
+
const prepared = prepareCompaction(eventsPath);
|
|
553
|
+
if (prepared) applyCompactionUnlocked(eventsPath, prepared);
|
|
503
554
|
} catch (error) {
|
|
504
555
|
logInternalError("event-log.immediate-compact", error, `eventsPath=${eventsPath}`);
|
|
505
556
|
}
|
|
@@ -507,7 +558,7 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
|
|
|
507
558
|
if (fs.existsSync(eventsPath)) {
|
|
508
559
|
const afterCompact = fs.statSync(eventsPath);
|
|
509
560
|
if (afterCompact.size > MAX_EVENTS_BYTES) {
|
|
510
|
-
|
|
561
|
+
rotateEventLogUnlocked(eventsPath);
|
|
511
562
|
}
|
|
512
563
|
}
|
|
513
564
|
}
|
|
@@ -555,7 +606,15 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
|
|
|
555
606
|
}
|
|
556
607
|
appendCounter++;
|
|
557
608
|
if (appendCounter % 100 === 0 && needsRotation(eventsPath)) {
|
|
558
|
-
|
|
609
|
+
// Round 24 (BUG 1): we are INSIDE withEventLogLockSync here (called via
|
|
610
|
+
// appendEventInsideLock). The mkdir lock is NOT re-entrant, so calling the
|
|
611
|
+
// locked compactEventLog would deadlock → 5s timeout → compaction never
|
|
612
|
+
// ran → unbounded log growth → events silently dropped past 50MB. Use the
|
|
613
|
+
// unlocked apply path instead (lock already held).
|
|
614
|
+
try {
|
|
615
|
+
const prepared = prepareCompaction(eventsPath);
|
|
616
|
+
if (prepared) applyCompactionUnlocked(eventsPath, prepared);
|
|
617
|
+
} catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
|
|
559
618
|
}
|
|
560
619
|
try { emitFromTeamEvent(fullEvent); } catch (error) { logInternalError("event-log.emit", error); }
|
|
561
620
|
return fullEvent;
|
|
@@ -4,7 +4,12 @@ import type { RunHealth } from "../runtime/task-health.ts";
|
|
|
4
4
|
import { computeRunHealth } from "../runtime/task-health.ts";
|
|
5
5
|
import type { ManifestSummary } from "../runtime/task-health.ts";
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
// Relative to the crew root (`<cwd>/.crew`). BUG A fix (pts/2 hang
|
|
8
|
+
// investigation 2026-06-16): this was `.crew/state/health`, which double-joined
|
|
9
|
+
// to `<crewRoot>/state/.crew/state/health` because the caller passed the state
|
|
10
|
+
// dir (not the crew root). Now the caller passes the real crew root, so this is
|
|
11
|
+
// a plain `state/health` suffix.
|
|
12
|
+
const HEALTH_DIR = "state/health";
|
|
8
13
|
|
|
9
14
|
export interface HealthSnapshot {
|
|
10
15
|
runId: string;
|
package/src/state/locks.ts
CHANGED
|
@@ -66,6 +66,57 @@ function isLockHolderAlive(filePath: string): boolean {
|
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
68
|
|
|
69
|
+
/**
|
|
70
|
+
* Round 26 (BUG 1): read the lock file ONCE and evaluate staleness + holder
|
|
71
|
+
* liveness from that single snapshot.
|
|
72
|
+
*
|
|
73
|
+
* Previously `acquireLockWithRetry` called `isLockStale()` and
|
|
74
|
+
* `isLockHolderAlive()` separately, each performing its own `readFileSync`.
|
|
75
|
+
* Between those two reads the lock could transition stale→fresh (old holder
|
|
76
|
+
* released, new holder acquired): isLockStale saw the OLD createdAt → stale,
|
|
77
|
+
* isLockHolderAlive saw the NEW pid → alive, yielding `!stale && alive` =
|
|
78
|
+
* false → we forcibly rm the NEW holder's freshly-acquired lock and take it
|
|
79
|
+
* ourselves → BOTH in the critical section. Reading once closes the window.
|
|
80
|
+
*
|
|
81
|
+
* Returns `{ canSteal: true }` if the lock is stale OR the holder is dead
|
|
82
|
+
* (safe to forcibly remove); `{ canSteal: false }` if it is fresh AND held by
|
|
83
|
+
* a live process (must keep waiting).
|
|
84
|
+
*/
|
|
85
|
+
function readLockSnapshot(filePath: string, staleMs: number): { canSteal: boolean } {
|
|
86
|
+
let stat: fs.Stats | undefined;
|
|
87
|
+
let raw: string | undefined;
|
|
88
|
+
try {
|
|
89
|
+
stat = fs.statSync(filePath);
|
|
90
|
+
raw = fs.readFileSync(filePath, "utf-8");
|
|
91
|
+
} catch {
|
|
92
|
+
// File vanished between writeLockFile's EEXIST and now (holder released).
|
|
93
|
+
// Loop will retry the create; safe to signal "nothing to steal".
|
|
94
|
+
return { canSteal: false };
|
|
95
|
+
}
|
|
96
|
+
// Staleness from a single snapshot.
|
|
97
|
+
let createdAt = parseCreatedAtFromLock(raw);
|
|
98
|
+
if (createdAt === undefined) createdAt = stat.mtimeMs;
|
|
99
|
+
const isStale = Date.now() - createdAt > staleMs;
|
|
100
|
+
// Holder liveness from the SAME snapshot.
|
|
101
|
+
let isAlive = true; // Unknown holder — assume alive to be safe (matches isLockHolderAlive).
|
|
102
|
+
try {
|
|
103
|
+
const parsed = JSON.parse(raw) as { pid?: unknown };
|
|
104
|
+
const pid = typeof parsed.pid === "number" ? parsed.pid : undefined;
|
|
105
|
+
if (pid !== undefined) {
|
|
106
|
+
try {
|
|
107
|
+
process.kill(pid, 0);
|
|
108
|
+
isAlive = true;
|
|
109
|
+
} catch (error) {
|
|
110
|
+
const code = (error as NodeJS.ErrnoException).code;
|
|
111
|
+
// EPERM/ESRCH → treat as not-alive (stealable), see isLockHolderAlive.
|
|
112
|
+
isAlive = false;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
} catch { /* malformed payload — keep isAlive=true */ }
|
|
116
|
+
// Steal if stale OR holder dead — matches the original intent.
|
|
117
|
+
return { canSteal: isStale || !isAlive };
|
|
118
|
+
}
|
|
119
|
+
|
|
69
120
|
/**
|
|
70
121
|
* Lock file kinds. Discriminator written to the lock file payload so that:
|
|
71
122
|
* - Debugging tools (e.g. a future `pi-crew locks` command) can identify
|
|
@@ -180,9 +231,10 @@ function acquireLockWithRetry(filePath: string, staleMs: number, kind: LockKind
|
|
|
180
231
|
if (Date.now() > deadline) {
|
|
181
232
|
throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
|
|
182
233
|
}
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
234
|
+
// Round 26 (BUG 1): single-snapshot read closes the TOCTOU window between
|
|
235
|
+
// separate stale + alive reads (which could race stale→fresh).
|
|
236
|
+
const { canSteal } = readLockSnapshot(filePath, staleMs);
|
|
237
|
+
if (!canSteal) {
|
|
186
238
|
throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
|
|
187
239
|
}
|
|
188
240
|
// Stale or dead holder — forcibly remove the lock.
|
|
@@ -213,9 +265,9 @@ async function acquireLockWithRetryAsync(filePath: string, staleMs: number, kind
|
|
|
213
265
|
if (Date.now() > deadline) {
|
|
214
266
|
throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
|
|
215
267
|
}
|
|
216
|
-
|
|
217
|
-
const
|
|
218
|
-
if (!
|
|
268
|
+
// Round 26 (BUG 1): single-snapshot read (see sync variant).
|
|
269
|
+
const { canSteal } = readLockSnapshot(filePath, staleMs);
|
|
270
|
+
if (!canSteal) {
|
|
219
271
|
throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
|
|
220
272
|
}
|
|
221
273
|
// Stale or dead holder — forcibly remove the lock.
|
|
@@ -244,16 +296,14 @@ export function withFileLockSync<T>(filePath: string, fn: () => T, options: RunL
|
|
|
244
296
|
// Between mkdir and lock acquisition, an attacker could plant a symlink.
|
|
245
297
|
if (!isSymlinkSafePath(path.dirname(lockFile))) throw new Error("Refusing: parent of lock directory is a symlink");
|
|
246
298
|
fs.mkdirSync(path.dirname(lockFile), { recursive: true });
|
|
247
|
-
//
|
|
248
|
-
//
|
|
249
|
-
//
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
try { fs.rmSync(lockFile, { force: true }); } catch { /* ignore */ }
|
|
256
|
-
}
|
|
299
|
+
// Round 26 (BUG 2): REMOVED the pre-acquisition target-file-existence check.
|
|
300
|
+
// It was racy — between statSync(target) and acquire, a concurrent process
|
|
301
|
+
// could acquire the lock to CREATE the target, and we'd delete its active
|
|
302
|
+
// lock. It was also actively wrong for callers that pass a path already
|
|
303
|
+
// ending in `.lock` (config.ts: the checked "target" never exists, so the
|
|
304
|
+
// cleanup ALWAYS fired, deleting a fresh concurrent holder's lock). Genuine
|
|
305
|
+
// orphan locks (crashed holder) are reclaimed by acquireLockWithRetry's
|
|
306
|
+
// staleMs-based steal logic after at most `staleMs`.
|
|
257
307
|
// FIX (TOCTOU): Re-validate symlink safety before each lock acquisition
|
|
258
308
|
// attempt. Between our initial check and the acquisition (and between
|
|
259
309
|
// acquireLockWithRetry's internal retries), an attacker could plant a
|
package/src/state/state-store.ts
CHANGED
|
@@ -57,7 +57,7 @@ export interface RunPaths {
|
|
|
57
57
|
eventsPath: string;
|
|
58
58
|
}
|
|
59
59
|
|
|
60
|
-
interface ManifestCacheEntry {
|
|
60
|
+
export interface ManifestCacheEntry {
|
|
61
61
|
manifest: TeamRunManifest;
|
|
62
62
|
tasks: TeamTaskState[];
|
|
63
63
|
manifestMtimeMs: number;
|
|
@@ -76,6 +76,19 @@ const MANIFEST_CACHE_TTL_MS = 15 * 1000; // 15 seconds (FIX: increased from 5s f
|
|
|
76
76
|
const LOAD_MANIFEST_RETRY_LIMIT = 5; // Configurable retry limit for mtime/size stability checks under contention
|
|
77
77
|
const manifestCache = new Map<string, ManifestCacheEntry>();
|
|
78
78
|
|
|
79
|
+
/** @internal — exported for TTL-eviction unit testing (Round 19). */
|
|
80
|
+
export function __test__setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
|
|
81
|
+
setManifestCache(stateRoot, entry);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** @internal — exported for TTL-eviction unit testing (Round 19). */
|
|
85
|
+
export function __test__getManifestCacheEntry(stateRoot: string): ManifestCacheEntry | undefined {
|
|
86
|
+
return manifestCache.get(stateRoot);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/** @internal — the TTL in ms used for manifest cache eviction. */
|
|
90
|
+
export const MANIFEST_CACHE_TTL_MS_VALUE = MANIFEST_CACHE_TTL_MS;
|
|
91
|
+
|
|
79
92
|
function setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
|
|
80
93
|
if (manifestCache.has(stateRoot)) manifestCache.delete(stateRoot);
|
|
81
94
|
entry.cachedAt = Date.now();
|
package/src/ui/card-colors.ts
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
* 30-35% — word-level emphasis (prominent)
|
|
15
15
|
*/
|
|
16
16
|
import type { CrewTheme } from "./theme-adapter.ts";
|
|
17
|
+
import { visibleWidth as visualWidth } from "../utils/visual.ts";
|
|
17
18
|
|
|
18
19
|
// ── ANSI parsing ────────────────────────────────────────────────────────
|
|
19
20
|
|
|
@@ -96,12 +97,15 @@ export function deriveCardBackground(
|
|
|
96
97
|
|
|
97
98
|
// ── Helpers for padding lines with a background ─────────────────────────
|
|
98
99
|
|
|
99
|
-
const ANSI_SGR_RE = /\x1b\[[0-9;]*m/g;
|
|
100
100
|
const RESET = "\x1b[0m";
|
|
101
101
|
|
|
102
|
-
/** Strip ANSI SGR codes
|
|
102
|
+
/** Strip ANSI SGR codes then compute the VISUAL width (Unicode-aware).
|
|
103
|
+
* Round 23 (BUG 2): previously this used `.length` (UTF-16 code units), which
|
|
104
|
+
* under-counts CJK/emoji → wrong padding → broken frame borders in crew cards.
|
|
105
|
+
* Delegate to the canonical Unicode-aware visualWidth from utils/visual.ts
|
|
106
|
+
* used by every other renderer. */
|
|
103
107
|
export function visibleWidth(text: string): number {
|
|
104
|
-
return text
|
|
108
|
+
return visualWidth(text);
|
|
105
109
|
}
|
|
106
110
|
|
|
107
111
|
/**
|
|
@@ -3,7 +3,9 @@ import { iconForStatus } from "../status-colors.ts";
|
|
|
3
3
|
import type { RunUiSnapshot } from "../snapshot-types.ts";
|
|
4
4
|
import { spinnerFrame } from "../spinner.ts";
|
|
5
5
|
import type { CrewAgentRecord } from "../../runtime/crew-agent-runtime.ts";
|
|
6
|
+
import { formatCost } from "../../state/usage.ts";
|
|
6
7
|
import { listLiveAgents, listLiveAgentsByWorkspace, type LiveAgentHandle } from "../../runtime/live-agent-manager.ts";
|
|
8
|
+
import { computeLiveDurationMs } from "../live-duration.ts";
|
|
7
9
|
|
|
8
10
|
/**
|
|
9
11
|
* Returns true if this agent did real work (LLM call, tool use, or non-trivial duration).
|
|
@@ -82,15 +84,26 @@ export function renderAgentsPane(snapshot: RunUiSnapshot | undefined, options: R
|
|
|
82
84
|
: agent.status === "failed" ? (agent.error ?? "failed")
|
|
83
85
|
: "done";
|
|
84
86
|
|
|
85
|
-
// Stats: tokens + duration
|
|
87
|
+
// Stats: tokens + cost + duration
|
|
86
88
|
const stats: string[] = [];
|
|
87
89
|
const tokenTotal = (agent.usage?.input ?? 0) + (agent.usage?.output ?? 0) + (agent.usage?.cacheRead ?? 0) + (agent.usage?.cacheWrite ?? 0);
|
|
88
90
|
if (tokenTotal > 0) {
|
|
89
91
|
const tok = tokenTotal >= 1000 ? `${(tokenTotal / 1000).toFixed(1)}k` : `${tokenTotal}`;
|
|
90
92
|
stats.push(tok);
|
|
91
93
|
}
|
|
94
|
+
// Per-agent cost (Round 17 BS-1): the data is already on task.usage.cost;
|
|
95
|
+
// surface it live so the user sees $ burn per agent during a run.
|
|
96
|
+
if (agent.usage?.cost && agent.usage.cost > 0) {
|
|
97
|
+
stats.push(formatCost(agent.usage.cost));
|
|
98
|
+
}
|
|
92
99
|
if (liveHandle) {
|
|
93
|
-
|
|
100
|
+
// Round 23 (BUG 1): the duration math here was naive —
|
|
101
|
+
// (completedAtMs ?? Date.now()) - startedAtMs
|
|
102
|
+
// which produced a giant NEGATIVE duration whenever startedAtMs was
|
|
103
|
+
// 0/undefined/bad, or a race set completedAtMs < startedAtMs. This
|
|
104
|
+
// fired for EVERY running live agent in the dashboard. Use the shared,
|
|
105
|
+
// validated computeLiveDurationMs (mirrors widget-formatters.ts).
|
|
106
|
+
const ms = computeLiveDurationMs(liveHandle.activity);
|
|
94
107
|
stats.push(`${(ms / 1000).toFixed(1)}s`);
|
|
95
108
|
if (options.showModel !== false && liveHandle.modelName && liveHandle.modelName !== "default") {
|
|
96
109
|
stats.push(liveHandle.modelName);
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Round 23 (BUG 1 fix): live-agent duration computation.
|
|
3
|
+
*
|
|
4
|
+
* The naive `(completedAtMs ?? Date.now()) - startedAtMs` produced giant
|
|
5
|
+
* NEGATIVE durations for every running live agent whenever startedAtMs was
|
|
6
|
+
* 0/undefined/out-of-range, or a race set completedAtMs < startedAtMs.
|
|
7
|
+
*
|
|
8
|
+
* This module consolidates the validated duration math (previously duplicated
|
|
9
|
+
* between widget-formatters.ts and agents-pane.ts) into one pure, fully
|
|
10
|
+
* testable function: it normalizes seconds-vs-ms, sanity-checks the start
|
|
11
|
+
* timestamp against the current time, and never returns a negative value.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
export interface LiveActivity {
|
|
15
|
+
startedAtMs?: number;
|
|
16
|
+
completedAtMs?: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Normalize a raw timestamp that may be seconds or milliseconds. */
|
|
20
|
+
function toMs(v: number): number {
|
|
21
|
+
if (v <= 0) return 0;
|
|
22
|
+
// 1e9 < seconds < 1e10 → seconds, scale up
|
|
23
|
+
if (v > 1_000_000_000 && v < 10_000_000_000) return v * 1000;
|
|
24
|
+
// 1e11 < ms < 1e13 → already ms
|
|
25
|
+
if (v > 100_000_000_000 && v < 10_000_000_000_000) return v;
|
|
26
|
+
return v;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Compute the live elapsed duration in milliseconds for an agent activity.
|
|
31
|
+
*
|
|
32
|
+
* - Never negative (clamped to >= 0).
|
|
33
|
+
* - Returns 0 if the start timestamp is missing or implausible.
|
|
34
|
+
* - Uses `completedAtMs` when present and sane; otherwise `nowMs` (running).
|
|
35
|
+
*
|
|
36
|
+
* @param activity the live agent activity handle
|
|
37
|
+
* @param nowMs optional override for `Date.now()` (tests / determinism)
|
|
38
|
+
*/
|
|
39
|
+
export function computeLiveDurationMs(activity: LiveActivity, nowMs: number = Date.now()): number {
|
|
40
|
+
const rawStarted = activity.startedAtMs || 0;
|
|
41
|
+
const rawCompleted = activity.completedAtMs || 0;
|
|
42
|
+
const startedMs = toMs(rawStarted);
|
|
43
|
+
const completedMs = rawCompleted > 0 ? toMs(rawCompleted) : 0;
|
|
44
|
+
// A valid start is positive, not more than 1 minute in the future, and not
|
|
45
|
+
// more than ~1000 years in the past (guards against 0 / garbage / clock skew).
|
|
46
|
+
const isValidStarted =
|
|
47
|
+
startedMs > 0 &&
|
|
48
|
+
startedMs < nowMs + 60_000 &&
|
|
49
|
+
startedMs > nowMs - 31_556_926_000_000;
|
|
50
|
+
const end = completedMs > 0 && completedMs < nowMs + 60_000 ? completedMs : nowMs;
|
|
51
|
+
const ms = end - (isValidStarted ? startedMs : nowMs);
|
|
52
|
+
return Number.isFinite(ms) && ms >= 0 ? ms : 0;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Format a live duration in seconds, e.g. `12.3s`. Returns `0.0s` for 0. */
|
|
56
|
+
export function formatLiveDuration(activity: LiveActivity, nowMs: number = Date.now()): string {
|
|
57
|
+
return `${(computeLiveDurationMs(activity, nowMs) / 1000).toFixed(1)}s`;
|
|
58
|
+
}
|
package/src/ui/tool-render.ts
CHANGED
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
import { Container, Spacer, Text, visibleWidth } from "@earendil-works/pi-tui";
|
|
11
11
|
import type { CrewAgentRecord } from "../runtime/crew-agent-runtime.ts";
|
|
12
12
|
import { replaceTabs } from "./render-diff.ts";
|
|
13
|
+
import { truncateToWidth } from "../utils/visual.ts";
|
|
13
14
|
|
|
14
15
|
// ── Types ──────────────────────────────────────────────────────────────
|
|
15
16
|
export interface Theme {
|
|
@@ -68,17 +69,12 @@ function formatContextUsage(tokens: number, contextWindow: number | undefined):
|
|
|
68
69
|
|
|
69
70
|
export function truncLine(text: string, maxWidth: number): string {
|
|
70
71
|
if (text.includes("\n") || text.includes("\r")) text = text.replace(/\r?\n/g, "↵ ");
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
}
|
|
78
|
-
if (width >= maxWidth - 1) return result + "…";
|
|
79
|
-
result += text[i]; width++;
|
|
80
|
-
}
|
|
81
|
-
return result;
|
|
72
|
+
// Round 23 (BUG 4): previously this loop counted 1 visual column per UTF-16
|
|
73
|
+
// code unit and indexed text[i], so for CJK it emitted up to 2x the visual
|
|
74
|
+
// width (frame overflow) and for emoji it split surrogate pairs (U+FFFD).
|
|
75
|
+
// Delegate to the grapheme/ANSI-aware truncateToWidth (keeps ANSI codes,
|
|
76
|
+
// respects double-wide CJK + surrogate pairs, adds the '…' ellipsis).
|
|
77
|
+
return truncateToWidth(text, maxWidth);
|
|
82
78
|
}
|
|
83
79
|
|
|
84
80
|
export function formatToolPreview(name: string, args: Record<string, unknown>): string {
|
|
@@ -12,6 +12,7 @@ import type { CrewTheme } from "../theme-adapter.ts";
|
|
|
12
12
|
import { truncLine, formatTokens, formatDuration } from "../tool-render.ts";
|
|
13
13
|
import type { CrewAgentRecord } from "../../runtime/crew-agent-runtime.ts";
|
|
14
14
|
import { isBrief, briefToolResult } from "./brief-mode.ts";
|
|
15
|
+
import { truncateToWidth } from "../../utils/visual.ts";
|
|
15
16
|
|
|
16
17
|
// ── Types ──────────────────────────────────────────────────────────────
|
|
17
18
|
|
|
@@ -42,9 +43,11 @@ function padVisual(str: string, targetWidth: number): string {
|
|
|
42
43
|
/** Truncate a string (which may contain ANSI codes) to a target VISUAL width. */
|
|
43
44
|
function truncVisual(str: string, maxWidth: number): string {
|
|
44
45
|
if (visibleWidth(str) <= maxWidth) return str;
|
|
45
|
-
//
|
|
46
|
-
|
|
47
|
-
|
|
46
|
+
// Round 23 (BUG 3): previously used String.slice(0, maxWidth) which counts
|
|
47
|
+
// UTF-16 code units — for CJK that overflows the card by up to 2x, and for
|
|
48
|
+
// emoji it splits a surrogate pair (U+FFFD). Use the grapheme/ANSI-aware
|
|
49
|
+
// truncateToWidth with empty ellipsis (the caller appends its own '…').
|
|
50
|
+
return truncateToWidth(str, maxWidth, "");
|
|
48
51
|
}
|
|
49
52
|
|
|
50
53
|
// ── Visual primitives ──────────────────────────────────────────────────
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
import type { CrewAgentRecord } from "../../runtime/crew-agent-runtime.ts";
|
|
8
8
|
import type { LiveAgentHandle } from "../../runtime/live-agent-manager.ts";
|
|
9
9
|
import { getTaskUsage } from "../../runtime/usage-tracker.ts";
|
|
10
|
+
import { computeLiveDurationMs } from "../live-duration.ts";
|
|
10
11
|
|
|
11
12
|
// ── Token formatting ──────────────────────────────────────────────────
|
|
12
13
|
|
|
@@ -115,19 +116,7 @@ export function agentStats(agent: CrewAgentRecord, liveHandle?: LiveAgentHandle)
|
|
|
115
116
|
const ctxPct = stats?.contextUsage?.percent;
|
|
116
117
|
if (ctxPct != null) parts.push(`${Math.round(ctxPct)}% ctx`);
|
|
117
118
|
} catch { /* ignore */ }
|
|
118
|
-
const
|
|
119
|
-
const rawCompleted = act.completedAtMs || 0;
|
|
120
|
-
const nowMs = Date.now();
|
|
121
|
-
const toMs = (v: number): number => {
|
|
122
|
-
if (v <= 0) return 0;
|
|
123
|
-
if (v > 1000000000 && v < 10000000000) return v * 1000;
|
|
124
|
-
if (v > 100000000000 && v < 10000000000000) return v;
|
|
125
|
-
return v;
|
|
126
|
-
};
|
|
127
|
-
const startedMs = toMs(rawStarted);
|
|
128
|
-
const completedMs = rawCompleted > 0 ? toMs(rawCompleted) : 0;
|
|
129
|
-
const isValidStarted = startedMs > 0 && startedMs < nowMs + 60000 && startedMs > nowMs - 3155692600000;
|
|
130
|
-
const ms = (completedMs > 0 && completedMs < nowMs + 60000 ? completedMs : nowMs) - (isValidStarted ? startedMs : nowMs);
|
|
119
|
+
const ms = computeLiveDurationMs(act);
|
|
131
120
|
parts.push(`${(ms / 1000).toFixed(1)}s`);
|
|
132
121
|
} else {
|
|
133
122
|
if (agent.toolUses) parts.push(`${agent.toolUses} tools`);
|
package/src/utils/fs-watch.ts
CHANGED
|
@@ -1,9 +1,17 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
|
-
import * as path from "node:path";
|
|
3
2
|
import type { FSWatcher, WatchListener } from "node:fs";
|
|
4
3
|
|
|
5
|
-
/**
|
|
6
|
-
|
|
4
|
+
/**
|
|
5
|
+
* Filesystem watcher helpers (slimmed down — pts/2 hang fix 2026-06-16).
|
|
6
|
+
*
|
|
7
|
+
* The recursive-watcher helpers (createRecursiveWatcher / watchCrewState /
|
|
8
|
+
* runIdFromStateRelativePath) were REMOVED: a recursive fs.watch on the run
|
|
9
|
+
* state tree exploded to O(total run history) inotify watches on Linux and
|
|
10
|
+
* caused a permanent interactive-session busy-loop. The bounded
|
|
11
|
+
* {@link RunWatcherRegistry} (one non-recursive watcher per ACTIVE run) now
|
|
12
|
+
* replaces them. Only the two primitives below survive — they are still used by
|
|
13
|
+
* manifest-cache, result-watcher, and run-watcher-registry.
|
|
14
|
+
*/
|
|
7
15
|
|
|
8
16
|
export function closeWatcher(watcher: FSWatcher | null | undefined): void {
|
|
9
17
|
if (!watcher) {
|
|
@@ -31,60 +39,3 @@ export function watchWithErrorHandler(
|
|
|
31
39
|
return null;
|
|
32
40
|
}
|
|
33
41
|
}
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* 1.3 — Watch a directory recursively and invoke `onChange` when any file
|
|
37
|
-
* inside changes. Falls back to `null` on systems where `fs.watch` rejects
|
|
38
|
-
* recursive mode (e.g., Linux when running on older kernels via FUSE/network FS).
|
|
39
|
-
*
|
|
40
|
-
* Callers MUST handle null and fall back to polling. The watcher emits the
|
|
41
|
-
* filename relative to `rootDir` (forward-slash normalised on Windows).
|
|
42
|
-
*/
|
|
43
|
-
export function createRecursiveWatcher(
|
|
44
|
-
rootDir: string,
|
|
45
|
-
onChange: (relativePath: string) => void,
|
|
46
|
-
onError: (error: unknown) => void,
|
|
47
|
-
): FSWatcher | null {
|
|
48
|
-
try {
|
|
49
|
-
if (!fs.existsSync(rootDir)) fs.mkdirSync(rootDir, { recursive: true });
|
|
50
|
-
const watcher = fs.watch(rootDir, { recursive: true }, (_eventType, filename) => {
|
|
51
|
-
if (typeof filename !== "string" || filename.length === 0) return;
|
|
52
|
-
onChange(filename.replace(/\\/g, "/"));
|
|
53
|
-
});
|
|
54
|
-
watcher.on("error", (error) => {
|
|
55
|
-
try { watcher.close(); } catch { /* ignore */ }
|
|
56
|
-
onError(error);
|
|
57
|
-
});
|
|
58
|
-
return watcher;
|
|
59
|
-
} catch (error) {
|
|
60
|
-
onError(error);
|
|
61
|
-
return null;
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
/**
|
|
66
|
-
* Given a path relative to `<crewRoot>/state`, return the runId that owns
|
|
67
|
-
* the change, or undefined if the path doesn't match any tracked run layout.
|
|
68
|
-
*/
|
|
69
|
-
export function runIdFromStateRelativePath(relativePath: string): string | undefined {
|
|
70
|
-
const parts = relativePath.split("/");
|
|
71
|
-
// Layout is `runs/{runId}/...` — see docs/architecture.md state layer.
|
|
72
|
-
if (parts.length >= 2 && parts[0] === "runs" && parts[1]) return parts[1];
|
|
73
|
-
return undefined;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
/** Convenience: combine the two helpers for `<crewRoot>/state` watching. */
|
|
77
|
-
export function watchCrewState(
|
|
78
|
-
stateDir: string,
|
|
79
|
-
onRunChange: (runId: string) => void,
|
|
80
|
-
onError: (error: unknown) => void,
|
|
81
|
-
): FSWatcher | null {
|
|
82
|
-
return createRecursiveWatcher(stateDir, (relativePath) => {
|
|
83
|
-
const runId = runIdFromStateRelativePath(relativePath);
|
|
84
|
-
if (runId) onRunChange(runId);
|
|
85
|
-
}, onError);
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
// Re-export path helper so callers don't pull node:path just for join.
|
|
89
|
-
/** @internal */
|
|
90
|
-
const joinPath = path.join;
|