pi-crew 0.7.4 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +79 -0
  2. package/README.md +11 -11
  3. package/docs/commands-reference.md +14 -10
  4. package/docs/troubleshooting.md +131 -0
  5. package/docs/usage.md +9 -4
  6. package/package.json +1 -1
  7. package/src/config/config.ts +11 -4
  8. package/src/config/types.ts +2 -0
  9. package/src/errors.ts +66 -0
  10. package/src/extension/action-suggestions.ts +71 -0
  11. package/src/extension/context-status-injection.ts +174 -0
  12. package/src/extension/knowledge-injection.ts +29 -1
  13. package/src/extension/register.ts +81 -65
  14. package/src/extension/team-tool/api.ts +3 -2
  15. package/src/extension/team-tool/cancel.ts +5 -4
  16. package/src/extension/team-tool/explain.ts +2 -1
  17. package/src/extension/team-tool/failure-patterns.ts +124 -0
  18. package/src/extension/team-tool/inspect.ts +10 -6
  19. package/src/extension/team-tool/lifecycle-actions.ts +5 -4
  20. package/src/extension/team-tool/respond.ts +4 -3
  21. package/src/extension/team-tool/run-not-found.ts +54 -0
  22. package/src/extension/team-tool/run.ts +26 -4
  23. package/src/extension/team-tool/status.ts +58 -4
  24. package/src/extension/team-tool.ts +5 -3
  25. package/src/runtime/async-runner.ts +7 -0
  26. package/src/runtime/background-runner.ts +7 -1
  27. package/src/runtime/chain-parser.ts +13 -5
  28. package/src/runtime/checkpoint.ts +13 -1
  29. package/src/runtime/child-pi.ts +9 -1
  30. package/src/runtime/live-session-runtime.ts +15 -1
  31. package/src/runtime/parent-guard.ts +2 -2
  32. package/src/runtime/pipeline-runner.ts +3 -1
  33. package/src/runtime/stale-reconciler.ts +28 -4
  34. package/src/runtime/task-runner.ts +50 -20
  35. package/src/runtime/team-runner.ts +19 -2
  36. package/src/runtime/verification-gates.ts +21 -1
  37. package/src/runtime/workspace-tree.ts +28 -2
  38. package/src/schema/team-tool-schema.ts +9 -0
  39. package/src/state/blob-store.ts +12 -10
  40. package/src/state/event-log-rotation.ts +114 -93
  41. package/src/state/event-log.ts +83 -23
  42. package/src/state/health-store.ts +6 -1
  43. package/src/state/locks.ts +66 -16
  44. package/src/state/state-store.ts +46 -2
  45. package/src/ui/card-colors.ts +7 -3
  46. package/src/ui/dashboard-panes/agents-pane.ts +15 -2
  47. package/src/ui/live-duration.ts +58 -0
  48. package/src/ui/tool-render.ts +7 -11
  49. package/src/ui/tool-renderers/index.ts +6 -3
  50. package/src/ui/widget/widget-formatters.ts +2 -13
  51. package/src/utils/fs-watch.ts +11 -60
  52. package/src/utils/run-watcher-registry.ts +164 -0
  53. package/src/workflows/discover-workflows.ts +2 -1
  54. package/src/workflows/workflow-config.ts +5 -0
  55. package/src/runtime/dynamic-script-runner.ts +0 -497
  56. package/src/runtime/sandbox.ts +0 -335
@@ -3,12 +3,13 @@ import * as fs from "node:fs";
3
3
  import * as path from "node:path";
4
4
  import { DEFAULT_EVENT_LOG } from "../config/defaults.ts";
5
5
  import { atomicWriteFile } from "./atomic-write.ts";
6
+ import { errors } from "../errors.ts";
6
7
  import { emitFromTeamEvent } from "../ui/run-event-bus.ts";
7
8
  import { logInternalError } from "../utils/internal-error.ts";
8
9
  import { readJsonlSince, type IncrementalReadState } from "../utils/incremental-reader.ts";
9
10
  import { redactSecrets } from "../utils/redaction.ts";
10
11
  import { sleepSync } from "../utils/sleep.ts";
11
- import { needsRotation, compactEventLog, rotateEventLog } from "./event-log-rotation.ts";
12
+ import { needsRotation, compactEventLog, rotateEventLog, applyCompactionUnlocked, prepareCompaction, rotateEventLogUnlocked } from "./event-log-rotation.ts";
12
13
 
13
14
  export type TeamEventProvenance = "live_worker" | "test" | "healthcheck" | "replay" | "api" | "background" | "team_runner";
14
15
  export type TeamWatcherAction = "act" | "observe" | "ignore";
@@ -75,7 +76,7 @@ let overflowCounter = 0;
75
76
  * `flushOneEventLogBuffer`, and `state/mailbox.ts`. Prefer the async alternative
76
77
  * (`appendEventAsync`) for all new code.
77
78
  */
78
- export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
79
+ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T, options?: { timeoutMs?: number; staleMs?: number }): T {
79
80
  // Ensure parent directory exists before attempting lock
80
81
  fs.mkdirSync(path.dirname(eventsPath), { recursive: true });
81
82
  const lockDir = `${eventsPath}.lock`;
@@ -85,8 +86,8 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
85
86
  // event loop indefinitely. 500 retries × 10ms = 5s max. After timeout, we
86
87
  // throw a clear error instead of blocking forever. This ensures AbortSignal
87
88
  // handlers, SIGTERM, and graceful shutdown can fire within seconds.
88
- const timeout = 5000;
89
- const staleMs = 10000;
89
+ const timeout = options?.timeoutMs ?? 5000;
90
+ const staleMs = options?.staleMs ?? 10000;
90
91
  let acquired = false;
91
92
  while (true) {
92
93
  try {
@@ -105,28 +106,39 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
105
106
  // SECURITY (HIGH #2 fix): Throw instead of continuing without lock.
106
107
  // Previously this logged and broke out of the loop, executing the
107
108
  // operation without lock protection. Now we throw so callers can retry.
108
- throw new Error(
109
- `Event log lock timeout for ${eventsPath}: could not acquire lock within ${timeout}ms`,
110
- );
109
+ // E1 (Round 15): structured CrewError (E010) with help hint so users know
110
+ // to check for orphaned .lock dirs / stale processes.
111
+ throw errors.eventLogLockTimeout(eventsPath, timeout);
111
112
  }
112
- // Stale detection: if the owning process is dead, remove the stale lock.
113
+ // Round 26 (BUG 3): mtime-based stale check INDEPENDENT of pidFile.
114
+ // If the holder crashed between mkdir and writing pidFile, there is no
115
+ // pidFile to read — the old code just slept until the 5s timeout, then
116
+ // threw, leaving the dir orphaned FOREVER (every retry repeats the
117
+ // timeout). Now: if the lock dir's mtime exceeds staleMs, reclaim it.
118
+ try {
119
+ const dirStat = fs.statSync(lockDir);
120
+ if (Date.now() - dirStat.mtimeMs > staleMs) {
121
+ fs.rmSync(lockDir, { recursive: true, force: true });
122
+ continue;
123
+ }
124
+ } catch { /* dir vanished — let loop retry */ }
125
+ // Round 26 (BUG 4): the mtime check was previously NESTED inside
126
+ // `if (!alive)`, so a recycled PID (crashed holder's PID reused by an
127
+ // unrelated live process) kept `alive=true` and the mtime check NEVER
128
+ // fired → permanent wedge. mtime is now checked FIRST (above) for ALL
129
+ // holders. The PID check below is a secondary fast-path: if the holder
130
+ // PID is provably dead AND the lock isn't stale yet, we still wait
131
+ // (don't steal a fresh lock just because the pid lookup raced).
113
132
  try {
114
133
  const raw = fs.readFileSync(pidFile, "utf-8").trim();
115
134
  const ownerPid = Number.parseInt(raw, 10);
116
135
  if (!Number.isNaN(ownerPid) && ownerPid !== process.pid) {
117
136
  let alive = false;
118
137
  try { process.kill(ownerPid, 0); alive = true; } catch { /* dead */ }
119
- if (!alive) {
120
- try {
121
- const stat = fs.statSync(lockDir);
122
- if (Date.now() - stat.mtimeMs > staleMs) {
123
- fs.rmSync(lockDir, { recursive: true, force: true });
124
- continue;
125
- }
126
- } catch { /* race — let loop sleep */ }
127
- }
138
+ // (mtime already handled above; nothing to do here for dead-but-fresh.)
139
+ void alive;
128
140
  }
129
- } catch { /* no pid file — fall through to sleep */ }
141
+ } catch { /* no pid file — mtime check above already handles it */ }
130
142
  sleepSync(10);
131
143
  }
132
144
  }
@@ -134,7 +146,19 @@ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
134
146
  return fn();
135
147
  } finally {
136
148
  if (acquired) {
137
- try { fs.rmSync(lockDir, { recursive: true, force: true }); } catch { /* best-effort */ }
149
+ // Round 26 (BUG 5): token/PID-guarded release. Previously the release
150
+ // was an UNCONDITIONAL rmSync. If our fn exceeded staleMs, another
151
+ // process could steal our lock (rm our dir, make its own); when our fn
152
+ // finished our finally block would then DELETE THE STEALER's dir → both
153
+ // in the critical section + lost lock. Verify the pidFile still records
154
+ // OUR pid before removing; if it doesn't, the lock was stolen and the
155
+ // current holder owns the dir.
156
+ try {
157
+ const currentPid = fs.readFileSync(pidFile, "utf-8").trim();
158
+ if (currentPid === String(process.pid)) {
159
+ fs.rmSync(lockDir, { recursive: true, force: true });
160
+ }
161
+ } catch { /* lock stolen or already gone — do not touch */ }
138
162
  }
139
163
  }
140
164
  }
@@ -151,6 +175,29 @@ function evictOldestSequenceCacheEntries(): void {
151
175
  }
152
176
  }
153
177
 
178
+ /** @internal — exported for sequence-cache LRU testing (Round 19). */
179
+ export function __test__sequenceCacheSize(): number {
180
+ return sequenceCache.size;
181
+ }
182
+
183
+ /** @internal — seed an entry into the sequence cache for testing. */
184
+ export function __test__seedSequenceCache(eventsPath: string, lastAccessMs: number): void {
185
+ sequenceCache.set(eventsPath, { size: 1, mtimeMs: 0, seq: 0, lastAccessMs });
186
+ }
187
+
188
+ /** @internal — expose eviction for testing. */
189
+ export function __test__evictOldestSequenceCacheEntries(): void {
190
+ evictOldestSequenceCacheEntries();
191
+ }
192
+
193
+ /** @internal — clear the sequence cache. */
194
+ export function __test__clearSequenceCache(): void {
195
+ sequenceCache.clear();
196
+ }
197
+
198
+ /** @internal — the max sequence cache entries bound. */
199
+ export const MAX_SEQUENCE_CACHE_ENTRIES_VALUE = MAX_SEQUENCE_CACHE_ENTRIES;
200
+
154
201
  export function sequencePath(eventsPath: string): string {
155
202
  return `${eventsPath}.seq`;
156
203
  }
@@ -496,9 +543,14 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
496
543
  if (!isTerminal && fs.existsSync(eventsPath)) {
497
544
  const stat = fs.statSync(eventsPath);
498
545
  if (stat.size > MAX_EVENTS_BYTES) {
499
- // Try immediate compact (not waiting for counter % 100)
546
+ // Try immediate compact (not waiting for counter % 100).
547
+ // Round 24 (BUG 1): we are INSIDE withEventLogLockSync. Use the unlocked
548
+ // apply/rotate cores — the locked variants would deadlock (mkdir lock
549
+ // is not re-entrant → 5s timeout → compaction/rotation never ran →
550
+ // unbounded log growth → events silently dropped past 50MB).
500
551
  try {
501
- compactEventLog(eventsPath);
552
+ const prepared = prepareCompaction(eventsPath);
553
+ if (prepared) applyCompactionUnlocked(eventsPath, prepared);
502
554
  } catch (error) {
503
555
  logInternalError("event-log.immediate-compact", error, `eventsPath=${eventsPath}`);
504
556
  }
@@ -506,7 +558,7 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
506
558
  if (fs.existsSync(eventsPath)) {
507
559
  const afterCompact = fs.statSync(eventsPath);
508
560
  if (afterCompact.size > MAX_EVENTS_BYTES) {
509
- rotateEventLog(eventsPath);
561
+ rotateEventLogUnlocked(eventsPath);
510
562
  }
511
563
  }
512
564
  }
@@ -554,7 +606,15 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
554
606
  }
555
607
  appendCounter++;
556
608
  if (appendCounter % 100 === 0 && needsRotation(eventsPath)) {
557
- try { compactEventLog(eventsPath); } catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
609
+ // Round 24 (BUG 1): we are INSIDE withEventLogLockSync here (called via
610
+ // appendEventInsideLock). The mkdir lock is NOT re-entrant, so calling the
611
+ // locked compactEventLog would deadlock → 5s timeout → compaction never
612
+ // ran → unbounded log growth → events silently dropped past 50MB. Use the
613
+ // unlocked apply path instead (lock already held).
614
+ try {
615
+ const prepared = prepareCompaction(eventsPath);
616
+ if (prepared) applyCompactionUnlocked(eventsPath, prepared);
617
+ } catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
558
618
  }
559
619
  try { emitFromTeamEvent(fullEvent); } catch (error) { logInternalError("event-log.emit", error); }
560
620
  return fullEvent;
@@ -4,7 +4,12 @@ import type { RunHealth } from "../runtime/task-health.ts";
4
4
  import { computeRunHealth } from "../runtime/task-health.ts";
5
5
  import type { ManifestSummary } from "../runtime/task-health.ts";
6
6
 
7
- const HEALTH_DIR = ".crew/state/health";
7
+ // Relative to the crew root (`<cwd>/.crew`). BUG A fix (pts/2 hang
8
+ // investigation 2026-06-16): this was `.crew/state/health`, which double-joined
9
+ // to `<crewRoot>/state/.crew/state/health` because the caller passed the state
10
+ // dir (not the crew root). Now the caller passes the real crew root, so this is
11
+ // a plain `state/health` suffix.
12
+ const HEALTH_DIR = "state/health";
8
13
 
9
14
  export interface HealthSnapshot {
10
15
  runId: string;
@@ -66,6 +66,57 @@ function isLockHolderAlive(filePath: string): boolean {
66
66
  }
67
67
  }
68
68
 
69
+ /**
70
+ * Round 26 (BUG 1): read the lock file ONCE and evaluate staleness + holder
71
+ * liveness from that single snapshot.
72
+ *
73
+ * Previously `acquireLockWithRetry` called `isLockStale()` and
74
+ * `isLockHolderAlive()` separately, each performing its own `readFileSync`.
75
+ * Between those two reads the lock could transition stale→fresh (old holder
76
+ * released, new holder acquired): isLockStale saw the OLD createdAt → stale,
77
+ * isLockHolderAlive saw the NEW pid → alive, yielding `!stale && alive` =
78
+ * false → we forcibly rm the NEW holder's freshly-acquired lock and take it
79
+ * ourselves → BOTH in the critical section. Reading once closes the window.
80
+ *
81
+ * Returns `{ canSteal: true }` if the lock is stale OR the holder is dead
82
+ * (safe to forcibly remove); `{ canSteal: false }` if it is fresh AND held by
83
+ * a live process (must keep waiting).
84
+ */
85
+ function readLockSnapshot(filePath: string, staleMs: number): { canSteal: boolean } {
86
+ let stat: fs.Stats | undefined;
87
+ let raw: string | undefined;
88
+ try {
89
+ stat = fs.statSync(filePath);
90
+ raw = fs.readFileSync(filePath, "utf-8");
91
+ } catch {
92
+ // File vanished between writeLockFile's EEXIST and now (holder released).
93
+ // Loop will retry the create; safe to signal "nothing to steal".
94
+ return { canSteal: false };
95
+ }
96
+ // Staleness from a single snapshot.
97
+ let createdAt = parseCreatedAtFromLock(raw);
98
+ if (createdAt === undefined) createdAt = stat.mtimeMs;
99
+ const isStale = Date.now() - createdAt > staleMs;
100
+ // Holder liveness from the SAME snapshot.
101
+ let isAlive = true; // Unknown holder — assume alive to be safe (matches isLockHolderAlive).
102
+ try {
103
+ const parsed = JSON.parse(raw) as { pid?: unknown };
104
+ const pid = typeof parsed.pid === "number" ? parsed.pid : undefined;
105
+ if (pid !== undefined) {
106
+ try {
107
+ process.kill(pid, 0);
108
+ isAlive = true;
109
+ } catch (error) {
110
+ const code = (error as NodeJS.ErrnoException).code;
111
+ // EPERM/ESRCH → treat as not-alive (stealable), see isLockHolderAlive.
112
+ isAlive = false;
113
+ }
114
+ }
115
+ } catch { /* malformed payload — keep isAlive=true */ }
116
+ // Steal if stale OR holder dead — matches the original intent.
117
+ return { canSteal: isStale || !isAlive };
118
+ }
119
+
69
120
  /**
70
121
  * Lock file kinds. Discriminator written to the lock file payload so that:
71
122
  * - Debugging tools (e.g. a future `pi-crew locks` command) can identify
@@ -180,9 +231,10 @@ function acquireLockWithRetry(filePath: string, staleMs: number, kind: LockKind
180
231
  if (Date.now() > deadline) {
181
232
  throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
182
233
  }
183
- const isStale = isLockStale(filePath, staleMs);
184
- const isHolderAlive = isLockHolderAlive(filePath);
185
- if (!isStale && isHolderAlive) {
234
+ // Round 26 (BUG 1): single-snapshot read closes the TOCTOU window between
235
+ // separate stale + alive reads (which could race stale→fresh).
236
+ const { canSteal } = readLockSnapshot(filePath, staleMs);
237
+ if (!canSteal) {
186
238
  throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
187
239
  }
188
240
  // Stale or dead holder — forcibly remove the lock.
@@ -213,9 +265,9 @@ async function acquireLockWithRetryAsync(filePath: string, staleMs: number, kind
213
265
  if (Date.now() > deadline) {
214
266
  throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
215
267
  }
216
- const isStale = isLockStale(filePath, staleMs);
217
- const isHolderAlive = isLockHolderAlive(filePath);
218
- if (!isStale && isHolderAlive) {
268
+ // Round 26 (BUG 1): single-snapshot read (see sync variant).
269
+ const { canSteal } = readLockSnapshot(filePath, staleMs);
270
+ if (!canSteal) {
219
271
  throw new Error(`Run '${path.basename(filePath)}' is locked by another operation.`);
220
272
  }
221
273
  // Stale or dead holder — forcibly remove the lock.
@@ -244,16 +296,14 @@ export function withFileLockSync<T>(filePath: string, fn: () => T, options: RunL
244
296
  // Between mkdir and lock acquisition, an attacker could plant a symlink.
245
297
  if (!isSymlinkSafePath(path.dirname(lockFile))) throw new Error("Refusing: parent of lock directory is a symlink");
246
298
  fs.mkdirSync(path.dirname(lockFile), { recursive: true });
247
- // FIX: Validate that the target file still exists. If it was deleted and
248
- // recreated since the last lock cycle, the old .lock file may be orphaned
249
- // and should not block the new cycle. Clean it up if the target is missing.
250
- try {
251
- fs.statSync(filePath);
252
- } catch {
253
- // Target file doesn't exist clean up any stale .lock file and proceed.
254
- // The lock will be acquired fresh for the new file (if fn creates it).
255
- try { fs.rmSync(lockFile, { force: true }); } catch { /* ignore */ }
256
- }
299
+ // Round 26 (BUG 2): REMOVED the pre-acquisition target-file-existence check.
300
+ // It was racy between statSync(target) and acquire, a concurrent process
301
+ // could acquire the lock to CREATE the target, and we'd delete its active
302
+ // lock. It was also actively wrong for callers that pass a path already
303
+ // ending in `.lock` (config.ts: the checked "target" never exists, so the
304
+ // cleanup ALWAYS fired, deleting a fresh concurrent holder's lock). Genuine
305
+ // orphan locks (crashed holder) are reclaimed by acquireLockWithRetry's
306
+ // staleMs-based steal logic after at most `staleMs`.
257
307
  // FIX (TOCTOU): Re-validate symlink safety before each lock acquisition
258
308
  // attempt. Between our initial check and the acquisition (and between
259
309
  // acquireLockWithRetry's internal retries), an attacker could plant a
@@ -17,6 +17,37 @@ import type { WorkflowConfig } from "../workflows/workflow-config.ts";
17
17
  import { toPiSessionId } from "../utils/session-utils.ts";
18
18
  import { HealthStore } from "./health-store.ts";
19
19
 
20
+ /**
21
+ * stat() the manifest with a brief retry on Windows for the AV-scan window.
22
+ *
23
+ * On the GitHub Actions windows-latest runner, Windows Defender real-time
24
+ * scanning can make a freshly-written manifest.json briefly invisible to
25
+ * statSync (ENOENT) even though the write succeeded and the file is on disk.
26
+ * loadRunManifestById is called right after createRunManifest in tests and in
27
+ * production (e.g. refreshPersistedSubagentRecord), so without a retry the
28
+ * caller sees a phantom "missing" run.
29
+ *
30
+ * On non-Windows, ENOENT means the file genuinely doesn't exist — passthrough
31
+ * (throw immediately) with no retry. On Windows, ENOENT/EPERM/EBUSY/EAGAIN get
32
+ * a handful of short retries (~30ms worst case) before giving up and throwing
33
+ * so the caller's catch returns undefined as before.
34
+ */
35
+ function statManifestWithWindowsRetry(manifestPath: string): fs.Stats {
36
+ if (process.platform !== "win32") return fs.statSync(manifestPath);
37
+ const retryable = new Set(["ENOENT", "EPERM", "EBUSY", "EAGAIN"]);
38
+ for (let attempt = 0; attempt < 5; attempt++) {
39
+ try {
40
+ return fs.statSync(manifestPath);
41
+ } catch (error) {
42
+ const code = (error as NodeJS.ErrnoException).code;
43
+ if (!retryable.has(code ?? "")) throw error;
44
+ const end = Date.now() + Math.min(8, 1 * 2 ** attempt);
45
+ while (Date.now() < end) { /* brief spin to ride out the AV scan window */ }
46
+ }
47
+ }
48
+ return fs.statSync(manifestPath); // last attempt — let caller's catch handle ENOENT
49
+ }
50
+
20
51
  export interface RunPaths {
21
52
  runId: string;
22
53
  stateRoot: string;
@@ -26,7 +57,7 @@ export interface RunPaths {
26
57
  eventsPath: string;
27
58
  }
28
59
 
29
- interface ManifestCacheEntry {
60
+ export interface ManifestCacheEntry {
30
61
  manifest: TeamRunManifest;
31
62
  tasks: TeamTaskState[];
32
63
  manifestMtimeMs: number;
@@ -45,6 +76,19 @@ const MANIFEST_CACHE_TTL_MS = 15 * 1000; // 15 seconds (FIX: increased from 5s f
45
76
  const LOAD_MANIFEST_RETRY_LIMIT = 5; // Configurable retry limit for mtime/size stability checks under contention
46
77
  const manifestCache = new Map<string, ManifestCacheEntry>();
47
78
 
79
+ /** @internal — exported for TTL-eviction unit testing (Round 19). */
80
+ export function __test__setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
81
+ setManifestCache(stateRoot, entry);
82
+ }
83
+
84
+ /** @internal — exported for TTL-eviction unit testing (Round 19). */
85
+ export function __test__getManifestCacheEntry(stateRoot: string): ManifestCacheEntry | undefined {
86
+ return manifestCache.get(stateRoot);
87
+ }
88
+
89
+ /** @internal — the TTL in ms used for manifest cache eviction. */
90
+ export const MANIFEST_CACHE_TTL_MS_VALUE = MANIFEST_CACHE_TTL_MS;
91
+
48
92
  function setManifestCache(stateRoot: string, entry: ManifestCacheEntry): void {
49
93
  if (manifestCache.has(stateRoot)) manifestCache.delete(stateRoot);
50
94
  entry.cachedAt = Date.now();
@@ -506,7 +550,7 @@ export function loadRunManifestById(cwd: string, runId: string): { manifest: Tea
506
550
 
507
551
  let manifestStat: fs.Stats;
508
552
  try {
509
- manifestStat = fs.statSync(manifestPath);
553
+ manifestStat = statManifestWithWindowsRetry(manifestPath);
510
554
  } catch {
511
555
  return undefined;
512
556
  }
@@ -14,6 +14,7 @@
14
14
  * 30-35% — word-level emphasis (prominent)
15
15
  */
16
16
  import type { CrewTheme } from "./theme-adapter.ts";
17
+ import { visibleWidth as visualWidth } from "../utils/visual.ts";
17
18
 
18
19
  // ── ANSI parsing ────────────────────────────────────────────────────────
19
20
 
@@ -96,12 +97,15 @@ export function deriveCardBackground(
96
97
 
97
98
  // ── Helpers for padding lines with a background ─────────────────────────
98
99
 
99
- const ANSI_SGR_RE = /\x1b\[[0-9;]*m/g;
100
100
  const RESET = "\x1b[0m";
101
101
 
102
- /** Strip ANSI SGR codes to get visible character count. */
102
+ /** Strip ANSI SGR codes then compute the VISUAL width (Unicode-aware).
103
+ * Round 23 (BUG 2): previously this used `.length` (UTF-16 code units), which
104
+ * under-counts CJK/emoji → wrong padding → broken frame borders in crew cards.
105
+ * Delegate to the canonical Unicode-aware visualWidth from utils/visual.ts
106
+ * used by every other renderer. */
103
107
  export function visibleWidth(text: string): number {
104
- return text.replace(ANSI_SGR_RE, "").length;
108
+ return visualWidth(text);
105
109
  }
106
110
 
107
111
  /**
@@ -3,7 +3,9 @@ import { iconForStatus } from "../status-colors.ts";
3
3
  import type { RunUiSnapshot } from "../snapshot-types.ts";
4
4
  import { spinnerFrame } from "../spinner.ts";
5
5
  import type { CrewAgentRecord } from "../../runtime/crew-agent-runtime.ts";
6
+ import { formatCost } from "../../state/usage.ts";
6
7
  import { listLiveAgents, listLiveAgentsByWorkspace, type LiveAgentHandle } from "../../runtime/live-agent-manager.ts";
8
+ import { computeLiveDurationMs } from "../live-duration.ts";
7
9
 
8
10
  /**
9
11
  * Returns true if this agent did real work (LLM call, tool use, or non-trivial duration).
@@ -82,15 +84,26 @@ export function renderAgentsPane(snapshot: RunUiSnapshot | undefined, options: R
82
84
  : agent.status === "failed" ? (agent.error ?? "failed")
83
85
  : "done";
84
86
 
85
- // Stats: tokens + duration only
87
+ // Stats: tokens + cost + duration
86
88
  const stats: string[] = [];
87
89
  const tokenTotal = (agent.usage?.input ?? 0) + (agent.usage?.output ?? 0) + (agent.usage?.cacheRead ?? 0) + (agent.usage?.cacheWrite ?? 0);
88
90
  if (tokenTotal > 0) {
89
91
  const tok = tokenTotal >= 1000 ? `${(tokenTotal / 1000).toFixed(1)}k` : `${tokenTotal}`;
90
92
  stats.push(tok);
91
93
  }
94
+ // Per-agent cost (Round 17 BS-1): the data is already on task.usage.cost;
95
+ // surface it live so the user sees $ burn per agent during a run.
96
+ if (agent.usage?.cost && agent.usage.cost > 0) {
97
+ stats.push(formatCost(agent.usage.cost));
98
+ }
92
99
  if (liveHandle) {
93
- const ms = (liveHandle.activity.completedAtMs ?? Date.now()) - liveHandle.activity.startedAtMs;
100
+ // Round 23 (BUG 1): the duration math here was naive —
101
+ // (completedAtMs ?? Date.now()) - startedAtMs
102
+ // which produced a giant NEGATIVE duration whenever startedAtMs was
103
+ // 0/undefined/bad, or a race set completedAtMs < startedAtMs. This
104
+ // fired for EVERY running live agent in the dashboard. Use the shared,
105
+ // validated computeLiveDurationMs (mirrors widget-formatters.ts).
106
+ const ms = computeLiveDurationMs(liveHandle.activity);
94
107
  stats.push(`${(ms / 1000).toFixed(1)}s`);
95
108
  if (options.showModel !== false && liveHandle.modelName && liveHandle.modelName !== "default") {
96
109
  stats.push(liveHandle.modelName);
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Round 23 (BUG 1 fix): live-agent duration computation.
3
+ *
4
+ * The naive `(completedAtMs ?? Date.now()) - startedAtMs` produced giant
5
+ * NEGATIVE durations for every running live agent whenever startedAtMs was
6
+ * 0/undefined/out-of-range, or a race set completedAtMs < startedAtMs.
7
+ *
8
+ * This module consolidates the validated duration math (previously duplicated
9
+ * between widget-formatters.ts and agents-pane.ts) into one pure, fully
10
+ * testable function: it normalizes seconds-vs-ms, sanity-checks the start
11
+ * timestamp against the current time, and never returns a negative value.
12
+ */
13
+
14
+ export interface LiveActivity {
15
+ startedAtMs?: number;
16
+ completedAtMs?: number;
17
+ }
18
+
19
+ /** Normalize a raw timestamp that may be seconds or milliseconds. */
20
+ function toMs(v: number): number {
21
+ if (v <= 0) return 0;
22
+ // 1e9 < seconds < 1e10 → seconds, scale up
23
+ if (v > 1_000_000_000 && v < 10_000_000_000) return v * 1000;
24
+ // 1e11 < ms < 1e13 → already ms
25
+ if (v > 100_000_000_000 && v < 10_000_000_000_000) return v;
26
+ return v;
27
+ }
28
+
29
+ /**
30
+ * Compute the live elapsed duration in milliseconds for an agent activity.
31
+ *
32
+ * - Never negative (clamped to >= 0).
33
+ * - Returns 0 if the start timestamp is missing or implausible.
34
+ * - Uses `completedAtMs` when present and sane; otherwise `nowMs` (running).
35
+ *
36
+ * @param activity the live agent activity handle
37
+ * @param nowMs optional override for `Date.now()` (tests / determinism)
38
+ */
39
+ export function computeLiveDurationMs(activity: LiveActivity, nowMs: number = Date.now()): number {
40
+ const rawStarted = activity.startedAtMs || 0;
41
+ const rawCompleted = activity.completedAtMs || 0;
42
+ const startedMs = toMs(rawStarted);
43
+ const completedMs = rawCompleted > 0 ? toMs(rawCompleted) : 0;
44
+ // A valid start is positive, not more than 1 minute in the future, and not
45
+ // more than ~1000 years in the past (guards against 0 / garbage / clock skew).
46
+ const isValidStarted =
47
+ startedMs > 0 &&
48
+ startedMs < nowMs + 60_000 &&
49
+ startedMs > nowMs - 31_556_926_000_000;
50
+ const end = completedMs > 0 && completedMs < nowMs + 60_000 ? completedMs : nowMs;
51
+ const ms = end - (isValidStarted ? startedMs : nowMs);
52
+ return Number.isFinite(ms) && ms >= 0 ? ms : 0;
53
+ }
54
+
55
+ /** Format a live duration in seconds, e.g. `12.3s`. Returns `0.0s` for 0. */
56
+ export function formatLiveDuration(activity: LiveActivity, nowMs: number = Date.now()): string {
57
+ return `${(computeLiveDurationMs(activity, nowMs) / 1000).toFixed(1)}s`;
58
+ }
@@ -10,6 +10,7 @@
10
10
  import { Container, Spacer, Text, visibleWidth } from "@earendil-works/pi-tui";
11
11
  import type { CrewAgentRecord } from "../runtime/crew-agent-runtime.ts";
12
12
  import { replaceTabs } from "./render-diff.ts";
13
+ import { truncateToWidth } from "../utils/visual.ts";
13
14
 
14
15
  // ── Types ──────────────────────────────────────────────────────────────
15
16
  export interface Theme {
@@ -68,17 +69,12 @@ function formatContextUsage(tokens: number, contextWindow: number | undefined):
68
69
 
69
70
  export function truncLine(text: string, maxWidth: number): string {
70
71
  if (text.includes("\n") || text.includes("\r")) text = text.replace(/\r?\n/g, "↵ ");
71
- if (visibleWidth(text) <= maxWidth) return text;
72
- let result = "", width = 0;
73
- for (let i = 0; i < text.length; i++) {
74
- if (text[i] === "\x1b") {
75
- const m = text.slice(i).match(/^\x1b\[[0-9;]*m/);
76
- if (m) { result += m[0]; i += m[0].length - 1; continue; }
77
- }
78
- if (width >= maxWidth - 1) return result + "…";
79
- result += text[i]; width++;
80
- }
81
- return result;
72
+ // Round 23 (BUG 4): previously this loop counted 1 visual column per UTF-16
73
+ // code unit and indexed text[i], so for CJK it emitted up to 2x the visual
74
+ // width (frame overflow) and for emoji it split surrogate pairs (U+FFFD).
75
+ // Delegate to the grapheme/ANSI-aware truncateToWidth (keeps ANSI codes,
76
+ // respects double-wide CJK + surrogate pairs, adds the '…' ellipsis).
77
+ return truncateToWidth(text, maxWidth);
82
78
  }
83
79
 
84
80
  export function formatToolPreview(name: string, args: Record<string, unknown>): string {
@@ -12,6 +12,7 @@ import type { CrewTheme } from "../theme-adapter.ts";
12
12
  import { truncLine, formatTokens, formatDuration } from "../tool-render.ts";
13
13
  import type { CrewAgentRecord } from "../../runtime/crew-agent-runtime.ts";
14
14
  import { isBrief, briefToolResult } from "./brief-mode.ts";
15
+ import { truncateToWidth } from "../../utils/visual.ts";
15
16
 
16
17
  // ── Types ──────────────────────────────────────────────────────────────
17
18
 
@@ -42,9 +43,11 @@ function padVisual(str: string, targetWidth: number): string {
42
43
  /** Truncate a string (which may contain ANSI codes) to a target VISUAL width. */
43
44
  function truncVisual(str: string, maxWidth: number): string {
44
45
  if (visibleWidth(str) <= maxWidth) return str;
45
- // Strip ANSI to truncate safely, then caller re-colors
46
- const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
47
- return stripped.slice(0, maxWidth);
46
+ // Round 23 (BUG 3): previously used String.slice(0, maxWidth) which counts
47
+ // UTF-16 code units — for CJK that overflows the card by up to 2x, and for
48
+ // emoji it splits a surrogate pair (U+FFFD). Use the grapheme/ANSI-aware
49
+ // truncateToWidth with empty ellipsis (the caller appends its own '…').
50
+ return truncateToWidth(str, maxWidth, "");
48
51
  }
49
52
 
50
53
  // ── Visual primitives ──────────────────────────────────────────────────
@@ -7,6 +7,7 @@
7
7
  import type { CrewAgentRecord } from "../../runtime/crew-agent-runtime.ts";
8
8
  import type { LiveAgentHandle } from "../../runtime/live-agent-manager.ts";
9
9
  import { getTaskUsage } from "../../runtime/usage-tracker.ts";
10
+ import { computeLiveDurationMs } from "../live-duration.ts";
10
11
 
11
12
  // ── Token formatting ──────────────────────────────────────────────────
12
13
 
@@ -115,19 +116,7 @@ export function agentStats(agent: CrewAgentRecord, liveHandle?: LiveAgentHandle)
115
116
  const ctxPct = stats?.contextUsage?.percent;
116
117
  if (ctxPct != null) parts.push(`${Math.round(ctxPct)}% ctx`);
117
118
  } catch { /* ignore */ }
118
- const rawStarted = act.startedAtMs || 0;
119
- const rawCompleted = act.completedAtMs || 0;
120
- const nowMs = Date.now();
121
- const toMs = (v: number): number => {
122
- if (v <= 0) return 0;
123
- if (v > 1000000000 && v < 10000000000) return v * 1000;
124
- if (v > 100000000000 && v < 10000000000000) return v;
125
- return v;
126
- };
127
- const startedMs = toMs(rawStarted);
128
- const completedMs = rawCompleted > 0 ? toMs(rawCompleted) : 0;
129
- const isValidStarted = startedMs > 0 && startedMs < nowMs + 60000 && startedMs > nowMs - 3155692600000;
130
- const ms = (completedMs > 0 && completedMs < nowMs + 60000 ? completedMs : nowMs) - (isValidStarted ? startedMs : nowMs);
119
+ const ms = computeLiveDurationMs(act);
131
120
  parts.push(`${(ms / 1000).toFixed(1)}s`);
132
121
  } else {
133
122
  if (agent.toolUses) parts.push(`${agent.toolUses} tools`);