pi-crew 0.7.5 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/CHANGELOG.md +51 -0
  2. package/README.md +11 -11
  3. package/docs/commands-reference.md +14 -10
  4. package/docs/troubleshooting.md +131 -0
  5. package/docs/usage.md +9 -4
  6. package/package.json +1 -1
  7. package/src/config/config.ts +11 -4
  8. package/src/extension/action-suggestions.ts +71 -0
  9. package/src/extension/context-status-injection.ts +32 -1
  10. package/src/extension/register.ts +71 -65
  11. package/src/extension/team-tool/api.ts +3 -2
  12. package/src/extension/team-tool/cancel.ts +5 -4
  13. package/src/extension/team-tool/explain.ts +2 -1
  14. package/src/extension/team-tool/failure-patterns.ts +124 -0
  15. package/src/extension/team-tool/inspect.ts +10 -6
  16. package/src/extension/team-tool/lifecycle-actions.ts +5 -4
  17. package/src/extension/team-tool/respond.ts +4 -3
  18. package/src/extension/team-tool/run-not-found.ts +54 -0
  19. package/src/extension/team-tool/run.ts +26 -4
  20. package/src/extension/team-tool/status.ts +58 -4
  21. package/src/extension/team-tool.ts +5 -3
  22. package/src/runtime/async-runner.ts +7 -0
  23. package/src/runtime/background-runner.ts +7 -1
  24. package/src/runtime/chain-parser.ts +13 -5
  25. package/src/runtime/checkpoint.ts +13 -1
  26. package/src/runtime/child-pi.ts +9 -1
  27. package/src/runtime/live-session-runtime.ts +15 -1
  28. package/src/runtime/parent-guard.ts +2 -2
  29. package/src/runtime/stale-reconciler.ts +8 -3
  30. package/src/runtime/task-runner.ts +10 -1
  31. package/src/runtime/team-runner.ts +19 -2
  32. package/src/runtime/verification-gates.ts +21 -1
  33. package/src/schema/team-tool-schema.ts +9 -0
  34. package/src/state/blob-store.ts +12 -10
  35. package/src/state/event-log-rotation.ts +114 -93
  36. package/src/state/event-log.ts +79 -20
  37. package/src/state/health-store.ts +6 -1
  38. package/src/state/locks.ts +66 -16
  39. package/src/state/state-store.ts +14 -1
  40. package/src/ui/card-colors.ts +7 -3
  41. package/src/ui/dashboard-panes/agents-pane.ts +15 -2
  42. package/src/ui/live-duration.ts +58 -0
  43. package/src/ui/tool-render.ts +7 -11
  44. package/src/ui/tool-renderers/index.ts +6 -3
  45. package/src/ui/widget/widget-formatters.ts +2 -13
  46. package/src/utils/fs-watch.ts +11 -60
  47. package/src/utils/run-watcher-registry.ts +164 -0
  48. package/src/workflows/discover-workflows.ts +2 -1
  49. package/src/workflows/workflow-config.ts +5 -0
  50. package/src/runtime/dynamic-script-runner.ts +0 -497
  51. package/src/runtime/sandbox.ts +0 -335
@@ -122,10 +122,10 @@ class ChainParser {
122
122
 
123
123
  parse(): ChainStep[] {
124
124
  const steps: ChainStep[] = [];
125
- steps.push(this.parseStep());
125
+ steps.push(this.parseStep(0));
126
126
  while (this.peek("ARROW")) {
127
127
  this.consume("ARROW");
128
- steps.push(this.parseStep());
128
+ steps.push(this.parseStep(0));
129
129
  }
130
130
  if (this.pos < this.tokens.length) {
131
131
  throw new Error(`Unexpected token '${this.tokens[this.pos]?.value}' at position ${this.pos}`);
@@ -133,16 +133,24 @@ class ChainParser {
133
133
  return steps;
134
134
  }
135
135
 
136
- private parseStep(): ChainStep {
136
+ private parseStep(depth: number = 0): ChainStep {
137
+ // Round 22 (BUG 2): guard against stack overflow on deeply nested input.
138
+ // Without this, a crafted 'parallel(parallel(parallel(...)))' input would
139
+ // recurse unbounded and crash the process with RangeError. Each nesting
140
+ // level needs >=9 chars, so ~130KB could overflow V8's ~15K-frame stack.
141
+ const MAX_CHAIN_NESTING = 100;
142
+ if (depth > MAX_CHAIN_NESTING) {
143
+ throw new Error(`Chain DSL nesting too deep (max ${MAX_CHAIN_NESTING}); likely unbalanced or malicious input`);
144
+ }
137
145
  // Check for parallel(...) construct
138
146
  if (this.peek("NAME", "parallel")) {
139
147
  this.consume("NAME"); // eat "parallel"
140
148
  this.consume("LPAREN");
141
149
  const parallel: ChainStep[] = [];
142
- parallel.push(this.parseStep());
150
+ parallel.push(this.parseStep(depth + 1));
143
151
  while (this.peek("COMMA")) {
144
152
  this.consume("COMMA");
145
- parallel.push(this.parseStep());
153
+ parallel.push(this.parseStep(depth + 1));
146
154
  }
147
155
  this.consume("RPAREN");
148
156
  const step: ChainStep = { name: "parallel", parallel };
@@ -64,7 +64,19 @@ export class FileCheckpointStore implements CheckpointStore {
64
64
  // Atomic write: write to temp file first, then rename, then fsync parent.
65
65
  // This guarantees either the old file or the new file, never a partial
66
66
  // write, even on network filesystems or certain journal modes.
67
- const tmp = path.join(this.checkpointDir(), ".tmp.checkpoint");
67
+ //
68
+ // Round 22 (BUG 1): the temp filename MUST be unique per save call.
69
+ // Previously a fixed '.tmp.checkpoint' was shared across ALL concurrent
70
+ // saves; pi-crew's multi-process architecture (main + detached background
71
+ // workers each checkpointing their own tasks) made this realistic: two
72
+ // processes writing '.tmp.checkpoint' at once → one's rename picks up the
73
+ // other's data (silent corruption) and the second rename hits ENOENT
74
+ // (silent data loss). Including taskId + pid + timestamp guarantees
75
+ // uniqueness across processes and across tasks.
76
+ const tmp = path.join(
77
+ this.checkpointDir(),
78
+ `.tmp.${checkpoint.taskId}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2, 8)}`,
79
+ );
68
80
  fs.writeFileSync(tmp, JSON.stringify(checkpoint, null, 2), "utf-8");
69
81
  fs.renameSync(tmp, p);
70
82
  // fsync parent directory to ensure the rename is durable
@@ -628,7 +628,14 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
628
628
  let graceTurns = input.graceTurns;
629
629
  if (graceTurns !== undefined && graceTurns > 1000) graceTurns = 1000;
630
630
  let abortDueToParentSignal = false;
631
- input.signal?.addEventListener("abort", () => { abortDueToParentSignal = true; }, { once: true });
631
+ // Round 27 (BUG 4): extract to a named handler so settle() can remove it.
632
+ // The previous anonymous listener was never removed → on runs with >10
633
+ // tasks sharing one AbortSignal (background-runner), Node emitted
634
+ // MaxListenersExceededWarning and each leaked listener pinned the task's
635
+ // stack frame (abortDueToParentSignal closure) in memory. { once: true }
636
+ // only auto-removes AFTER the signal fires; on normal completion it leaks.
637
+ const onParentAbort = (): void => { abortDueToParentSignal = true; };
638
+ input.signal?.addEventListener("abort", onParentAbort, { once: true });
632
639
  const restartNoResponseTimer = (): void => {
633
640
  if (responseTimeoutMs <= 0) return;
634
641
  if (noResponseTimer) clearTimeout(noResponseTimer);
@@ -747,6 +754,7 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
747
754
  clearChildPiTimeouts();
748
755
  lineObserver.flush();
749
756
  input.signal?.removeEventListener("abort", abort);
757
+ input.signal?.removeEventListener("abort", onParentAbort);
750
758
  try {
751
759
  cleanupTempDir(built.tempDir);
752
760
  } catch (error) {
@@ -384,6 +384,12 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
384
384
 
385
385
  const agentId = `${input.manifest.runId}:${input.task.id}`;
386
386
 
387
+ // Round 27 (BUG 4): hoisted to function scope so the finally block can remove
388
+ // it. const inside try{} is block-scoped and invisible to finally{}. The
389
+ // handler resolves `session` lazily at call time (it may be assigned later
390
+ // inside the try), so declaring it here is safe.
391
+ let onSignalAbort: (() => void) | undefined;
392
+
387
393
  try {
388
394
  const agentDir = typeof mod.getAgentDir === "function" ? mod.getAgentDir() : undefined;
389
395
  let resourceLoader: unknown;
@@ -545,9 +551,14 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
545
551
  }
546
552
  });
547
553
  }
554
+ // Round 27 (BUG 4): named abort handler (removed in finally below).
555
+ onSignalAbort = (): void => { void session?.abort?.(); };
548
556
  if (input.signal) {
549
557
  if (input.signal.aborted) await session.abort?.();
550
- else input.signal.addEventListener("abort", () => { void session?.abort?.(); }, { once: true });
558
+ // Round 27 (BUG 4): named handler so the finally block can remove it.
559
+ // The previous anonymous listener leaked on normal completion (only
560
+ // auto-removed by { once: true } AFTER the signal fires).
561
+ else input.signal.addEventListener("abort", onSignalAbort, { once: true });
551
562
  }
552
563
  const effectivePrompt = input.runtimeConfig?.inheritContext === true && input.parentContext ? `${input.parentContext}\n\n---\n# Live Subagent Task\n${input.prompt}` : input.prompt;
553
564
 
@@ -687,6 +698,9 @@ export async function runLiveSessionTask(input: LiveSessionSpawnInput): Promise<
687
698
  // H6: Unsubscribe listeners FIRST before clearing timer to prevent race
688
699
  unsubscribe?.();
689
700
  unsubscribeControlRealtime?.();
701
+ // Round 27 (BUG 4): remove the named abort listener to avoid leaking it
702
+ // on the shared AbortSignal across many live-session tasks.
703
+ if (onSignalAbort) input.signal?.removeEventListener("abort", onSignalAbort);
690
704
  if (controlTimer) clearInterval(controlTimer);
691
705
  streamOut?.close();
692
706
  if (input.signal?.aborted) {
@@ -29,8 +29,8 @@
29
29
  * signal, NOT a security boundary:
30
30
  * - It only causes the (already-compromised) child to exit earlier.
31
31
  * - A truly malicious child can simply not call `startParentGuard()`.
32
- * - Real protection against hostile children comes from the sandbox,
33
- * env-filter allowlist, and redaction — all enforced before spawn.
32
+ * - Real protection against hostile children comes from the env-filter
33
+ * allowlist and redaction — all enforced before spawn.
34
34
  *
35
35
  * The guard exists for the benign case: a parent dies (user closes the
36
36
  * terminal, pi crashes, machine loses power) and we want all detached
@@ -485,9 +485,13 @@ export interface OrphanReconcileResult {
485
485
  */
486
486
  export function reconcileOrphanedTempWorkspaces(
487
487
  now = Date.now(),
488
- options?: { cleanupOrphanedTempDirs?: boolean },
488
+ options?: { cleanupOrphanedTempDirs?: boolean; tmpDir?: string; scanBatchSize?: number },
489
489
  ): OrphanReconcileResult {
490
- const tmpDir = getSafeTempDir();
490
+ // Injectable tmpDir + scanBatchSize for deterministic unit testing
491
+ // (Round 19: tests must not depend on global /tmp cleanliness; the
492
+ // production ORPHAN_TEMP_SCAN_BATCH_SIZE cap could exclude a test's dir
493
+ // when leftover dirs accumulate). Defaults remain os.tmpdir() + the cap.
494
+ const tmpDir = options?.tmpDir ?? getSafeTempDir();
491
495
  if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
492
496
  let repaired = 0;
493
497
  let cleanedDirs = 0;
@@ -496,10 +500,11 @@ export function reconcileOrphanedTempWorkspaces(
496
500
  // Sort for deterministic order; cap to ORPHAN_TEMP_SCAN_BATCH_SIZE per
497
501
  // tick to avoid main-thread stalls when /tmp has thousands of
498
502
  // pi-crew-* dirs from past interrupted test runs.
503
+ const scanBatch = options?.scanBatchSize ?? ORPHAN_TEMP_SCAN_BATCH_SIZE;
499
504
  const candidates = entries
500
505
  .filter((e) => e.isDirectory() && e.name.startsWith("pi-crew-"))
501
506
  .sort((a, b) => a.name.localeCompare(b.name))
502
- .slice(0, ORPHAN_TEMP_SCAN_BATCH_SIZE);
507
+ .slice(0, scanBatch);
503
508
  for (const entry of candidates) {
504
509
  if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
505
510
  continue;
@@ -292,7 +292,16 @@ export async function runTeamTask(
292
292
  const exitCode = (err as NodeJS.ErrnoException & { status?: number }).status;
293
293
  // E1 (Round 15): structured CrewError with code E009 + help hint,
294
294
  // instead of a raw Error. Surfaces the script path, exit code, and stderr.
295
- throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
295
+ // Round 21 (E4): if preStepOptional is set, a failing hook is NON-FATAL.
296
+ // Log a warning + emit a 'warning' event, then proceed without the
297
+ // pre-step output rather than aborting the task (advisory hooks).
298
+ if (input.step.preStepOptional) {
299
+ const warnMsg = `[preStepOptional] pre-step hook '${input.step.preStepScript}' failed (exit ${exitCode ?? "?"}) but preStepOptional=true; continuing without its output.`;
300
+ try { appendEventFireAndForget(manifest.eventsPath, { type: "hook.pre_step_optional_failed", runId: manifest.runId, taskId: task.id, message: warnMsg, data: { script: input.step.preStepScript, exitCode: exitCode ?? null } }); } catch { /* best-effort event log */ }
301
+ preStepOutput = undefined;
302
+ } else {
303
+ throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
304
+ }
296
305
  }
297
306
  }
298
307
 
@@ -455,6 +455,15 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
455
455
 
456
456
  return result;
457
457
  } catch (error) {
458
+ // Round 27 (BUG 1): the success path calls stopTeamHeartbeat() but this
459
+ // catch path did NOT. The team heartbeat is a non-unref'd setInterval
460
+ // (30s) that deliberately keeps the event loop alive — without this
461
+ // call, a failed team run leaves the interval firing forever and the
462
+ // foreground pi process hangs (never returns to the prompt); in
463
+ // background-runner mode the worker never exits. clearInterval is
464
+ // idempotent so a double-call (if this runs after the success path)
465
+ // is harmless.
466
+ stopTeamHeartbeat();
458
467
  // P1: Catch unhandled errors — ensure manifest/tasks/agents are terminal so they don't stay "running" forever.
459
468
  const message = error instanceof Error ? error.message : String(error);
460
469
  // Reload manifest with lock to avoid stale data overwriting concurrent writes.
@@ -922,8 +931,16 @@ tasks = mergeResult.resultTasks;
922
931
  await saveRunTasksAsync(finalManifest, tasks);
923
932
  });
924
933
  manifest = finalManifest;
925
- // Save health snapshot on run completion
926
- const crewRoot = path.dirname(path.dirname(finalManifest.stateRoot));
934
+ // Save health snapshot on run completion.
935
+ // BUG A (pts/2 hang investigation 2026-06-16): stateRoot = `<crewRoot>/state/runs/<runId>`,
936
+ // so the crew root is THREE dirnames up, not two. Two dirnames gave `<crewRoot>/state`
937
+ // (the state dir), and HealthStore then joined HEALTH_DIR (`.crew/state/health`)
938
+ // onto it → `<crewRoot>/state/.crew/state/health` — a double-joined BOGUS path.
939
+ // That wrote health snapshots to a nonexistent subtree (silently breaking the
940
+ // health feature) AND created junk dirs that the recursive state watcher then
941
+ // attached extra inotify watches to. Fix: compute the real crew root (3 up)
942
+ // and make HEALTH_DIR relative to it.
943
+ const crewRoot = path.dirname(path.dirname(path.dirname(finalManifest.stateRoot)));
927
944
  const healthStore = new HealthStore(crewRoot);
928
945
  healthStore.saveSnapshot({
929
946
  runId: finalManifest.runId,
@@ -57,7 +57,12 @@ export const CARGO_RUST_GATES: Array<{ name: string; command: string; critical:
57
57
  * Execute a single command and capture output.
58
58
  */
59
59
  /** Characters/patterns that indicate dangerous shell metacharacters. */
60
- const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[^&])/;
60
+ // Round 25 (VULN-3/VULN-4): also block raw newlines (sh -c treats \n as a
61
+ // command separator -> injection) and bare $VARNAME references (can exfiltrate
62
+ // secrets into captured gate output, e.g. `echo $ANTHROPIC_API_KEY`).
63
+ // $+word-char is blocked; special vars like $?/$$/$! are left alone. Built-in
64
+ // gates use only `2>&1` (no $VAR), so this does not break them.
65
+ const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\$\w|\b(eval|exec)\b|>>|<[^^&]|[\r\n])/;
61
66
  // Note: single `>` is NOT blocked here because `2>&1` is a safe redirect used by built-in gates.
62
67
  // `>>` (append) is still blocked. `<` without `&` (input redirect) is still blocked.
63
68
 
@@ -66,7 +71,22 @@ const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[
66
71
  * Rejects commands with shell metacharacters that could enable injection.
67
72
  * Allows: pipes (|), redirection of stderr (2>&1), and basic npm/cargo/npx commands.
68
73
  */
74
+ /** @internal — exported for injection-guard unit testing (Round 25). */
75
+ export function __test__validateGateCommand(command: string): void {
76
+ validateGateCommand(command);
77
+ }
78
+
69
79
  function validateGateCommand(command: string): void {
80
+ // Round 25 (VULN-3): check the ORIGINAL command for raw newlines BEFORE
81
+ // normalization. The regex below runs on the NORMALIZED command (which
82
+ // collapses \s+ incl. newlines to a single space), so a newline would be
83
+ // hidden from it - but `sh -c` treats a raw newline as a command
84
+ // separator, enabling injection (e.g. `npm test\nrm -rf x`).
85
+ if (/[\r\n]/.test(command)) {
86
+ throw new Error(
87
+ `Security: verification gate command rejected (raw newline - potential command injection): ${JSON.stringify(command)}`,
88
+ );
89
+ }
70
90
  const normalized = command
71
91
  .replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '') // ANSI escape sequences
72
92
  .replace(/[\x00-\x08\x0b\x0c\x0e-\x1f]/g, '') // control chars
@@ -135,6 +135,13 @@ export const TeamToolParams = Type.Object({
135
135
  description: "Run in background when execution support is enabled.",
136
136
  }),
137
137
  ),
138
+ details: Type.Optional(
139
+ Type.Boolean({
140
+ default: true,
141
+ description:
142
+ "(status) Output detail level. true (default) = full status (task graph, agents, effectiveness, events). false = compact summary (status, goal, task counts, and only failed/attention task errors) for quick checks.",
143
+ }),
144
+ ),
138
145
  workspaceMode: Type.Optional(
139
146
  Type.Union([Type.Literal("single"), Type.Literal("worktree")], {
140
147
  description:
@@ -318,6 +325,8 @@ export interface TeamToolParamsValue {
318
325
  taskId?: string;
319
326
  message?: string;
320
327
  async?: boolean;
328
+ /** (status) Output detail level. false = compact summary. Default: true (full). */
329
+ details?: boolean;
321
330
  workspaceMode?: "single" | "worktree";
322
331
  context?: "fresh" | "fork";
323
332
  cwd?: string;
@@ -190,16 +190,18 @@ export function writeBlob(artifactsRoot: string, input: {
190
190
  metadataWritten = true;
191
191
  });
192
192
  } catch (error) {
193
- // Issue 4 fix: Clean up orphaned blob if metadata write fails.
194
- // If metadata write fails (e.g., concurrent conflict), the blob content
195
- // is orphaned since no metadata references it. Clean it up to reclaim space.
196
- // Issue 8 fix: Do NOT delete blob content on metadata failure.
197
- // If metadata write fails due to concurrent conflict (different values),
198
- // the blob content is still valid. Another process has written metadata
199
- // referencing this blob - deleting the blob would orphan their metadata.
200
- // The caller can retry the metadata write if needed.
201
- // However, if metadata was never written (metadataWritten === false),
202
- // the blob is orphaned and should be cleaned up.
193
+ // Round 24 (BUG 4 note): the catch block previously checked
194
+ // `if (!blobContentWritten)` the WRONG variable (the local comment said
195
+ // `metadataWritten === false`). For a CONTENT-ADDRESSED store the blob path
196
+ // is the content hash, so the blob may be referenced by another process's
197
+ // metadata even when OUR metadata write failed (e.g. a concurrent conflict
198
+ // where the peer already wrote metadata for the same hash). Deleting it
199
+ // would orphan their metadata. The safe behavior is therefore to NEVER
200
+ // delete on a metadata write failure and let the periodic
201
+ // cleanupOrphanedBlobs() reclaim genuinely-orphaned blobs. The guard below
202
+ // only removes a blob when its CONTENT was never written (a stray/partial
203
+ // file from a failed content write) — which is the only unambiguously-safe
204
+ // case to clean up here.
203
205
  if (!blobContentWritten) {
204
206
  try { fs.rmSync(blobPath, { force: true }); } catch { /* best-effort */ }
205
207
  }
@@ -1,5 +1,5 @@
1
1
  import * as fs from "node:fs";
2
- import { readEvents } from "./event-log.ts";
2
+ import { readEvents, type TeamEvent } from "./event-log.ts";
3
3
  import { atomicWriteFile } from "./atomic-write.ts";
4
4
  import { logInternalError } from "../utils/internal-error.ts";
5
5
  import { withEventLogLockSync } from "./event-log.ts";
@@ -65,6 +65,25 @@ export interface CompactionResult {
65
65
  * 6. Return compaction stats
66
66
  */
67
67
  export function compactEventLog(eventsPath: string, config?: Partial<RotationConfig>): CompactionResult | undefined {
68
+ const prepared = prepareCompaction(eventsPath, config);
69
+ if (!prepared) return undefined;
70
+ // FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
71
+ // event loss during compaction. Without lock, events can be appended between
72
+ // read and write, lost silently.
73
+ //
74
+ // NOTE (Round 24 BUG 1): callers ALREADY holding the event-log lock (e.g.
75
+ // appendEventInsideLock in event-log.ts) must call applyCompactionUnlocked
76
+ // directly — calling compactEventLog from inside the lock deadlocks (the
77
+ // mkdir lock is not re-entrant → 5s timeout → compaction never ran → the
78
+ // log grew unbounded until events were silently dropped past 50MB).
79
+ return withEventLogLockSync(eventsPath, () => applyCompactionUnlocked(eventsPath, prepared));
80
+ }
81
+
82
+ /** Round 24 (BUG 1): the lock-free pre-read for compaction. Safe to run
83
+ * outside the lock (read-only). Returns the compacted lines + stats needed
84
+ * for the write phase. */
85
+ export function prepareCompaction(eventsPath: string, config?: Partial<RotationConfig>):
86
+ { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] } | undefined {
68
87
  if (!fs.existsSync(eventsPath)) return undefined;
69
88
  const cfg = resolveConfig(config);
70
89
  let originalSize: number;
@@ -74,79 +93,73 @@ export function compactEventLog(eventsPath: string, config?: Partial<RotationCon
74
93
  if (originalCount <= cfg.compactToCount) return undefined;
75
94
  const kept = allEvents.slice(-cfg.compactToCount);
76
95
  const lines = kept.map((e) => JSON.stringify(e)).join("\n") + "\n";
96
+ return { lines, originalSize, originalCount, kept };
97
+ }
77
98
 
78
- // FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
79
- // event loss during compaction. Without lock, events can be appended between
80
- // read and write, lost silently.
81
- return withEventLogLockSync(eventsPath, () => {
82
- try {
83
- atomicWriteFile(eventsPath, lines);
84
- } catch (err) {
85
- // Concurrent write conflict — skip compaction this cycle
86
- logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
87
- return undefined;
88
- }
89
- // C2: Re-read to recover any events appended during the compaction window.
90
- // Events appended during the compaction window are preserved because they
91
- // appear in afterWrite and the condition afterWrite.length >= kept.length is
92
- // true, so they are included in the return stats without entering the
93
- // recovery branch.
94
- try {
95
- const afterWrite = readEvents(eventsPath);
96
- // FIX: Check if events were actually lost (afterWrite.length < kept.length)
97
- // rather than using appendedDuringWindow >= 0 which is always true.
98
- // Also use sequence numbers for comparison instead of JSON.stringify
99
- // which is fragile due to key ordering and floating point differences.
100
- if (afterWrite.length >= kept.length) {
101
- // No data loss either events were appended and kept, or nothing happened.
102
- return {
103
- originalSize,
104
- compactedSize: fs.statSync(eventsPath).size,
105
- eventsRemoved: originalCount - kept.length,
106
- eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
107
- };
108
- }
109
- // afterWrite.length < kept.length — events were lost during compaction window.
110
- // Find missing events and re-append them.
111
- // FIX: Use sequence numbers for comparison instead of JSON.stringify.
112
- const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
113
- const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
114
- let recoveredCount = 0;
115
- let recoveryFailed = false;
116
- if (missingEvents.length > 0) {
117
- // BUGFIX (Round 12 C2): the previous loop called atomicWriteFile PER event,
118
- // which REPLACES the entire file each iteration — destroying the
119
- // compacted log and all previously-recovered events, leaving only the
120
- // LAST missing event. FIX: accumulate all missing events into one
121
- // string and append in a single write (appendFileSync appends without
122
- // destroying existing content).
123
- const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
124
- try {
125
- fs.appendFileSync(eventsPath, recoveryLines);
126
- recoveredCount = missingEvents.length;
127
- } catch (err) {
128
- recoveryFailed = true;
129
- logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
130
- }
131
- }
99
+ /** Round 24 (BUG 1): the write+recover phase of compaction. Assumes the
100
+ * caller ALREADY holds the event-log lock (or accepts the unlocked race). */
101
+ export function applyCompactionUnlocked(
102
+ eventsPath: string,
103
+ prepared: { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] },
104
+ ): CompactionResult | undefined {
105
+ const { lines, originalSize, originalCount, kept } = prepared;
106
+ try {
107
+ atomicWriteFile(eventsPath, lines);
108
+ } catch (err) {
109
+ // Concurrent write conflict — skip compaction this cycle
110
+ logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
111
+ return undefined;
112
+ }
113
+ // C2: Re-read to recover any events appended during the compaction window.
114
+ // Events appended during the compaction window are preserved because they
115
+ // appear in afterWrite and the condition afterWrite.length >= kept.length is
116
+ // true, so they are included in the return stats without entering the
117
+ // recovery branch.
118
+ try {
119
+ const afterWrite = readEvents(eventsPath);
120
+ // FIX: Check if events were actually lost (afterWrite.length < kept.length)
121
+ // rather than using appendedDuringWindow >= 0 which is always true.
122
+ // Also use sequence numbers for comparison instead of JSON.stringify
123
+ // which is fragile due to key ordering and floating point differences.
124
+ if (afterWrite.length >= kept.length) {
132
125
  return {
133
126
  originalSize,
134
127
  compactedSize: fs.statSync(eventsPath).size,
135
128
  eventsRemoved: originalCount - kept.length,
136
- eventsKept: kept.length + recoveredCount,
137
- recoveryFailed,
138
- };
139
- } catch {
140
- // Post-write verification failed — compaction likely succeeded.
141
- const compactedSize = fs.statSync(eventsPath).size;
142
- return {
143
- originalSize,
144
- compactedSize,
145
- eventsRemoved: originalCount - kept.length,
146
- eventsKept: kept.length,
129
+ eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
147
130
  };
148
131
  }
149
- });
132
+ // afterWrite.length < kept.length — events were lost during compaction window.
133
+ const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
134
+ const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
135
+ let recoveredCount = 0;
136
+ let recoveryFailed = false;
137
+ if (missingEvents.length > 0) {
138
+ const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
139
+ try {
140
+ fs.appendFileSync(eventsPath, recoveryLines);
141
+ recoveredCount = missingEvents.length;
142
+ } catch (err) {
143
+ recoveryFailed = true;
144
+ logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
145
+ }
146
+ }
147
+ return {
148
+ originalSize,
149
+ compactedSize: fs.statSync(eventsPath).size,
150
+ eventsRemoved: originalCount - kept.length,
151
+ eventsKept: kept.length + recoveredCount,
152
+ recoveryFailed,
153
+ };
154
+ } catch {
155
+ // Post-write verification failed — compaction likely succeeded.
156
+ return {
157
+ originalSize,
158
+ compactedSize: fs.statSync(eventsPath).size,
159
+ eventsRemoved: originalCount - kept.length,
160
+ eventsKept: kept.length,
161
+ };
162
+ }
150
163
  }
151
164
 
152
165
  /**
@@ -161,33 +174,41 @@ export function rotateEventLog(eventsPath: string): boolean {
161
174
  // FIX: Wrap rotation in lock to prevent race conditions with concurrent readers.
162
175
  // Order of operations: (1) create new empty file, (2) rename old file to archive.
163
176
  // This ensures eventsPath always exists — a reader never sees a missing file.
164
- return withEventLogLockSync(eventsPath, () => {
165
- try {
166
- const ts = new Date().toISOString().replace(/[:.]/g, "-");
167
- let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
168
- // Round 12: avoid timestamp collisions when two rotations happen within
169
- // the same millisecond (copyFileSync would silently overwrite the
170
- // first archive). Append a counter until the path is free.
171
- let collision = 1;
172
- while (fs.existsSync(archivePath)) {
173
- archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
174
- collision++;
175
- }
176
- // BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
177
- // rename) destroyed ALL events atomicWriteFile replaces the file
178
- // in place, so the rename then moved an EMPTY file to the archive.
179
- // FIX: copy current content to the archive first (archive is populated,
180
- // original still intact), then truncate the original to empty in place.
181
- // copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
182
- // (no missing-file window for concurrent readers).
183
- fs.copyFileSync(eventsPath, archivePath);
184
- fs.writeFileSync(eventsPath, "", "utf-8");
185
- return true;
186
- } catch (error) {
187
- logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
188
- return false;
177
+ //
178
+ // NOTE (Round 24 BUG 1): callers ALREADY holding the lock must call
179
+ // rotateEventLogUnlocked directly this locked variant is NOT re-entrant.
180
+ return withEventLogLockSync(eventsPath, () => rotateEventLogUnlocked(eventsPath));
181
+ }
182
+
183
+ /** Round 24 (BUG 1): the lock-free core of rotation. Assumes the caller
184
+ * already holds the event-log lock (or accepts the unlocked race). */
185
+ export function rotateEventLogUnlocked(eventsPath: string): boolean {
186
+ if (!fs.existsSync(eventsPath)) return false;
187
+ try {
188
+ const ts = new Date().toISOString().replace(/[:.]/g, "-");
189
+ let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
190
+ // Round 12: avoid timestamp collisions when two rotations happen within
191
+ // the same millisecond (copyFileSync would silently overwrite the
192
+ // first archive). Append a counter until the path is free.
193
+ let collision = 1;
194
+ while (fs.existsSync(archivePath)) {
195
+ archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
196
+ collision++;
189
197
  }
190
- });
198
+ // BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
199
+ // rename) destroyed ALL events — atomicWriteFile replaces the file
200
+ // in place, so the rename then moved an EMPTY file to the archive.
201
+ // FIX: copy current content to the archive first (archive is populated,
202
+ // original still intact), then truncate the original to empty in place.
203
+ // copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
204
+ // (no missing-file window for concurrent readers).
205
+ fs.copyFileSync(eventsPath, archivePath);
206
+ fs.writeFileSync(eventsPath, "", "utf-8");
207
+ return true;
208
+ } catch (error) {
209
+ logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
210
+ return false;
211
+ }
191
212
  }
192
213
 
193
214
  export interface EventLogStats {