pi-crew 0.7.4 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +79 -0
  2. package/README.md +11 -11
  3. package/docs/commands-reference.md +14 -10
  4. package/docs/troubleshooting.md +131 -0
  5. package/docs/usage.md +9 -4
  6. package/package.json +1 -1
  7. package/src/config/config.ts +11 -4
  8. package/src/config/types.ts +2 -0
  9. package/src/errors.ts +66 -0
  10. package/src/extension/action-suggestions.ts +71 -0
  11. package/src/extension/context-status-injection.ts +174 -0
  12. package/src/extension/knowledge-injection.ts +29 -1
  13. package/src/extension/register.ts +81 -65
  14. package/src/extension/team-tool/api.ts +3 -2
  15. package/src/extension/team-tool/cancel.ts +5 -4
  16. package/src/extension/team-tool/explain.ts +2 -1
  17. package/src/extension/team-tool/failure-patterns.ts +124 -0
  18. package/src/extension/team-tool/inspect.ts +10 -6
  19. package/src/extension/team-tool/lifecycle-actions.ts +5 -4
  20. package/src/extension/team-tool/respond.ts +4 -3
  21. package/src/extension/team-tool/run-not-found.ts +54 -0
  22. package/src/extension/team-tool/run.ts +26 -4
  23. package/src/extension/team-tool/status.ts +58 -4
  24. package/src/extension/team-tool.ts +5 -3
  25. package/src/runtime/async-runner.ts +7 -0
  26. package/src/runtime/background-runner.ts +7 -1
  27. package/src/runtime/chain-parser.ts +13 -5
  28. package/src/runtime/checkpoint.ts +13 -1
  29. package/src/runtime/child-pi.ts +9 -1
  30. package/src/runtime/live-session-runtime.ts +15 -1
  31. package/src/runtime/parent-guard.ts +2 -2
  32. package/src/runtime/pipeline-runner.ts +3 -1
  33. package/src/runtime/stale-reconciler.ts +28 -4
  34. package/src/runtime/task-runner.ts +50 -20
  35. package/src/runtime/team-runner.ts +19 -2
  36. package/src/runtime/verification-gates.ts +21 -1
  37. package/src/runtime/workspace-tree.ts +28 -2
  38. package/src/schema/team-tool-schema.ts +9 -0
  39. package/src/state/blob-store.ts +12 -10
  40. package/src/state/event-log-rotation.ts +114 -93
  41. package/src/state/event-log.ts +83 -23
  42. package/src/state/health-store.ts +6 -1
  43. package/src/state/locks.ts +66 -16
  44. package/src/state/state-store.ts +46 -2
  45. package/src/ui/card-colors.ts +7 -3
  46. package/src/ui/dashboard-panes/agents-pane.ts +15 -2
  47. package/src/ui/live-duration.ts +58 -0
  48. package/src/ui/tool-render.ts +7 -11
  49. package/src/ui/tool-renderers/index.ts +6 -3
  50. package/src/ui/widget/widget-formatters.ts +2 -13
  51. package/src/utils/fs-watch.ts +11 -60
  52. package/src/utils/run-watcher-registry.ts +164 -0
  53. package/src/workflows/discover-workflows.ts +2 -1
  54. package/src/workflows/workflow-config.ts +5 -0
  55. package/src/runtime/dynamic-script-runner.ts +0 -497
  56. package/src/runtime/sandbox.ts +0 -335
@@ -2,6 +2,7 @@ import * as fs from "node:fs";
2
2
  import * as os from "node:os";
3
3
  import * as path from "node:path";
4
4
  import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
5
+ import { errors } from "../errors.ts";
5
6
  import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
6
7
  import { checkProcessLiveness } from "./process-status.ts";
7
8
  import { saveRunManifest } from "../state/state-store.ts";
@@ -272,6 +273,23 @@ function getRunningTaskStaleness(
272
273
  /**
273
274
  * Repair a stale run by marking it as failed and cancelling running tasks.
274
275
  */
276
+ /**
277
+ * E3/E1 (Round 15): Build a human-actionable error string for a stale-reconciled
278
+ * task. Explains WHY the run was marked stale (the detected reason) and gives
279
+ * concrete remediation, instead of the bare 'Stale run reconciled: <reason>'.
280
+ * Now returns a structured CrewError (E012) so callers also get a machine-
281
+ * readable code + help hint; `.message` carries the same rich text as before.
282
+ */
283
+ function buildStaleReconcileError(task: TeamTaskState, reason: string): Error {
284
+ const heartbeatAgeSeconds = task.heartbeat?.lastSeenAt ? Math.round((Date.now() - new Date(task.heartbeat.lastSeenAt).getTime()) / 1000) : undefined;
285
+ return errors.runStale(reason, heartbeatAgeSeconds);
286
+ }
287
+
288
+ /** @deprecated use buildStaleReconcileError (returns a structured CrewError). Kept for any external callers. */
289
+ function formatStaleReconcileError(task: TeamTaskState, reason: string): string {
290
+ return buildStaleReconcileError(task, reason).message;
291
+ }
292
+
275
293
  function repairStaleRun(
276
294
  manifest: TeamRunManifest,
277
295
  tasks: TeamTaskState[],
@@ -288,7 +306,8 @@ function repairStaleRun(
288
306
  ...task,
289
307
  status: "cancelled" as const,
290
308
  finishedAt: now,
291
- error: `Stale run reconciled: ${reason}`,
309
+ // E3/E1 (Round 15): structured CrewError (E012) with code + help hint.
310
+ error: buildStaleReconcileError(task, reason).message,
292
311
  };
293
312
  }
294
313
  return task;
@@ -466,9 +485,13 @@ export interface OrphanReconcileResult {
466
485
  */
467
486
  export function reconcileOrphanedTempWorkspaces(
468
487
  now = Date.now(),
469
- options?: { cleanupOrphanedTempDirs?: boolean },
488
+ options?: { cleanupOrphanedTempDirs?: boolean; tmpDir?: string; scanBatchSize?: number },
470
489
  ): OrphanReconcileResult {
471
- const tmpDir = getSafeTempDir();
490
+ // Injectable tmpDir + scanBatchSize for deterministic unit testing
491
+ // (Round 19: tests must not depend on global /tmp cleanliness; the
492
+ // production ORPHAN_TEMP_SCAN_BATCH_SIZE cap could exclude a test's dir
493
+ // when leftover dirs accumulate). Defaults remain os.tmpdir() + the cap.
494
+ const tmpDir = options?.tmpDir ?? getSafeTempDir();
472
495
  if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
473
496
  let repaired = 0;
474
497
  let cleanedDirs = 0;
@@ -477,10 +500,11 @@ export function reconcileOrphanedTempWorkspaces(
477
500
  // Sort for deterministic order; cap to ORPHAN_TEMP_SCAN_BATCH_SIZE per
478
501
  // tick to avoid main-thread stalls when /tmp has thousands of
479
502
  // pi-crew-* dirs from past interrupted test runs.
503
+ const scanBatch = options?.scanBatchSize ?? ORPHAN_TEMP_SCAN_BATCH_SIZE;
480
504
  const candidates = entries
481
505
  .filter((e) => e.isDirectory() && e.name.startsWith("pi-crew-"))
482
506
  .sort((a, b) => a.name.localeCompare(b.name))
483
- .slice(0, ORPHAN_TEMP_SCAN_BATCH_SIZE);
507
+ .slice(0, scanBatch);
484
508
  for (const entry of candidates) {
485
509
  if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
486
510
  continue;
@@ -11,6 +11,7 @@ import type {
11
11
  VerificationEvidence,
12
12
  } from "../state/types.ts";
13
13
  import { logInternalError } from "../utils/internal-error.ts";
14
+ import { errors } from "../errors.ts";
14
15
  import { writeArtifact } from "../state/artifact-store.ts";
15
16
  import { appendEventAsync, appendEventFireAndForget } from "../state/event-log.ts";
16
17
  import { saveRunManifest } from "../state/state-store.ts";
@@ -288,7 +289,19 @@ export async function runTeamTask(
288
289
  });
289
290
  } catch (err) {
290
291
  const msg = err instanceof Error ? err.message : String(err);
291
- throw new Error(`preStepScript failed: ${input.step.preStepScript}: ${msg}`);
292
+ const exitCode = (err as NodeJS.ErrnoException & { status?: number }).status;
293
+ // E1 (Round 15): structured CrewError with code E009 + help hint,
294
+ // instead of a raw Error. Surfaces the script path, exit code, and stderr.
295
+ // Round 21 (E4): if preStepOptional is set, a failing hook is NON-FATAL.
296
+ // Log a warning + emit a 'warning' event, then proceed without the
297
+ // pre-step output rather than aborting the task (advisory hooks).
298
+ if (input.step.preStepOptional) {
299
+ const warnMsg = `[preStepOptional] pre-step hook '${input.step.preStepScript}' failed (exit ${exitCode ?? "?"}) but preStepOptional=true; continuing without its output.`;
300
+ try { appendEventFireAndForget(manifest.eventsPath, { type: "hook.pre_step_optional_failed", runId: manifest.runId, taskId: task.id, message: warnMsg, data: { script: input.step.preStepScript, exitCode: exitCode ?? null } }); } catch { /* best-effort event log */ }
301
+ preStepOutput = undefined;
302
+ } else {
303
+ throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
304
+ }
292
305
  }
293
306
  }
294
307
 
@@ -383,6 +396,7 @@ export async function runTeamTask(
383
396
  let lastAgentRecordPersistedAt = 0;
384
397
  let lastHeartbeatPersistedAt = 0;
385
398
  let lastRunProgressPersistedAt = 0;
399
+ let lastTaskProgressPersistedAt = 0;
386
400
  let lastRunProgressSummary: ProgressEventSummary | undefined;
387
401
  const persistHeartbeat = (force = false): void => {
388
402
  const now = Date.now();
@@ -573,26 +587,23 @@ export async function runTeamTask(
573
587
  const eventLine = typeof event === "object" && !Array.isArray(event) ? JSON.stringify(event) : String(event);
574
588
  fs.appendFileSync(bgLogPath, `${eventLine}\n`);
575
589
  }
576
- // Apply agentProgress update first, then persist, then update in-memory array.
577
- // This ensures disk state is always >= in-memory state, preventing
578
- // fresher in-memory state from being lost on crash.
579
- tasks = persistSingleTaskUpdate(manifest, tasks, {
580
- ...task,
581
- agentProgress: applyAgentProgressEvent(
582
- task.agentProgress ?? emptyCrewAgentProgress(),
583
- event,
584
- task.startedAt,
585
- ),
586
- });
587
- task = {
588
- ...task,
589
- agentProgress: applyAgentProgressEvent(
590
- task.agentProgress ?? emptyCrewAgentProgress(),
591
- event,
592
- task.startedAt,
593
- ),
594
- };
590
+ // Always keep in-memory agentProgress fresh (cheap) so the UI/events see
591
+ // the latest progress, but THROTTLE the disk persist. Previously this
592
+ // did a full locked read-parse-write of tasks.json on EVERY child JSON
593
+ // event — a 200-event task produced 200 such cycles (Round 15 P1).
594
+ // Final state is force-flushed on task completion (persistHeartbeat(true)).
595
+ const nextProgress = applyAgentProgressEvent(
596
+ task.agentProgress ?? emptyCrewAgentProgress(),
597
+ event,
598
+ task.startedAt,
599
+ );
600
+ task = { ...task, agentProgress: nextProgress };
595
601
  tasks = updateTask(tasks, task);
602
+ const progressNow = Date.now();
603
+ if (progressNow - lastTaskProgressPersistedAt >= 500) {
604
+ tasks = persistSingleTaskUpdate(manifest, tasks, task);
605
+ lastTaskProgressPersistedAt = progressNow;
606
+ }
596
607
  // Bridge event to UI event bus for near-instant updates
597
608
  const bridgeEvent = bridgeEventFromJsonEvent(
598
609
  manifest.runId,
@@ -701,6 +712,15 @@ export async function runTeamTask(
701
712
  ? childResult.stderr ||
702
713
  `Child Pi exited with ${childResult.exitCode}`
703
714
  : undefined);
715
+ // E1/E7 (Round 15): when the child timed out, surface a structured
716
+ // CrewError (E007) so users get a code + actionable help hint instead
717
+ // of a bare 'no new output for N ms'. We keep .message as the task error.
718
+ if (childResult.exitStatus?.timedOut) {
719
+ error = errors.childTimeout({
720
+ taskId: task.id,
721
+ stderr: childResult.stderr,
722
+ }).message;
723
+ }
704
724
  persistHeartbeat(true);
705
725
  persistChildProgress({ type: "attempt_finished" }, true);
706
726
  const attempt: ModelAttemptSummary = {
@@ -724,6 +744,16 @@ export async function runTeamTask(
724
744
  if (!nextModel || !isRetryableModelFailure(error)) break;
725
745
  logs.push(formatModelAttemptNote(attempt, nextModel), "");
726
746
  }
747
+ // E2 (Round 15): when the fallback chain was used and STILL failed, surface
748
+ // that explicitly. Without this the task error only shows the last
749
+ // attempt's raw failure, so users can't tell whether to fix an API key,
750
+ // upgrade a plan, or change the model config. Include the chain tried +
751
+ // the final reason.
752
+ if (error && modelAttempts.length > 1) {
753
+ // E2/E1 (Round 15): structured CrewError (E008). Build via the factory so
754
+ // the error carries a code + help hint; keep its .message as the task error.
755
+ error = errors.modelExhausted(modelAttempts.map((a) => a.model), error).message;
756
+ }
727
757
  // NEW-8 fix: register all attempt transcripts as artifacts, not just the used one.
728
758
  // Earlier failed attempts' transcripts exist on disk but were invisible to the artifact system.
729
759
  const successfulAttemptIndex = modelAttempts.findIndex(
@@ -455,6 +455,15 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
455
455
 
456
456
  return result;
457
457
  } catch (error) {
458
+ // Round 27 (BUG 1): the success path calls stopTeamHeartbeat() but this
459
+ // catch path did NOT. The team heartbeat is a non-unref'd setInterval
460
+ // (30s) that deliberately keeps the event loop alive — without this
461
+ // call, a failed team run leaves the interval firing forever and the
462
+ // foreground pi process hangs (never returns to the prompt); in
463
+ // background-runner mode the worker never exits. clearInterval is
464
+ // idempotent so a double-call (if this runs after the success path)
465
+ // is harmless.
466
+ stopTeamHeartbeat();
458
467
  // P1: Catch unhandled errors — ensure manifest/tasks/agents are terminal so they don't stay "running" forever.
459
468
  const message = error instanceof Error ? error.message : String(error);
460
469
  // Reload manifest with lock to avoid stale data overwriting concurrent writes.
@@ -922,8 +931,16 @@ tasks = mergeResult.resultTasks;
922
931
  await saveRunTasksAsync(finalManifest, tasks);
923
932
  });
924
933
  manifest = finalManifest;
925
- // Save health snapshot on run completion
926
- const crewRoot = path.dirname(path.dirname(finalManifest.stateRoot));
934
+ // Save health snapshot on run completion.
935
+ // BUG A (pts/2 hang investigation 2026-06-16): stateRoot = `<crewRoot>/state/runs/<runId>`,
936
+ // so the crew root is THREE dirnames up, not two. Two dirnames gave `<crewRoot>/state`
937
+ // (the state dir), and HealthStore then joined HEALTH_DIR (`.crew/state/health`)
938
+ // onto it → `<crewRoot>/state/.crew/state/health` — a double-joined BOGUS path.
939
+ // That wrote health snapshots to a nonexistent subtree (silently breaking the
940
+ // health feature) AND created junk dirs that the recursive state watcher then
941
+ // attached extra inotify watches to. Fix: compute the real crew root (3 up)
942
+ // and make HEALTH_DIR relative to it.
943
+ const crewRoot = path.dirname(path.dirname(path.dirname(finalManifest.stateRoot)));
927
944
  const healthStore = new HealthStore(crewRoot);
928
945
  healthStore.saveSnapshot({
929
946
  runId: finalManifest.runId,
@@ -57,7 +57,12 @@ export const CARGO_RUST_GATES: Array<{ name: string; command: string; critical:
57
57
  * Execute a single command and capture output.
58
58
  */
59
59
  /** Characters/patterns that indicate dangerous shell metacharacters. */
60
- const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[^&])/;
60
+ // Round 25 (VULN-3/VULN-4): also block raw newlines (sh -c treats \n as a
61
+ // command separator -> injection) and bare $VARNAME references (can exfiltrate
62
+ // secrets into captured gate output, e.g. `echo $ANTHROPIC_API_KEY`).
63
+ // $+word-char is blocked; special vars like $?/$$/$! are left alone. Built-in
64
+ // gates use only `2>&1` (no $VAR), so this does not break them.
65
+ const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\$\w|\b(eval|exec)\b|>>|<[^^&]|[\r\n])/;
61
66
  // Note: single `>` is NOT blocked here because `2>&1` is a safe redirect used by built-in gates.
62
67
  // `>>` (append) is still blocked. `<` without `&` (input redirect) is still blocked.
63
68
 
@@ -66,7 +71,22 @@ const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[
66
71
  * Rejects commands with shell metacharacters that could enable injection.
67
72
  * Allows: pipes (|), redirection of stderr (2>&1), and basic npm/cargo/npx commands.
68
73
  */
74
+ /** @internal — exported for injection-guard unit testing (Round 25). */
75
+ export function __test__validateGateCommand(command: string): void {
76
+ validateGateCommand(command);
77
+ }
78
+
69
79
  function validateGateCommand(command: string): void {
80
+ // Round 25 (VULN-3): check the ORIGINAL command for raw newlines BEFORE
81
+ // normalization. The regex below runs on the NORMALIZED command (which
82
+ // collapses \s+ incl. newlines to a single space), so a newline would be
83
+ // hidden from it - but `sh -c` treats a raw newline as a command
84
+ // separator, enabling injection (e.g. `npm test\nrm -rf x`).
85
+ if (/[\r\n]/.test(command)) {
86
+ throw new Error(
87
+ `Security: verification gate command rejected (raw newline - potential command injection): ${JSON.stringify(command)}`,
88
+ );
89
+ }
70
90
  const normalized = command
71
91
  .replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '') // ANSI escape sequences
72
92
  .replace(/[\x00-\x08\x0b\x0c\x0e-\x1f]/g, '') // control chars
@@ -252,7 +252,7 @@ function applyLineCap(
252
252
  return { lines: kept, elided: removable.length };
253
253
  }
254
254
 
255
- // ── Public API ─────────────────────────────────────────────────────────
255
+ // ── Public API ────────────────────────────────────────────────────────
256
256
 
257
257
  const emptyResult = (rootPath: string): WorkspaceTree => ({
258
258
  rootPath,
@@ -261,11 +261,35 @@ const emptyResult = (rootPath: string): WorkspaceTree => ({
261
261
  totalLines: 0,
262
262
  });
263
263
 
264
+ /**
265
+ * Per-cwd TTL cache for the rendered workspace tree. Workers in the same run
266
+ * share a cwd, so the recursive walk was previously repeated once per task
267
+ * (Round 15 P4). The tree is informational context for the worker; short-lived
268
+ * staleness is acceptable, so a 30s TTL is safe and keeps prompts fresh during
269
+ * long active runs while eliminating redundant walks.
270
+ */
271
+ const TREE_CACHE_TTL_MS = 30_000;
272
+ interface CachedTree {
273
+ tree: WorkspaceTree;
274
+ expiresAt: number;
275
+ }
276
+ const treeCache = new Map<string, CachedTree>();
277
+
278
+ function treeCacheKey(cwd: string, options?: WorkspaceTreeOptions): string {
279
+ // Cache is keyed on the inputs that affect the walk output.
280
+ return `${path.resolve(cwd)}|${options?.maxDepth ?? ""}|${options?.dirLimit ?? ""}|${options?.lineCap ?? ""}`;
281
+ }
282
+
264
283
  export async function buildWorkspaceTree(
265
284
  cwd: string,
266
285
  options?: WorkspaceTreeOptions,
267
286
  ): Promise<WorkspaceTree> {
268
287
  const rootPath = path.resolve(cwd);
288
+ const cacheKey = treeCacheKey(cwd, options);
289
+ const cached = treeCache.get(cacheKey);
290
+ if (cached && cached.expiresAt > Date.now()) {
291
+ return cached.tree;
292
+ }
269
293
  try {
270
294
  const maxDepth = options?.maxDepth ?? DEFAULT_MAX_DEPTH;
271
295
  const dirLimit = options?.dirLimit ?? DEFAULT_DIR_LIMIT;
@@ -286,12 +310,14 @@ export async function buildWorkspaceTree(
286
310
  const { lines: capped, elided } = applyLineCap(lines, lineCap);
287
311
  const rendered = capped.map((l) => l.text).join("\n");
288
312
 
289
- return {
313
+ const result: WorkspaceTree = {
290
314
  rootPath,
291
315
  rendered,
292
316
  truncated: dirTruncated || elided > 0,
293
317
  totalLines: capped.length,
294
318
  };
319
+ treeCache.set(cacheKey, { tree: result, expiresAt: Date.now() + TREE_CACHE_TTL_MS });
320
+ return result;
295
321
  } catch {
296
322
  return emptyResult(rootPath);
297
323
  }
@@ -135,6 +135,13 @@ export const TeamToolParams = Type.Object({
135
135
  description: "Run in background when execution support is enabled.",
136
136
  }),
137
137
  ),
138
+ details: Type.Optional(
139
+ Type.Boolean({
140
+ default: true,
141
+ description:
142
+ "(status) Output detail level. true (default) = full status (task graph, agents, effectiveness, events). false = compact summary (status, goal, task counts, and only failed/attention task errors) for quick checks.",
143
+ }),
144
+ ),
138
145
  workspaceMode: Type.Optional(
139
146
  Type.Union([Type.Literal("single"), Type.Literal("worktree")], {
140
147
  description:
@@ -318,6 +325,8 @@ export interface TeamToolParamsValue {
318
325
  taskId?: string;
319
326
  message?: string;
320
327
  async?: boolean;
328
+ /** (status) Output detail level. false = compact summary. Default: true (full). */
329
+ details?: boolean;
321
330
  workspaceMode?: "single" | "worktree";
322
331
  context?: "fresh" | "fork";
323
332
  cwd?: string;
@@ -190,16 +190,18 @@ export function writeBlob(artifactsRoot: string, input: {
190
190
  metadataWritten = true;
191
191
  });
192
192
  } catch (error) {
193
- // Issue 4 fix: Clean up orphaned blob if metadata write fails.
194
- // If metadata write fails (e.g., concurrent conflict), the blob content
195
- // is orphaned since no metadata references it. Clean it up to reclaim space.
196
- // Issue 8 fix: Do NOT delete blob content on metadata failure.
197
- // If metadata write fails due to concurrent conflict (different values),
198
- // the blob content is still valid. Another process has written metadata
199
- // referencing this blob - deleting the blob would orphan their metadata.
200
- // The caller can retry the metadata write if needed.
201
- // However, if metadata was never written (metadataWritten === false),
202
- // the blob is orphaned and should be cleaned up.
193
+ // Round 24 (BUG 4 note): the catch block previously checked
194
+ // `if (!blobContentWritten)` the WRONG variable (the local comment said
195
+ // `metadataWritten === false`). For a CONTENT-ADDRESSED store the blob path
196
+ // is the content hash, so the blob may be referenced by another process's
197
+ // metadata even when OUR metadata write failed (e.g. a concurrent conflict
198
+ // where the peer already wrote metadata for the same hash). Deleting it
199
+ // would orphan their metadata. The safe behavior is therefore to NEVER
200
+ // delete on a metadata write failure and let the periodic
201
+ // cleanupOrphanedBlobs() reclaim genuinely-orphaned blobs. The guard below
202
+ // only removes a blob when its CONTENT was never written (a stray/partial
203
+ // file from a failed content write) — which is the only unambiguously-safe
204
+ // case to clean up here.
203
205
  if (!blobContentWritten) {
204
206
  try { fs.rmSync(blobPath, { force: true }); } catch { /* best-effort */ }
205
207
  }
@@ -1,5 +1,5 @@
1
1
  import * as fs from "node:fs";
2
- import { readEvents } from "./event-log.ts";
2
+ import { readEvents, type TeamEvent } from "./event-log.ts";
3
3
  import { atomicWriteFile } from "./atomic-write.ts";
4
4
  import { logInternalError } from "../utils/internal-error.ts";
5
5
  import { withEventLogLockSync } from "./event-log.ts";
@@ -65,6 +65,25 @@ export interface CompactionResult {
65
65
  * 6. Return compaction stats
66
66
  */
67
67
  export function compactEventLog(eventsPath: string, config?: Partial<RotationConfig>): CompactionResult | undefined {
68
+ const prepared = prepareCompaction(eventsPath, config);
69
+ if (!prepared) return undefined;
70
+ // FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
71
+ // event loss during compaction. Without lock, events can be appended between
72
+ // read and write, lost silently.
73
+ //
74
+ // NOTE (Round 24 BUG 1): callers ALREADY holding the event-log lock (e.g.
75
+ // appendEventInsideLock in event-log.ts) must call applyCompactionUnlocked
76
+ // directly — calling compactEventLog from inside the lock deadlocks (the
77
+ // mkdir lock is not re-entrant → 5s timeout → compaction never ran → the
78
+ // log grew unbounded until events were silently dropped past 50MB).
79
+ return withEventLogLockSync(eventsPath, () => applyCompactionUnlocked(eventsPath, prepared));
80
+ }
81
+
82
+ /** Round 24 (BUG 1): the lock-free pre-read for compaction. Safe to run
83
+ * outside the lock (read-only). Returns the compacted lines + stats needed
84
+ * for the write phase. */
85
+ export function prepareCompaction(eventsPath: string, config?: Partial<RotationConfig>):
86
+ { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] } | undefined {
68
87
  if (!fs.existsSync(eventsPath)) return undefined;
69
88
  const cfg = resolveConfig(config);
70
89
  let originalSize: number;
@@ -74,79 +93,73 @@ export function compactEventLog(eventsPath: string, config?: Partial<RotationCon
74
93
  if (originalCount <= cfg.compactToCount) return undefined;
75
94
  const kept = allEvents.slice(-cfg.compactToCount);
76
95
  const lines = kept.map((e) => JSON.stringify(e)).join("\n") + "\n";
96
+ return { lines, originalSize, originalCount, kept };
97
+ }
77
98
 
78
- // FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
79
- // event loss during compaction. Without lock, events can be appended between
80
- // read and write, lost silently.
81
- return withEventLogLockSync(eventsPath, () => {
82
- try {
83
- atomicWriteFile(eventsPath, lines);
84
- } catch (err) {
85
- // Concurrent write conflict — skip compaction this cycle
86
- logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
87
- return undefined;
88
- }
89
- // C2: Re-read to recover any events appended during the compaction window.
90
- // Events appended during the compaction window are preserved because they
91
- // appear in afterWrite and the condition afterWrite.length >= kept.length is
92
- // true, so they are included in the return stats without entering the
93
- // recovery branch.
94
- try {
95
- const afterWrite = readEvents(eventsPath);
96
- // FIX: Check if events were actually lost (afterWrite.length < kept.length)
97
- // rather than using appendedDuringWindow >= 0 which is always true.
98
- // Also use sequence numbers for comparison instead of JSON.stringify
99
- // which is fragile due to key ordering and floating point differences.
100
- if (afterWrite.length >= kept.length) {
101
- // No data loss either events were appended and kept, or nothing happened.
102
- return {
103
- originalSize,
104
- compactedSize: fs.statSync(eventsPath).size,
105
- eventsRemoved: originalCount - kept.length,
106
- eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
107
- };
108
- }
109
- // afterWrite.length < kept.length — events were lost during compaction window.
110
- // Find missing events and re-append them.
111
- // FIX: Use sequence numbers for comparison instead of JSON.stringify.
112
- const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
113
- const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
114
- let recoveredCount = 0;
115
- let recoveryFailed = false;
116
- if (missingEvents.length > 0) {
117
- // BUGFIX (Round 12 C2): the previous loop called atomicWriteFile PER event,
118
- // which REPLACES the entire file each iteration — destroying the
119
- // compacted log and all previously-recovered events, leaving only the
120
- // LAST missing event. FIX: accumulate all missing events into one
121
- // string and append in a single write (appendFileSync appends without
122
- // destroying existing content).
123
- const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
124
- try {
125
- fs.appendFileSync(eventsPath, recoveryLines);
126
- recoveredCount = missingEvents.length;
127
- } catch (err) {
128
- recoveryFailed = true;
129
- logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
130
- }
131
- }
99
+ /** Round 24 (BUG 1): the write+recover phase of compaction. Assumes the
100
+ * caller ALREADY holds the event-log lock (or accepts the unlocked race). */
101
+ export function applyCompactionUnlocked(
102
+ eventsPath: string,
103
+ prepared: { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] },
104
+ ): CompactionResult | undefined {
105
+ const { lines, originalSize, originalCount, kept } = prepared;
106
+ try {
107
+ atomicWriteFile(eventsPath, lines);
108
+ } catch (err) {
109
+ // Concurrent write conflict — skip compaction this cycle
110
+ logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
111
+ return undefined;
112
+ }
113
+ // C2: Re-read to recover any events appended during the compaction window.
114
+ // Events appended during the compaction window are preserved because they
115
+ // appear in afterWrite and the condition afterWrite.length >= kept.length is
116
+ // true, so they are included in the return stats without entering the
117
+ // recovery branch.
118
+ try {
119
+ const afterWrite = readEvents(eventsPath);
120
+ // FIX: Check if events were actually lost (afterWrite.length < kept.length)
121
+ // rather than using appendedDuringWindow >= 0 which is always true.
122
+ // Also use sequence numbers for comparison instead of JSON.stringify
123
+ // which is fragile due to key ordering and floating point differences.
124
+ if (afterWrite.length >= kept.length) {
132
125
  return {
133
126
  originalSize,
134
127
  compactedSize: fs.statSync(eventsPath).size,
135
128
  eventsRemoved: originalCount - kept.length,
136
- eventsKept: kept.length + recoveredCount,
137
- recoveryFailed,
138
- };
139
- } catch {
140
- // Post-write verification failed — compaction likely succeeded.
141
- const compactedSize = fs.statSync(eventsPath).size;
142
- return {
143
- originalSize,
144
- compactedSize,
145
- eventsRemoved: originalCount - kept.length,
146
- eventsKept: kept.length,
129
+ eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
147
130
  };
148
131
  }
149
- });
132
+ // afterWrite.length < kept.length — events were lost during compaction window.
133
+ const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
134
+ const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
135
+ let recoveredCount = 0;
136
+ let recoveryFailed = false;
137
+ if (missingEvents.length > 0) {
138
+ const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
139
+ try {
140
+ fs.appendFileSync(eventsPath, recoveryLines);
141
+ recoveredCount = missingEvents.length;
142
+ } catch (err) {
143
+ recoveryFailed = true;
144
+ logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
145
+ }
146
+ }
147
+ return {
148
+ originalSize,
149
+ compactedSize: fs.statSync(eventsPath).size,
150
+ eventsRemoved: originalCount - kept.length,
151
+ eventsKept: kept.length + recoveredCount,
152
+ recoveryFailed,
153
+ };
154
+ } catch {
155
+ // Post-write verification failed — compaction likely succeeded.
156
+ return {
157
+ originalSize,
158
+ compactedSize: fs.statSync(eventsPath).size,
159
+ eventsRemoved: originalCount - kept.length,
160
+ eventsKept: kept.length,
161
+ };
162
+ }
150
163
  }
151
164
 
152
165
  /**
@@ -161,33 +174,41 @@ export function rotateEventLog(eventsPath: string): boolean {
161
174
  // FIX: Wrap rotation in lock to prevent race conditions with concurrent readers.
162
175
  // Order of operations: (1) create new empty file, (2) rename old file to archive.
163
176
  // This ensures eventsPath always exists — a reader never sees a missing file.
164
- return withEventLogLockSync(eventsPath, () => {
165
- try {
166
- const ts = new Date().toISOString().replace(/[:.]/g, "-");
167
- let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
168
- // Round 12: avoid timestamp collisions when two rotations happen within
169
- // the same millisecond (copyFileSync would silently overwrite the
170
- // first archive). Append a counter until the path is free.
171
- let collision = 1;
172
- while (fs.existsSync(archivePath)) {
173
- archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
174
- collision++;
175
- }
176
- // BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
177
- // rename) destroyed ALL events atomicWriteFile replaces the file
178
- // in place, so the rename then moved an EMPTY file to the archive.
179
- // FIX: copy current content to the archive first (archive is populated,
180
- // original still intact), then truncate the original to empty in place.
181
- // copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
182
- // (no missing-file window for concurrent readers).
183
- fs.copyFileSync(eventsPath, archivePath);
184
- fs.writeFileSync(eventsPath, "", "utf-8");
185
- return true;
186
- } catch (error) {
187
- logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
188
- return false;
177
+ //
178
+ // NOTE (Round 24 BUG 1): callers ALREADY holding the lock must call
179
+ // rotateEventLogUnlocked directly this locked variant is NOT re-entrant.
180
+ return withEventLogLockSync(eventsPath, () => rotateEventLogUnlocked(eventsPath));
181
+ }
182
+
183
+ /** Round 24 (BUG 1): the lock-free core of rotation. Assumes the caller
184
+ * already holds the event-log lock (or accepts the unlocked race). */
185
+ export function rotateEventLogUnlocked(eventsPath: string): boolean {
186
+ if (!fs.existsSync(eventsPath)) return false;
187
+ try {
188
+ const ts = new Date().toISOString().replace(/[:.]/g, "-");
189
+ let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
190
+ // Round 12: avoid timestamp collisions when two rotations happen within
191
+ // the same millisecond (copyFileSync would silently overwrite the
192
+ // first archive). Append a counter until the path is free.
193
+ let collision = 1;
194
+ while (fs.existsSync(archivePath)) {
195
+ archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
196
+ collision++;
189
197
  }
190
- });
198
+ // BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
199
+ // rename) destroyed ALL events — atomicWriteFile replaces the file
200
+ // in place, so the rename then moved an EMPTY file to the archive.
201
+ // FIX: copy current content to the archive first (archive is populated,
202
+ // original still intact), then truncate the original to empty in place.
203
+ // copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
204
+ // (no missing-file window for concurrent readers).
205
+ fs.copyFileSync(eventsPath, archivePath);
206
+ fs.writeFileSync(eventsPath, "", "utf-8");
207
+ return true;
208
+ } catch (error) {
209
+ logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
210
+ return false;
211
+ }
191
212
  }
192
213
 
193
214
  export interface EventLogStats {