pi-crew 0.5.14 → 0.5.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +117 -0
  2. package/README.md +1 -1
  3. package/docs/pi-crew-v0.5.16-audit-fix-plan.md +35 -0
  4. package/docs/pi-crew-v0.5.17-audit-fix-plan.md +80 -0
  5. package/docs/skills/REFERENCE.md +11 -0
  6. package/package.json +1 -1
  7. package/skills/iterative-audit/SKILL.md +330 -0
  8. package/src/extension/management.ts +1 -1
  9. package/src/extension/plan-orchestrate.ts +0 -1
  10. package/src/extension/register.ts +16 -7
  11. package/src/extension/registration/viewers.ts +1 -1
  12. package/src/extension/run-index.ts +1 -1
  13. package/src/extension/team-tool/explain.ts +0 -1
  14. package/src/extension/team-tool/handle-schedule.ts +0 -1
  15. package/src/extension/team-tool/health-monitor.ts +0 -1
  16. package/src/extension/team-tool/run.ts +2 -2
  17. package/src/extension/team-tool/status.ts +1 -1
  18. package/src/extension/team-tool.ts +2 -30
  19. package/src/observability/exporters/otlp-exporter.ts +11 -1
  20. package/src/runtime/child-pi.ts +1 -1
  21. package/src/runtime/crash-recovery.ts +1 -1
  22. package/src/runtime/crew-agent-records.ts +23 -3
  23. package/src/runtime/crew-hooks.ts +1 -1
  24. package/src/runtime/handoff-manager.ts +0 -1
  25. package/src/runtime/heartbeat-watcher.ts +1 -1
  26. package/src/runtime/live-session-runtime.ts +0 -1
  27. package/src/runtime/loop-gates.ts +0 -1
  28. package/src/runtime/mcp-proxy.ts +2 -2
  29. package/src/runtime/pipeline-runner.ts +1 -2
  30. package/src/runtime/task-runner/live-executor.ts +1 -2
  31. package/src/runtime/task-runner.ts +1 -1
  32. package/src/state/jsonl-writer.ts +24 -0
  33. package/src/state/locks.ts +66 -35
  34. package/src/state/run-metrics.ts +1 -2
  35. package/src/state/schedule.ts +13 -5
  36. package/src/state/state-store.ts +1 -1
  37. package/src/tools/safe-bash.ts +0 -1
  38. package/src/ui/crew-widget.ts +2 -2
  39. package/src/ui/render-diff.ts +1 -1
  40. package/src/ui/run-dashboard.ts +1 -2
  41. package/src/ui/tool-render.ts +20 -3
  42. package/src/utils/conflict-detect.ts +0 -1
  43. package/src/utils/gh-protocol.ts +0 -2
@@ -6,7 +6,7 @@ import type {
6
6
  ExtensionContext,
7
7
  } from "@earendil-works/pi-coding-agent";
8
8
  import { loadConfig } from "../config/config.ts";
9
- import { applyCrewSettingsToConfig, loadCrewSettings, saveCrewSettings } from "../runtime/settings-store.ts";
9
+ import { applyCrewSettingsToConfig, loadCrewSettings } from "../runtime/settings-store.ts";
10
10
  // 2.7: Lazy-load LiveRunSidebar — only constructed when the user actually opens
11
11
  // a live run sidebar overlay. The class pulls in transcript-viewer and other
12
12
  // heavy UI modules.
@@ -47,12 +47,9 @@ import {
47
47
  createMetricFileSink,
48
48
  type MetricSink,
49
49
  } from "../observability/metric-sink.ts";
50
- import { killProcessPid } from "../runtime/child-pi.ts";
51
50
  import { listLiveAgents } from "../runtime/live-agent-manager.ts";
52
51
  import { createManifestCache } from "../runtime/manifest-cache.ts";
53
- import { checkProcessLiveness } from "../runtime/process-status.ts";
54
52
  import { CrewScheduler } from "../runtime/scheduler.ts";
55
- import { appendEvent } from "../state/event-log.ts";
56
53
  import { loadRunManifestById, updateRunStatus } from "../state/state-store.ts";
57
54
  import type { TeamRunManifest } from "../state/types.ts";
58
55
  import { SubagentManager } from "../subagents/manager.ts";
@@ -128,9 +125,6 @@ import type {
128
125
  // deferred cleanup and cleanupRuntime. Each function is awaited inside an
129
126
  // async context that already runs after registration completes.
130
127
  import {
131
- cancelOrphanedRuns,
132
- detectInterruptedRuns,
133
- purgeStaleActiveRunIndex,
134
128
  reconcileAllStaleRuns,
135
129
  } from "../runtime/crash-recovery.ts";
136
130
  import { appendDeadletter } from "../runtime/deadletter.ts";
@@ -482,6 +476,13 @@ export function registerPiTeams(pi: ExtensionAPI): void {
482
476
  }
483
477
  };
484
478
  const autoRecoveryLast = new Map<string, number>();
479
+ // FIX (Round 22, defensive cap): Bound the cooldown-gate Map. Each run
480
+ // contributes up to 4 keys (one per maybeNotifyHealth kind). Without a cap,
481
+ // a long-running pi session that runs thousands of teams accumulates
482
+ // thousands of entries. Eviction: oldest insertion first — matches the
483
+ // 5-minute cooldown gate semantics, since once the gate has expired the
484
+ // entry is irrelevant.
485
+ const AUTO_RECOVERY_LAST_MAX_ENTRIES = 1000;
485
486
  const configureDeliveryCoordinator = (): void => {
486
487
  deliveryCoordinator?.dispose();
487
488
  deliveryCoordinator = undefined;
@@ -1531,6 +1532,14 @@ export function registerPiTeams(pi: ExtensionAPI): void {
1531
1532
  now - previous < 5 * 60_000
1532
1533
  )
1533
1534
  return;
1535
+ // Defensive cap: evict oldest entries before inserting
1536
+ // when size exceeds the limit. Map's natural insertion
1537
+ // order means the first key is the oldest.
1538
+ while (autoRecoveryLast.size >= AUTO_RECOVERY_LAST_MAX_ENTRIES) {
1539
+ const oldest = autoRecoveryLast.keys().next().value;
1540
+ if (oldest === undefined) break;
1541
+ autoRecoveryLast.delete(oldest);
1542
+ }
1534
1543
  autoRecoveryLast.set(key, now);
1535
1544
  notifyOperator({
1536
1545
  id: key,
@@ -2,7 +2,7 @@ import type { ExtensionCommandContext } from "@earendil-works/pi-coding-agent";
2
2
  import { loadRunManifestById } from "../../state/state-store.ts";
3
3
  import { readCrewAgents } from "../../runtime/crew-agent-records.ts";
4
4
  import { loadConfig } from "../../config/config.ts";
5
- import { listLiveAgents, type LiveAgentHandle } from "../../runtime/live-agent-manager.ts";
5
+ import { listLiveAgents } from "../../runtime/live-agent-manager.ts";
6
6
  import { LiveConversationOverlay } from "../../ui/live-conversation-overlay.ts";
7
7
  import { asCrewTheme } from "../../ui/theme-adapter.ts";
8
8
  // Lazy-loaded: DurableTranscriptViewer is 658ms — only needed for /crew transcript command
@@ -7,7 +7,7 @@ import { findRepoRoot, projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
7
7
  import { activeRunEntries } from "../state/active-run-registry.ts";
8
8
  import { isSafePathId, resolveRealContainedPath } from "../utils/safe-paths.ts";
9
9
  import { sharedScanCache } from "../utils/scan-cache.ts";
10
- import { CancellationToken, createCancellationToken } from "../runtime/cancellation-token.ts";
10
+ import { createCancellationToken } from "../runtime/cancellation-token.ts";
11
11
 
12
12
  function readManifest(filePath: string): TeamRunManifest | undefined {
13
13
  const cached = sharedScanCache.readAndCache("manifests", filePath, filePath);
@@ -1,6 +1,5 @@
1
1
  import * as fs from "node:fs";
2
2
  import * as path from "node:path";
3
- import { toolResult } from "../tool-result.ts";
4
3
  import { loadRunManifestById } from "../../state/state-store.ts";
5
4
  import type { TeamRunManifest, TeamTaskState } from "../../state/types.ts";
6
5
 
@@ -3,7 +3,6 @@ import type { PiTeamsToolResult } from "../tool-result.ts";
3
3
  import type { TeamToolParamsValue } from "../../schema/team-tool-schema.ts";
4
4
  import { result, type TeamContext } from "./context.ts";
5
5
  import { humanizeSchedule, nextRunTime, parseSchedule } from "../../runtime/scheduler.ts";
6
- import { loadConfig } from "../../config/config.ts";
7
6
  import { loadCrewSettings, saveCrewSettings } from "../../runtime/settings-store.ts";
8
7
 
9
8
  // Global key for cross-module scheduler access.
@@ -8,7 +8,6 @@ import { listRuns } from "../run-index.ts";
8
8
  import { readCrewAgents } from "../../runtime/crew-agent-records.ts";
9
9
  import {
10
10
  isActiveRunStatus,
11
- isFinishedRunStatus,
12
11
  hasStaleAsyncProcess,
13
12
  isLikelyOrphanedActiveRun,
14
13
  } from "../../runtime/process-status.ts";
@@ -8,7 +8,7 @@ import { registerActiveRun, unregisterActiveRun } from "../../state/active-run-r
8
8
  import { createRunManifest, loadRunManifestById, updateRunStatus } from "../../state/state-store.ts";
9
9
  import { atomicWriteJson } from "../../state/atomic-write.ts";
10
10
  import { validateWorkflowForTeam } from "../../workflows/validate-workflow.ts";
11
- import { PipelineRunner, type PipelineWorkflow, type PipelineStage } from "../../runtime/pipeline-runner.ts";
11
+ import { PipelineRunner, type PipelineWorkflow } from "../../runtime/pipeline-runner.ts";
12
12
  // Heavy runtime — lazy-loaded to avoid 1.4s import cost at extension registration.
13
13
  import type { executeTeamRun as ExecuteTeamRunFn } from "../../runtime/team-runner.ts";
14
14
  // eslint-disable-next-line @typescript-eslint/no-unused-vars -- type-only import for TS inference
@@ -24,7 +24,7 @@ async function executeTeamRun(...args: Parameters<typeof ExecuteTeamRunFn>): Pro
24
24
  return _cachedExecuteTeamRun(...args);
25
25
  }
26
26
  import { spawnBackgroundTeamRun } from "../../subagents/async-entry.ts";
27
- import { appendEvent, appendEventAsync, readEvents } from "../../state/event-log.ts";
27
+ import { appendEventAsync, readEvents } from "../../state/event-log.ts";
28
28
  import { resolveCrewRuntime, runtimeResolutionState } from "../../runtime/runtime-resolver.ts";
29
29
  import { normalizeSkillOverride } from "../../runtime/skill-instructions.ts";
30
30
  import { expandParallelResearchWorkflow } from "../../runtime/parallel-research.ts";
@@ -8,7 +8,7 @@ import { applyAttentionState, formatActivityAge, resolveCrewControlConfig } from
8
8
  import { readCrewAgents } from "../../runtime/crew-agent-records.ts";
9
9
  import { checkProcessLiveness, isActiveRunStatus } from "../../runtime/process-status.ts";
10
10
  import { formatTaskGraphLines, waitingReason } from "../../runtime/task-display.ts";
11
- import { verifyTaskCompletion, formatOutputPreview } from "../../runtime/completion-guard.ts";
11
+ import { verifyTaskCompletion } from "../../runtime/completion-guard.ts";
12
12
  import { evaluateRunEffectiveness } from "../../runtime/effectiveness.ts";
13
13
  import type { PiTeamsToolResult } from "../tool-result.ts";
14
14
  import { locateRunCwd } from "../team-tool.ts";
@@ -4,7 +4,6 @@ import type { AgentConfig } from "../agents/agent-config.ts";
4
4
  import {
5
5
  allAgents,
6
6
  discoverAgents,
7
- invalidateAgentDiscoveryCache,
8
7
  listDynamicAgents,
9
8
  registerDynamicAgent,
10
9
  unregisterDynamicAgent,
@@ -19,8 +18,8 @@ import {
19
18
  import type { executeTeamRun as _executeTeamRunFn } from "../runtime/team-runner.ts";
20
19
  import type { TeamToolParamsValue } from "../schema/team-tool-schema.ts";
21
20
  import { writeArtifact } from "../state/artifact-store.ts";
22
- import { appendEvent, readEvents } from "../state/event-log.ts";
23
- import { withRunLock, withRunLockSync } from "../state/locks.ts";
21
+ import { appendEvent } from "../state/event-log.ts";
22
+ import { withRunLock } from "../state/locks.ts";
24
23
  import { replayPendingMailboxMessages } from "../state/mailbox.ts";
25
24
  import {
26
25
  loadRunManifestById,
@@ -33,22 +32,15 @@ import type {
33
32
  TeamRunManifest,
34
33
  TeamTaskState,
35
34
  } from "../state/types.ts";
36
- import { aggregateUsage, formatUsage } from "../state/usage.ts";
37
35
  import { allTeams, discoverTeams } from "../teams/discover-teams.ts";
38
36
  import {
39
37
  allWorkflows,
40
38
  discoverWorkflows,
41
39
  } from "../workflows/discover-workflows.ts";
42
- import { validateWorkflowForTeam } from "../workflows/validate-workflow.ts";
43
- import { cleanupRunWorktrees } from "../worktree/cleanup.ts";
44
40
  import { piTeamsHelp } from "./help.ts";
45
- import { listImportedRuns } from "./import-index.ts";
46
41
  import { handleCreate, handleDelete, handleUpdate } from "./management.ts";
47
42
  import { initializeProject } from "./project-init.ts";
48
- import { exportRunBundle } from "./run-export.ts";
49
- import { importRunBundle } from "./run-import.ts";
50
43
  import { listRuns } from "./run-index.ts";
51
- import { pruneFinishedRuns } from "./run-maintenance.ts";
52
44
  import { formatRecommendation, recommendTeam } from "./team-recommendation.ts";
53
45
  import { handleSettings } from "./team-tool/handle-settings.ts";
54
46
  import type { PiTeamsToolResult } from "./tool-result.ts";
@@ -70,31 +62,12 @@ async function executeTeamRun(
70
62
  return _cachedExecuteTeamRun(...args);
71
63
  }
72
64
 
73
- import {
74
- applyAttentionState,
75
- formatActivityAge,
76
- resolveCrewControlConfig,
77
- } from "../runtime/agent-control.ts";
78
- import {
79
- readCrewAgents,
80
- recordFromTask,
81
- saveCrewAgents,
82
- } from "../runtime/crew-agent-records.ts";
83
65
  import { directTeamAndWorkflowFromRun } from "../runtime/direct-run.ts";
84
- import { writeForegroundInterruptRequest } from "../runtime/foreground-control.ts";
85
66
  import { parsePiJsonOutput } from "../runtime/pi-json-output.ts";
86
- import {
87
- checkProcessLiveness,
88
- isActiveRunStatus,
89
- } from "../runtime/process-status.ts";
90
67
  import {
91
68
  resolveCrewRuntime,
92
69
  runtimeResolutionState,
93
70
  } from "../runtime/runtime-resolver.ts";
94
- import {
95
- formatTaskGraphLines,
96
- waitingReason,
97
- } from "../runtime/task-display.ts";
98
71
  import { handleApi } from "./team-tool/api.ts";
99
72
  import {
100
73
  autonomousPatchFromConfig,
@@ -128,7 +101,6 @@ async function handleRun(
128
101
 
129
102
  import { waitForRun } from "../runtime/run-tracker.ts";
130
103
  import { normalizeSkillOverride } from "../runtime/skill-instructions.ts";
131
- import { logInternalError } from "../utils/internal-error.ts";
132
104
  import { searchAgents, searchTeams } from "../utils/bm25-search.ts";
133
105
  import { projectCrewRoot } from "../utils/paths.ts";
134
106
  import {
@@ -124,8 +124,18 @@ export class OTLPExporter implements MetricExporter {
124
124
  }
125
125
  }
126
126
 
127
- dispose(): void {
127
+ /**
128
+ * FIX (Round 23, resource cleanup): Make dispose() async and await the
129
+ * in-flight push so it completes (or aborts) before we return. The push
130
+ * itself is bounded by the 10s fetch timeout, so this won't hang
131
+ * indefinitely. Without this, dispose() would orphan an in-flight
132
+ * network request whose result is then discarded.
133
+ */
134
+ async dispose(): Promise<void> {
128
135
  if (this.timer) clearInterval(this.timer);
129
136
  this.timer = undefined;
137
+ if (this.inFlight) {
138
+ try { await this.inFlight; } catch { /* swallow — push() already logs errors */ }
139
+ }
130
140
  }
131
141
  }
@@ -8,7 +8,7 @@ import { getPiSpawnCommand } from "./pi-spawn.ts";
8
8
  import { DEFAULT_CHILD_PI } from "../config/defaults.ts";
9
9
  import { logInternalError } from "../utils/internal-error.ts";
10
10
  import { attachPostExitStdioGuard, trySignalChild } from "./post-exit-stdio-guard.ts";
11
- import { redactJsonLine, isSecretKey } from "../utils/redaction.ts";
11
+ import { redactJsonLine } from "../utils/redaction.ts";
12
12
  import { sanitizeEnvSecrets } from "../utils/env-filter.ts";
13
13
  import { registerChildProcess, unregisterChildProcess } from "../extension/crew-cleanup.ts";
14
14
 
@@ -11,7 +11,7 @@ import type { ManifestCache } from "./manifest-cache.ts";
11
11
  import { checkProcessLiveness } from "./process-status.ts";
12
12
  import { reconcileStaleRun, type ReconcileResult } from "./stale-reconciler.ts";
13
13
  import { executeHook, appendHookEvent } from "../hooks/registry.ts";
14
- import { activeRunEntries, unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
14
+ import { unregisterActiveRun, readActiveRunRegistry } from "../state/active-run-registry.ts";
15
15
  import { resolveRealContainedPath } from "../utils/safe-paths.ts";
16
16
  import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
17
17
  import { terminateLiveAgentsForRun } from "./live-agent-manager.ts";
@@ -263,8 +263,28 @@ export function readCrewAgentStatus(manifest: TeamRunManifest, taskOrAgentId: st
263
263
  }
264
264
 
265
265
  const agentEventSeqCache = new Map<string, { size: number; mtimeMs: number; seq: number }>();
266
+ // FIX (Round 22, defensive cap): Bound the per-file-path cache. Without a cap,
267
+ // a long-running pi-crew process that spawns 1000s of agents accumulates 1000s
268
+ // of entries. Mirrors the `asyncAgentReaderCache` pattern (above) and the
269
+ // `NotificationRouter.SEEN_MAP_MAX_SIZE` pattern.
270
+ const AGENT_EVENT_SEQ_CACHE_MAX_ENTRIES = 1000;
266
271
  const AGENT_EVENT_SEQ_SIDECAR = ".seq";
267
272
 
273
+ /**
274
+ * Set an entry in the seq cache, evicting the oldest entries when the cache
275
+ * exceeds the cap. Map's natural insertion order means the first key is the
276
+ * oldest — same as the pattern used in `asyncAgentReaderCache`.
277
+ */
278
+ function setAgentEventSeqCache(filePath: string, entry: { size: number; mtimeMs: number; seq: number }): void {
279
+ if (agentEventSeqCache.has(filePath)) agentEventSeqCache.delete(filePath);
280
+ agentEventSeqCache.set(filePath, entry);
281
+ while (agentEventSeqCache.size > AGENT_EVENT_SEQ_CACHE_MAX_ENTRIES) {
282
+ const oldest = agentEventSeqCache.keys().next().value;
283
+ if (oldest === undefined) break;
284
+ agentEventSeqCache.delete(oldest);
285
+ }
286
+ }
287
+
268
288
  function readSeqFromSidecar(filePath: string): number | undefined {
269
289
  try {
270
290
  const raw = fs.readFileSync(`${filePath}.${AGENT_EVENT_SEQ_SIDECAR}`, "utf-8");
@@ -295,7 +315,7 @@ function nextAgentEventSeq(filePath: string): number {
295
315
  // FIX: Try sidecar file for O(1) lookup before falling back to O(n) scan.
296
316
  const sidecarSeq = readSeqFromSidecar(filePath);
297
317
  if (sidecarSeq !== undefined) {
298
- agentEventSeqCache.set(filePath, { size: stat.size, mtimeMs: stat.mtimeMs, seq: sidecarSeq });
318
+ setAgentEventSeqCache(filePath, { size: stat.size, mtimeMs: stat.mtimeMs, seq: sidecarSeq });
299
319
  return sidecarSeq + 1;
300
320
  }
301
321
  let max = 0;
@@ -309,7 +329,7 @@ function nextAgentEventSeq(filePath: string): number {
309
329
  max += 1;
310
330
  }
311
331
  }
312
- agentEventSeqCache.set(filePath, { size: stat.size, mtimeMs: stat.mtimeMs, seq: max });
332
+ setAgentEventSeqCache(filePath, { size: stat.size, mtimeMs: stat.mtimeMs, seq: max });
313
333
  writeSeqToSidecar(filePath, max);
314
334
  return max + 1;
315
335
  }
@@ -321,7 +341,7 @@ export function appendCrewAgentEvent(manifest: TeamRunManifest, taskId: string,
321
341
  fs.appendFileSync(filePath, `${JSON.stringify(redactSecrets({ seq, time: new Date().toISOString(), event }))}\n`, "utf-8");
322
342
  try {
323
343
  const stat = fs.statSync(filePath);
324
- agentEventSeqCache.set(filePath, { size: stat.size, mtimeMs: stat.mtimeMs, seq });
344
+ setAgentEventSeqCache(filePath, { size: stat.size, mtimeMs: stat.mtimeMs, seq });
325
345
  writeSeqToSidecar(filePath, seq);
326
346
  } catch (error) {
327
347
  logInternalError("crew-agent-records.stat", error, `filePath=${filePath}`);
@@ -146,7 +146,7 @@ export class HookRegistry {
146
146
  emit(event: CrewHookEvent): void {
147
147
  // Validate event type using type guard
148
148
  if (!isValidEventType(event.type)) {
149
- console.warn(`[crew-hooks] Unknown event type: ${event.type}`);
149
+ logInternalError("crew-hooks.unknown-event-type", new Error(`Unknown event type: ${event.type}`));
150
150
  return;
151
151
  }
152
152
 
@@ -55,7 +55,6 @@ export function isValidHandoffSummary(value: unknown): value is HandoffSummary {
55
55
  */
56
56
 
57
57
  import type { TeamEvent } from "../state/event-log.ts";
58
- import { appendEventAsync } from "../state/event-log.ts";
59
58
 
60
59
  /**
61
60
  * Represents a key decision made during task execution.
@@ -6,7 +6,7 @@ import { loadRunManifestById } from "../state/state-store.ts";
6
6
  import type { TeamRunManifest } from "../state/types.ts";
7
7
  import { logInternalError } from "../utils/internal-error.ts";
8
8
  import type { ManifestCache } from "./manifest-cache.ts";
9
- import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
9
+ import { DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
10
10
 
11
11
  export interface HeartbeatWatcherRouter {
12
12
  enqueue(notification: NotificationDescriptor): boolean;
@@ -24,7 +24,6 @@ import { buildExtensionBridge } from "./live-extension-bridge.ts";
24
24
  import { logInternalError } from "../utils/internal-error.ts";
25
25
  // prose-compressor imported for custom tool descriptions below;
26
26
  // tool description compression for SDK-managed tools awaits SDK support.
27
- import { compressToolDescription } from "./prose-compressor.ts";
28
27
  import { buildSensitivePathConstraint } from "./sensitive-paths.ts";
29
28
  import { collectLiveSessionHealth, formatLiveSessionDiagnostics, type LiveSessionHealth } from "./live-session-health.ts";
30
29
  import { listLiveAgents } from "./live-agent-manager.ts";
@@ -8,7 +8,6 @@
8
8
  * Distilled from pi-autoresearch's dual-gate loop pattern.
9
9
  */
10
10
  import * as fs from "node:fs";
11
- import * as path from "node:path";
12
11
  import type { TeamTaskState } from "../state/types.ts";
13
12
 
14
13
  /**
@@ -16,8 +16,8 @@
16
16
  * when proxying from the parent.
17
17
  */
18
18
 
19
- import { defineTool, type ToolDefinition } from "@earendil-works/pi-coding-agent";
20
- import { Type, type Static, type TSchema } from "@sinclair/typebox";
19
+ import { type ToolDefinition } from "@earendil-works/pi-coding-agent";
20
+ import { type Static, type TSchema } from "@sinclair/typebox";
21
21
 
22
22
  export interface McpProxyConfig {
23
23
  /** Whether to enable MCP in the child session. */
@@ -2,8 +2,7 @@ import type { TeamTaskState } from "../state/types.ts";
2
2
  import type { WorkflowConfig, WorkflowStep } from "../workflows/workflow-config.ts";
3
3
  import type { TeamConfig } from "../teams/team-config.ts";
4
4
  import type { AgentConfig } from "../agents/agent-config.ts";
5
- import { writeArtifact } from "../state/artifact-store.ts";
6
- import { appendEvent, appendEventAsync } from "../state/event-log.ts";
5
+ import { appendEventAsync } from "../state/event-log.ts";
7
6
  import { mapConcurrent } from "./parallel-utils.ts";
8
7
 
9
8
  /**
@@ -3,7 +3,6 @@ import type { AgentConfig } from "../../agents/agent-config.ts";
3
3
  import type { CrewRuntimeConfig } from "../../config/config.ts";
4
4
  import { writeArtifact } from "../../state/artifact-store.ts";
5
5
  import {
6
- appendEvent,
7
6
  appendEventFireAndForget,
8
7
  } from "../../state/event-log.ts";
9
8
  import type {
@@ -11,7 +10,7 @@ import type {
11
10
  TeamRunManifest,
12
11
  TeamTaskState,
13
12
  } from "../../state/types.ts";
14
- import { loadRunManifestById, saveRunTasks } from "../../state/state-store.ts";
13
+ import { loadRunManifestById } from "../../state/state-store.ts";
15
14
  import { persistSingleTaskUpdate } from "./state-helpers.ts";
16
15
  import type { WorkflowStep } from "../../workflows/workflow-config.ts";
17
16
  import { appendCrewAgentEvent, appendCrewAgentOutput, emptyCrewAgentProgress, recordFromTask, upsertCrewAgent } from "../crew-agent-records.ts";
@@ -11,7 +11,7 @@ import type {
11
11
  } from "../state/types.ts";
12
12
  import { logInternalError } from "../utils/internal-error.ts";
13
13
  import { writeArtifact } from "../state/artifact-store.ts";
14
- import { appendEvent, appendEventAsync, appendEventFireAndForget } from "../state/event-log.ts";
14
+ import { appendEventAsync, appendEventFireAndForget } from "../state/event-log.ts";
15
15
  import { saveRunManifest } from "../state/state-store.ts";
16
16
  import { createTaskClaim } from "../state/task-claims.ts";
17
17
  import {
@@ -14,10 +14,17 @@ export interface JsonlWriteStream {
14
14
  }
15
15
 
16
16
  const DEFAULT_MAX_JSONL_BYTES = 50 * 1024 * 1024;
17
+ // FIX (Round 21, per-line cap): A single huge line could exhaust memory during
18
+ // redactJsonLine if an upstream caller constructs an enormous string. Cap each
19
+ // line at 1MB by default — large enough for any legitimate event payload, small
20
+ // enough to prevent memory blow-up. Mirrors the upstream oh-my-pi pattern of
21
+ // bounding chunk boundaries in Bun.file().writer().
22
+ const DEFAULT_MAX_LINE_BYTES = 1 * 1024 * 1024;
17
23
 
18
24
  export interface JsonlWriterDeps {
19
25
  createWriteStream?: (filePath: string) => JsonlWriteStream;
20
26
  maxBytes?: number;
27
+ maxLineBytes?: number;
21
28
  }
22
29
 
23
30
  export interface JsonlWriter {
@@ -47,7 +54,9 @@ export function createJsonlWriter(filePath: string | undefined, source: Drainabl
47
54
  let backpressured = false;
48
55
  let closed = false;
49
56
  let bytesWritten = 0;
57
+ let linesDroppedForSize = 0;
50
58
  const maxBytes = deps.maxBytes ?? DEFAULT_MAX_JSONL_BYTES;
59
+ const maxLineBytes = deps.maxLineBytes ?? DEFAULT_MAX_LINE_BYTES;
51
60
 
52
61
  return {
53
62
  writeLine(line: string) {
@@ -55,6 +64,21 @@ export function createJsonlWriter(filePath: string | undefined, source: Drainabl
55
64
  const safeLine = redactJsonLine(line);
56
65
  const chunk = `${safeLine}\n`;
57
66
  const chunkBytes = Buffer.byteLength(chunk, "utf-8");
67
+ // FIX (Round 21, per-line cap): Drop oversize lines. Without this, a
68
+ // single huge payload (e.g. a 100MB base64-encoded transcript) would
69
+ // be buffered in memory by redactJsonLine AND queued in the write
70
+ // stream. We log the drop so silent loss is visible.
71
+ if (chunkBytes > maxLineBytes) {
72
+ linesDroppedForSize++;
73
+ if (linesDroppedForSize === 1 || linesDroppedForSize % 100 === 0) {
74
+ logInternalError(
75
+ "jsonl-writer.lineTooLarge",
76
+ new Error(`line size ${chunkBytes} exceeds maxLineBytes ${maxLineBytes}`),
77
+ `file=${filePath} dropped=${linesDroppedForSize}`,
78
+ );
79
+ }
80
+ return;
81
+ }
58
82
  if (bytesWritten + chunkBytes > maxBytes) return;
59
83
  try {
60
84
  const ok = stream.write(chunk);
@@ -1,5 +1,6 @@
1
1
  import * as fs from "node:fs";
2
2
  import * as path from "node:path";
3
+ import { randomUUID } from "node:crypto";
3
4
  import type { TeamRunManifest } from "./types.ts";
4
5
  import { DEFAULT_LOCKS } from "../config/defaults.ts";
5
6
  import { sleepSync } from "../utils/sleep.ts";
@@ -59,22 +60,71 @@ function isLockHolderAlive(filePath: string): boolean {
59
60
  }
60
61
  }
61
62
 
62
- function writeLockFile(filePath: string): void {
63
+ /**
64
+ * Lock file kinds. Discriminator written to the lock file payload so that:
65
+ * - Debugging tools (e.g. a future `pi-crew locks` command) can identify
66
+ * what a lock is protecting.
67
+ * - Cross-kind ambiguity is prevented if two locks somehow resolve to the
68
+ * same path (defense in depth).
69
+ * - Forward compat: new lock types can be added without changing the
70
+ * on-disk format (the `kind` field is the only discriminator).
71
+ */
72
+ export type LockKind = "run" | "file";
73
+
74
+ function writeLockFile(filePath: string, token: string, kind: LockKind = "file"): void {
63
75
  const fd = fs.openSync(filePath, fs.constants.O_WRONLY | fs.constants.O_CREAT | fs.constants.O_EXCL, 0o644);
64
76
  try {
65
- fs.writeSync(fd, JSON.stringify({ pid: process.pid, createdAt: new Date().toISOString() }));
77
+ fs.writeSync(fd, JSON.stringify({ kind, pid: process.pid, createdAt: new Date().toISOString(), token }));
66
78
  } finally {
67
79
  fs.closeSync(fd);
68
80
  }
69
81
  }
70
82
 
71
- function acquireLockWithRetry(filePath: string, staleMs: number): void {
83
+ /**
84
+ * Read the token stored in a lock file. Returns undefined if the file
85
+ * cannot be read or parsed.
86
+ */
87
+ function readLockToken(filePath: string): string | undefined {
88
+ try {
89
+ const raw = fs.readFileSync(filePath, "utf-8");
90
+ const parsed = JSON.parse(raw) as { token?: unknown };
91
+ return typeof parsed.token === "string" ? parsed.token : undefined;
92
+ } catch {
93
+ return undefined;
94
+ }
95
+ }
96
+
97
+ /**
98
+ * Release a lock file, but ONLY if the stored token matches. This prevents
99
+ * the "losing contender wipes winner's lock" race that occurs when:
100
+ * 1. Process A acquires lock with token T_A
101
+ * 2. Process B times out waiting, steals the lock (overwriting with T_B)
102
+ * 3. Process A finishes, tries to release — would otherwise rm Process B's lock
103
+ *
104
+ * With token matching, A's release is a no-op for B's lock.
105
+ */
106
+ function releaseLock(filePath: string, token: string): void {
107
+ const stored = readLockToken(filePath);
108
+ if (stored === undefined || stored === token) {
109
+ try {
110
+ fs.rmSync(filePath, { force: true });
111
+ } catch {
112
+ // Best-effort cleanup. Either someone else with the same token got
113
+ // there first, or the lock is already gone — both are fine.
114
+ }
115
+ }
116
+ // If the stored token does not match, our lock has been stolen
117
+ // (probably stale and overtaken). Do not touch it — the new holder owns it.
118
+ }
119
+
120
+ function acquireLockWithRetry(filePath: string, staleMs: number, kind: LockKind = "file"): string {
72
121
  let attempt = 0;
73
122
  const deadline = Date.now() + staleMs * 2;
74
123
  while (true) {
124
+ const token = randomUUID();
75
125
  try {
76
- writeLockFile(filePath);
77
- return;
126
+ writeLockFile(filePath, token, kind);
127
+ return token;
78
128
  } catch (error) {
79
129
  const code = (error as NodeJS.ErrnoException).code;
80
130
  if (code !== "EEXIST") throw error;
@@ -105,21 +155,14 @@ function sleep(ms: number): Promise<void> {
105
155
  return new Promise((resolve) => setTimeout(resolve, ms));
106
156
  }
107
157
 
108
- function readLockStateAsync(filePath: string, staleMs: number): void {
109
- try {
110
- if (isLockStale(filePath, staleMs)) fs.rmSync(filePath, { force: true });
111
- } catch {
112
- // Ignore stale-check races.
113
- }
114
- }
115
-
116
- async function acquireLockWithRetryAsync(filePath: string, staleMs: number): Promise<void> {
158
+ async function acquireLockWithRetryAsync(filePath: string, staleMs: number, kind: LockKind = "file"): Promise<string> {
117
159
  let attempt = 0;
118
160
  const deadline = Date.now() + staleMs * 2;
119
161
  while (true) {
162
+ const token = randomUUID();
120
163
  try {
121
- writeLockFile(filePath);
122
- return;
164
+ writeLockFile(filePath, token, kind);
165
+ return token;
123
166
  } catch (error) {
124
167
  const code = (error as NodeJS.ErrnoException).code;
125
168
  if (code !== "EEXIST") throw error;
@@ -139,7 +182,6 @@ async function acquireLockWithRetryAsync(filePath: string, staleMs: number): Pro
139
182
  try {
140
183
  fs.rmSync(filePath, { force: true });
141
184
  } catch { /* race — let loop retry */ }
142
- await readLockStateAsync(filePath, staleMs);
143
185
  const delay = Math.min(250, 25 * 2 ** attempt);
144
186
  await sleep(delay);
145
187
  attempt++;
@@ -159,15 +201,12 @@ export function withFileLockSync<T>(filePath: string, fn: () => T, options: RunL
159
201
  const lockFile = `${filePath}.lock`;
160
202
  const staleMs = options.staleMs ?? DEFAULT_STALE_MS;
161
203
  fs.mkdirSync(path.dirname(lockFile), { recursive: true });
162
- acquireLockWithRetry(lockFile, staleMs);
204
+ const token = acquireLockWithRetry(lockFile, staleMs, "file");
163
205
  try {
164
206
  return fn();
165
207
  } finally {
166
- try {
167
- fs.rmSync(lockFile, { force: true });
168
- } catch {
169
- // Best-effort lock cleanup.
170
- }
208
+ // Token-guarded release: don't rm the lock if it has been stolen.
209
+ releaseLock(lockFile, token);
171
210
  }
172
211
  }
173
212
 
@@ -175,15 +214,11 @@ export function withRunLockSync<T>(manifest: TeamRunManifest, fn: () => T, optio
175
214
  const filePath = lockPath(manifest);
176
215
  const staleMs = options.staleMs ?? DEFAULT_STALE_MS;
177
216
  fs.mkdirSync(path.dirname(filePath), { recursive: true });
178
- acquireLockWithRetry(filePath, staleMs);
217
+ const token = acquireLockWithRetry(filePath, staleMs, "run");
179
218
  try {
180
219
  return fn();
181
220
  } finally {
182
- try {
183
- fs.rmSync(filePath, { force: true });
184
- } catch {
185
- // Best-effort lock cleanup.
186
- }
221
+ releaseLock(filePath, token);
187
222
  }
188
223
  }
189
224
 
@@ -191,14 +226,10 @@ export async function withRunLock<T>(manifest: TeamRunManifest, fn: () => Promis
191
226
  const filePath = lockPath(manifest);
192
227
  const staleMs = options.staleMs ?? DEFAULT_STALE_MS;
193
228
  fs.mkdirSync(path.dirname(filePath), { recursive: true });
194
- await acquireLockWithRetryAsync(filePath, staleMs);
229
+ const token = await acquireLockWithRetryAsync(filePath, staleMs, "run");
195
230
  try {
196
231
  return await fn();
197
232
  } finally {
198
- try {
199
- fs.rmSync(filePath, { force: true });
200
- } catch {
201
- // Best-effort lock cleanup.
202
- }
233
+ releaseLock(filePath, token);
203
234
  }
204
235
  }
@@ -1,9 +1,8 @@
1
1
  import * as fs from "node:fs";
2
2
  import * as path from "node:path";
3
3
  import { loadRunManifestById } from "./state-store.ts";
4
- import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
4
+ import { projectCrewRoot } from "../utils/paths.ts";
5
5
  import { atomicWriteJson, readJsonFile } from "./atomic-write.ts";
6
- import { DEFAULT_PATHS } from "../config/defaults.ts";
7
6
 
8
7
  /**
9
8
  * Run metrics snapshot captured after a run completes (or on demand).