pi-crew 0.2.20 → 0.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CHANGELOG.md +23 -10
  2. package/README.md +4 -2
  3. package/docs/PROJECT_REVIEW.md +271 -0
  4. package/docs/PROJECT_REVIEW_FIXES.md +343 -0
  5. package/docs/PROJECT_REVIEW_ROUND4.md +156 -0
  6. package/docs/PROJECT_REVIEW_ROUND5.md +86 -0
  7. package/docs/fixes/BATCH_A_H1_H2.md +86 -0
  8. package/docs/fixes/bug-006-foreground-cancel-concurrent.md +78 -0
  9. package/docs/fixes/bug-007-async-notifier-stale-ctx.md +112 -0
  10. package/docs/fixes/bug-008-child-process-silent-timeout.md +100 -0
  11. package/docs/fixes/bug-009-executor-yield-limit-needs-attention.md +75 -0
  12. package/docs/fixes/bug-010-child-process-api-key-filtered.md +109 -0
  13. package/docs/fixes/bug-011-spawn-pi-enoent.md +92 -0
  14. package/docs/fixes/bug-012-essential-env-stripped.md +89 -0
  15. package/docs/fixes/bug-013-background-runner-death.md +84 -0
  16. package/docs/fixes/bug-014-infinite-retry-loop-needs-attention.md +82 -0
  17. package/docs/fixes/bug-015-background-runner-sigterm.md +65 -0
  18. package/docs/fixes/bug-017-background-runner-session-shutdown.md +66 -0
  19. package/docs/fixes/bug-017-background-runner-sigkill-double-fork.md +28 -0
  20. package/docs/fixes/bug-018-child-pi-worker-stdin-hang.md +61 -0
  21. package/docs/fixes/bug-019-phantom-runs-temp-workspace.md +52 -0
  22. package/docs/pi-crew-bugs.md +954 -0
  23. package/docs/pi-crew-investigation-report.md +411 -0
  24. package/docs/pi-crew-test-final.md +120 -0
  25. package/docs/pi-crew-test-results.md +260 -0
  26. package/docs/pi-crew-test-round2.md +136 -0
  27. package/docs/pi-crew-test-round4.md +100 -0
  28. package/docs/pi-crew-test-round5.md +70 -0
  29. package/docs/pi-crew-test-round6.md +110 -0
  30. package/docs/usage.md +14 -0
  31. package/package.json +4 -2
  32. package/src/adapters/export-util.ts +12 -6
  33. package/src/agents/agent-config.ts +2 -0
  34. package/src/config/defaults.ts +1 -1
  35. package/src/config/markers.ts +22 -17
  36. package/src/config/resilient-parser.ts +1 -1
  37. package/src/extension/async-notifier.ts +4 -2
  38. package/src/extension/management.ts +52 -0
  39. package/src/extension/register.ts +47 -10
  40. package/src/extension/run-index.ts +20 -2
  41. package/src/extension/run-maintenance.ts +2 -2
  42. package/src/extension/team-tool/parallel-dispatch.ts +1 -1
  43. package/src/extension/team-tool/run.ts +3 -6
  44. package/src/extension/team-tool.ts +67 -11
  45. package/src/observability/event-to-metric.ts +2 -1
  46. package/src/runtime/async-runner.ts +42 -34
  47. package/src/runtime/background-runner.ts +165 -7
  48. package/src/runtime/child-pi.ts +111 -18
  49. package/src/runtime/code-summary.ts +1 -1
  50. package/src/runtime/crash-recovery.ts +1 -1
  51. package/src/runtime/crew-agent-runtime.ts +2 -1
  52. package/src/runtime/heartbeat-watcher.ts +4 -0
  53. package/src/runtime/live-agent-manager.ts +1 -1
  54. package/src/runtime/live-session-runtime.ts +2 -1
  55. package/src/runtime/manifest-cache.ts +2 -2
  56. package/src/runtime/model-fallback.ts +2 -1
  57. package/src/runtime/phase-progress.ts +1 -1
  58. package/src/runtime/pi-args.ts +3 -1
  59. package/src/runtime/pi-spawn.ts +6 -0
  60. package/src/runtime/prose-compressor.ts +1 -1
  61. package/src/runtime/result-extractor.ts +0 -1
  62. package/src/runtime/retry-executor.ts +1 -1
  63. package/src/runtime/runtime-resolver.ts +8 -3
  64. package/src/runtime/skill-instructions.ts +0 -1
  65. package/src/runtime/stale-reconciler.ts +30 -3
  66. package/src/runtime/subagent-manager.ts +2 -0
  67. package/src/runtime/task-display.ts +1 -1
  68. package/src/runtime/task-graph-scheduler.ts +1 -1
  69. package/src/runtime/task-runner/live-executor.ts +15 -0
  70. package/src/runtime/task-runner/tail-read.ts +26 -0
  71. package/src/runtime/task-runner.ts +1007 -383
  72. package/src/runtime/team-runner.ts +9 -5
  73. package/src/runtime/worker-startup.ts +3 -1
  74. package/src/schema/team-tool-schema.ts +2 -1
  75. package/src/state/active-run-registry.ts +8 -2
  76. package/src/state/atomic-write.ts +17 -0
  77. package/src/state/contracts.ts +5 -2
  78. package/src/state/event-log-rotation.ts +118 -31
  79. package/src/state/event-log.ts +33 -5
  80. package/src/state/event-reconstructor.ts +4 -2
  81. package/src/state/mailbox.ts +5 -1
  82. package/src/state/schedule.ts +146 -0
  83. package/src/state/types.ts +40 -0
  84. package/src/state/usage.ts +20 -0
  85. package/src/ui/crew-widget.ts +2 -2
  86. package/src/ui/run-event-bus.ts +1 -1
  87. package/src/ui/run-snapshot-cache.ts +2 -1
  88. package/src/ui/snapshot-types.ts +1 -0
  89. package/src/utils/gh-protocol.ts +2 -2
  90. package/src/utils/names.ts +1 -1
  91. package/src/utils/sse-parser.ts +0 -2
  92. package/src/worktree/branch-freshness.ts +1 -1
  93. package/src/worktree/cleanup.ts +54 -14
  94. package/src/worktree/worktree-manager.ts +19 -9
@@ -101,7 +101,9 @@ function shouldMergeTaskUpdate(current: TeamTaskState, updated: TeamTaskState):
101
101
  return updated.status !== current.status || updated.finishedAt !== current.finishedAt || updated.startedAt !== current.startedAt || Boolean(updated.resultArtifact) || Boolean(updated.error) || Boolean(updated.modelAttempts?.length) || Boolean(updated.usage) || Boolean(updated.attempts?.length);
102
102
  }
103
103
 
104
- export function __test__mergeTaskUpdates(base: TeamTaskState[], results: Array<{ tasks: TeamTaskState[] }>): TeamTaskState[] {
104
+ // H4 fix: rename to descriptive name. Kept __test__ as alias for backward
105
+ // compat test imports.
106
+ export function mergeTaskUpdatesPreservingTerminal(base: TeamTaskState[], results: Array<{ tasks: TeamTaskState[] }>): TeamTaskState[] {
105
107
  let merged = base;
106
108
  for (const result of results) {
107
109
  for (const updated of result.tasks) {
@@ -112,6 +114,8 @@ export function __test__mergeTaskUpdates(base: TeamTaskState[], results: Array<{
112
114
  }
113
115
  return refreshTaskGraphQueues(merged);
114
116
  }
117
+ /** @deprecated Use mergeTaskUpdatesPreservingTerminal. Kept for backward test import compat. */
118
+ export const __test__mergeTaskUpdates = mergeTaskUpdatesPreservingTerminal;
115
119
 
116
120
  // 2.8: adaptive-plan parsing/repair/injection moved to src/runtime/adaptive-plan.ts.
117
121
  // Re-export the test-only helpers so existing test imports still resolve.
@@ -260,10 +264,10 @@ function dagReadyTaskIds(tasks: TeamTaskState[], completedIds: Set<string>): str
260
264
  }
261
265
 
262
266
  export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ manifest: TeamRunManifest; tasks: TeamTaskState[] }> {
263
- let workflow = input.workflow;
267
+ const workflow = input.workflow;
264
268
  let manifest = updateRunStatus(input.manifest, "running", input.executeWorkers ? "Executing team workflow." : "Creating workflow prompts and placeholder results.");
265
269
 
266
- const runPromise = registerRunPromise(manifest.runId);
270
+ void registerRunPromise(manifest.runId);
267
271
 
268
272
  const cleanupUsage = (): void => {
269
273
  for (const task of input.tasks) clearTrackedTaskUsage(task.id);
@@ -541,10 +545,10 @@ async function executeTeamRunCore(
541
545
  );
542
546
  if (results.length === 0) break;
543
547
  manifest = { ...results.at(-1)!.manifest, artifacts: mergeArtifacts([manifest.artifacts, ...results.map((item) => item.manifest.artifacts)].flat()) };
544
- tasks = __test__mergeTaskUpdates(tasks, results);
548
+ tasks = mergeTaskUpdatesPreservingTerminal(tasks, results);
545
549
 
546
550
  // Advance workflow phases whose tasks are all in terminal state
547
- const terminalStatuses = new Set(["completed", "failed", "skipped", "cancelled"]);
551
+ const terminalStatuses = new Set(["completed", "failed", "skipped", "cancelled", "needs_attention"]);
548
552
  const phaseTaskMap = new Map<string, string[]>();
549
553
  for (const task of tasks) {
550
554
  if (!task.stepId) continue;
@@ -1,5 +1,5 @@
1
1
  export type WorkerLifecycleState = "spawning" | "trust_required" | "ready_for_prompt" | "running" | "finished" | "failed";
2
- export type StartupFailureClassification = "trust_required" | "prompt_misdelivery" | "prompt_acceptance_timeout" | "transport_dead" | "worker_crashed" | "unknown";
2
+ export type StartupFailureClassification = "trust_required" | "prompt_misdelivery" | "prompt_acceptance_timeout" | "transport_dead" | "worker_crashed" | "rate_limited" | "provider_error" | "unknown";
3
3
 
4
4
  export interface WorkerStartupEvidence {
5
5
  lastLifecycleState: WorkerLifecycleState;
@@ -20,6 +20,8 @@ export function detectTrustPrompt(text: string): boolean {
20
20
  }
21
21
 
22
22
  export function classifyStartupFailure(evidence: Omit<WorkerStartupEvidence, "classification">): StartupFailureClassification {
23
+ if (evidence.stderrPreview && /429|rate.?limit/i.test(evidence.stderrPreview)) return "rate_limited";
24
+ if (evidence.stderrPreview && /5\d{2}|server.?error|internal.?error|provider.?error/i.test(evidence.stderrPreview)) return "provider_error";
23
25
  if (!evidence.transportHealthy) return "transport_dead";
24
26
  if (evidence.trustPromptDetected || evidence.lastLifecycleState === "trust_required") return "trust_required";
25
27
  if (evidence.promptSentAt && !evidence.promptAccepted && evidence.childProcessAlive) return "prompt_acceptance_timeout";
@@ -49,6 +49,7 @@ export const TeamToolParams = Type.Object({
49
49
  Type.Literal("autonomy"),
50
50
  Type.Literal("api"),
51
51
  Type.Literal("settings"),
52
+ Type.Literal("steer"),
52
53
  ], { description: "Team action. Defaults to 'list' when omitted." })),
53
54
  resource: Type.Optional(Type.Union([
54
55
  Type.Literal("agent"),
@@ -93,7 +94,7 @@ export const TeamToolParams = Type.Object({
93
94
  });
94
95
 
95
96
  export interface TeamToolParamsValue {
96
- action?: "run" | "parallel" | "plan" | "status" | "list" | "get" | "cancel" | "retry" | "resume" | "respond" | "create" | "update" | "delete" | "doctor" | "cleanup" | "events" | "artifacts" | "worktrees" | "forget" | "summary" | "prune" | "export" | "import" | "imports" | "help" | "validate" | "config" | "init" | "recommend" | "autonomy" | "api" | "settings";
97
+ action?: "run" | "parallel" | "plan" | "status" | "list" | "get" | "cancel" | "retry" | "resume" | "respond" | "create" | "update" | "delete" | "doctor" | "cleanup" | "events" | "artifacts" | "worktrees" | "forget" | "summary" | "prune" | "export" | "import" | "imports" | "help" | "validate" | "config" | "init" | "recommend" | "autonomy" | "api" | "settings" | "steer";
97
98
  resource?: "agent" | "team" | "workflow";
98
99
  team?: string;
99
100
  workflow?: string;
@@ -121,7 +121,7 @@ export function readActiveRunRegistry(maxEntries = DEFAULT_CACHE.manifestMaxEntr
121
121
  }
122
122
  const entries = Array.isArray(parsed) ? parsed.map(normalizeEntry).filter((entry): entry is ActiveRunRegistryEntry => entry !== undefined) : [];
123
123
  const byId = new Map<string, ActiveRunRegistryEntry>();
124
- for (const entry of entries.sort((a, b) => b.updatedAt.localeCompare(a.updatedAt))) {
124
+ for (const entry of entries.sort((a, b) => (b.updatedAt ?? "").localeCompare(a.updatedAt ?? ""))) {
125
125
  if (!byId.has(entry.runId)) byId.set(entry.runId, entry);
126
126
  }
127
127
  return [...byId.values()].slice(0, Math.max(0, maxEntries));
@@ -157,12 +157,18 @@ function filterAliveEntries(entries: ActiveRunRegistryEntry[]): ActiveRunRegistr
157
157
  return false;
158
158
  }
159
159
  try {
160
- const raw = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8")) as { status?: string; async?: { pid?: number } };
160
+ const raw = JSON.parse(fs.readFileSync(entry.manifestPath, "utf-8")) as { status?: string; async?: { pid?: number }; updatedAt?: string };
161
161
  if (TERMINAL_STATUSES.has(raw.status ?? "")) return false;
162
162
  // Dead PID = stale async run
163
163
  if (raw.async?.pid) {
164
164
  try { process.kill(raw.async.pid, 0); } catch { return false; }
165
165
  }
166
+ // 2.19 — Stale non-async run: live-session/scaffold runs older than 30 min
167
+ // Without this, test runs that crash/leak would stay in the registry forever.
168
+ if (!raw.async) {
169
+ const updatedAt = typeof raw.updatedAt === 'string' ? Date.parse(raw.updatedAt) : NaN;
170
+ if (Number.isFinite(updatedAt) && Date.now() - updatedAt > 30 * 60 * 1000) return false;
171
+ }
166
172
  } catch {
167
173
  return false;
168
174
  }
@@ -114,6 +114,23 @@ export function atomicWriteFile(filePath: string, content: string): void {
114
114
  try {
115
115
  renameWithRetry(tempPath, filePath);
116
116
  } catch (renameError) {
117
+ // H3 fix: re-check symlink safety before fallback.
118
+ // Between isSymlinkSafePath at top and rename attempt, the file
119
+ // could have been replaced with a symlink (TOCTOU). Refuse if so.
120
+ try {
121
+ const lstat = fs.lstatSync(filePath);
122
+ if (lstat.isSymbolicLink()) {
123
+ try { fs.rmSync(tempPath, { force: true }); } catch { /* best-effort */ }
124
+ throw renameError;
125
+ }
126
+ } catch (checkError) {
127
+ // Only ENOENT / ENOTDIR means the file genuinely doesn't exist — safe to proceed.
128
+ // Re-throw everything else (EACCES, EPERM, EBUSY, etc.)
129
+ const code = (checkError as NodeJS.ErrnoException).code;
130
+ if (code !== "ENOENT" && code !== "ENOTDIR") {
131
+ throw checkError;
132
+ }
133
+ }
117
134
  // Fallback: if rename fails (Windows EPERM/EBUSY), try direct write.
118
135
  // This is less atomic but avoids data loss when concurrent writers contend.
119
136
  try {
@@ -1,11 +1,11 @@
1
1
  export const TEAM_RUN_STATUSES = ["queued", "planning", "running", "blocked", "completed", "failed", "cancelled"] as const;
2
2
  export type TeamRunStatus = typeof TEAM_RUN_STATUSES[number];
3
3
 
4
- export const TEAM_TASK_STATUSES = ["queued", "running", "waiting", "completed", "failed", "cancelled", "skipped"] as const;
4
+ export const TEAM_TASK_STATUSES = ["queued", "running", "waiting", "completed", "failed", "cancelled", "skipped", "needs_attention"] as const;
5
5
  export type TeamTaskStatus = typeof TEAM_TASK_STATUSES[number];
6
6
 
7
7
  export const TEAM_TERMINAL_RUN_STATUSES: ReadonlySet<TeamRunStatus> = new Set(["blocked", "completed", "failed", "cancelled"]);
8
- export const TEAM_TERMINAL_TASK_STATUSES: ReadonlySet<TeamTaskStatus> = new Set(["completed", "failed", "cancelled", "skipped"]);
8
+ export const TEAM_TERMINAL_TASK_STATUSES: ReadonlySet<TeamTaskStatus> = new Set(["completed", "failed", "cancelled", "skipped", "needs_attention"]);
9
9
 
10
10
  export const TEAM_RUN_STATUS_TRANSITIONS: Readonly<Record<TeamRunStatus, readonly TeamRunStatus[]>> = {
11
11
  queued: ["planning", "running", "cancelled", "failed"],
@@ -25,6 +25,7 @@ export const TEAM_TASK_STATUS_TRANSITIONS: Readonly<Record<TeamTaskStatus, reado
25
25
  failed: ["queued", "cancelled"],
26
26
  cancelled: ["queued"],
27
27
  skipped: ["queued", "cancelled"],
28
+ needs_attention: ["queued", "running"],
28
29
  };
29
30
 
30
31
  export const TEAM_EVENT_TYPES = [
@@ -46,6 +47,7 @@ export const TEAM_EVENT_TYPES = [
46
47
  "task.failed",
47
48
  "task.cancelled",
48
49
  "task.skipped",
50
+ "task.needs_attention",
49
51
  "review.approved",
50
52
  "review.rejected",
51
53
  "policy.action",
@@ -77,6 +79,7 @@ export const TEAM_WAKEABLE_EVENT_TYPES: ReadonlySet<TeamEventType> = new Set([
77
79
  "task.failed",
78
80
  "task.cancelled",
79
81
  "task.skipped",
82
+ "task.needs_attention",
80
83
  "async.completed",
81
84
  "async.failed",
82
85
  "async.stale",
@@ -1,6 +1,7 @@
1
1
  import * as fs from "node:fs";
2
2
  import { readEvents } from "./event-log.ts";
3
3
  import { atomicWriteFile } from "./atomic-write.ts";
4
+ import { logInternalError } from "../utils/internal-error.ts";
4
5
 
5
6
  export interface RotationConfig {
6
7
  maxFileSizeBytes: number;
@@ -77,24 +78,40 @@ export function compactEventLog(eventsPath: string, config?: Partial<RotationCon
77
78
  // Concurrent write conflict — skip compaction this cycle
78
79
  return undefined;
79
80
  }
80
- // C2: Re-read to recover any events appended between readEvents and atomicWriteFile
81
+ // C2: Re-read to recover any events appended during the compaction window.
82
+ // If events were appended and then overwritten by atomicWriteFile, they are LOST.
83
+ // Detect this and re-append any missing events.
81
84
  try {
82
85
  const afterWrite = readEvents(eventsPath);
83
- if (afterWrite.length > kept.length) {
84
- // Events were appended during the window — they're already in the file,
85
- // no data loss occurred since atomicWriteFile preserves appends after its write point
86
- }
87
86
  const appendedDuringWindow = afterWrite.length - kept.length;
88
- const eventsKept = kept.length + Math.max(0, appendedDuringWindow);
89
- const compactedSize = fs.statSync(eventsPath).size;
90
- return {
87
+ if (appendedDuringWindow >= 0) {
88
+ // No data loss — either events were appended and kept, or nothing happened.
89
+ return {
91
90
  originalSize,
92
- compactedSize,
93
- eventsRemoved: originalCount + Math.max(0, appendedDuringWindow) - eventsKept,
94
- eventsKept,
91
+ compactedSize: fs.statSync(eventsPath).size,
92
+ eventsRemoved: originalCount - kept.length,
93
+ eventsKept: kept.length + Math.max(0, appendedDuringWindow),
95
94
  };
95
+ }
96
+ // afterWrite.length < kept.length — events were lost during compaction window.
97
+ // Find missing events and re-append them.
98
+ const afterSet = new Set(afterWrite.map((e) => JSON.stringify(e)));
99
+ const missingEvents = kept.filter((e) => !afterSet.has(JSON.stringify(e)));
100
+ for (const event of missingEvents) {
101
+ try {
102
+ fs.appendFileSync(eventsPath, JSON.stringify(event) + "\n", "utf-8");
103
+ } catch {
104
+ // Append failed — log but don't throw.
105
+ }
106
+ }
107
+ return {
108
+ originalSize,
109
+ compactedSize: fs.statSync(eventsPath).size,
110
+ eventsRemoved: originalCount - kept.length,
111
+ eventsKept: kept.length,
112
+ };
96
113
  } catch {
97
- // Post-write verification failed; compaction likely succeeded
114
+ // Post-write verification failed compaction likely succeeded.
98
115
  const compactedSize = fs.statSync(eventsPath).size;
99
116
  return {
100
117
  originalSize,
@@ -105,6 +122,27 @@ export function compactEventLog(eventsPath: string, config?: Partial<RotationCon
105
122
  }
106
123
  }
107
124
 
125
+ /**
126
+ * Rotate an event log file by archiving it with a timestamp.
127
+ * The current file is renamed to `<eventsPath>.<timestamp>.archive.jsonl`
128
+ * and a fresh empty file is created in its place.
129
+ * Readers using `readEvents` will see the new file; archived files can be
130
+ * picked up by snapshot replay if needed.
131
+ */
132
+ export function rotateEventLog(eventsPath: string): boolean {
133
+ if (!fs.existsSync(eventsPath)) return false;
134
+ try {
135
+ const ts = new Date().toISOString().replace(/[:.]/g, "-");
136
+ const archivePath = `${eventsPath}.${ts}.archive.jsonl`;
137
+ fs.renameSync(eventsPath, archivePath);
138
+ fs.writeFileSync(eventsPath, "", "utf-8");
139
+ return true;
140
+ } catch (error) {
141
+ logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
142
+ return false;
143
+ }
144
+ }
145
+
108
146
  export interface EventLogStats {
109
147
  fileSizeBytes: number;
110
148
  eventCount: number;
@@ -125,29 +163,77 @@ export function getEventLogStats(eventsPath: string): EventLogStats | undefined
125
163
  return { fileSizeBytes: 0, eventCount: 0 };
126
164
  }
127
165
 
128
- // Count lines efficiently using readline-like scan
129
- const content = fs.readFileSync(eventsPath, "utf-8");
130
- const eventCount = content.split("\n").filter(Boolean).length;
131
-
132
- // Read first line for oldest timestamp
133
- let oldestTimestamp: string | undefined;
134
- try {
135
- const firstNewline = content.indexOf("\n");
136
- const firstLine = firstNewline === -1 ? content : content.slice(0, firstNewline);
137
- if (firstLine.trim()) {
138
- oldestTimestamp = (JSON.parse(firstLine) as { time: string }).time;
166
+ // NEW-9 fix: stream-scan for line count (no full-file load).
167
+ // Read last up-to-1KB for newest timestamp.
168
+ let newestTimestamp: string | undefined;
169
+ let lastLine = "";
170
+ const tailSize = Math.min(fileSizeBytes, 1024);
171
+ {
172
+ const tailBuf = Buffer.alloc(tailSize);
173
+ const fd = fs.openSync(eventsPath, "r");
174
+ try {
175
+ fs.readSync(fd, tailBuf, 0, tailSize, fileSizeBytes - tailSize);
176
+ } finally {
177
+ fs.closeSync(fd);
139
178
  }
140
- } catch { /* corrupt head */ }
179
+ const tailStr = tailBuf.toString("utf-8");
180
+ // JSONL files end with "\n", so the last newline bounds an empty string.
181
+ // Walk backwards to find the last non-empty line.
182
+ let searchFrom = tailStr.length;
183
+ for (;;) {
184
+ const nl = tailStr.lastIndexOf("\n", searchFrom - 1);
185
+ if (nl < 0) { lastLine = tailStr.trim(); break; }
186
+ const candidate = tailStr.slice(nl + 1, searchFrom).trim();
187
+ if (candidate) { lastLine = candidate; break; }
188
+ searchFrom = nl;
189
+ }
190
+ try {
191
+ if (lastLine) {
192
+ newestTimestamp = (JSON.parse(lastLine) as { time: string }).time;
193
+ }
194
+ } catch { /* corrupt tail */ }
195
+ }
141
196
 
142
- // Read last line for newest timestamp
143
- let newestTimestamp: string | undefined;
197
+ // Stream-scan to count newlines and find first line boundary.
198
+ let eventCount = 0;
199
+ let firstLineBytes = 0;
200
+ const buf = Buffer.alloc(8192);
201
+ let offset = 0;
202
+ let newlineCount = 0;
203
+ const scanFd = fs.openSync(eventsPath, "r");
144
204
  try {
145
- const lastNewline = content.lastIndexOf("\n", content.length - 2);
146
- const lastLine = content.slice(lastNewline + 1).trim();
147
- if (lastLine) {
148
- newestTimestamp = (JSON.parse(lastLine) as { time: string }).time;
205
+ let bytesRead: number;
206
+ while ((bytesRead = fs.readSync(scanFd, buf, 0, buf.length, offset)) > 0) {
207
+ for (let i = 0; i < bytesRead; i++) {
208
+ if (buf[i] === 10) {
209
+ if (newlineCount === 0) firstLineBytes = offset + i + 1;
210
+ newlineCount++;
211
+ }
212
+ }
213
+ offset += bytesRead;
214
+ }
215
+ } finally {
216
+ fs.closeSync(scanFd);
149
217
  }
150
- } catch { /* corrupt tail */ }
218
+ eventCount = newlineCount;
219
+
220
+ // Read first line for oldest timestamp.
221
+ let oldestTimestamp: string | undefined;
222
+ if (firstLineBytes > 0) {
223
+ try {
224
+ const firstBuf = Buffer.alloc(firstLineBytes);
225
+ const fd = fs.openSync(eventsPath, "r");
226
+ try {
227
+ fs.readSync(fd, firstBuf, 0, firstLineBytes, 0);
228
+ } finally {
229
+ fs.closeSync(fd);
230
+ }
231
+ const firstLine = firstBuf.toString("utf-8").trim();
232
+ if (firstLine) {
233
+ oldestTimestamp = (JSON.parse(firstLine) as { time: string }).time;
234
+ }
235
+ } catch { /* corrupt head */ }
236
+ }
151
237
 
152
238
  return {
153
239
  fileSizeBytes,
@@ -159,3 +245,4 @@ export function getEventLogStats(eventsPath: string): EventLogStats | undefined
159
245
  return undefined;
160
246
  }
161
247
  }
248
+
@@ -8,7 +8,7 @@ import { logInternalError } from "../utils/internal-error.ts";
8
8
  import { readJsonlSince, type IncrementalReadState } from "../utils/incremental-reader.ts";
9
9
  import { redactSecrets } from "../utils/redaction.ts";
10
10
  import { sleepSync } from "../utils/sleep.ts";
11
- import { needsRotation, compactEventLog } from "./event-log-rotation.ts";
11
+ import { needsRotation, compactEventLog, rotateEventLog } from "./event-log-rotation.ts";
12
12
 
13
13
  export type TeamEventProvenance = "live_worker" | "test" | "healthcheck" | "replay" | "api" | "background" | "team_runner";
14
14
  export type TeamWatcherAction = "act" | "observe" | "ignore";
@@ -64,7 +64,7 @@ let appendCounter = 0;
64
64
  /** Simple cross-process lock for an eventsPath to prevent JSONL interleave on concurrent append.
65
65
  * Detects stale locks by checking the owner PID written inside the lock directory.
66
66
  */
67
- function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
67
+ export function withEventLogLockSync<T>(eventsPath: string, fn: () => T): T {
68
68
  const lockDir = `${eventsPath}.lock`;
69
69
  const pidFile = path.join(lockDir, "pid");
70
70
  const start = Date.now();
@@ -208,15 +208,43 @@ function appendEventInsideLock(eventsPath: string, event: AppendTeamEvent): Team
208
208
  metadata = { ...metadata, fingerprint: baseMetadata?.fingerprint ?? computeEventFingerprint(fullEvent) };
209
209
  fullEvent.metadata = metadata;
210
210
  }
211
+ // H1 fix: handle overflow before appending.
212
+ // 1. Terminal events must always be persisted regardless of size.
213
+ // 2. Non-terminal events exceeding MAX_EVENTS_BYTES trigger immediate compact.
214
+ // 3. After compact, if still over limit, rotate.
215
+ const isTerminal = TERMINAL_EVENT_TYPES.has(fullEvent.type);
216
+ let skippedDueToSize = false;
217
+ if (!isTerminal && fs.existsSync(eventsPath)) {
218
+ const stat = fs.statSync(eventsPath);
219
+ if (stat.size > MAX_EVENTS_BYTES) {
220
+ // Try immediate compact (not waiting for counter % 100)
221
+ try {
222
+ compactEventLog(eventsPath);
223
+ } catch (error) {
224
+ logInternalError("event-log.immediate-compact", error, `eventsPath=${eventsPath}`);
225
+ }
226
+ // Check if still too large after compact — if so, rotate
227
+ if (fs.existsSync(eventsPath)) {
228
+ const afterCompact = fs.statSync(eventsPath);
229
+ if (afterCompact.size > MAX_EVENTS_BYTES) {
230
+ rotateEventLog(eventsPath);
231
+ }
232
+ }
233
+ }
234
+ }
211
235
  try {
212
236
  if (fs.existsSync(eventsPath) && fs.statSync(eventsPath).size > MAX_EVENTS_BYTES) {
213
- logInternalError("event-log.size-limit", new Error(`events file ${eventsPath} exceeds ${MAX_EVENTS_BYTES} bytes`), `eventsPath=${eventsPath}`);
214
- return { ...fullEvent, metadata: { ...(fullEvent.metadata ?? { seq: 0, provenance: "team_runner" }), appended: false } };
237
+ // Only reach here for non-terminal events that still overflow after compact+rotate.
238
+ // Log and mark as not appended.
239
+ logInternalError("event-log.size-limit", new Error(`events file ${eventsPath} exceeds ${MAX_EVENTS_BYTES} bytes after compaction`), `eventsPath=${eventsPath}`);
240
+ skippedDueToSize = true;
215
241
  }
216
242
  } catch (error) {
217
243
  logInternalError("event-log.size-check", error, `eventsPath=${eventsPath}`);
218
244
  }
219
- fs.appendFileSync(eventsPath, `${JSON.stringify(redactSecrets(fullEvent))}\n`, "utf-8");
245
+ if (!skippedDueToSize) {
246
+ fs.appendFileSync(eventsPath, `${JSON.stringify(redactSecrets(fullEvent))}\n`, "utf-8");
247
+ }
220
248
  appendCounter++;
221
249
  if (appendCounter % 100 === 0 && needsRotation(eventsPath)) {
222
250
  try { compactEventLog(eventsPath); } catch (error) { logInternalError("event-log.rotation", error, `eventsPath=${eventsPath}`); }
@@ -11,7 +11,7 @@ import type { TeamEvent } from "./event-log.ts";
11
11
  import { readEvents } from "./event-log.ts";
12
12
 
13
13
  /** Task status values that can be reconstructed from lifecycle events. */
14
- const RECONSTRUCTABLE_STATUSES = new Set(["created", "queued", "running", "completed", "failed", "cancelled", "skipped", "waiting"]);
14
+ const RECONSTRUCTABLE_STATUSES = new Set(["created", "queued", "running", "completed", "failed", "cancelled", "skipped", "waiting", "needs_attention"]);
15
15
 
16
16
  /** Event types that carry task lifecycle state transitions. */
17
17
  const TASK_LIFECYCLE_EVENT_TYPES = new Set([
@@ -21,6 +21,7 @@ const TASK_LIFECYCLE_EVENT_TYPES = new Set([
21
21
  "task.failed",
22
22
  "task.skipped",
23
23
  "task.cancelled",
24
+ "task.needs_attention",
24
25
  "task.waiting",
25
26
  "task.resumed",
26
27
  "task.retried",
@@ -31,7 +32,7 @@ const TASK_LIFECYCLE_EVENT_TYPES = new Set([
31
32
  ]);
32
33
 
33
34
  /** Terminal events that set finishedAt. */
34
- const TERMINAL_EVENTS = new Set(["task.completed", "task.failed", "task.cancelled", "task.skipped"]);
35
+ const TERMINAL_EVENTS = new Set(["task.completed", "task.failed", "task.cancelled", "task.skipped", "task.needs_attention"]);
35
36
 
36
37
  /** Mapping from event type to the reconstructed task status. */
37
38
  const EVENT_STATUS_MAP: Readonly<Record<string, string>> = {
@@ -41,6 +42,7 @@ const EVENT_STATUS_MAP: Readonly<Record<string, string>> = {
41
42
  "task.failed": "failed",
42
43
  "task.skipped": "skipped",
43
44
  "task.cancelled": "cancelled",
45
+ "task.needs_attention": "needs_attention",
44
46
  "task.waiting": "waiting",
45
47
  "task.resumed": "running",
46
48
  "task.retried": "queued",
@@ -5,6 +5,7 @@ import { resolveRealContainedPath } from "../utils/safe-paths.ts";
5
5
  import { redactSecrets } from "../utils/redaction.ts";
6
6
  import { logInternalError } from "../utils/internal-error.ts";
7
7
  import { atomicWriteFile } from "./atomic-write.ts";
8
+ import { withEventLogLockSync } from "./event-log.ts";
8
9
 
9
10
  export type MailboxDirection = "inbox" | "outbox";
10
11
  export type MailboxMessageStatus = "queued" | "delivered" | "acknowledged";
@@ -298,7 +299,10 @@ export function appendMailboxMessage(manifest: TeamRunManifest, message: Omit<Ma
298
299
  repliedAt: message.repliedAt,
299
300
  replyContent: message.replyContent,
300
301
  };
301
- fs.appendFileSync(mailboxFile(manifest, complete.direction, complete.taskId), `${JSON.stringify(redactSecrets(complete))}\n`, "utf-8");
302
+ // H2 fix: wrap append in cross-process lock to prevent interleaving on Windows.
303
+ withEventLogLockSync(mailboxFile(manifest, complete.direction, complete.taskId), () => {
304
+ fs.appendFileSync(mailboxFile(manifest, complete.direction, complete.taskId), `${JSON.stringify(redactSecrets(complete))}\n`, "utf-8");
305
+ });
302
306
  // 3.3 — rotate mailbox file if it has grown past 10 MB. Cheap stat
303
307
  // check; rotates at most once per append.
304
308
  rotateMailboxFileIfNeeded(mailboxFile(manifest, complete.direction, complete.taskId));
@@ -0,0 +1,146 @@
1
+ /**
2
+ * schedule.ts — Schedule detection and parsing utilities.
3
+ *
4
+ * Mirrors pi-subagents3's SubagentScheduler static methods:
5
+ * - detectSchedule(): sniff cron / interval / one-shot from string
6
+ * - validateCronExpression(): 6-field cron validation
7
+ * - parseRelativeTime(): "+10m" → ISO timestamp
8
+ * - parseInterval(): "5m" → milliseconds
9
+ */
10
+
11
+ import type { ScheduleStoreData, ScheduledTask } from "./types.ts";
12
+
13
+ export type DetectedSchedule =
14
+ | { type: "cron"; normalized: string }
15
+ | { type: "interval"; intervalMs: number; normalized: string }
16
+ | { type: "once"; normalized: string };
17
+
18
+ /** "+10s"/"+5m"/"+1h"/"+2d" → ISO timestamp or null if not a relative time. */
19
+ export function parseRelativeTime(s: string): string | null {
20
+ const m = s.trim().match(/^\+(\d+)(s|m|h|d)$/);
21
+ if (!m) return null;
22
+ const ms = parseInt(m[1], 10) * { s: 1000, m: 60_000, h: 3_600_000, d: 86_400_000 }[m[2] as "s" | "m" | "h" | "d"];
23
+ return new Date(Date.now() + ms).toISOString();
24
+ }
25
+
26
+ /** "10s"/"5m"/"1h"/"2d" → milliseconds or null if not an interval. */
27
+ export function parseInterval(s: string): number | null {
28
+ const m = s.trim().match(/^(\d+)(s|m|h|d)$/);
29
+ if (!m) return null;
30
+ return parseInt(m[1], 10) * { s: 1000, m: 60_000, h: 3_600_000, d: 86_400_000 }[m[2] as "s" | "m" | "h" | "d"];
31
+ }
32
+
33
+ /** 6-field cron validation ("second minute hour dom month dow"). */
34
+ export function validateCronExpression(expr: string): { valid: boolean; error?: string } {
35
+ const fields = expr.trim().split(/\s+/);
36
+ if (fields.length !== 6) {
37
+ return {
38
+ valid: false,
39
+ error: `Cron must have 6 fields (second minute hour dom month dow), got ${fields.length}. Example: "0 0 9 * * 1" for 9am every Monday.`,
40
+ };
41
+ }
42
+ // Basic format check: all fields must be non-empty
43
+ if (!fields.every(f => f.length > 0)) {
44
+ return { valid: false, error: "Cron expression contains empty fields." };
45
+ }
46
+ // Accept any cron pattern — fail silently for malformed expressions
47
+ // (the croner library will reject at execution time)
48
+ return { valid: true };
49
+ }
50
+
51
+ /**
52
+ * Sniff a schedule string and tag its type. Throws on invalid input.
53
+ * Order matters: relative ("+10m") and interval ("5m") both match digit+unit;
54
+ * relative requires the leading "+" to disambiguate.
55
+ */
56
+ export function detectSchedule(s: string): DetectedSchedule {
57
+ const trimmed = s.trim();
58
+ // "+10m" — relative one-shot
59
+ const rel = parseRelativeTime(trimmed);
60
+ if (rel !== null) return { type: "once", normalized: rel };
61
+ // "5m" — interval
62
+ const ivl = parseInterval(trimmed);
63
+ if (ivl !== null) return { type: "interval", intervalMs: ivl, normalized: trimmed };
64
+ // ISO timestamp — one-shot. Reject past timestamps.
65
+ if (/^\d{4}-\d{2}-\d{2}T/.test(trimmed)) {
66
+ const d = new Date(trimmed);
67
+ if (!Number.isNaN(d.getTime())) {
68
+ if (d.getTime() <= Date.now()) {
69
+ throw new Error(`Scheduled time ${d.toISOString()} is in the past.`);
70
+ }
71
+ return { type: "once", normalized: d.toISOString() };
72
+ }
73
+ }
74
+ // Cron — 6-field
75
+ const cronCheck = validateCronExpression(trimmed);
76
+ if (cronCheck.valid) return { type: "cron", normalized: trimmed };
77
+ throw new Error(
78
+ `Invalid schedule "${s}". Use 6-field cron (e.g. "0 0 9 * * 1" — 9am every Monday), interval ("5m"/"1h"), or one-shot ("+10m" / ISO).`,
79
+ );
80
+ }
81
+
82
+ /** ScheduleStore: PID-locked, session-scoped, atomic JSON persistence. */
83
+ export class ScheduleStore {
84
+ private readonly path: string;
85
+ private data: ScheduleStoreData;
86
+
87
+ constructor(path: string) {
88
+ this.path = path;
89
+ this.data = { version: 1, jobs: [] };
90
+ try {
91
+ if (require("node:fs").existsSync(path)) {
92
+ const content = require("node:fs").readFileSync(path, "utf-8");
93
+ const parsed = JSON.parse(content);
94
+ if (parsed && typeof parsed === "object" && "version" in parsed && "jobs" in parsed) {
95
+ this.data = parsed as ScheduleStoreData;
96
+ }
97
+ }
98
+ } catch {
99
+ // Corrupt or missing file — start fresh
100
+ }
101
+ }
102
+
103
+ private save(): void {
104
+ try {
105
+ require("node:fs").mkdirSync(require("node:path").dirname(this.path), { recursive: true });
106
+ require("node:fs").writeFileSync(this.path, JSON.stringify(this.data, null, 2), "utf-8");
107
+ } catch (error) {
108
+ console.warn(`[pi-crew] Failed to save schedule store: ${error instanceof Error ? error.message : String(error)}`);
109
+ }
110
+ }
111
+
112
+ list(): ScheduledTask[] {
113
+ return [...this.data.jobs];
114
+ }
115
+
116
+ hasName(name: string): boolean {
117
+ return this.data.jobs.some(j => j.name === name);
118
+ }
119
+
120
+ get(id: string): ScheduledTask | undefined {
121
+ return this.data.jobs.find(j => j.id === id);
122
+ }
123
+
124
+ add(job: ScheduledTask): void {
125
+ this.data.jobs.push(job);
126
+ this.save();
127
+ }
128
+
129
+ update(id: string, patch: Partial<ScheduledTask>): ScheduledTask | undefined {
130
+ const idx = this.data.jobs.findIndex(j => j.id === id);
131
+ if (idx === -1) return undefined;
132
+ this.data.jobs[idx] = { ...this.data.jobs[idx], ...patch };
133
+ this.save();
134
+ return this.data.jobs[idx];
135
+ }
136
+
137
+ remove(id: string): boolean {
138
+ const before = this.data.jobs.length;
139
+ this.data.jobs = this.data.jobs.filter(j => j.id !== id);
140
+ if (this.data.jobs.length !== before) {
141
+ this.save();
142
+ return true;
143
+ }
144
+ return false;
145
+ }
146
+ }