pi-crew 0.7.4 → 0.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +79 -0
- package/README.md +11 -11
- package/docs/commands-reference.md +14 -10
- package/docs/troubleshooting.md +131 -0
- package/docs/usage.md +9 -4
- package/package.json +1 -1
- package/src/config/config.ts +11 -4
- package/src/config/types.ts +2 -0
- package/src/errors.ts +66 -0
- package/src/extension/action-suggestions.ts +71 -0
- package/src/extension/context-status-injection.ts +174 -0
- package/src/extension/knowledge-injection.ts +29 -1
- package/src/extension/register.ts +81 -65
- package/src/extension/team-tool/api.ts +3 -2
- package/src/extension/team-tool/cancel.ts +5 -4
- package/src/extension/team-tool/explain.ts +2 -1
- package/src/extension/team-tool/failure-patterns.ts +124 -0
- package/src/extension/team-tool/inspect.ts +10 -6
- package/src/extension/team-tool/lifecycle-actions.ts +5 -4
- package/src/extension/team-tool/respond.ts +4 -3
- package/src/extension/team-tool/run-not-found.ts +54 -0
- package/src/extension/team-tool/run.ts +26 -4
- package/src/extension/team-tool/status.ts +58 -4
- package/src/extension/team-tool.ts +5 -3
- package/src/runtime/async-runner.ts +7 -0
- package/src/runtime/background-runner.ts +7 -1
- package/src/runtime/chain-parser.ts +13 -5
- package/src/runtime/checkpoint.ts +13 -1
- package/src/runtime/child-pi.ts +9 -1
- package/src/runtime/live-session-runtime.ts +15 -1
- package/src/runtime/parent-guard.ts +2 -2
- package/src/runtime/pipeline-runner.ts +3 -1
- package/src/runtime/stale-reconciler.ts +28 -4
- package/src/runtime/task-runner.ts +50 -20
- package/src/runtime/team-runner.ts +19 -2
- package/src/runtime/verification-gates.ts +21 -1
- package/src/runtime/workspace-tree.ts +28 -2
- package/src/schema/team-tool-schema.ts +9 -0
- package/src/state/blob-store.ts +12 -10
- package/src/state/event-log-rotation.ts +114 -93
- package/src/state/event-log.ts +83 -23
- package/src/state/health-store.ts +6 -1
- package/src/state/locks.ts +66 -16
- package/src/state/state-store.ts +46 -2
- package/src/ui/card-colors.ts +7 -3
- package/src/ui/dashboard-panes/agents-pane.ts +15 -2
- package/src/ui/live-duration.ts +58 -0
- package/src/ui/tool-render.ts +7 -11
- package/src/ui/tool-renderers/index.ts +6 -3
- package/src/ui/widget/widget-formatters.ts +2 -13
- package/src/utils/fs-watch.ts +11 -60
- package/src/utils/run-watcher-registry.ts +164 -0
- package/src/workflows/discover-workflows.ts +2 -1
- package/src/workflows/workflow-config.ts +5 -0
- package/src/runtime/dynamic-script-runner.ts +0 -497
- package/src/runtime/sandbox.ts +0 -335
|
@@ -2,6 +2,7 @@ import * as fs from "node:fs";
|
|
|
2
2
|
import * as os from "node:os";
|
|
3
3
|
import * as path from "node:path";
|
|
4
4
|
import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
5
|
+
import { errors } from "../errors.ts";
|
|
5
6
|
import { recordFromTask, upsertCrewAgent } from "./crew-agent-records.ts";
|
|
6
7
|
import { checkProcessLiveness } from "./process-status.ts";
|
|
7
8
|
import { saveRunManifest } from "../state/state-store.ts";
|
|
@@ -272,6 +273,23 @@ function getRunningTaskStaleness(
|
|
|
272
273
|
/**
|
|
273
274
|
* Repair a stale run by marking it as failed and cancelling running tasks.
|
|
274
275
|
*/
|
|
276
|
+
/**
|
|
277
|
+
* E3/E1 (Round 15): Build a human-actionable error string for a stale-reconciled
|
|
278
|
+
* task. Explains WHY the run was marked stale (the detected reason) and gives
|
|
279
|
+
* concrete remediation, instead of the bare 'Stale run reconciled: <reason>'.
|
|
280
|
+
* Now returns a structured CrewError (E012) so callers also get a machine-
|
|
281
|
+
* readable code + help hint; `.message` carries the same rich text as before.
|
|
282
|
+
*/
|
|
283
|
+
function buildStaleReconcileError(task: TeamTaskState, reason: string): Error {
|
|
284
|
+
const heartbeatAgeSeconds = task.heartbeat?.lastSeenAt ? Math.round((Date.now() - new Date(task.heartbeat.lastSeenAt).getTime()) / 1000) : undefined;
|
|
285
|
+
return errors.runStale(reason, heartbeatAgeSeconds);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/** @deprecated use buildStaleReconcileError (returns a structured CrewError). Kept for any external callers. */
|
|
289
|
+
function formatStaleReconcileError(task: TeamTaskState, reason: string): string {
|
|
290
|
+
return buildStaleReconcileError(task, reason).message;
|
|
291
|
+
}
|
|
292
|
+
|
|
275
293
|
function repairStaleRun(
|
|
276
294
|
manifest: TeamRunManifest,
|
|
277
295
|
tasks: TeamTaskState[],
|
|
@@ -288,7 +306,8 @@ function repairStaleRun(
|
|
|
288
306
|
...task,
|
|
289
307
|
status: "cancelled" as const,
|
|
290
308
|
finishedAt: now,
|
|
291
|
-
|
|
309
|
+
// E3/E1 (Round 15): structured CrewError (E012) with code + help hint.
|
|
310
|
+
error: buildStaleReconcileError(task, reason).message,
|
|
292
311
|
};
|
|
293
312
|
}
|
|
294
313
|
return task;
|
|
@@ -466,9 +485,13 @@ export interface OrphanReconcileResult {
|
|
|
466
485
|
*/
|
|
467
486
|
export function reconcileOrphanedTempWorkspaces(
|
|
468
487
|
now = Date.now(),
|
|
469
|
-
options?: { cleanupOrphanedTempDirs?: boolean },
|
|
488
|
+
options?: { cleanupOrphanedTempDirs?: boolean; tmpDir?: string; scanBatchSize?: number },
|
|
470
489
|
): OrphanReconcileResult {
|
|
471
|
-
|
|
490
|
+
// Injectable tmpDir + scanBatchSize for deterministic unit testing
|
|
491
|
+
// (Round 19: tests must not depend on global /tmp cleanliness; the
|
|
492
|
+
// production ORPHAN_TEMP_SCAN_BATCH_SIZE cap could exclude a test's dir
|
|
493
|
+
// when leftover dirs accumulate). Defaults remain os.tmpdir() + the cap.
|
|
494
|
+
const tmpDir = options?.tmpDir ?? getSafeTempDir();
|
|
472
495
|
if (!tmpDir) return { repaired: 0, cleanedDirs: 0 };
|
|
473
496
|
let repaired = 0;
|
|
474
497
|
let cleanedDirs = 0;
|
|
@@ -477,10 +500,11 @@ export function reconcileOrphanedTempWorkspaces(
|
|
|
477
500
|
// Sort for deterministic order; cap to ORPHAN_TEMP_SCAN_BATCH_SIZE per
|
|
478
501
|
// tick to avoid main-thread stalls when /tmp has thousands of
|
|
479
502
|
// pi-crew-* dirs from past interrupted test runs.
|
|
503
|
+
const scanBatch = options?.scanBatchSize ?? ORPHAN_TEMP_SCAN_BATCH_SIZE;
|
|
480
504
|
const candidates = entries
|
|
481
505
|
.filter((e) => e.isDirectory() && e.name.startsWith("pi-crew-"))
|
|
482
506
|
.sort((a, b) => a.name.localeCompare(b.name))
|
|
483
|
-
.slice(0,
|
|
507
|
+
.slice(0, scanBatch);
|
|
484
508
|
for (const entry of candidates) {
|
|
485
509
|
if (!entry.isDirectory() || !entry.name.startsWith("pi-crew-"))
|
|
486
510
|
continue;
|
|
@@ -11,6 +11,7 @@ import type {
|
|
|
11
11
|
VerificationEvidence,
|
|
12
12
|
} from "../state/types.ts";
|
|
13
13
|
import { logInternalError } from "../utils/internal-error.ts";
|
|
14
|
+
import { errors } from "../errors.ts";
|
|
14
15
|
import { writeArtifact } from "../state/artifact-store.ts";
|
|
15
16
|
import { appendEventAsync, appendEventFireAndForget } from "../state/event-log.ts";
|
|
16
17
|
import { saveRunManifest } from "../state/state-store.ts";
|
|
@@ -288,7 +289,19 @@ export async function runTeamTask(
|
|
|
288
289
|
});
|
|
289
290
|
} catch (err) {
|
|
290
291
|
const msg = err instanceof Error ? err.message : String(err);
|
|
291
|
-
|
|
292
|
+
const exitCode = (err as NodeJS.ErrnoException & { status?: number }).status;
|
|
293
|
+
// E1 (Round 15): structured CrewError with code E009 + help hint,
|
|
294
|
+
// instead of a raw Error. Surfaces the script path, exit code, and stderr.
|
|
295
|
+
// Round 21 (E4): if preStepOptional is set, a failing hook is NON-FATAL.
|
|
296
|
+
// Log a warning + emit a 'warning' event, then proceed without the
|
|
297
|
+
// pre-step output rather than aborting the task (advisory hooks).
|
|
298
|
+
if (input.step.preStepOptional) {
|
|
299
|
+
const warnMsg = `[preStepOptional] pre-step hook '${input.step.preStepScript}' failed (exit ${exitCode ?? "?"}) but preStepOptional=true; continuing without its output.`;
|
|
300
|
+
try { appendEventFireAndForget(manifest.eventsPath, { type: "hook.pre_step_optional_failed", runId: manifest.runId, taskId: task.id, message: warnMsg, data: { script: input.step.preStepScript, exitCode: exitCode ?? null } }); } catch { /* best-effort event log */ }
|
|
301
|
+
preStepOutput = undefined;
|
|
302
|
+
} else {
|
|
303
|
+
throw errors.preStepFailed(input.step.preStepScript, exitCode, msg);
|
|
304
|
+
}
|
|
292
305
|
}
|
|
293
306
|
}
|
|
294
307
|
|
|
@@ -383,6 +396,7 @@ export async function runTeamTask(
|
|
|
383
396
|
let lastAgentRecordPersistedAt = 0;
|
|
384
397
|
let lastHeartbeatPersistedAt = 0;
|
|
385
398
|
let lastRunProgressPersistedAt = 0;
|
|
399
|
+
let lastTaskProgressPersistedAt = 0;
|
|
386
400
|
let lastRunProgressSummary: ProgressEventSummary | undefined;
|
|
387
401
|
const persistHeartbeat = (force = false): void => {
|
|
388
402
|
const now = Date.now();
|
|
@@ -573,26 +587,23 @@ export async function runTeamTask(
|
|
|
573
587
|
const eventLine = typeof event === "object" && !Array.isArray(event) ? JSON.stringify(event) : String(event);
|
|
574
588
|
fs.appendFileSync(bgLogPath, `${eventLine}\n`);
|
|
575
589
|
}
|
|
576
|
-
//
|
|
577
|
-
//
|
|
578
|
-
//
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
}
|
|
587
|
-
task = {
|
|
588
|
-
...task,
|
|
589
|
-
agentProgress: applyAgentProgressEvent(
|
|
590
|
-
task.agentProgress ?? emptyCrewAgentProgress(),
|
|
591
|
-
event,
|
|
592
|
-
task.startedAt,
|
|
593
|
-
),
|
|
594
|
-
};
|
|
590
|
+
// Always keep in-memory agentProgress fresh (cheap) so the UI/events see
|
|
591
|
+
// the latest progress, but THROTTLE the disk persist. Previously this
|
|
592
|
+
// did a full locked read-parse-write of tasks.json on EVERY child JSON
|
|
593
|
+
// event — a 200-event task produced 200 such cycles (Round 15 P1).
|
|
594
|
+
// Final state is force-flushed on task completion (persistHeartbeat(true)).
|
|
595
|
+
const nextProgress = applyAgentProgressEvent(
|
|
596
|
+
task.agentProgress ?? emptyCrewAgentProgress(),
|
|
597
|
+
event,
|
|
598
|
+
task.startedAt,
|
|
599
|
+
);
|
|
600
|
+
task = { ...task, agentProgress: nextProgress };
|
|
595
601
|
tasks = updateTask(tasks, task);
|
|
602
|
+
const progressNow = Date.now();
|
|
603
|
+
if (progressNow - lastTaskProgressPersistedAt >= 500) {
|
|
604
|
+
tasks = persistSingleTaskUpdate(manifest, tasks, task);
|
|
605
|
+
lastTaskProgressPersistedAt = progressNow;
|
|
606
|
+
}
|
|
596
607
|
// Bridge event to UI event bus for near-instant updates
|
|
597
608
|
const bridgeEvent = bridgeEventFromJsonEvent(
|
|
598
609
|
manifest.runId,
|
|
@@ -701,6 +712,15 @@ export async function runTeamTask(
|
|
|
701
712
|
? childResult.stderr ||
|
|
702
713
|
`Child Pi exited with ${childResult.exitCode}`
|
|
703
714
|
: undefined);
|
|
715
|
+
// E1/E7 (Round 15): when the child timed out, surface a structured
|
|
716
|
+
// CrewError (E007) so users get a code + actionable help hint instead
|
|
717
|
+
// of a bare 'no new output for N ms'. We keep .message as the task error.
|
|
718
|
+
if (childResult.exitStatus?.timedOut) {
|
|
719
|
+
error = errors.childTimeout({
|
|
720
|
+
taskId: task.id,
|
|
721
|
+
stderr: childResult.stderr,
|
|
722
|
+
}).message;
|
|
723
|
+
}
|
|
704
724
|
persistHeartbeat(true);
|
|
705
725
|
persistChildProgress({ type: "attempt_finished" }, true);
|
|
706
726
|
const attempt: ModelAttemptSummary = {
|
|
@@ -724,6 +744,16 @@ export async function runTeamTask(
|
|
|
724
744
|
if (!nextModel || !isRetryableModelFailure(error)) break;
|
|
725
745
|
logs.push(formatModelAttemptNote(attempt, nextModel), "");
|
|
726
746
|
}
|
|
747
|
+
// E2 (Round 15): when the fallback chain was used and STILL failed, surface
|
|
748
|
+
// that explicitly. Without this the task error only shows the last
|
|
749
|
+
// attempt's raw failure, so users can't tell whether to fix an API key,
|
|
750
|
+
// upgrade a plan, or change the model config. Include the chain tried +
|
|
751
|
+
// the final reason.
|
|
752
|
+
if (error && modelAttempts.length > 1) {
|
|
753
|
+
// E2/E1 (Round 15): structured CrewError (E008). Build via the factory so
|
|
754
|
+
// the error carries a code + help hint; keep its .message as the task error.
|
|
755
|
+
error = errors.modelExhausted(modelAttempts.map((a) => a.model), error).message;
|
|
756
|
+
}
|
|
727
757
|
// NEW-8 fix: register all attempt transcripts as artifacts, not just the used one.
|
|
728
758
|
// Earlier failed attempts' transcripts exist on disk but were invisible to the artifact system.
|
|
729
759
|
const successfulAttemptIndex = modelAttempts.findIndex(
|
|
@@ -455,6 +455,15 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
|
|
|
455
455
|
|
|
456
456
|
return result;
|
|
457
457
|
} catch (error) {
|
|
458
|
+
// Round 27 (BUG 1): the success path calls stopTeamHeartbeat() but this
|
|
459
|
+
// catch path did NOT. The team heartbeat is a non-unref'd setInterval
|
|
460
|
+
// (30s) that deliberately keeps the event loop alive — without this
|
|
461
|
+
// call, a failed team run leaves the interval firing forever and the
|
|
462
|
+
// foreground pi process hangs (never returns to the prompt); in
|
|
463
|
+
// background-runner mode the worker never exits. clearInterval is
|
|
464
|
+
// idempotent so a double-call (if this runs after the success path)
|
|
465
|
+
// is harmless.
|
|
466
|
+
stopTeamHeartbeat();
|
|
458
467
|
// P1: Catch unhandled errors — ensure manifest/tasks/agents are terminal so they don't stay "running" forever.
|
|
459
468
|
const message = error instanceof Error ? error.message : String(error);
|
|
460
469
|
// Reload manifest with lock to avoid stale data overwriting concurrent writes.
|
|
@@ -922,8 +931,16 @@ tasks = mergeResult.resultTasks;
|
|
|
922
931
|
await saveRunTasksAsync(finalManifest, tasks);
|
|
923
932
|
});
|
|
924
933
|
manifest = finalManifest;
|
|
925
|
-
// Save health snapshot on run completion
|
|
926
|
-
|
|
934
|
+
// Save health snapshot on run completion.
|
|
935
|
+
// BUG A (pts/2 hang investigation 2026-06-16): stateRoot = `<crewRoot>/state/runs/<runId>`,
|
|
936
|
+
// so the crew root is THREE dirnames up, not two. Two dirnames gave `<crewRoot>/state`
|
|
937
|
+
// (the state dir), and HealthStore then joined HEALTH_DIR (`.crew/state/health`)
|
|
938
|
+
// onto it → `<crewRoot>/state/.crew/state/health` — a double-joined BOGUS path.
|
|
939
|
+
// That wrote health snapshots to a nonexistent subtree (silently breaking the
|
|
940
|
+
// health feature) AND created junk dirs that the recursive state watcher then
|
|
941
|
+
// attached extra inotify watches to. Fix: compute the real crew root (3 up)
|
|
942
|
+
// and make HEALTH_DIR relative to it.
|
|
943
|
+
const crewRoot = path.dirname(path.dirname(path.dirname(finalManifest.stateRoot)));
|
|
927
944
|
const healthStore = new HealthStore(crewRoot);
|
|
928
945
|
healthStore.saveSnapshot({
|
|
929
946
|
runId: finalManifest.runId,
|
|
@@ -57,7 +57,12 @@ export const CARGO_RUST_GATES: Array<{ name: string; command: string; critical:
|
|
|
57
57
|
* Execute a single command and capture output.
|
|
58
58
|
*/
|
|
59
59
|
/** Characters/patterns that indicate dangerous shell metacharacters. */
|
|
60
|
-
|
|
60
|
+
// Round 25 (VULN-3/VULN-4): also block raw newlines (sh -c treats \n as a
|
|
61
|
+
// command separator -> injection) and bare $VARNAME references (can exfiltrate
|
|
62
|
+
// secrets into captured gate output, e.g. `echo $ANTHROPIC_API_KEY`).
|
|
63
|
+
// $+word-char is blocked; special vars like $?/$$/$! are left alone. Built-in
|
|
64
|
+
// gates use only `2>&1` (no $VAR), so this does not break them.
|
|
65
|
+
const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\$\w|\b(eval|exec)\b|>>|<[^^&]|[\r\n])/;
|
|
61
66
|
// Note: single `>` is NOT blocked here because `2>&1` is a safe redirect used by built-in gates.
|
|
62
67
|
// `>>` (append) is still blocked. `<` without `&` (input redirect) is still blocked.
|
|
63
68
|
|
|
@@ -66,7 +71,22 @@ const DANGEROUS_SHELL_PATTERNS = /(?:;|&&|\|\||\$\(|`|\$\{|\b(eval|exec)\b|>>|<[
|
|
|
66
71
|
* Rejects commands with shell metacharacters that could enable injection.
|
|
67
72
|
* Allows: pipes (|), redirection of stderr (2>&1), and basic npm/cargo/npx commands.
|
|
68
73
|
*/
|
|
74
|
+
/** @internal — exported for injection-guard unit testing (Round 25). */
|
|
75
|
+
export function __test__validateGateCommand(command: string): void {
|
|
76
|
+
validateGateCommand(command);
|
|
77
|
+
}
|
|
78
|
+
|
|
69
79
|
function validateGateCommand(command: string): void {
|
|
80
|
+
// Round 25 (VULN-3): check the ORIGINAL command for raw newlines BEFORE
|
|
81
|
+
// normalization. The regex below runs on the NORMALIZED command (which
|
|
82
|
+
// collapses \s+ incl. newlines to a single space), so a newline would be
|
|
83
|
+
// hidden from it - but `sh -c` treats a raw newline as a command
|
|
84
|
+
// separator, enabling injection (e.g. `npm test\nrm -rf x`).
|
|
85
|
+
if (/[\r\n]/.test(command)) {
|
|
86
|
+
throw new Error(
|
|
87
|
+
`Security: verification gate command rejected (raw newline - potential command injection): ${JSON.stringify(command)}`,
|
|
88
|
+
);
|
|
89
|
+
}
|
|
70
90
|
const normalized = command
|
|
71
91
|
.replace(/\x1b\[[0-9;]*[a-zA-Z]/g, '') // ANSI escape sequences
|
|
72
92
|
.replace(/[\x00-\x08\x0b\x0c\x0e-\x1f]/g, '') // control chars
|
|
@@ -252,7 +252,7 @@ function applyLineCap(
|
|
|
252
252
|
return { lines: kept, elided: removable.length };
|
|
253
253
|
}
|
|
254
254
|
|
|
255
|
-
// ── Public API
|
|
255
|
+
// ── Public API ────────────────────────────────────────────────────────
|
|
256
256
|
|
|
257
257
|
const emptyResult = (rootPath: string): WorkspaceTree => ({
|
|
258
258
|
rootPath,
|
|
@@ -261,11 +261,35 @@ const emptyResult = (rootPath: string): WorkspaceTree => ({
|
|
|
261
261
|
totalLines: 0,
|
|
262
262
|
});
|
|
263
263
|
|
|
264
|
+
/**
|
|
265
|
+
* Per-cwd TTL cache for the rendered workspace tree. Workers in the same run
|
|
266
|
+
* share a cwd, so the recursive walk was previously repeated once per task
|
|
267
|
+
* (Round 15 P4). The tree is informational context for the worker; short-lived
|
|
268
|
+
* staleness is acceptable, so a 30s TTL is safe and keeps prompts fresh during
|
|
269
|
+
* long active runs while eliminating redundant walks.
|
|
270
|
+
*/
|
|
271
|
+
const TREE_CACHE_TTL_MS = 30_000;
|
|
272
|
+
interface CachedTree {
|
|
273
|
+
tree: WorkspaceTree;
|
|
274
|
+
expiresAt: number;
|
|
275
|
+
}
|
|
276
|
+
const treeCache = new Map<string, CachedTree>();
|
|
277
|
+
|
|
278
|
+
function treeCacheKey(cwd: string, options?: WorkspaceTreeOptions): string {
|
|
279
|
+
// Cache is keyed on the inputs that affect the walk output.
|
|
280
|
+
return `${path.resolve(cwd)}|${options?.maxDepth ?? ""}|${options?.dirLimit ?? ""}|${options?.lineCap ?? ""}`;
|
|
281
|
+
}
|
|
282
|
+
|
|
264
283
|
export async function buildWorkspaceTree(
|
|
265
284
|
cwd: string,
|
|
266
285
|
options?: WorkspaceTreeOptions,
|
|
267
286
|
): Promise<WorkspaceTree> {
|
|
268
287
|
const rootPath = path.resolve(cwd);
|
|
288
|
+
const cacheKey = treeCacheKey(cwd, options);
|
|
289
|
+
const cached = treeCache.get(cacheKey);
|
|
290
|
+
if (cached && cached.expiresAt > Date.now()) {
|
|
291
|
+
return cached.tree;
|
|
292
|
+
}
|
|
269
293
|
try {
|
|
270
294
|
const maxDepth = options?.maxDepth ?? DEFAULT_MAX_DEPTH;
|
|
271
295
|
const dirLimit = options?.dirLimit ?? DEFAULT_DIR_LIMIT;
|
|
@@ -286,12 +310,14 @@ export async function buildWorkspaceTree(
|
|
|
286
310
|
const { lines: capped, elided } = applyLineCap(lines, lineCap);
|
|
287
311
|
const rendered = capped.map((l) => l.text).join("\n");
|
|
288
312
|
|
|
289
|
-
|
|
313
|
+
const result: WorkspaceTree = {
|
|
290
314
|
rootPath,
|
|
291
315
|
rendered,
|
|
292
316
|
truncated: dirTruncated || elided > 0,
|
|
293
317
|
totalLines: capped.length,
|
|
294
318
|
};
|
|
319
|
+
treeCache.set(cacheKey, { tree: result, expiresAt: Date.now() + TREE_CACHE_TTL_MS });
|
|
320
|
+
return result;
|
|
295
321
|
} catch {
|
|
296
322
|
return emptyResult(rootPath);
|
|
297
323
|
}
|
|
@@ -135,6 +135,13 @@ export const TeamToolParams = Type.Object({
|
|
|
135
135
|
description: "Run in background when execution support is enabled.",
|
|
136
136
|
}),
|
|
137
137
|
),
|
|
138
|
+
details: Type.Optional(
|
|
139
|
+
Type.Boolean({
|
|
140
|
+
default: true,
|
|
141
|
+
description:
|
|
142
|
+
"(status) Output detail level. true (default) = full status (task graph, agents, effectiveness, events). false = compact summary (status, goal, task counts, and only failed/attention task errors) for quick checks.",
|
|
143
|
+
}),
|
|
144
|
+
),
|
|
138
145
|
workspaceMode: Type.Optional(
|
|
139
146
|
Type.Union([Type.Literal("single"), Type.Literal("worktree")], {
|
|
140
147
|
description:
|
|
@@ -318,6 +325,8 @@ export interface TeamToolParamsValue {
|
|
|
318
325
|
taskId?: string;
|
|
319
326
|
message?: string;
|
|
320
327
|
async?: boolean;
|
|
328
|
+
/** (status) Output detail level. false = compact summary. Default: true (full). */
|
|
329
|
+
details?: boolean;
|
|
321
330
|
workspaceMode?: "single" | "worktree";
|
|
322
331
|
context?: "fresh" | "fork";
|
|
323
332
|
cwd?: string;
|
package/src/state/blob-store.ts
CHANGED
|
@@ -190,16 +190,18 @@ export function writeBlob(artifactsRoot: string, input: {
|
|
|
190
190
|
metadataWritten = true;
|
|
191
191
|
});
|
|
192
192
|
} catch (error) {
|
|
193
|
-
//
|
|
194
|
-
//
|
|
195
|
-
//
|
|
196
|
-
//
|
|
197
|
-
//
|
|
198
|
-
// the
|
|
199
|
-
//
|
|
200
|
-
//
|
|
201
|
-
//
|
|
202
|
-
//
|
|
193
|
+
// Round 24 (BUG 4 note): the catch block previously checked
|
|
194
|
+
// `if (!blobContentWritten)` — the WRONG variable (the local comment said
|
|
195
|
+
// `metadataWritten === false`). For a CONTENT-ADDRESSED store the blob path
|
|
196
|
+
// is the content hash, so the blob may be referenced by another process's
|
|
197
|
+
// metadata even when OUR metadata write failed (e.g. a concurrent conflict
|
|
198
|
+
// where the peer already wrote metadata for the same hash). Deleting it
|
|
199
|
+
// would orphan their metadata. The safe behavior is therefore to NEVER
|
|
200
|
+
// delete on a metadata write failure and let the periodic
|
|
201
|
+
// cleanupOrphanedBlobs() reclaim genuinely-orphaned blobs. The guard below
|
|
202
|
+
// only removes a blob when its CONTENT was never written (a stray/partial
|
|
203
|
+
// file from a failed content write) — which is the only unambiguously-safe
|
|
204
|
+
// case to clean up here.
|
|
203
205
|
if (!blobContentWritten) {
|
|
204
206
|
try { fs.rmSync(blobPath, { force: true }); } catch { /* best-effort */ }
|
|
205
207
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
|
-
import { readEvents } from "./event-log.ts";
|
|
2
|
+
import { readEvents, type TeamEvent } from "./event-log.ts";
|
|
3
3
|
import { atomicWriteFile } from "./atomic-write.ts";
|
|
4
4
|
import { logInternalError } from "../utils/internal-error.ts";
|
|
5
5
|
import { withEventLogLockSync } from "./event-log.ts";
|
|
@@ -65,6 +65,25 @@ export interface CompactionResult {
|
|
|
65
65
|
* 6. Return compaction stats
|
|
66
66
|
*/
|
|
67
67
|
export function compactEventLog(eventsPath: string, config?: Partial<RotationConfig>): CompactionResult | undefined {
|
|
68
|
+
const prepared = prepareCompaction(eventsPath, config);
|
|
69
|
+
if (!prepared) return undefined;
|
|
70
|
+
// FIX: Wrap entire read-compact-write-recover sequence in lock to prevent
|
|
71
|
+
// event loss during compaction. Without lock, events can be appended between
|
|
72
|
+
// read and write, lost silently.
|
|
73
|
+
//
|
|
74
|
+
// NOTE (Round 24 BUG 1): callers ALREADY holding the event-log lock (e.g.
|
|
75
|
+
// appendEventInsideLock in event-log.ts) must call applyCompactionUnlocked
|
|
76
|
+
// directly — calling compactEventLog from inside the lock deadlocks (the
|
|
77
|
+
// mkdir lock is not re-entrant → 5s timeout → compaction never ran → the
|
|
78
|
+
// log grew unbounded until events were silently dropped past 50MB).
|
|
79
|
+
return withEventLogLockSync(eventsPath, () => applyCompactionUnlocked(eventsPath, prepared));
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Round 24 (BUG 1): the lock-free pre-read for compaction. Safe to run
|
|
83
|
+
* outside the lock (read-only). Returns the compacted lines + stats needed
|
|
84
|
+
* for the write phase. */
|
|
85
|
+
export function prepareCompaction(eventsPath: string, config?: Partial<RotationConfig>):
|
|
86
|
+
{ lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] } | undefined {
|
|
68
87
|
if (!fs.existsSync(eventsPath)) return undefined;
|
|
69
88
|
const cfg = resolveConfig(config);
|
|
70
89
|
let originalSize: number;
|
|
@@ -74,79 +93,73 @@ export function compactEventLog(eventsPath: string, config?: Partial<RotationCon
|
|
|
74
93
|
if (originalCount <= cfg.compactToCount) return undefined;
|
|
75
94
|
const kept = allEvents.slice(-cfg.compactToCount);
|
|
76
95
|
const lines = kept.map((e) => JSON.stringify(e)).join("\n") + "\n";
|
|
96
|
+
return { lines, originalSize, originalCount, kept };
|
|
97
|
+
}
|
|
77
98
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
compactedSize: fs.statSync(eventsPath).size,
|
|
105
|
-
eventsRemoved: originalCount - kept.length,
|
|
106
|
-
eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
|
|
107
|
-
};
|
|
108
|
-
}
|
|
109
|
-
// afterWrite.length < kept.length — events were lost during compaction window.
|
|
110
|
-
// Find missing events and re-append them.
|
|
111
|
-
// FIX: Use sequence numbers for comparison instead of JSON.stringify.
|
|
112
|
-
const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
|
|
113
|
-
const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
|
|
114
|
-
let recoveredCount = 0;
|
|
115
|
-
let recoveryFailed = false;
|
|
116
|
-
if (missingEvents.length > 0) {
|
|
117
|
-
// BUGFIX (Round 12 C2): the previous loop called atomicWriteFile PER event,
|
|
118
|
-
// which REPLACES the entire file each iteration — destroying the
|
|
119
|
-
// compacted log and all previously-recovered events, leaving only the
|
|
120
|
-
// LAST missing event. FIX: accumulate all missing events into one
|
|
121
|
-
// string and append in a single write (appendFileSync appends without
|
|
122
|
-
// destroying existing content).
|
|
123
|
-
const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
|
|
124
|
-
try {
|
|
125
|
-
fs.appendFileSync(eventsPath, recoveryLines);
|
|
126
|
-
recoveredCount = missingEvents.length;
|
|
127
|
-
} catch (err) {
|
|
128
|
-
recoveryFailed = true;
|
|
129
|
-
logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
|
|
130
|
-
}
|
|
131
|
-
}
|
|
99
|
+
/** Round 24 (BUG 1): the write+recover phase of compaction. Assumes the
|
|
100
|
+
* caller ALREADY holds the event-log lock (or accepts the unlocked race). */
|
|
101
|
+
export function applyCompactionUnlocked(
|
|
102
|
+
eventsPath: string,
|
|
103
|
+
prepared: { lines: string; originalSize: number; originalCount: number; kept: TeamEvent[] },
|
|
104
|
+
): CompactionResult | undefined {
|
|
105
|
+
const { lines, originalSize, originalCount, kept } = prepared;
|
|
106
|
+
try {
|
|
107
|
+
atomicWriteFile(eventsPath, lines);
|
|
108
|
+
} catch (err) {
|
|
109
|
+
// Concurrent write conflict — skip compaction this cycle
|
|
110
|
+
logInternalError("event-log-rotation.compact", err, `eventsPath=${eventsPath}`);
|
|
111
|
+
return undefined;
|
|
112
|
+
}
|
|
113
|
+
// C2: Re-read to recover any events appended during the compaction window.
|
|
114
|
+
// Events appended during the compaction window are preserved because they
|
|
115
|
+
// appear in afterWrite and the condition afterWrite.length >= kept.length is
|
|
116
|
+
// true, so they are included in the return stats without entering the
|
|
117
|
+
// recovery branch.
|
|
118
|
+
try {
|
|
119
|
+
const afterWrite = readEvents(eventsPath);
|
|
120
|
+
// FIX: Check if events were actually lost (afterWrite.length < kept.length)
|
|
121
|
+
// rather than using appendedDuringWindow >= 0 which is always true.
|
|
122
|
+
// Also use sequence numbers for comparison instead of JSON.stringify
|
|
123
|
+
// which is fragile due to key ordering and floating point differences.
|
|
124
|
+
if (afterWrite.length >= kept.length) {
|
|
132
125
|
return {
|
|
133
126
|
originalSize,
|
|
134
127
|
compactedSize: fs.statSync(eventsPath).size,
|
|
135
128
|
eventsRemoved: originalCount - kept.length,
|
|
136
|
-
eventsKept: kept.length +
|
|
137
|
-
recoveryFailed,
|
|
138
|
-
};
|
|
139
|
-
} catch {
|
|
140
|
-
// Post-write verification failed — compaction likely succeeded.
|
|
141
|
-
const compactedSize = fs.statSync(eventsPath).size;
|
|
142
|
-
return {
|
|
143
|
-
originalSize,
|
|
144
|
-
compactedSize,
|
|
145
|
-
eventsRemoved: originalCount - kept.length,
|
|
146
|
-
eventsKept: kept.length,
|
|
129
|
+
eventsKept: kept.length + Math.max(0, afterWrite.length - kept.length),
|
|
147
130
|
};
|
|
148
131
|
}
|
|
149
|
-
|
|
132
|
+
// afterWrite.length < kept.length — events were lost during compaction window.
|
|
133
|
+
const afterSeqs = new Set(afterWrite.map((e) => e.metadata?.seq).filter((s): s is number => s !== undefined));
|
|
134
|
+
const missingEvents = kept.filter((e) => e.metadata?.seq === undefined || !afterSeqs.has(e.metadata.seq));
|
|
135
|
+
let recoveredCount = 0;
|
|
136
|
+
let recoveryFailed = false;
|
|
137
|
+
if (missingEvents.length > 0) {
|
|
138
|
+
const recoveryLines = missingEvents.map((e) => JSON.stringify(e) + "\n").join("");
|
|
139
|
+
try {
|
|
140
|
+
fs.appendFileSync(eventsPath, recoveryLines);
|
|
141
|
+
recoveredCount = missingEvents.length;
|
|
142
|
+
} catch (err) {
|
|
143
|
+
recoveryFailed = true;
|
|
144
|
+
logInternalError("event-log-rotation.recovery", err, `eventsPath=${eventsPath} lostEvents=${missingEvents.length}`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
originalSize,
|
|
149
|
+
compactedSize: fs.statSync(eventsPath).size,
|
|
150
|
+
eventsRemoved: originalCount - kept.length,
|
|
151
|
+
eventsKept: kept.length + recoveredCount,
|
|
152
|
+
recoveryFailed,
|
|
153
|
+
};
|
|
154
|
+
} catch {
|
|
155
|
+
// Post-write verification failed — compaction likely succeeded.
|
|
156
|
+
return {
|
|
157
|
+
originalSize,
|
|
158
|
+
compactedSize: fs.statSync(eventsPath).size,
|
|
159
|
+
eventsRemoved: originalCount - kept.length,
|
|
160
|
+
eventsKept: kept.length,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
150
163
|
}
|
|
151
164
|
|
|
152
165
|
/**
|
|
@@ -161,33 +174,41 @@ export function rotateEventLog(eventsPath: string): boolean {
|
|
|
161
174
|
// FIX: Wrap rotation in lock to prevent race conditions with concurrent readers.
|
|
162
175
|
// Order of operations: (1) create new empty file, (2) rename old file to archive.
|
|
163
176
|
// This ensures eventsPath always exists — a reader never sees a missing file.
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
fs.writeFileSync(eventsPath, "", "utf-8");
|
|
185
|
-
return true;
|
|
186
|
-
} catch (error) {
|
|
187
|
-
logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
|
|
188
|
-
return false;
|
|
177
|
+
//
|
|
178
|
+
// NOTE (Round 24 BUG 1): callers ALREADY holding the lock must call
|
|
179
|
+
// rotateEventLogUnlocked directly — this locked variant is NOT re-entrant.
|
|
180
|
+
return withEventLogLockSync(eventsPath, () => rotateEventLogUnlocked(eventsPath));
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/** Round 24 (BUG 1): the lock-free core of rotation. Assumes the caller
|
|
184
|
+
* already holds the event-log lock (or accepts the unlocked race). */
|
|
185
|
+
export function rotateEventLogUnlocked(eventsPath: string): boolean {
|
|
186
|
+
if (!fs.existsSync(eventsPath)) return false;
|
|
187
|
+
try {
|
|
188
|
+
const ts = new Date().toISOString().replace(/[:.]/g, "-");
|
|
189
|
+
let archivePath = `${eventsPath}.${ts}.archive.jsonl`;
|
|
190
|
+
// Round 12: avoid timestamp collisions when two rotations happen within
|
|
191
|
+
// the same millisecond (copyFileSync would silently overwrite the
|
|
192
|
+
// first archive). Append a counter until the path is free.
|
|
193
|
+
let collision = 1;
|
|
194
|
+
while (fs.existsSync(archivePath)) {
|
|
195
|
+
archivePath = `${eventsPath}.${ts}.${collision}.archive.jsonl`;
|
|
196
|
+
collision++;
|
|
189
197
|
}
|
|
190
|
-
|
|
198
|
+
// BUGFIX (Round 12 C1): the previous order (atomicWriteFile empty THEN
|
|
199
|
+
// rename) destroyed ALL events — atomicWriteFile replaces the file
|
|
200
|
+
// in place, so the rename then moved an EMPTY file to the archive.
|
|
201
|
+
// FIX: copy current content to the archive first (archive is populated,
|
|
202
|
+
// original still intact), then truncate the original to empty in place.
|
|
203
|
+
// copyFileSync + writeFileSync("") ensures eventsPath ALWAYS exists
|
|
204
|
+
// (no missing-file window for concurrent readers).
|
|
205
|
+
fs.copyFileSync(eventsPath, archivePath);
|
|
206
|
+
fs.writeFileSync(eventsPath, "", "utf-8");
|
|
207
|
+
return true;
|
|
208
|
+
} catch (error) {
|
|
209
|
+
logInternalError("event-log.rotate", error, `eventsPath=${eventsPath}`);
|
|
210
|
+
return false;
|
|
211
|
+
}
|
|
191
212
|
}
|
|
192
213
|
|
|
193
214
|
export interface EventLogStats {
|