pi-crew 0.5.2 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +183 -0
- package/README.md +17 -1
- package/docs/architecture.md +2 -0
- package/docs/bugs/cross-session-notification-leakage.md +82 -0
- package/docs/coding-agent-optimization.md +268 -0
- package/docs/deep-review-report.md +384 -0
- package/docs/distillation/cybersecurity-patterns.md +294 -0
- package/docs/migration-v0.4-v0.5.md +208 -0
- package/docs/optimization-plan.md +642 -0
- package/docs/pi-crew-v0.5.5-audit-fix-plan.md +133 -0
- package/docs/pi-mono-opportunities.md +969 -0
- package/docs/pi-mono-review.md +291 -0
- package/docs/skills/REFERENCE.md +144 -0
- package/package.json +12 -9
- package/skills/artifact-analysis-loop/SKILL.md +302 -0
- package/skills/async-worker-recovery/SKILL.md +19 -1
- package/skills/child-pi-spawning/SKILL.md +19 -6
- package/skills/context-artifact-hygiene/SKILL.md +19 -2
- package/skills/delegation-patterns/SKILL.md +68 -3
- package/skills/detection-pipeline-design/SKILL.md +285 -0
- package/skills/event-log-tracing/SKILL.md +20 -6
- package/skills/git-master/SKILL.md +20 -6
- package/skills/hunting-investigation-loop/SKILL.md +401 -0
- package/skills/incident-playbook-construction/SKILL.md +383 -0
- package/skills/live-agent-lifecycle/SKILL.md +20 -6
- package/skills/mailbox-interactive/SKILL.md +19 -6
- package/skills/model-routing-context/SKILL.md +19 -1
- package/skills/multi-perspective-review/SKILL.md +19 -4
- package/skills/observability-reliability/SKILL.md +19 -2
- package/skills/orchestration/SKILL.md +20 -2
- package/skills/ownership-session-security/SKILL.md +20 -2
- package/skills/pi-extension-lifecycle/SKILL.md +20 -2
- package/skills/post-mortem/SKILL.md +7 -2
- package/skills/read-only-explorer/SKILL.md +20 -6
- package/skills/requirements-to-task-packet/SKILL.md +23 -3
- package/skills/resource-discovery-config/SKILL.md +20 -2
- package/skills/runtime-state-reader/SKILL.md +20 -2
- package/skills/safe-bash/SKILL.md +21 -6
- package/skills/scrutinize/SKILL.md +20 -2
- package/skills/secure-agent-orchestration-review/SKILL.md +29 -2
- package/skills/security-review/SKILL.md +560 -0
- package/skills/state-mutation-locking/SKILL.md +22 -2
- package/skills/systematic-debugging/SKILL.md +8 -6
- package/skills/threat-hypothesis-framework/SKILL.md +175 -0
- package/skills/ui-render-performance/SKILL.md +20 -2
- package/skills/verification-before-done/SKILL.md +17 -2
- package/skills/widget-rendering/SKILL.md +21 -6
- package/skills/workspace-isolation/SKILL.md +20 -6
- package/skills/worktree-isolation/SKILL.md +20 -6
- package/src/agents/agent-config.ts +40 -1
- package/src/benchmark/benchmark-runner.ts +45 -0
- package/src/benchmark/feedback-loop.ts +5 -0
- package/src/config/config.ts +32 -5
- package/src/config/role-tools.ts +82 -0
- package/src/config/suggestions.ts +8 -0
- package/src/config/types.ts +4 -0
- package/src/extension/async-notifier.ts +10 -1
- package/src/extension/crew-cleanup.ts +114 -0
- package/src/extension/cross-extension-rpc.ts +1 -1
- package/src/extension/notification-router.ts +18 -0
- package/src/extension/register.ts +27 -19
- package/src/extension/registration/subagent-tools.ts +1 -1
- package/src/extension/team-tool/anchor.ts +201 -0
- package/src/extension/team-tool/api.ts +2 -1
- package/src/extension/team-tool/auto-summarize.ts +154 -0
- package/src/extension/team-tool/run.ts +42 -7
- package/src/extension/team-tool.ts +44 -2
- package/src/hooks/registry.ts +1 -3
- package/src/observability/event-bus.ts +69 -0
- package/src/observability/event-to-metric.ts +0 -2
- package/src/runtime/anchor-manager.ts +473 -0
- package/src/runtime/async-runner.ts +8 -4
- package/src/runtime/auto-summarize.ts +350 -0
- package/src/runtime/background-runner.ts +10 -3
- package/src/runtime/budget-tracker.ts +354 -0
- package/src/runtime/chain-runner.ts +507 -0
- package/src/runtime/child-pi.ts +123 -35
- package/src/runtime/crash-recovery.ts +5 -4
- package/src/runtime/crew-agent-runtime.ts +1 -0
- package/src/runtime/custom-tools/irc-tool.ts +13 -0
- package/src/runtime/custom-tools/submit-result-tool.ts +3 -2
- package/src/runtime/delivery-coordinator.ts +10 -3
- package/src/runtime/dynamic-script-runner.ts +482 -0
- package/src/runtime/foreground-control.ts +87 -17
- package/src/runtime/handoff-manager.ts +589 -0
- package/src/runtime/hidden-handoff.ts +424 -0
- package/src/runtime/live-agent-manager.ts +20 -4
- package/src/runtime/live-session-runtime.ts +39 -4
- package/src/runtime/manifest-cache.ts +2 -1
- package/src/runtime/model-resolver.ts +16 -4
- package/src/runtime/phase-tracker.ts +373 -0
- package/src/runtime/pi-args.ts +11 -1
- package/src/runtime/pi-json-output.ts +31 -0
- package/src/runtime/pipeline-runner.ts +514 -0
- package/src/runtime/progress-tracker.ts +124 -0
- package/src/runtime/retry-runner.ts +354 -0
- package/src/runtime/sandbox.ts +252 -0
- package/src/runtime/scheduler.ts +7 -2
- package/src/runtime/skill-effectiveness.ts +473 -0
- package/src/runtime/skill-instructions.ts +37 -3
- package/src/runtime/subagent-manager.ts +1 -1
- package/src/runtime/task-graph.ts +11 -1
- package/src/runtime/task-runner.ts +92 -18
- package/src/runtime/team-runner.ts +13 -12
- package/src/runtime/tool-progress.ts +10 -3
- package/src/runtime/verification-gates.ts +367 -0
- package/src/schema/team-tool-schema.ts +37 -0
- package/src/skills/discover-skills.ts +5 -0
- package/src/state/active-run-registry.ts +9 -2
- package/src/state/contracts.ts +9 -0
- package/src/state/crew-init.ts +3 -3
- package/src/state/decision-ledger.ts +98 -55
- package/src/state/event-log-rotation.ts +2 -2
- package/src/state/event-log.ts +144 -10
- package/src/state/hook-instinct-bridge.ts +5 -5
- package/src/state/mailbox.ts +10 -0
- package/src/state/run-cache.ts +18 -8
- package/src/state/state-store.ts +3 -1
- package/src/state/types.ts +4 -0
- package/src/tools/safe-bash-extension.ts +1 -0
- package/src/tools/safe-bash.ts +152 -20
- package/src/types/new-api-types.ts +34 -0
- package/src/ui/agent-management-overlay.ts +5 -1
- package/src/ui/crew-widget.ts +29 -15
- package/src/ui/overlays/mailbox-detail-overlay.ts +13 -2
- package/src/ui/powerbar-publisher.ts +101 -7
- package/src/ui/tool-render.ts +15 -15
- package/src/ui/transcript-cache.ts +13 -0
- package/src/utils/bm25-search.ts +16 -8
- package/src/utils/env-filter.ts +8 -5
- package/src/utils/redaction.ts +169 -15
- package/src/utils/session-utils.ts +52 -0
- package/src/utils/sse-parser.ts +10 -1
- package/src/worktree/cleanup.ts +6 -1
- package/src/worktree/worktree-manager.ts +32 -13
- package/workflows/chain.workflow.md +252 -0
- package/workflows/pipeline.workflow.md +27 -0
package/src/runtime/child-pi.ts
CHANGED
|
@@ -8,8 +8,9 @@ import { getPiSpawnCommand } from "./pi-spawn.ts";
|
|
|
8
8
|
import { DEFAULT_CHILD_PI } from "../config/defaults.ts";
|
|
9
9
|
import { logInternalError } from "../utils/internal-error.ts";
|
|
10
10
|
import { attachPostExitStdioGuard, trySignalChild } from "./post-exit-stdio-guard.ts";
|
|
11
|
-
import { redactJsonLine,
|
|
11
|
+
import { redactJsonLine, isSecretKey } from "../utils/redaction.ts";
|
|
12
12
|
import { sanitizeEnvSecrets } from "../utils/env-filter.ts";
|
|
13
|
+
import { registerChildProcess, unregisterChildProcess } from "../extension/crew-cleanup.ts";
|
|
13
14
|
|
|
14
15
|
const POST_EXIT_STDIO_GUARD_MS = DEFAULT_CHILD_PI.postExitStdioGuardMs;
|
|
15
16
|
const FINAL_DRAIN_MS = DEFAULT_CHILD_PI.finalDrainMs;
|
|
@@ -117,6 +118,8 @@ export interface ChildPiLifecycleEvent {
|
|
|
117
118
|
error?: string;
|
|
118
119
|
/** Stderr captured at timeout moment (for response_timeout events). */
|
|
119
120
|
stderr?: string;
|
|
121
|
+
/** Last N chars of stderr for error context (exit/error events). */
|
|
122
|
+
stderrExcerpt?: string;
|
|
120
123
|
/** Timestamp (ISO). */
|
|
121
124
|
ts: string;
|
|
122
125
|
}
|
|
@@ -146,6 +149,16 @@ export interface ChildPiRunInput {
|
|
|
146
149
|
parentContext?: string;
|
|
147
150
|
/** When true, prepend parentContext to the task prompt. */
|
|
148
151
|
inheritContext?: boolean;
|
|
152
|
+
/** Pass to pi to mark certain commands as context-excluded. Default: false */
|
|
153
|
+
excludeContextBash?: boolean;
|
|
154
|
+
/** pi session ID for session naming (aligns with pi-crew run ID) */
|
|
155
|
+
sessionId?: string;
|
|
156
|
+
/** Run ID for cleanup tracking */
|
|
157
|
+
runId?: string;
|
|
158
|
+
/** Agent ID for cleanup tracking */
|
|
159
|
+
agentId?: string;
|
|
160
|
+
/** Role for tool restrictions (from role-tools.ts) */
|
|
161
|
+
role?: string;
|
|
149
162
|
}
|
|
150
163
|
|
|
151
164
|
export interface ChildPiRunResult {
|
|
@@ -168,18 +181,24 @@ export function buildChildPiSpawnOptions(cwd: string, env: NodeJS.ProcessEnv): S
|
|
|
168
181
|
// Bug #12 fix: essential env vars (PATH, HOME, etc.) are always preserved so child can find npm/node.
|
|
169
182
|
const filteredEnv = sanitizeEnvSecrets(env, {
|
|
170
183
|
allowList: [
|
|
171
|
-
// Model provider API keys (
|
|
172
|
-
"
|
|
173
|
-
"
|
|
174
|
-
"
|
|
175
|
-
"
|
|
176
|
-
"
|
|
177
|
-
"
|
|
178
|
-
"
|
|
179
|
-
"
|
|
180
|
-
"
|
|
181
|
-
"
|
|
182
|
-
"
|
|
184
|
+
// Model provider API keys (explicit list — do NOT use wildcards)
|
|
185
|
+
"MINIMAX_API_KEY",
|
|
186
|
+
"MINIMAX_GROUP_ID",
|
|
187
|
+
"OPENAI_API_KEY",
|
|
188
|
+
"OPENAI_ORG_ID",
|
|
189
|
+
"ANTHROPIC_API_KEY",
|
|
190
|
+
"GOOGLE_API_KEY",
|
|
191
|
+
"GOOGLE_GENERATIVE_LANGUAGE_API_KEY",
|
|
192
|
+
"AZURE_OPENAI_API_KEY",
|
|
193
|
+
"AZURE_OPENAI_ENDPOINT",
|
|
194
|
+
"AWS_ACCESS_KEY_ID",
|
|
195
|
+
"AWS_SECRET_ACCESS_KEY",
|
|
196
|
+
"AWS_REGION",
|
|
197
|
+
"ZEU_API_KEY",
|
|
198
|
+
"ZERODEV_API_KEY",
|
|
199
|
+
// SECURITY FIX: Removed dangerous wildcards "*_API_KEY", "*_TOKEN", "*_SECRET"
|
|
200
|
+
// These patterns would leak ALL secrets matching the pattern to child processes.
|
|
201
|
+
// Only add specific, intended provider keys above.
|
|
183
202
|
// Essential non-secret vars for child process to function
|
|
184
203
|
"PATH",
|
|
185
204
|
"HOME",
|
|
@@ -368,23 +387,31 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
368
387
|
if (depth.blocked) return { exitCode: 1, stdout: "", stderr: `pi-crew depth guard blocked child worker: depth ${depth.depth} >= max ${depth.maxDepth}` };
|
|
369
388
|
const mock = process.env.PI_TEAMS_MOCK_CHILD_PI;
|
|
370
389
|
if (mock) {
|
|
390
|
+
// SECURITY: Log mock mode activation prominently for audit trail
|
|
391
|
+
console.warn(`[⚠️ PI_CREW_MOCK_MODE] Mock mode active: ${mock} — NOT running real agents!`);
|
|
392
|
+
// SECURITY FIX: Require PI_CREW_ALLOW_MOCK alongside PI_TEAMS_MOCK_CHILD_PI
|
|
393
|
+
const allowMock = process.env.PI_CREW_ALLOW_MOCK === "1" || process.env.PI_CREW_ALLOW_MOCK === "true";
|
|
394
|
+
if (!allowMock) {
|
|
395
|
+
console.error(`[🚨 PI_CREW_MOCK_MODE] SECURITY: PI_TEAMS_MOCK_CHILD_PI is set but PI_CREW_ALLOW_MOCK is not "1". Ignoring mock request for safety.`);
|
|
396
|
+
return { exitCode: 1, stdout: "", stderr: "Mock mode requires PI_CREW_ALLOW_MOCK=1 alongside PI_TEAMS_MOCK_CHILD_PI" };
|
|
397
|
+
}
|
|
371
398
|
if (mock === "success") {
|
|
372
|
-
const stdout = `
|
|
399
|
+
const stdout = `[MOCK] Success for ${input.agent.name}\n`;
|
|
373
400
|
observeStdoutChunk(input, stdout);
|
|
374
401
|
return { exitCode: 0, stdout, stderr: "" };
|
|
375
402
|
}
|
|
376
403
|
if (mock === "json-success" || mock === "adaptive-plan") {
|
|
377
404
|
const text = mock === "adaptive-plan" && effectiveTask.includes("ADAPTIVE_PLAN_JSON_START")
|
|
378
|
-
? `Adaptive
|
|
379
|
-
: `
|
|
405
|
+
? `[MOCK] Adaptive plan\nADAPTIVE_PLAN_JSON_START\n${JSON.stringify({ phases: [{ name: "research", tasks: [{ role: "explorer", task: "Explore adaptive target" }, { role: "analyst", task: "Analyze adaptive target" }, { role: "planner", task: "Plan adaptive target" }] }, { name: "build", tasks: [{ role: "executor", task: "Implement adaptive target" }] }, { name: "check", tasks: [{ role: "reviewer", task: "Review adaptive target" }, { role: "test-engineer", task: "Test adaptive target" }, { role: "writer", task: "Summarize adaptive target" }] }] })}\nADAPTIVE_PLAN_JSON_END`
|
|
406
|
+
: `[MOCK] JSON success for ${input.agent.name}`;
|
|
380
407
|
const stdout = `${JSON.stringify({ type: "message", message: { role: "assistant", content: [{ type: "text", text }] } })}\n${JSON.stringify({ type: "message_end", usage: { input: 10, output: 5, cost: 0.001, turns: 1 } })}\n`;
|
|
381
408
|
observeStdoutChunk(input, stdout);
|
|
382
409
|
return { exitCode: 0, stdout, stderr: "" };
|
|
383
410
|
}
|
|
384
|
-
if (mock === "retryable-failure") return { exitCode: 1, stdout: "", stderr: "rate limit: mock failure" };
|
|
385
|
-
return { exitCode: 1, stdout: "", stderr: `
|
|
411
|
+
if (mock === "retryable-failure") return { exitCode: 1, stdout: "", stderr: "[MOCK] rate limit: mock failure" };
|
|
412
|
+
return { exitCode: 1, stdout: "", stderr: `[MOCK] failure: ${mock}` };
|
|
386
413
|
}
|
|
387
|
-
const built = buildPiWorkerArgs({ task: effectiveTask, agent: input.agent, model: input.model, sessionEnabled: true, maxDepth: input.maxDepth, skillPaths: input.skillPaths });
|
|
414
|
+
const built = buildPiWorkerArgs({ task: effectiveTask, agent: input.agent, model: input.model, sessionEnabled: true, maxDepth: input.maxDepth, skillPaths: input.skillPaths, role: input.role });
|
|
388
415
|
const spawnSpec = getPiSpawnCommand(built.args);
|
|
389
416
|
try {
|
|
390
417
|
return await new Promise<ChildPiRunResult>((resolve) => {
|
|
@@ -393,6 +420,10 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
393
420
|
activeChildProcesses.set(child.pid, child);
|
|
394
421
|
input.onSpawn?.(child.pid);
|
|
395
422
|
input.onLifecycleEvent?.({ type: "spawned", pid: child.pid, ts: new Date().toISOString() });
|
|
423
|
+
// Register with cleanup handler for graceful shutdown
|
|
424
|
+
if (input.runId && input.agentId) {
|
|
425
|
+
registerChildProcess(child.pid, input.runId, input.agentId);
|
|
426
|
+
}
|
|
396
427
|
} else {
|
|
397
428
|
input.onLifecycleEvent?.({ type: "spawn_error", error: "spawn returned no pid", ts: new Date().toISOString() });
|
|
398
429
|
}
|
|
@@ -414,6 +445,36 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
414
445
|
let hardKilled = false;
|
|
415
446
|
const cleanupErrors: string[] = [];
|
|
416
447
|
let turnCount = 0;
|
|
448
|
+
// Track in-flight operations for proper rejection on unexpected exit
|
|
449
|
+
interface PendingOperation {
|
|
450
|
+
id: string;
|
|
451
|
+
type: "prompt" | "steer" | "json_event";
|
|
452
|
+
startedAt: number;
|
|
453
|
+
}
|
|
454
|
+
const pendingOperations = new Map<string, PendingOperation>();
|
|
455
|
+
let operationIdCounter = 0;
|
|
456
|
+
|
|
457
|
+
const startOperation = (type: PendingOperation["type"]): string => {
|
|
458
|
+
const id = `op-${++operationIdCounter}`;
|
|
459
|
+
pendingOperations.set(id, { id, type, startedAt: Date.now() });
|
|
460
|
+
return id;
|
|
461
|
+
};
|
|
462
|
+
|
|
463
|
+
const completeOperation = (id: string): void => {
|
|
464
|
+
pendingOperations.delete(id);
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
const rejectPendingOperations = (error: Error): void => {
|
|
468
|
+
pendingOperations.forEach((op, id) => {
|
|
469
|
+
logInternalError(
|
|
470
|
+
"child-pi.pending-operation-rejected",
|
|
471
|
+
error,
|
|
472
|
+
`opId=${id} type=${op.type} elapsed=${Date.now() - op.startedAt}ms`,
|
|
473
|
+
);
|
|
474
|
+
});
|
|
475
|
+
pendingOperations.clear();
|
|
476
|
+
};
|
|
477
|
+
|
|
417
478
|
let softLimitReached = false;
|
|
418
479
|
const maxTurns = input.maxTurns;
|
|
419
480
|
const graceTurns = input.graceTurns;
|
|
@@ -450,20 +511,27 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
450
511
|
},
|
|
451
512
|
onJsonEvent: (event) => {
|
|
452
513
|
restartNoResponseTimer();
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
if (
|
|
457
|
-
|
|
458
|
-
if (
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
514
|
+
const eventOpId = startOperation("json_event");
|
|
515
|
+
try {
|
|
516
|
+
// Turn-count-based steering: soft limit steer + hard abort after graceTurns
|
|
517
|
+
if (event && typeof event === "object" && !Array.isArray(event)) {
|
|
518
|
+
const obj = event as Record<string, unknown>;
|
|
519
|
+
if (obj.type === "turn_end") {
|
|
520
|
+
turnCount += 1;
|
|
521
|
+
if (maxTurns !== undefined && !softLimitReached && turnCount >= maxTurns) {
|
|
522
|
+
softLimitReached = true;
|
|
523
|
+
// Inject steer via stdin to tell child to wrap up
|
|
524
|
+
child.stdin?.write(JSON.stringify({ type: "steer", message: "You have reached your turn limit. Wrap up immediately — provide your final answer now." }) + "\n");
|
|
525
|
+
} else if (maxTurns !== undefined && softLimitReached && turnCount >= maxTurns + (graceTurns ?? 5)) {
|
|
526
|
+
// Hard abort — terminate after grace turns
|
|
527
|
+
try { child.kill(process.platform === "win32" ? undefined : "SIGTERM"); } catch { /* best-effort */ }
|
|
528
|
+
}
|
|
465
529
|
}
|
|
466
530
|
}
|
|
531
|
+
completeOperation(eventOpId);
|
|
532
|
+
} catch (err) {
|
|
533
|
+
completeOperation(eventOpId);
|
|
534
|
+
throw err;
|
|
467
535
|
}
|
|
468
536
|
input.onJsonEvent?.(event);
|
|
469
537
|
if (!isFinalAssistantEvent(event) || childExited || settled || finalDrainTimer) return;
|
|
@@ -587,20 +655,38 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
587
655
|
stderr = appendBoundedTail(stderr, chunk.toString("utf-8"));
|
|
588
656
|
});
|
|
589
657
|
child.on("error", (error) => {
|
|
658
|
+
// Reject pending operations with process error context
|
|
659
|
+
const processError = new Error(
|
|
660
|
+
`Child Pi process error: ${error.message}. Stderr: ${stderr.slice(-500) || "(none)"}`,
|
|
661
|
+
);
|
|
662
|
+
rejectPendingOperations(processError);
|
|
590
663
|
try {
|
|
591
|
-
input.onLifecycleEvent?.({ type: "spawn_error", pid: child.pid, error:
|
|
664
|
+
input.onLifecycleEvent?.({ type: "spawn_error", pid: child.pid, error: processError.message, ts: new Date().toISOString(), stderrExcerpt: stderr.slice(-500) || undefined });
|
|
592
665
|
} catch (err) {
|
|
593
666
|
logInternalError("child-pi.on-lifecycle-event", err, `event=error, pid=${child.pid}`);
|
|
594
667
|
}
|
|
595
|
-
settle({ exitCode: null, stdout, stderr, error:
|
|
668
|
+
settle({ exitCode: null, stdout, stderr, error: processError.message });
|
|
596
669
|
});
|
|
597
|
-
child.on("exit", (code) => {
|
|
670
|
+
child.on("exit", (code, signal) => {
|
|
598
671
|
if (child.pid) {
|
|
599
672
|
activeChildProcesses.delete(child.pid);
|
|
600
673
|
clearHardKillTimer(child.pid);
|
|
674
|
+
// Unregister from cleanup handler
|
|
675
|
+
unregisterChildProcess(child.pid);
|
|
676
|
+
}
|
|
677
|
+
// Build comprehensive exit error for unexpected exits
|
|
678
|
+
const isUnexpectedExit = !childExited && !settled && !responseTimeoutHit && !abortRequested;
|
|
679
|
+
const exitError = isUnexpectedExit
|
|
680
|
+
? new Error(
|
|
681
|
+
`Child Pi process exited unexpectedly (code=${code ?? "null"} signal=${signal ?? "null"}). `
|
|
682
|
+
+ `Stderr: ${stderr.slice(-1000) || "(none)"}`,
|
|
683
|
+
)
|
|
684
|
+
: null;
|
|
685
|
+
if (exitError) {
|
|
686
|
+
rejectPendingOperations(exitError);
|
|
601
687
|
}
|
|
602
688
|
try {
|
|
603
|
-
input.onLifecycleEvent?.({ type: "exit", pid: child.pid, exitCode: code, ts: new Date().toISOString() });
|
|
689
|
+
input.onLifecycleEvent?.({ type: "exit", pid: child.pid, exitCode: code, ts: new Date().toISOString(), error: exitError?.message, stderrExcerpt: isUnexpectedExit ? stderr.slice(-1000) || undefined : undefined });
|
|
604
690
|
} catch (err) {
|
|
605
691
|
logInternalError("child-pi.on-lifecycle-event", err, `event=exit, pid=${child.pid}`);
|
|
606
692
|
}
|
|
@@ -618,6 +704,8 @@ export async function runChildPi(input: ChildPiRunInput): Promise<ChildPiRunResu
|
|
|
618
704
|
if (child.pid) {
|
|
619
705
|
activeChildProcesses.delete(child.pid);
|
|
620
706
|
clearHardKillTimer(child.pid);
|
|
707
|
+
// Unregister from cleanup handler
|
|
708
|
+
unregisterChildProcess(child.pid);
|
|
621
709
|
}
|
|
622
710
|
try {
|
|
623
711
|
input.onLifecycleEvent?.({ type: "close", pid: child.pid, exitCode, ts: new Date().toISOString() });
|
|
@@ -15,6 +15,7 @@ import { activeRunEntries, unregisterActiveRun, readActiveRunRegistry } from "..
|
|
|
15
15
|
import { resolveRealContainedPath } from "../utils/safe-paths.ts";
|
|
16
16
|
import { projectCrewRoot, userCrewRoot } from "../utils/paths.ts";
|
|
17
17
|
import { terminateLiveAgentsForRun } from "./live-agent-manager.ts";
|
|
18
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
18
19
|
|
|
19
20
|
export interface RecoveryPlan {
|
|
20
21
|
runId: string;
|
|
@@ -159,7 +160,7 @@ export function cancelOrphanedRuns(
|
|
|
159
160
|
cancelled.push(manifest.runId);
|
|
160
161
|
cancelledRun = true;
|
|
161
162
|
});
|
|
162
|
-
if (cancelledRun) void terminateLiveAgentsForRun(manifest.runId, "cancelled", appendEvent, loaded.manifest.eventsPath).catch(() => {});
|
|
163
|
+
if (cancelledRun) void terminateLiveAgentsForRun(manifest.runId, "cancelled", appendEvent, loaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.orphan.terminate", error, `runId=${manifest.runId}`));
|
|
163
164
|
}
|
|
164
165
|
|
|
165
166
|
return { cancelled, skipped };
|
|
@@ -268,7 +269,7 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
|
|
|
268
269
|
saveRunTasks(fullLoaded.manifest, repairedTasks);
|
|
269
270
|
for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
270
271
|
updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: worker process dead and no recent activity");
|
|
271
|
-
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch(() => {});
|
|
272
|
+
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
|
|
272
273
|
}
|
|
273
274
|
} catch {
|
|
274
275
|
// Best-effort manifest cleanup
|
|
@@ -299,7 +300,7 @@ export function purgeStaleActiveRunIndex(staleThresholdMs = 300_000, now = Date.
|
|
|
299
300
|
saveRunTasks(fullLoaded.manifest, repairedTasks);
|
|
300
301
|
for (const task of repairedTasks) { try { upsertCrewAgent(fullLoaded.manifest, recordFromTask(fullLoaded.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
301
302
|
updateRunStatus(fullLoaded.manifest, "cancelled", "Orphaned run: no async worker and no manifest update in over " + Math.round(staleThresholdMs / 60000) + " minutes");
|
|
302
|
-
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch(() => {});
|
|
303
|
+
void terminateLiveAgentsForRun(fullLoaded.manifest.runId, "cancelled", appendEvent, fullLoaded.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.pid-dead.terminate", error, `runId=${fullLoaded.manifest.runId}`));
|
|
303
304
|
}
|
|
304
305
|
} catch {
|
|
305
306
|
// Best-effort
|
|
@@ -335,7 +336,7 @@ export function reconcileAllStaleRuns(cwd: string, manifestCache: ManifestCache,
|
|
|
335
336
|
for (const task of result.repairedTasks) { try { upsertCrewAgent(fresh.manifest, recordFromTask(fresh.manifest, task, "scaffold")); } catch { /* non-critical */ } }
|
|
336
337
|
}
|
|
337
338
|
updateRunStatus(fresh.manifest, "failed", `Stale run reconciled: ${result.detail}`);
|
|
338
|
-
void terminateLiveAgentsForRun(fresh.manifest.runId, "failed", appendEvent, fresh.manifest.eventsPath).catch(() => {});
|
|
339
|
+
void terminateLiveAgentsForRun(fresh.manifest.runId, "failed", appendEvent, fresh.manifest.eventsPath).catch((error) => logInternalError("crash-recovery.reconcile.terminate", error, `runId=${fresh.manifest.runId}`));
|
|
339
340
|
appendEvent(fresh.manifest.eventsPath, { type: "crew.run.reconciled_stale", runId: manifest.runId, message: result.detail, data: { verdict: result.verdict } });
|
|
340
341
|
}
|
|
341
342
|
if (result.verdict !== "healthy") {
|
|
@@ -44,6 +44,19 @@ const IrcParams = Type.Object({
|
|
|
44
44
|
|
|
45
45
|
type IrcParams = Static<typeof IrcParams>;
|
|
46
46
|
|
|
47
|
+
/**
|
|
48
|
+
* Output schema for the irc tool's `details` field.
|
|
49
|
+
* All fields are optional — only present when relevant to the operation.
|
|
50
|
+
*
|
|
51
|
+
* Schema:
|
|
52
|
+
* op — Always present. "send" | "list"
|
|
53
|
+
* from — Sender agent ID. Present on all responses.
|
|
54
|
+
* to — Recipient agent ID. Present on send responses.
|
|
55
|
+
* delivered — Array of agent IDs that received the message. Present on send.
|
|
56
|
+
* notFound — Array of agent IDs that were unknown or unavailable. Present on send.
|
|
57
|
+
* peers — Array of { id, status } for list operation.
|
|
58
|
+
* error — Human-readable error description. Present when the operation failed.
|
|
59
|
+
*/
|
|
47
60
|
interface IrcDetails {
|
|
48
61
|
op: "send" | "list";
|
|
49
62
|
from?: string;
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
import { defineTool, type ToolDefinition } from "@earendil-works/pi-coding-agent";
|
|
13
13
|
import { Type, type Static } from "@sinclair/typebox";
|
|
14
14
|
import type { YieldResult } from "../yield-handler.ts";
|
|
15
|
+
import { logInternalError } from "../../utils/internal-error.ts";
|
|
15
16
|
|
|
16
17
|
const SubmitResultParams = Type.Object({
|
|
17
18
|
summary: Type.String({ description: "Summary of completed work." }),
|
|
@@ -81,8 +82,8 @@ export function createSubmitResultTool(
|
|
|
81
82
|
};
|
|
82
83
|
try {
|
|
83
84
|
onYield(result);
|
|
84
|
-
} catch {
|
|
85
|
-
|
|
85
|
+
} catch (error) {
|
|
86
|
+
logInternalError("submit-result-tool.yield", error, toolCallId);
|
|
86
87
|
}
|
|
87
88
|
return response;
|
|
88
89
|
},
|
|
@@ -28,11 +28,10 @@ export class DeliveryCoordinator {
|
|
|
28
28
|
private flushing = false;
|
|
29
29
|
private readonly deps: DeliveryCoordinatorDeps;
|
|
30
30
|
private ttlTimer: ReturnType<typeof setInterval> | undefined;
|
|
31
|
+
private timerStarted = false;
|
|
31
32
|
|
|
32
33
|
constructor(deps: DeliveryCoordinatorDeps) {
|
|
33
34
|
this.deps = deps;
|
|
34
|
-
this.ttlTimer = setInterval(() => this.evictExpired(), 60_000);
|
|
35
|
-
this.ttlTimer.unref();
|
|
36
35
|
}
|
|
37
36
|
|
|
38
37
|
activate(sessionId: string): void {
|
|
@@ -102,9 +101,11 @@ export class DeliveryCoordinator {
|
|
|
102
101
|
|
|
103
102
|
flushQueuedResults(): void {
|
|
104
103
|
if (!this.active || this.pending.length === 0) return;
|
|
105
|
-
//
|
|
104
|
+
// HIGH-16/ MEDIUM-16: Set flushing BEFORE splice to prevent re-entrancy
|
|
106
105
|
if (this.flushing) return;
|
|
107
106
|
this.flushing = true;
|
|
107
|
+
// Note: this.flushing is now set, so concurrent calls will exit early due to the check above
|
|
108
|
+
// This serves as a simple lock to prevent race conditions
|
|
108
109
|
const batch = this.pending.splice(0);
|
|
109
110
|
try {
|
|
110
111
|
const retryLater: PendingDelivery[] = [];
|
|
@@ -162,6 +163,12 @@ export class DeliveryCoordinator {
|
|
|
162
163
|
}
|
|
163
164
|
|
|
164
165
|
private enqueue(delivery: PendingDelivery): void {
|
|
166
|
+
// Lazily start the TTL timer on first enqueue (only if never started)
|
|
167
|
+
if (!this.timerStarted) {
|
|
168
|
+
this.timerStarted = true;
|
|
169
|
+
this.ttlTimer = setInterval(() => this.evictExpired(), 60_000);
|
|
170
|
+
this.ttlTimer.unref();
|
|
171
|
+
}
|
|
165
172
|
this.pending.push({ ...delivery, generation: this.generation });
|
|
166
173
|
}
|
|
167
174
|
|