pi-crew 0.1.49 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +74 -1
- package/README.md +176 -781
- package/agents/analyst.md +11 -11
- package/agents/critic.md +11 -11
- package/agents/executor.md +11 -11
- package/agents/explorer.md +11 -11
- package/agents/planner.md +11 -11
- package/agents/reviewer.md +11 -11
- package/agents/security-reviewer.md +11 -11
- package/agents/test-engineer.md +11 -11
- package/agents/verifier.md +70 -11
- package/agents/writer.md +11 -11
- package/docs/actions-reference.md +595 -0
- package/docs/commands-reference.md +347 -0
- package/docs/runtime-flow.md +148 -148
- package/index.ts +6 -6
- package/package.json +99 -99
- package/skills/async-worker-recovery/SKILL.md +42 -42
- package/skills/context-artifact-hygiene/SKILL.md +52 -52
- package/skills/delegation-patterns/SKILL.md +54 -54
- package/skills/mailbox-interactive/SKILL.md +40 -40
- package/skills/model-routing-context/SKILL.md +39 -39
- package/skills/multi-perspective-review/SKILL.md +58 -58
- package/skills/observability-reliability/SKILL.md +41 -41
- package/skills/orchestration/SKILL.md +157 -157
- package/skills/ownership-session-security/SKILL.md +41 -41
- package/skills/pi-extension-lifecycle/SKILL.md +39 -39
- package/skills/requirements-to-task-packet/SKILL.md +63 -63
- package/skills/resource-discovery-config/SKILL.md +41 -41
- package/skills/runtime-state-reader/SKILL.md +44 -44
- package/skills/secure-agent-orchestration-review/SKILL.md +45 -45
- package/skills/state-mutation-locking/SKILL.md +42 -42
- package/skills/systematic-debugging/SKILL.md +67 -67
- package/skills/ui-render-performance/SKILL.md +39 -39
- package/skills/verification-before-done/SKILL.md +57 -57
- package/skills/worktree-isolation/SKILL.md +39 -39
- package/src/adapters/claude-adapter.ts +25 -0
- package/src/adapters/codex-adapter.ts +21 -0
- package/src/adapters/cursor-adapter.ts +17 -0
- package/src/adapters/export-util.ts +137 -0
- package/src/adapters/index.ts +15 -0
- package/src/adapters/registry.ts +18 -0
- package/src/adapters/types.ts +23 -0
- package/src/agents/agent-config.ts +2 -0
- package/src/agents/agent-search.ts +98 -98
- package/src/agents/discover-agents.ts +2 -1
- package/src/config/config.ts +14 -1
- package/src/config/defaults.ts +5 -5
- package/src/config/drift-detector.ts +211 -0
- package/src/config/markers.ts +327 -0
- package/src/config/resilient-parser.ts +108 -0
- package/src/config/suggestions.ts +74 -0
- package/src/extension/cross-extension-rpc.ts +103 -82
- package/src/extension/project-init.ts +36 -4
- package/src/extension/register.ts +67 -22
- package/src/extension/registration/commands.ts +77 -8
- package/src/extension/registration/subagent-tools.ts +10 -1
- package/src/extension/registration/team-tool.ts +10 -1
- package/src/extension/registration/viewers.ts +48 -34
- package/src/extension/run-bundle-schema.ts +89 -89
- package/src/extension/run-export.ts +26 -12
- package/src/extension/run-import.ts +25 -1
- package/src/extension/run-index.ts +5 -1
- package/src/extension/run-maintenance.ts +142 -68
- package/src/extension/team-manager-command.ts +10 -1
- package/src/extension/team-tool/context.ts +1 -1
- package/src/extension/team-tool/doctor.ts +28 -3
- package/src/extension/team-tool/handle-settings.ts +195 -188
- package/src/extension/team-tool/inspect.ts +41 -41
- package/src/extension/team-tool/intent-policy.ts +42 -42
- package/src/extension/team-tool/lifecycle-actions.ts +27 -8
- package/src/extension/team-tool/plan.ts +19 -19
- package/src/extension/team-tool/run.ts +12 -1
- package/src/extension/team-tool.ts +14 -3
- package/src/i18n.ts +184 -184
- package/src/observability/exporters/otlp-exporter.ts +92 -77
- package/src/prompt/prompt-runtime.ts +72 -72
- package/src/runtime/agent-memory.ts +72 -72
- package/src/runtime/agent-observability.ts +114 -114
- package/src/runtime/async-marker.ts +26 -26
- package/src/runtime/attention-events.ts +28 -28
- package/src/runtime/auto-resume.ts +100 -0
- package/src/runtime/background-runner.ts +11 -1
- package/src/runtime/cancellation-token.ts +89 -89
- package/src/runtime/cancellation.ts +61 -61
- package/src/runtime/capability-inventory.ts +116 -116
- package/src/runtime/child-pi.ts +7 -2
- package/src/runtime/compaction-summary.ts +271 -0
- package/src/runtime/completion-guard.ts +190 -190
- package/src/runtime/concurrency.ts +3 -1
- package/src/runtime/crash-recovery.ts +33 -0
- package/src/runtime/delta-conflict.ts +360 -0
- package/src/runtime/diagnostic-export.ts +3 -1
- package/src/runtime/direct-run.ts +35 -35
- package/src/runtime/event-stream-bridge.ts +3 -1
- package/src/runtime/foreground-control.ts +82 -82
- package/src/runtime/green-contract.ts +46 -46
- package/src/runtime/group-join.ts +106 -106
- package/src/runtime/heartbeat-gradient.ts +28 -28
- package/src/runtime/heartbeat-watcher.ts +124 -124
- package/src/runtime/iteration-hooks.ts +262 -0
- package/src/runtime/live-agent-control.ts +88 -88
- package/src/runtime/live-control-realtime.ts +36 -36
- package/src/runtime/live-extension-bridge.ts +150 -150
- package/src/runtime/live-irc.ts +92 -92
- package/src/runtime/live-session-health.ts +100 -100
- package/src/runtime/loop-gates.ts +129 -0
- package/src/runtime/metric-parser.ts +40 -0
- package/src/runtime/notebook-helpers.ts +90 -90
- package/src/runtime/orphan-sentinel.ts +7 -7
- package/src/runtime/parallel-research.ts +44 -44
- package/src/runtime/phase-progress.ts +217 -0
- package/src/runtime/pi-args.ts +38 -2
- package/src/runtime/pi-json-output.ts +111 -111
- package/src/runtime/pi-spawn.ts +74 -6
- package/src/runtime/policy-engine.ts +79 -79
- package/src/runtime/post-checks.ts +122 -0
- package/src/runtime/process-status.ts +14 -1
- package/src/runtime/progress-event-coalescer.ts +43 -43
- package/src/runtime/prose-compressor.ts +164 -164
- package/src/runtime/recovery-recipes.ts +74 -74
- package/src/runtime/result-extractor.ts +121 -121
- package/src/runtime/role-permission.ts +39 -39
- package/src/runtime/sensitive-paths.ts +3 -3
- package/src/runtime/session-resources.ts +25 -25
- package/src/runtime/session-snapshot.ts +59 -59
- package/src/runtime/session-usage.ts +79 -79
- package/src/runtime/sidechain-output.ts +29 -29
- package/src/runtime/stream-preview.ts +177 -177
- package/src/runtime/supervisor-contact.ts +59 -59
- package/src/runtime/task-display.ts +38 -38
- package/src/runtime/task-graph.ts +207 -0
- package/src/runtime/task-quality.ts +207 -0
- package/src/runtime/task-runner/capabilities.ts +78 -78
- package/src/runtime/task-runner/live-executor.ts +7 -1
- package/src/runtime/task-runner/progress.ts +119 -119
- package/src/runtime/task-runner/prompt-builder.ts +1 -1
- package/src/runtime/task-runner/prompt-pipeline.ts +64 -64
- package/src/runtime/task-runner/result-utils.ts +14 -14
- package/src/runtime/task-runner/run-projection.ts +103 -103
- package/src/runtime/task-runner/state-helpers.ts +22 -22
- package/src/runtime/team-runner.ts +126 -7
- package/src/runtime/worker-heartbeat.ts +21 -21
- package/src/runtime/worker-startup.ts +57 -57
- package/src/runtime/workflow-state.ts +187 -0
- package/src/runtime/workspace-tree.ts +298 -298
- package/src/schema/config-schema.ts +12 -0
- package/src/schema/validation-types.ts +148 -0
- package/src/skills/skill-templates.ts +374 -0
- package/src/state/active-run-registry.ts +35 -11
- package/src/state/atomic-write.ts +33 -26
- package/src/state/contracts.ts +1 -0
- package/src/state/event-reconstructor.ts +217 -0
- package/src/state/locks.ts +2 -11
- package/src/state/mailbox.ts +4 -3
- package/src/state/state-store.ts +32 -14
- package/src/state/task-claims.ts +44 -44
- package/src/state/types.ts +9 -0
- package/src/state/usage.ts +29 -29
- package/src/subagents/async-entry.ts +1 -1
- package/src/subagents/index.ts +3 -3
- package/src/subagents/live/control.ts +1 -1
- package/src/subagents/live/manager.ts +1 -1
- package/src/subagents/live/realtime.ts +1 -1
- package/src/subagents/live/session-runtime.ts +1 -1
- package/src/subagents/manager.ts +1 -1
- package/src/subagents/spawn.ts +1 -1
- package/src/teams/team-serializer.ts +38 -38
- package/src/types/diff.d.ts +18 -18
- package/src/ui/crew-footer.ts +101 -101
- package/src/ui/crew-select-list.ts +111 -111
- package/src/ui/crew-widget.ts +9 -4
- package/src/ui/dashboard-panes/cancellation-pane.ts +42 -42
- package/src/ui/dashboard-panes/capability-pane.ts +59 -59
- package/src/ui/dashboard-panes/mailbox-pane.ts +35 -35
- package/src/ui/dashboard-panes/metrics-pane.ts +34 -34
- package/src/ui/dashboard-panes/progress-pane.ts +11 -0
- package/src/ui/dynamic-border.ts +25 -25
- package/src/ui/layout-primitives.ts +106 -106
- package/src/ui/loaders.ts +158 -158
- package/src/ui/powerbar-publisher.ts +6 -0
- package/src/ui/render-coalescer.ts +51 -51
- package/src/ui/render-diff.ts +119 -119
- package/src/ui/render-scheduler.ts +143 -143
- package/src/ui/run-action-dispatcher.ts +10 -1
- package/src/ui/spinner.ts +17 -17
- package/src/ui/status-colors.ts +58 -58
- package/src/ui/syntax-highlight.ts +116 -116
- package/src/ui/transcript-entries.ts +258 -258
- package/src/utils/completion-dedupe.ts +63 -63
- package/src/utils/frontmatter.ts +68 -68
- package/src/utils/git.ts +262 -262
- package/src/utils/ids.ts +17 -17
- package/src/utils/incremental-reader.ts +104 -104
- package/src/utils/names.ts +27 -27
- package/src/utils/redaction.ts +44 -44
- package/src/utils/safe-paths.ts +47 -47
- package/src/utils/scan-cache.ts +136 -136
- package/src/utils/sleep.ts +40 -26
- package/src/utils/task-name-generator.ts +337 -337
- package/src/workflows/validate-workflow.ts +40 -40
- package/src/worktree/branch-freshness.ts +45 -45
- package/src/worktree/worktree-manager.ts +11 -3
- package/teams/default.team.md +12 -12
- package/teams/fast-fix.team.md +11 -11
- package/teams/implementation.team.md +18 -18
- package/teams/parallel-research.team.md +14 -14
- package/teams/research.team.md +11 -11
- package/teams/review.team.md +12 -12
- package/workflows/default.workflow.md +30 -29
- package/workflows/fast-fix.workflow.md +23 -22
- package/workflows/implementation.workflow.md +43 -38
- package/workflows/parallel-research.workflow.md +46 -46
- package/workflows/research.workflow.md +22 -22
- package/workflows/review.workflow.md +30 -30
- package/docs/refactor-tasks-phase3.md +0 -394
- package/docs/refactor-tasks-phase4.md +0 -564
- package/docs/refactor-tasks-phase5.md +0 -402
- package/docs/refactor-tasks-phase6.md +0 -662
- package/docs/refactor-tasks.md +0 -1484
- package/docs/research/AGENT-EXECUTION-ARCHITECTURE.md +0 -261
- package/docs/research/AGENT-LIFECYCLE-COMPARISON.md +0 -111
- package/docs/research/AUDIT_OH_MY_PI.md +0 -261
- package/docs/research/AUDIT_PI_CREW.md +0 -457
- package/docs/research/CAVEMAN-DEEP-RESEARCH.md +0 -281
- package/docs/research/COMPARISON_OH_MY_PI_VS_PI_CREW.md +0 -264
- package/docs/research/DEEP-RESEARCH-PI-POWERBAR.md +0 -343
- package/docs/research/DEEP_RESEARCH_SUBAGENT_ARCHITECTURE.md +0 -480
- package/docs/research/GAP_CLOSURE_IMPLEMENTATION_PLAN.md +0 -354
- package/docs/research/IMPLEMENTATION_PLAN.md +0 -385
- package/docs/research/LIVE-SESSION-PRODUCTION-READY-PLAN.md +0 -502
- package/docs/research/OH-MY-PI-DEEP-RESEARCH-v14.7.6.md +0 -266
- package/docs/research/REMAINING-GAPS-PLAN.md +0 -363
- package/docs/research/SESSION-SUMMARY-2026-05-08.md +0 -146
- package/docs/research/UI-RESPONSIVENESS-AUDIT.md +0 -173
- package/docs/research-awesome-agent-skills-distillation.md +0 -100
- package/docs/research-extension-examples.md +0 -297
- package/docs/research-extension-system.md +0 -324
- package/docs/research-oh-my-pi-distillation.md +0 -369
- package/docs/research-optimization-plan.md +0 -548
- package/docs/research-phase10-distillation.md +0 -199
- package/docs/research-phase11-distillation.md +0 -201
- package/docs/research-phase8-operator-experience-plan.md +0 -819
- package/docs/research-phase9-observability-reliability-plan.md +0 -1190
- package/docs/research-pi-coding-agent.md +0 -357
- package/docs/research-source-pi-crew-reference.md +0 -174
- package/docs/research-ui-optimization-plan.md +0 -480
- package/docs/source-runtime-refactor-map.md +0 -107
- package/src/utils/atomic-write.ts +0 -33
|
@@ -1,124 +1,124 @@
|
|
|
1
|
-
import type { NotificationDescriptor } from "../extension/notification-router.ts";
|
|
2
|
-
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
|
-
import { appendEvent } from "../state/event-log.ts";
|
|
4
|
-
import { loadRunManifestById } from "../state/state-store.ts";
|
|
5
|
-
import type { TeamRunManifest } from "../state/types.ts";
|
|
6
|
-
import { logInternalError } from "../utils/internal-error.ts";
|
|
7
|
-
import type { ManifestCache } from "./manifest-cache.ts";
|
|
8
|
-
import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
|
|
9
|
-
|
|
10
|
-
export interface HeartbeatWatcherRouter {
|
|
11
|
-
enqueue(notification: NotificationDescriptor): boolean;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export interface HeartbeatWatcherOptions {
|
|
15
|
-
cwd: string;
|
|
16
|
-
pollIntervalMs?: number;
|
|
17
|
-
thresholds?: GradientThresholds;
|
|
18
|
-
manifestCache: ManifestCache;
|
|
19
|
-
registry: MetricRegistry;
|
|
20
|
-
router: HeartbeatWatcherRouter;
|
|
21
|
-
deadletterTickThreshold?: number;
|
|
22
|
-
onDead?: (runId: string, taskId: string, elapsed: number) => void;
|
|
23
|
-
onDeadletterTrigger?: (manifest: TeamRunManifest, taskId: string) => void;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
/**
|
|
27
|
-
* Polls running runs for heartbeat staleness.
|
|
28
|
-
*
|
|
29
|
-
* Uses recursive setTimeout to avoid timer storms.
|
|
30
|
-
* Cleanup is done in the same pass — no second scan over manifests.
|
|
31
|
-
* Keys for runs that disappear from the cache are cleaned via staleness-age policy
|
|
32
|
-
* rather than being leaked forever.
|
|
33
|
-
*/
|
|
34
|
-
export class HeartbeatWatcher {
|
|
35
|
-
private timer?: ReturnType<typeof setTimeout>;
|
|
36
|
-
private lastLevel = new Map<string, HeartbeatLevel>();
|
|
37
|
-
private consecutiveDead = new Map<string, number>();
|
|
38
|
-
private lastSeen = new Map<string, number>(); // key → last time it was active
|
|
39
|
-
/** Max age (ms) to retain a stale key before garbage-collecting it. */
|
|
40
|
-
private readonly maxKeyAgeMs = 600_000; // 10 minutes
|
|
41
|
-
private readonly opts: HeartbeatWatcherOptions;
|
|
42
|
-
|
|
43
|
-
constructor(opts: HeartbeatWatcherOptions) {
|
|
44
|
-
this.opts = opts;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
start(): void {
|
|
48
|
-
this.dispose();
|
|
49
|
-
this.scheduleTick();
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
private scheduleTick(): void {
|
|
53
|
-
this.timer = setTimeout(() => this.tick(), this.opts.pollIntervalMs ?? 5000);
|
|
54
|
-
this.timer.unref();
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
tick(now = Date.now()): void {
|
|
58
|
-
try {
|
|
59
|
-
this.tickUnsafe(now);
|
|
60
|
-
} catch (error) {
|
|
61
|
-
logInternalError("heartbeat-watcher.tick", error);
|
|
62
|
-
} finally {
|
|
63
|
-
this.scheduleTick();
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
private tickUnsafe(now: number): void {
|
|
68
|
-
const thresholds = this.opts.thresholds ?? DEFAULT_GRADIENT_THRESHOLDS;
|
|
69
|
-
const tickThreshold = this.opts.deadletterTickThreshold ?? 3;
|
|
70
|
-
const activeKeys = new Set<string>();
|
|
71
|
-
|
|
72
|
-
for (const run of this.opts.manifestCache.list(50)) {
|
|
73
|
-
if (run.status !== "running") continue;
|
|
74
|
-
const loaded = loadRunManifestById(this.opts.cwd, run.runId);
|
|
75
|
-
if (!loaded) continue;
|
|
76
|
-
for (const task of loaded.tasks) {
|
|
77
|
-
if (task.status !== "running") continue;
|
|
78
|
-
const key = `${run.runId}:${task.id}`;
|
|
79
|
-
activeKeys.add(key);
|
|
80
|
-
this.lastSeen.set(key, now);
|
|
81
|
-
|
|
82
|
-
const elapsed = heartbeatAgeMs(task.heartbeat, now);
|
|
83
|
-
const level = classifyHeartbeat(task.heartbeat, thresholds, now);
|
|
84
|
-
this.opts.registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: run.runId, taskId: task.id }, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
|
|
85
|
-
this.opts.registry.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: run.runId, level });
|
|
86
|
-
const previous = this.lastLevel.get(key);
|
|
87
|
-
this.lastLevel.set(key, level);
|
|
88
|
-
if (level === "dead" && previous !== "dead") {
|
|
89
|
-
this.opts.registry.counter("crew.heartbeat.dead_total", "Dead heartbeat detections").inc({ runId: run.runId });
|
|
90
|
-
appendEvent(loaded.manifest.eventsPath, { type: "crew.task.heartbeat_dead", runId: run.runId, taskId: task.id, message: `Task ${task.id} heartbeat dead.`, data: { elapsedMs: Number.isFinite(elapsed) ? elapsed : undefined } });
|
|
91
|
-
this.opts.router.enqueue({ id: `dead_${run.runId}_${task.id}`, severity: "warning", source: "heartbeat-watcher", runId: run.runId, title: `Task ${task.id} heartbeat dead`, body: "Background watcher detected a stuck worker." });
|
|
92
|
-
this.opts.onDead?.(run.runId, task.id, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
|
|
93
|
-
}
|
|
94
|
-
if (level === "dead") {
|
|
95
|
-
const count = (this.consecutiveDead.get(key) ?? 0) + 1;
|
|
96
|
-
this.consecutiveDead.set(key, count);
|
|
97
|
-
if (count === tickThreshold) this.opts.onDeadletterTrigger?.(loaded.manifest, task.id);
|
|
98
|
-
} else {
|
|
99
|
-
this.consecutiveDead.delete(key);
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
// Cleanup: drop keys that were NOT in this tick's active set AND
|
|
105
|
-
// haven't been seen for > maxKeyAgeMs. This covers runs that
|
|
106
|
-
// completed or fell out of the manifest cache's top-50 window.
|
|
107
|
-
const cutoff = now - this.maxKeyAgeMs;
|
|
108
|
-
for (const [key, ts] of this.lastSeen) {
|
|
109
|
-
if (!activeKeys.has(key) && ts < cutoff) {
|
|
110
|
-
this.lastLevel.delete(key);
|
|
111
|
-
this.consecutiveDead.delete(key);
|
|
112
|
-
this.lastSeen.delete(key);
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
dispose(): void {
|
|
118
|
-
if (this.timer) clearTimeout(this.timer);
|
|
119
|
-
this.timer = undefined;
|
|
120
|
-
this.lastLevel.clear();
|
|
121
|
-
this.consecutiveDead.clear();
|
|
122
|
-
this.lastSeen.clear();
|
|
123
|
-
}
|
|
124
|
-
}
|
|
1
|
+
import type { NotificationDescriptor } from "../extension/notification-router.ts";
|
|
2
|
+
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
|
+
import { appendEvent } from "../state/event-log.ts";
|
|
4
|
+
import { loadRunManifestById } from "../state/state-store.ts";
|
|
5
|
+
import type { TeamRunManifest } from "../state/types.ts";
|
|
6
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
7
|
+
import type { ManifestCache } from "./manifest-cache.ts";
|
|
8
|
+
import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
|
|
9
|
+
|
|
10
|
+
export interface HeartbeatWatcherRouter {
|
|
11
|
+
enqueue(notification: NotificationDescriptor): boolean;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface HeartbeatWatcherOptions {
|
|
15
|
+
cwd: string;
|
|
16
|
+
pollIntervalMs?: number;
|
|
17
|
+
thresholds?: GradientThresholds;
|
|
18
|
+
manifestCache: ManifestCache;
|
|
19
|
+
registry: MetricRegistry;
|
|
20
|
+
router: HeartbeatWatcherRouter;
|
|
21
|
+
deadletterTickThreshold?: number;
|
|
22
|
+
onDead?: (runId: string, taskId: string, elapsed: number) => void;
|
|
23
|
+
onDeadletterTrigger?: (manifest: TeamRunManifest, taskId: string) => void;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Polls running runs for heartbeat staleness.
|
|
28
|
+
*
|
|
29
|
+
* Uses recursive setTimeout to avoid timer storms.
|
|
30
|
+
* Cleanup is done in the same pass — no second scan over manifests.
|
|
31
|
+
* Keys for runs that disappear from the cache are cleaned via staleness-age policy
|
|
32
|
+
* rather than being leaked forever.
|
|
33
|
+
*/
|
|
34
|
+
export class HeartbeatWatcher {
|
|
35
|
+
private timer?: ReturnType<typeof setTimeout>;
|
|
36
|
+
private lastLevel = new Map<string, HeartbeatLevel>();
|
|
37
|
+
private consecutiveDead = new Map<string, number>();
|
|
38
|
+
private lastSeen = new Map<string, number>(); // key → last time it was active
|
|
39
|
+
/** Max age (ms) to retain a stale key before garbage-collecting it. */
|
|
40
|
+
private readonly maxKeyAgeMs = 600_000; // 10 minutes
|
|
41
|
+
private readonly opts: HeartbeatWatcherOptions;
|
|
42
|
+
|
|
43
|
+
constructor(opts: HeartbeatWatcherOptions) {
|
|
44
|
+
this.opts = opts;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
start(): void {
|
|
48
|
+
this.dispose();
|
|
49
|
+
this.scheduleTick();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
private scheduleTick(): void {
|
|
53
|
+
this.timer = setTimeout(() => this.tick(), this.opts.pollIntervalMs ?? 5000);
|
|
54
|
+
this.timer.unref();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
tick(now = Date.now()): void {
|
|
58
|
+
try {
|
|
59
|
+
this.tickUnsafe(now);
|
|
60
|
+
} catch (error) {
|
|
61
|
+
logInternalError("heartbeat-watcher.tick", error);
|
|
62
|
+
} finally {
|
|
63
|
+
this.scheduleTick();
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
private tickUnsafe(now: number): void {
|
|
68
|
+
const thresholds = this.opts.thresholds ?? DEFAULT_GRADIENT_THRESHOLDS;
|
|
69
|
+
const tickThreshold = this.opts.deadletterTickThreshold ?? 3;
|
|
70
|
+
const activeKeys = new Set<string>();
|
|
71
|
+
|
|
72
|
+
for (const run of this.opts.manifestCache.list(50)) {
|
|
73
|
+
if (run.status !== "running") continue;
|
|
74
|
+
const loaded = loadRunManifestById(this.opts.cwd, run.runId);
|
|
75
|
+
if (!loaded) continue;
|
|
76
|
+
for (const task of loaded.tasks) {
|
|
77
|
+
if (task.status !== "running") continue;
|
|
78
|
+
const key = `${run.runId}:${task.id}`;
|
|
79
|
+
activeKeys.add(key);
|
|
80
|
+
this.lastSeen.set(key, now);
|
|
81
|
+
|
|
82
|
+
const elapsed = heartbeatAgeMs(task.heartbeat, now);
|
|
83
|
+
const level = classifyHeartbeat(task.heartbeat, thresholds, now);
|
|
84
|
+
this.opts.registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: run.runId, taskId: task.id }, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
|
|
85
|
+
this.opts.registry.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: run.runId, level });
|
|
86
|
+
const previous = this.lastLevel.get(key);
|
|
87
|
+
this.lastLevel.set(key, level);
|
|
88
|
+
if (level === "dead" && previous !== "dead") {
|
|
89
|
+
this.opts.registry.counter("crew.heartbeat.dead_total", "Dead heartbeat detections").inc({ runId: run.runId });
|
|
90
|
+
appendEvent(loaded.manifest.eventsPath, { type: "crew.task.heartbeat_dead", runId: run.runId, taskId: task.id, message: `Task ${task.id} heartbeat dead.`, data: { elapsedMs: Number.isFinite(elapsed) ? elapsed : undefined } });
|
|
91
|
+
this.opts.router.enqueue({ id: `dead_${run.runId}_${task.id}`, severity: "warning", source: "heartbeat-watcher", runId: run.runId, title: `Task ${task.id} heartbeat dead`, body: "Background watcher detected a stuck worker." });
|
|
92
|
+
this.opts.onDead?.(run.runId, task.id, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
|
|
93
|
+
}
|
|
94
|
+
if (level === "dead") {
|
|
95
|
+
const count = (this.consecutiveDead.get(key) ?? 0) + 1;
|
|
96
|
+
this.consecutiveDead.set(key, count);
|
|
97
|
+
if (count === tickThreshold) this.opts.onDeadletterTrigger?.(loaded.manifest, task.id);
|
|
98
|
+
} else {
|
|
99
|
+
this.consecutiveDead.delete(key);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Cleanup: drop keys that were NOT in this tick's active set AND
|
|
105
|
+
// haven't been seen for > maxKeyAgeMs. This covers runs that
|
|
106
|
+
// completed or fell out of the manifest cache's top-50 window.
|
|
107
|
+
const cutoff = now - this.maxKeyAgeMs;
|
|
108
|
+
for (const [key, ts] of this.lastSeen) {
|
|
109
|
+
if (!activeKeys.has(key) && ts < cutoff) {
|
|
110
|
+
this.lastLevel.delete(key);
|
|
111
|
+
this.consecutiveDead.delete(key);
|
|
112
|
+
this.lastSeen.delete(key);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
dispose(): void {
|
|
118
|
+
if (this.timer) clearTimeout(this.timer);
|
|
119
|
+
this.timer = undefined;
|
|
120
|
+
this.lastLevel.clear();
|
|
121
|
+
this.consecutiveDead.clear();
|
|
122
|
+
this.lastSeen.clear();
|
|
123
|
+
}
|
|
124
|
+
}
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Transparent iteration hooks — runs user-supplied before/after task scripts
|
|
3
|
+
* with structured JSON payload on stdin.
|
|
4
|
+
*
|
|
5
|
+
* Distilled from pi-autoresearch's iteration hook pattern.
|
|
6
|
+
*/
|
|
7
|
+
import { spawn } from "node:child_process";
|
|
8
|
+
import * as fs from "node:fs";
|
|
9
|
+
import { DENIED_METRIC_NAMES } from "./metric-parser.ts";
|
|
10
|
+
|
|
11
|
+
/** Hook execution stage. */
|
|
12
|
+
export type HookStage = "before" | "after";
|
|
13
|
+
|
|
14
|
+
/** Payload sent to the hook script via stdin as JSON. */
|
|
15
|
+
export interface HookPayload {
|
|
16
|
+
event: HookStage;
|
|
17
|
+
cwd: string;
|
|
18
|
+
taskId: string;
|
|
19
|
+
runId: string;
|
|
20
|
+
taskRole: string;
|
|
21
|
+
lastResult?: {
|
|
22
|
+
status: string;
|
|
23
|
+
description: string;
|
|
24
|
+
diagnostics?: Record<string, unknown>;
|
|
25
|
+
} | null;
|
|
26
|
+
session: {
|
|
27
|
+
teamName: string;
|
|
28
|
+
workflowName: string;
|
|
29
|
+
goal: string;
|
|
30
|
+
completedTasks: number;
|
|
31
|
+
totalTasks: number;
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Result of executing an iteration hook. */
|
|
36
|
+
export interface HookResult {
|
|
37
|
+
/** Whether the hook script was actually executed. */
|
|
38
|
+
fired: boolean;
|
|
39
|
+
/** Captured stdout (truncated to 8KB). */
|
|
40
|
+
stdout: string;
|
|
41
|
+
/** Captured stderr. */
|
|
42
|
+
stderr: string;
|
|
43
|
+
/** Exit code of the hook process. */
|
|
44
|
+
exitCode: number | null;
|
|
45
|
+
/** Whether the hook timed out. */
|
|
46
|
+
timedOut: boolean;
|
|
47
|
+
/** Wall-clock duration in milliseconds. */
|
|
48
|
+
durationMs: number;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/** Maximum stdout capture size in bytes (8 KB). */
|
|
52
|
+
const MAX_STDOUT_BYTES = 8192;
|
|
53
|
+
|
|
54
|
+
/** Hook execution timeout in milliseconds (30 seconds). */
|
|
55
|
+
const HOOK_TIMEOUT_MS = 30_000;
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Create a not-fired result for when the hook script is absent or not executable.
|
|
59
|
+
*/
|
|
60
|
+
function notFiredResult(): HookResult {
|
|
61
|
+
return {
|
|
62
|
+
fired: false,
|
|
63
|
+
stdout: "",
|
|
64
|
+
stderr: "",
|
|
65
|
+
exitCode: null,
|
|
66
|
+
timedOut: false,
|
|
67
|
+
durationMs: 0,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Truncate a buffer to the given byte limit, snapping to the last newline
|
|
73
|
+
* boundary for UTF-8 safety.
|
|
74
|
+
*/
|
|
75
|
+
function truncateToLimit(buf: Buffer, limit: number): Buffer {
|
|
76
|
+
if (buf.byteLength <= limit) return buf;
|
|
77
|
+
|
|
78
|
+
const slice = buf.subarray(0, limit);
|
|
79
|
+
// Find the last newline within the truncated region
|
|
80
|
+
const lastNewline = slice.lastIndexOf("\n");
|
|
81
|
+
if (lastNewline >= 0) {
|
|
82
|
+
return slice.subarray(0, lastNewline);
|
|
83
|
+
}
|
|
84
|
+
// No newline found — return the full slice
|
|
85
|
+
return slice;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Check if a script path exists and is executable.
|
|
90
|
+
*/
|
|
91
|
+
function isScriptRunnable(scriptPath: string): boolean {
|
|
92
|
+
try {
|
|
93
|
+
if (!fs.existsSync(scriptPath)) return false;
|
|
94
|
+
|
|
95
|
+
// On Windows, X_OK is unreliable — just check F_OK (file exists).
|
|
96
|
+
// On Unix, check both F_OK and X_OK.
|
|
97
|
+
if (process.platform === "win32") {
|
|
98
|
+
fs.accessSync(scriptPath, fs.constants.F_OK);
|
|
99
|
+
} else {
|
|
100
|
+
fs.accessSync(scriptPath, fs.constants.F_OK | fs.constants.X_OK);
|
|
101
|
+
}
|
|
102
|
+
return true;
|
|
103
|
+
} catch {
|
|
104
|
+
return false;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Run an iteration hook script with JSON payload on stdin.
|
|
110
|
+
*
|
|
111
|
+
* Spawns `bash <script>` with the hook payload as JSON on stdin.
|
|
112
|
+
* Captures stdout (capped at 8KB) and stderr. Enforces a 30-second timeout.
|
|
113
|
+
*
|
|
114
|
+
* **Security note:** The script path is user-configurable and executed with
|
|
115
|
+
* minimal environment (PATH, HOME, USER, LANG). Only use with trusted script paths from
|
|
116
|
+
* workspace-owned configuration. No path containment validation is performed.
|
|
117
|
+
*
|
|
118
|
+
* @param payload - Structured hook payload
|
|
119
|
+
* @param hookScriptPath - Absolute or relative path to the hook script
|
|
120
|
+
* @returns HookResult indicating whether the hook fired and its output
|
|
121
|
+
*/
|
|
122
|
+
export async function runIterationHook(
|
|
123
|
+
payload: HookPayload,
|
|
124
|
+
hookScriptPath: string,
|
|
125
|
+
): Promise<HookResult> {
|
|
126
|
+
if (!isScriptRunnable(hookScriptPath)) {
|
|
127
|
+
return notFiredResult();
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const startTime = Date.now();
|
|
131
|
+
const stdinJson = JSON.stringify(payload);
|
|
132
|
+
const stdoutChunks: Buffer[] = [];
|
|
133
|
+
const stderrChunks: Buffer[] = [];
|
|
134
|
+
|
|
135
|
+
return new Promise<HookResult>((resolve) => {
|
|
136
|
+
const child = spawn("bash", [hookScriptPath], {
|
|
137
|
+
cwd: payload.cwd,
|
|
138
|
+
env: { PATH: process.env.PATH ?? "/usr/bin:/bin", HOME: process.env.HOME ?? "/tmp", USER: process.env.USER, LANG: process.env.LANG, PI_CREW_HOOK: "1" },
|
|
139
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
let killed = false;
|
|
143
|
+
const timeout = setTimeout(() => {
|
|
144
|
+
killed = true;
|
|
145
|
+
child.kill("SIGKILL");
|
|
146
|
+
}, HOOK_TIMEOUT_MS);
|
|
147
|
+
|
|
148
|
+
child.stdout.on("data", (chunk: Buffer) => {
|
|
149
|
+
stdoutChunks.push(chunk);
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
child.stderr.on("data", (chunk: Buffer) => {
|
|
153
|
+
stderrChunks.push(chunk);
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
child.on("close", (code: number | null) => {
|
|
157
|
+
clearTimeout(timeout);
|
|
158
|
+
const durationMs = Date.now() - startTime;
|
|
159
|
+
|
|
160
|
+
const rawStdout = Buffer.concat(stdoutChunks);
|
|
161
|
+
const truncatedStdout = truncateToLimit(rawStdout, MAX_STDOUT_BYTES);
|
|
162
|
+
|
|
163
|
+
const rawStderr = Buffer.concat(stderrChunks);
|
|
164
|
+
|
|
165
|
+
resolve({
|
|
166
|
+
fired: true,
|
|
167
|
+
stdout: truncatedStdout.toString("utf-8"),
|
|
168
|
+
stderr: rawStderr.toString("utf-8"),
|
|
169
|
+
exitCode: code,
|
|
170
|
+
timedOut: killed,
|
|
171
|
+
durationMs,
|
|
172
|
+
});
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
child.on("error", (err: Error) => {
|
|
176
|
+
clearTimeout(timeout);
|
|
177
|
+
const durationMs = Date.now() - startTime;
|
|
178
|
+
resolve({
|
|
179
|
+
fired: true,
|
|
180
|
+
stdout: "",
|
|
181
|
+
stderr: err.message,
|
|
182
|
+
exitCode: null,
|
|
183
|
+
timedOut: false,
|
|
184
|
+
durationMs,
|
|
185
|
+
});
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
// Write payload to stdin and close it
|
|
189
|
+
child.stdin.write(stdinJson, "utf-8");
|
|
190
|
+
child.stdin.end();
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Derive a steer message from the hook result.
|
|
196
|
+
*
|
|
197
|
+
* - Non-zero exit → error steer message
|
|
198
|
+
* - Timeout → timeout steer message
|
|
199
|
+
* - Empty stdout → null (no steer)
|
|
200
|
+
* - Otherwise → trimmed stdout content
|
|
201
|
+
*/
|
|
202
|
+
export function steerMessageFromHook(
|
|
203
|
+
stage: HookStage,
|
|
204
|
+
result: HookResult,
|
|
205
|
+
): string | null {
|
|
206
|
+
if (!result.fired) return null;
|
|
207
|
+
|
|
208
|
+
if (result.timedOut) {
|
|
209
|
+
return `[${stage}-hook] Hook timed out after ${result.durationMs}ms`;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (result.exitCode !== null && result.exitCode !== 0) {
|
|
213
|
+
const stderrSnippet = result.stderr.trim().slice(0, 200);
|
|
214
|
+
return `[${stage}-hook] Hook exited with code ${result.exitCode}${stderrSnippet ? `: ${stderrSnippet}` : ""}`;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const trimmed = result.stdout.trim();
|
|
218
|
+
if (trimmed.length === 0) return null;
|
|
219
|
+
|
|
220
|
+
// Filter out prototype-polluting metric names from hook output
|
|
221
|
+
const lines = trimmed.split("\n");
|
|
222
|
+
const safeLines = lines.filter((line) => {
|
|
223
|
+
const match = /^CREW_METRIC\s+(\w+)=/.exec(line);
|
|
224
|
+
if (match) {
|
|
225
|
+
const name = match[1];
|
|
226
|
+
return !DENIED_METRIC_NAMES.has(name);
|
|
227
|
+
}
|
|
228
|
+
return true;
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
return safeLines.join("\n");
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Build a log entry for recording hook execution in events.jsonl.
|
|
236
|
+
*/
|
|
237
|
+
export function hookLogEntry(
|
|
238
|
+
stage: HookStage,
|
|
239
|
+
result: HookResult,
|
|
240
|
+
): Record<string, unknown> {
|
|
241
|
+
const entry: Record<string, unknown> = {
|
|
242
|
+
type: "iteration-hook",
|
|
243
|
+
stage,
|
|
244
|
+
fired: result.fired,
|
|
245
|
+
durationMs: result.durationMs,
|
|
246
|
+
};
|
|
247
|
+
|
|
248
|
+
if (result.fired) {
|
|
249
|
+
entry.exitCode = result.exitCode;
|
|
250
|
+
entry.timedOut = result.timedOut;
|
|
251
|
+
|
|
252
|
+
// Include truncated stdout/stderr for diagnostics
|
|
253
|
+
if (result.stdout.length > 0) {
|
|
254
|
+
entry.stdoutPreview = result.stdout.slice(0, 512);
|
|
255
|
+
}
|
|
256
|
+
if (result.stderr.length > 0) {
|
|
257
|
+
entry.stderrPreview = result.stderr.slice(0, 512);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return entry;
|
|
262
|
+
}
|