pi-crew 0.1.44 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. package/CHANGELOG.md +27 -0
  2. package/agents/analyst.md +11 -11
  3. package/agents/critic.md +11 -11
  4. package/agents/executor.md +11 -11
  5. package/agents/explorer.md +11 -11
  6. package/agents/planner.md +11 -11
  7. package/agents/reviewer.md +11 -11
  8. package/agents/security-reviewer.md +11 -11
  9. package/agents/test-engineer.md +11 -11
  10. package/agents/verifier.md +11 -11
  11. package/agents/writer.md +11 -11
  12. package/docs/refactor-tasks-phase3.md +394 -394
  13. package/docs/refactor-tasks-phase4.md +564 -564
  14. package/docs/refactor-tasks-phase5.md +402 -402
  15. package/docs/refactor-tasks-phase6.md +662 -662
  16. package/docs/research-extension-examples.md +297 -297
  17. package/docs/research-extension-system.md +324 -324
  18. package/docs/research-optimization-plan.md +548 -548
  19. package/docs/research-phase10-distillation.md +198 -198
  20. package/docs/research-phase11-distillation.md +201 -201
  21. package/docs/research-pi-coding-agent.md +357 -357
  22. package/docs/research-source-pi-crew-reference.md +174 -174
  23. package/docs/runtime-flow.md +148 -148
  24. package/docs/source-runtime-refactor-map.md +83 -83
  25. package/index.ts +6 -6
  26. package/package.json +1 -1
  27. package/src/agents/agent-serializer.ts +34 -34
  28. package/src/extension/cross-extension-rpc.ts +82 -82
  29. package/src/extension/register.ts +8 -1
  30. package/src/extension/registration/commands.ts +18 -2
  31. package/src/extension/registration/compaction-guard.ts +125 -125
  32. package/src/extension/registration/subagent-tools.ts +148 -148
  33. package/src/extension/registration/team-tool.ts +26 -8
  34. package/src/extension/run-bundle-schema.ts +89 -89
  35. package/src/extension/run-maintenance.ts +43 -43
  36. package/src/extension/team-tool/cancel.ts +105 -102
  37. package/src/extension/team-tool/context.ts +1 -0
  38. package/src/extension/team-tool/handle-settings.ts +188 -188
  39. package/src/extension/team-tool/inspect.ts +41 -41
  40. package/src/extension/team-tool/lifecycle-actions.ts +79 -79
  41. package/src/extension/team-tool/plan.ts +19 -19
  42. package/src/extension/team-tool/respond.ts +83 -66
  43. package/src/extension/team-tool/run.ts +1 -0
  44. package/src/i18n.ts +184 -184
  45. package/src/observability/exporters/otlp-exporter.ts +77 -77
  46. package/src/prompt/prompt-runtime.ts +72 -72
  47. package/src/runtime/agent-control.ts +63 -63
  48. package/src/runtime/agent-memory.ts +72 -72
  49. package/src/runtime/agent-observability.ts +114 -114
  50. package/src/runtime/async-marker.ts +26 -26
  51. package/src/runtime/attention-events.ts +28 -28
  52. package/src/runtime/background-runner.ts +53 -53
  53. package/src/runtime/child-pi.ts +444 -444
  54. package/src/runtime/completion-guard.ts +190 -190
  55. package/src/runtime/crew-agent-records.ts +8 -0
  56. package/src/runtime/delivery-coordinator.ts +153 -142
  57. package/src/runtime/direct-run.ts +35 -35
  58. package/src/runtime/foreground-control.ts +82 -82
  59. package/src/runtime/green-contract.ts +46 -46
  60. package/src/runtime/group-join.ts +106 -106
  61. package/src/runtime/heartbeat-gradient.ts +28 -28
  62. package/src/runtime/heartbeat-watcher.ts +124 -124
  63. package/src/runtime/live-agent-control.ts +87 -87
  64. package/src/runtime/live-agent-manager.ts +85 -85
  65. package/src/runtime/live-control-realtime.ts +36 -36
  66. package/src/runtime/live-session-runtime.ts +305 -305
  67. package/src/runtime/overflow-recovery.ts +175 -156
  68. package/src/runtime/parallel-research.ts +44 -44
  69. package/src/runtime/pi-json-output.ts +111 -111
  70. package/src/runtime/policy-engine.ts +79 -79
  71. package/src/runtime/progress-event-coalescer.ts +43 -43
  72. package/src/runtime/recovery-recipes.ts +74 -74
  73. package/src/runtime/retry-executor.ts +64 -64
  74. package/src/runtime/role-permission.ts +39 -39
  75. package/src/runtime/session-resources.ts +25 -25
  76. package/src/runtime/session-snapshot.ts +59 -59
  77. package/src/runtime/session-usage.ts +79 -79
  78. package/src/runtime/sidechain-output.ts +29 -29
  79. package/src/runtime/stale-reconciler.ts +199 -179
  80. package/src/runtime/supervisor-contact.ts +59 -59
  81. package/src/runtime/task-display.ts +38 -38
  82. package/src/runtime/task-output-context.ts +127 -127
  83. package/src/runtime/task-runner/live-executor.ts +101 -101
  84. package/src/runtime/task-runner/progress.ts +119 -119
  85. package/src/runtime/task-runner/result-utils.ts +14 -14
  86. package/src/runtime/task-runner/state-helpers.ts +22 -22
  87. package/src/runtime/team-runner.ts +13 -4
  88. package/src/runtime/worker-heartbeat.ts +21 -21
  89. package/src/runtime/worker-startup.ts +57 -57
  90. package/src/state/state-store.ts +43 -0
  91. package/src/state/task-claims.ts +44 -44
  92. package/src/state/types.ts +2 -0
  93. package/src/state/usage.ts +29 -29
  94. package/src/subagents/async-entry.ts +1 -1
  95. package/src/subagents/index.ts +3 -3
  96. package/src/subagents/live/control.ts +1 -1
  97. package/src/subagents/live/manager.ts +1 -1
  98. package/src/subagents/live/realtime.ts +1 -1
  99. package/src/subagents/live/session-runtime.ts +1 -1
  100. package/src/subagents/manager.ts +1 -1
  101. package/src/subagents/spawn.ts +1 -1
  102. package/src/teams/team-serializer.ts +38 -38
  103. package/src/types/diff.d.ts +18 -18
  104. package/src/ui/crew-footer.ts +101 -101
  105. package/src/ui/crew-select-list.ts +111 -111
  106. package/src/ui/crew-widget.ts +5 -1
  107. package/src/ui/dashboard-panes/mailbox-pane.ts +2 -1
  108. package/src/ui/dashboard-panes/metrics-pane.ts +34 -34
  109. package/src/ui/dynamic-border.ts +25 -25
  110. package/src/ui/layout-primitives.ts +106 -106
  111. package/src/ui/loaders.ts +158 -158
  112. package/src/ui/powerbar-publisher.ts +1 -1
  113. package/src/ui/render-diff.ts +119 -119
  114. package/src/ui/render-scheduler.ts +143 -143
  115. package/src/ui/run-snapshot-cache.ts +56 -37
  116. package/src/ui/snapshot-types.ts +5 -0
  117. package/src/ui/spinner.ts +17 -17
  118. package/src/ui/status-colors.ts +58 -58
  119. package/src/ui/syntax-highlight.ts +116 -116
  120. package/src/utils/atomic-write.ts +33 -33
  121. package/src/utils/completion-dedupe.ts +63 -63
  122. package/src/utils/frontmatter.ts +68 -68
  123. package/src/utils/git.ts +262 -262
  124. package/src/utils/ids.ts +12 -12
  125. package/src/utils/names.ts +27 -27
  126. package/src/utils/redaction.ts +44 -44
  127. package/src/utils/safe-paths.ts +47 -47
  128. package/src/utils/sleep.ts +32 -32
  129. package/src/workflows/validate-workflow.ts +40 -40
  130. package/src/worktree/branch-freshness.ts +45 -45
  131. package/teams/default.team.md +12 -12
  132. package/teams/fast-fix.team.md +11 -11
  133. package/teams/implementation.team.md +18 -18
  134. package/teams/parallel-research.team.md +14 -14
  135. package/teams/research.team.md +11 -11
  136. package/teams/review.team.md +12 -12
  137. package/workflows/default.workflow.md +29 -29
  138. package/workflows/fast-fix.workflow.md +22 -22
  139. package/workflows/implementation.workflow.md +38 -38
  140. package/workflows/parallel-research.workflow.md +46 -46
  141. package/workflows/research.workflow.md +22 -22
  142. package/workflows/review.workflow.md +30 -30
@@ -1,106 +1,106 @@
1
- import type { CrewRuntimeConfig } from "../config/config.ts";
2
- import { writeArtifact } from "../state/artifact-store.ts";
3
- import { appendEvent } from "../state/event-log.ts";
4
- import { appendMailboxMessage, findMailboxMessageByRequestId, readDeliveryState } from "../state/mailbox.ts";
5
- import type { ArtifactDescriptor, TeamRunManifest, TeamTaskState } from "../state/types.ts";
6
- import { aggregateTaskOutputs } from "./task-output-context.ts";
7
-
8
- export type CrewGroupJoinMode = "off" | "group" | "smart";
9
-
10
- export interface CrewGroupJoinDelivery {
11
- batchId: string;
12
- mode: CrewGroupJoinMode;
13
- partial: boolean;
14
- taskIds: string[];
15
- completed: string[];
16
- failed: string[];
17
- skipped: string[];
18
- remaining: string[];
19
- artifact?: ArtifactDescriptor;
20
- messageId?: string;
21
- requestId?: string;
22
- ackRequired?: boolean;
23
- ackStatus?: "pending" | "acknowledged";
24
- }
25
-
26
- export function resolveGroupJoinMode(runtime?: CrewRuntimeConfig): CrewGroupJoinMode {
27
- return runtime?.groupJoin ?? "smart";
28
- }
29
-
30
- export function shouldGroupJoin(mode: CrewGroupJoinMode, batch: TeamTaskState[]): boolean {
31
- if (mode === "off") return false;
32
- if (mode === "group") return batch.length > 0;
33
- return batch.length > 1;
34
- }
35
-
36
- function batchIdFor(runId: string, taskIds: string[]): string {
37
- return `${runId}_${taskIds.join("+").replace(/[^a-zA-Z0-9_+-]/g, "_")}`;
38
- }
39
-
40
- function requestIdFor(runId: string, batchId: string, partial: boolean): string {
41
- return `${runId}:group-join:${partial ? "partial" : "completed"}:${batchId}`;
42
- }
43
-
44
- function statusList(tasks: TeamTaskState[], status: TeamTaskState["status"]): string[] {
45
- return tasks.filter((task) => task.status === status).map((task) => task.id);
46
- }
47
-
48
- export function deliverGroupJoin(input: {
49
- manifest: TeamRunManifest;
50
- mode: CrewGroupJoinMode;
51
- batch: TeamTaskState[];
52
- allTasks: TeamTaskState[];
53
- partial?: boolean;
54
- }): CrewGroupJoinDelivery | undefined {
55
- if (!shouldGroupJoin(input.mode, input.batch)) return undefined;
56
- const taskIds = input.batch.map((task) => task.id);
57
- const latest = taskIds.map((id) => input.allTasks.find((task) => task.id === id)).filter((task): task is TeamTaskState => Boolean(task));
58
- const completed = statusList(latest, "completed");
59
- const failed = statusList(latest, "failed");
60
- const skipped = statusList(latest, "skipped");
61
- const remaining = latest.filter((task) => task.status === "queued" || task.status === "running").map((task) => task.id);
62
- const partial = input.partial ?? remaining.length > 0;
63
- const batchId = batchIdFor(input.manifest.runId, taskIds);
64
- const summary = aggregateTaskOutputs(latest, input.manifest);
65
- const requestId = requestIdFor(input.manifest.runId, batchId, partial);
66
- const existingMailbox = findMailboxMessageByRequestId(input.manifest, requestId);
67
- const existingStatus = existingMailbox ? readDeliveryState(input.manifest).messages[existingMailbox.id] ?? existingMailbox.status : undefined;
68
- const delivery: CrewGroupJoinDelivery = { batchId, mode: input.mode, partial, taskIds, completed, failed, skipped, remaining, requestId, ackRequired: true, ackStatus: existingStatus === "acknowledged" ? "acknowledged" : "pending" };
69
- const content = `${JSON.stringify({ ...delivery, createdAt: new Date().toISOString() }, null, 2)}\n`;
70
- const artifact = writeArtifact(input.manifest.artifactsRoot, {
71
- kind: "metadata",
72
- relativePath: `metadata/group-joins/${batchId}.json`,
73
- producer: "group-join",
74
- content,
75
- });
76
- const mailbox = existingMailbox ?? appendMailboxMessage(input.manifest, {
77
- direction: "outbox",
78
- from: "group-join",
79
- to: "leader",
80
- body: [
81
- `Group join ${partial ? "partial" : "completed"}: ${taskIds.join(", ")}`,
82
- `Request: ${requestId}`,
83
- `Completed: ${completed.join(", ") || "none"}`,
84
- `Failed: ${failed.join(", ") || "none"}`,
85
- `Skipped: ${skipped.join(", ") || "none"}`,
86
- `Remaining: ${remaining.join(", ") || "none"}`,
87
- "",
88
- summary,
89
- ].join("\n"),
90
- status: "delivered",
91
- data: { kind: "group_join", requestId, batchId, partial, ackRequired: true, taskIds, completed, failed, skipped, remaining },
92
- });
93
- appendEvent(input.manifest.eventsPath, {
94
- type: partial ? "agent.group_join.partial" : "agent.group_join.completed",
95
- runId: input.manifest.runId,
96
- message: `Group join ${partial ? "partial" : "completed"} for ${taskIds.length} task(s).`,
97
- data: { ...delivery, artifactPath: artifact.path, messageId: mailbox.id, fallback: "mailbox-delivered", reused: Boolean(existingMailbox) },
98
- });
99
- if (existingMailbox) appendEvent(input.manifest.eventsPath, {
100
- type: "agent.group_join.delivery_reused",
101
- runId: input.manifest.runId,
102
- message: `Reused group join mailbox delivery for ${taskIds.length} task(s).`,
103
- data: { requestId, messageId: mailbox.id, batchId, partial },
104
- });
105
- return { ...delivery, artifact, messageId: mailbox.id };
106
- }
1
+ import type { CrewRuntimeConfig } from "../config/config.ts";
2
+ import { writeArtifact } from "../state/artifact-store.ts";
3
+ import { appendEvent } from "../state/event-log.ts";
4
+ import { appendMailboxMessage, findMailboxMessageByRequestId, readDeliveryState } from "../state/mailbox.ts";
5
+ import type { ArtifactDescriptor, TeamRunManifest, TeamTaskState } from "../state/types.ts";
6
+ import { aggregateTaskOutputs } from "./task-output-context.ts";
7
+
8
+ export type CrewGroupJoinMode = "off" | "group" | "smart";
9
+
10
+ export interface CrewGroupJoinDelivery {
11
+ batchId: string;
12
+ mode: CrewGroupJoinMode;
13
+ partial: boolean;
14
+ taskIds: string[];
15
+ completed: string[];
16
+ failed: string[];
17
+ skipped: string[];
18
+ remaining: string[];
19
+ artifact?: ArtifactDescriptor;
20
+ messageId?: string;
21
+ requestId?: string;
22
+ ackRequired?: boolean;
23
+ ackStatus?: "pending" | "acknowledged";
24
+ }
25
+
26
+ export function resolveGroupJoinMode(runtime?: CrewRuntimeConfig): CrewGroupJoinMode {
27
+ return runtime?.groupJoin ?? "smart";
28
+ }
29
+
30
+ export function shouldGroupJoin(mode: CrewGroupJoinMode, batch: TeamTaskState[]): boolean {
31
+ if (mode === "off") return false;
32
+ if (mode === "group") return batch.length > 0;
33
+ return batch.length > 1;
34
+ }
35
+
36
+ function batchIdFor(runId: string, taskIds: string[]): string {
37
+ return `${runId}_${taskIds.join("+").replace(/[^a-zA-Z0-9_+-]/g, "_")}`;
38
+ }
39
+
40
+ function requestIdFor(runId: string, batchId: string, partial: boolean): string {
41
+ return `${runId}:group-join:${partial ? "partial" : "completed"}:${batchId}`;
42
+ }
43
+
44
+ function statusList(tasks: TeamTaskState[], status: TeamTaskState["status"]): string[] {
45
+ return tasks.filter((task) => task.status === status).map((task) => task.id);
46
+ }
47
+
48
+ export function deliverGroupJoin(input: {
49
+ manifest: TeamRunManifest;
50
+ mode: CrewGroupJoinMode;
51
+ batch: TeamTaskState[];
52
+ allTasks: TeamTaskState[];
53
+ partial?: boolean;
54
+ }): CrewGroupJoinDelivery | undefined {
55
+ if (!shouldGroupJoin(input.mode, input.batch)) return undefined;
56
+ const taskIds = input.batch.map((task) => task.id);
57
+ const latest = taskIds.map((id) => input.allTasks.find((task) => task.id === id)).filter((task): task is TeamTaskState => Boolean(task));
58
+ const completed = statusList(latest, "completed");
59
+ const failed = statusList(latest, "failed");
60
+ const skipped = statusList(latest, "skipped");
61
+ const remaining = latest.filter((task) => task.status === "queued" || task.status === "running").map((task) => task.id);
62
+ const partial = input.partial ?? remaining.length > 0;
63
+ const batchId = batchIdFor(input.manifest.runId, taskIds);
64
+ const summary = aggregateTaskOutputs(latest, input.manifest);
65
+ const requestId = requestIdFor(input.manifest.runId, batchId, partial);
66
+ const existingMailbox = findMailboxMessageByRequestId(input.manifest, requestId);
67
+ const existingStatus = existingMailbox ? readDeliveryState(input.manifest).messages[existingMailbox.id] ?? existingMailbox.status : undefined;
68
+ const delivery: CrewGroupJoinDelivery = { batchId, mode: input.mode, partial, taskIds, completed, failed, skipped, remaining, requestId, ackRequired: true, ackStatus: existingStatus === "acknowledged" ? "acknowledged" : "pending" };
69
+ const content = `${JSON.stringify({ ...delivery, createdAt: new Date().toISOString() }, null, 2)}\n`;
70
+ const artifact = writeArtifact(input.manifest.artifactsRoot, {
71
+ kind: "metadata",
72
+ relativePath: `metadata/group-joins/${batchId}.json`,
73
+ producer: "group-join",
74
+ content,
75
+ });
76
+ const mailbox = existingMailbox ?? appendMailboxMessage(input.manifest, {
77
+ direction: "outbox",
78
+ from: "group-join",
79
+ to: "leader",
80
+ body: [
81
+ `Group join ${partial ? "partial" : "completed"}: ${taskIds.join(", ")}`,
82
+ `Request: ${requestId}`,
83
+ `Completed: ${completed.join(", ") || "none"}`,
84
+ `Failed: ${failed.join(", ") || "none"}`,
85
+ `Skipped: ${skipped.join(", ") || "none"}`,
86
+ `Remaining: ${remaining.join(", ") || "none"}`,
87
+ "",
88
+ summary,
89
+ ].join("\n"),
90
+ status: "delivered",
91
+ data: { kind: "group_join", requestId, batchId, partial, ackRequired: true, taskIds, completed, failed, skipped, remaining },
92
+ });
93
+ appendEvent(input.manifest.eventsPath, {
94
+ type: partial ? "agent.group_join.partial" : "agent.group_join.completed",
95
+ runId: input.manifest.runId,
96
+ message: `Group join ${partial ? "partial" : "completed"} for ${taskIds.length} task(s).`,
97
+ data: { ...delivery, artifactPath: artifact.path, messageId: mailbox.id, fallback: "mailbox-delivered", reused: Boolean(existingMailbox) },
98
+ });
99
+ if (existingMailbox) appendEvent(input.manifest.eventsPath, {
100
+ type: "agent.group_join.delivery_reused",
101
+ runId: input.manifest.runId,
102
+ message: `Reused group join mailbox delivery for ${taskIds.length} task(s).`,
103
+ data: { requestId, messageId: mailbox.id, batchId, partial },
104
+ });
105
+ return { ...delivery, artifact, messageId: mailbox.id };
106
+ }
@@ -1,28 +1,28 @@
1
- import type { WorkerHeartbeatState } from "./worker-heartbeat.ts";
2
-
3
- export type HeartbeatLevel = "healthy" | "warn" | "stale" | "dead";
4
-
5
- export interface GradientThresholds {
6
- warnMs: number;
7
- staleMs: number;
8
- deadMs: number;
9
- }
10
-
11
- export const DEFAULT_GRADIENT_THRESHOLDS: GradientThresholds = { warnMs: 30_000, staleMs: 60_000, deadMs: 300_000 };
12
-
13
- export function heartbeatAgeMs(heartbeat: WorkerHeartbeatState | undefined, now = Date.now()): number {
14
- if (!heartbeat) return Number.POSITIVE_INFINITY;
15
- const lastSeen = Date.parse(heartbeat.lastSeenAt);
16
- return Number.isFinite(lastSeen) ? Math.max(0, now - lastSeen) : Number.POSITIVE_INFINITY;
17
- }
18
-
19
- export function classifyHeartbeat(heartbeat: WorkerHeartbeatState | undefined, thresholds: GradientThresholds = DEFAULT_GRADIENT_THRESHOLDS, now = Date.now()): HeartbeatLevel {
20
- if (!heartbeat) return "dead";
21
- if (heartbeat.alive === false) return "dead";
22
- const elapsed = heartbeatAgeMs(heartbeat, now);
23
- if (!Number.isFinite(elapsed)) return "dead";
24
- if (elapsed > thresholds.deadMs) return "dead";
25
- if (elapsed > thresholds.staleMs) return "stale";
26
- if (elapsed > thresholds.warnMs) return "warn";
27
- return "healthy";
28
- }
1
+ import type { WorkerHeartbeatState } from "./worker-heartbeat.ts";
2
+
3
+ export type HeartbeatLevel = "healthy" | "warn" | "stale" | "dead";
4
+
5
+ export interface GradientThresholds {
6
+ warnMs: number;
7
+ staleMs: number;
8
+ deadMs: number;
9
+ }
10
+
11
+ export const DEFAULT_GRADIENT_THRESHOLDS: GradientThresholds = { warnMs: 30_000, staleMs: 60_000, deadMs: 300_000 };
12
+
13
+ export function heartbeatAgeMs(heartbeat: WorkerHeartbeatState | undefined, now = Date.now()): number {
14
+ if (!heartbeat) return Number.POSITIVE_INFINITY;
15
+ const lastSeen = Date.parse(heartbeat.lastSeenAt);
16
+ return Number.isFinite(lastSeen) ? Math.max(0, now - lastSeen) : Number.POSITIVE_INFINITY;
17
+ }
18
+
19
+ export function classifyHeartbeat(heartbeat: WorkerHeartbeatState | undefined, thresholds: GradientThresholds = DEFAULT_GRADIENT_THRESHOLDS, now = Date.now()): HeartbeatLevel {
20
+ if (!heartbeat) return "dead";
21
+ if (heartbeat.alive === false) return "dead";
22
+ const elapsed = heartbeatAgeMs(heartbeat, now);
23
+ if (!Number.isFinite(elapsed)) return "dead";
24
+ if (elapsed > thresholds.deadMs) return "dead";
25
+ if (elapsed > thresholds.staleMs) return "stale";
26
+ if (elapsed > thresholds.warnMs) return "warn";
27
+ return "healthy";
28
+ }
@@ -1,124 +1,124 @@
1
- import type { NotificationDescriptor } from "../extension/notification-router.ts";
2
- import type { MetricRegistry } from "../observability/metric-registry.ts";
3
- import { appendEvent } from "../state/event-log.ts";
4
- import { loadRunManifestById } from "../state/state-store.ts";
5
- import type { TeamRunManifest } from "../state/types.ts";
6
- import { logInternalError } from "../utils/internal-error.ts";
7
- import type { ManifestCache } from "./manifest-cache.ts";
8
- import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
9
-
10
- export interface HeartbeatWatcherRouter {
11
- enqueue(notification: NotificationDescriptor): boolean;
12
- }
13
-
14
- export interface HeartbeatWatcherOptions {
15
- cwd: string;
16
- pollIntervalMs?: number;
17
- thresholds?: GradientThresholds;
18
- manifestCache: ManifestCache;
19
- registry: MetricRegistry;
20
- router: HeartbeatWatcherRouter;
21
- deadletterTickThreshold?: number;
22
- onDead?: (runId: string, taskId: string, elapsed: number) => void;
23
- onDeadletterTrigger?: (manifest: TeamRunManifest, taskId: string) => void;
24
- }
25
-
26
- /**
27
- * Polls running runs for heartbeat staleness.
28
- *
29
- * Uses recursive setTimeout to avoid timer storms.
30
- * Cleanup is done in the same pass — no second scan over manifests.
31
- * Keys for runs that disappear from the cache are cleaned via staleness-age policy
32
- * rather than being leaked forever.
33
- */
34
- export class HeartbeatWatcher {
35
- private timer?: ReturnType<typeof setTimeout>;
36
- private lastLevel = new Map<string, HeartbeatLevel>();
37
- private consecutiveDead = new Map<string, number>();
38
- private lastSeen = new Map<string, number>(); // key → last time it was active
39
- /** Max age (ms) to retain a stale key before garbage-collecting it. */
40
- private readonly maxKeyAgeMs = 600_000; // 10 minutes
41
- private readonly opts: HeartbeatWatcherOptions;
42
-
43
- constructor(opts: HeartbeatWatcherOptions) {
44
- this.opts = opts;
45
- }
46
-
47
- start(): void {
48
- this.dispose();
49
- this.scheduleTick();
50
- }
51
-
52
- private scheduleTick(): void {
53
- this.timer = setTimeout(() => this.tick(), this.opts.pollIntervalMs ?? 5000);
54
- this.timer.unref();
55
- }
56
-
57
- tick(now = Date.now()): void {
58
- try {
59
- this.tickUnsafe(now);
60
- } catch (error) {
61
- logInternalError("heartbeat-watcher.tick", error);
62
- } finally {
63
- this.scheduleTick();
64
- }
65
- }
66
-
67
- private tickUnsafe(now: number): void {
68
- const thresholds = this.opts.thresholds ?? DEFAULT_GRADIENT_THRESHOLDS;
69
- const tickThreshold = this.opts.deadletterTickThreshold ?? 3;
70
- const activeKeys = new Set<string>();
71
-
72
- for (const run of this.opts.manifestCache.list(50)) {
73
- if (run.status !== "running") continue;
74
- const loaded = loadRunManifestById(this.opts.cwd, run.runId);
75
- if (!loaded) continue;
76
- for (const task of loaded.tasks) {
77
- if (task.status !== "running") continue;
78
- const key = `${run.runId}:${task.id}`;
79
- activeKeys.add(key);
80
- this.lastSeen.set(key, now);
81
-
82
- const elapsed = heartbeatAgeMs(task.heartbeat, now);
83
- const level = classifyHeartbeat(task.heartbeat, thresholds, now);
84
- this.opts.registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: run.runId, taskId: task.id }, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
85
- this.opts.registry.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: run.runId, level });
86
- const previous = this.lastLevel.get(key);
87
- this.lastLevel.set(key, level);
88
- if (level === "dead" && previous !== "dead") {
89
- this.opts.registry.counter("crew.heartbeat.dead_total", "Dead heartbeat detections").inc({ runId: run.runId });
90
- appendEvent(loaded.manifest.eventsPath, { type: "crew.task.heartbeat_dead", runId: run.runId, taskId: task.id, message: `Task ${task.id} heartbeat dead.`, data: { elapsedMs: Number.isFinite(elapsed) ? elapsed : undefined } });
91
- this.opts.router.enqueue({ id: `dead_${run.runId}_${task.id}`, severity: "warning", source: "heartbeat-watcher", runId: run.runId, title: `Task ${task.id} heartbeat dead`, body: "Background watcher detected a stuck worker." });
92
- this.opts.onDead?.(run.runId, task.id, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
93
- }
94
- if (level === "dead") {
95
- const count = (this.consecutiveDead.get(key) ?? 0) + 1;
96
- this.consecutiveDead.set(key, count);
97
- if (count === tickThreshold) this.opts.onDeadletterTrigger?.(loaded.manifest, task.id);
98
- } else {
99
- this.consecutiveDead.delete(key);
100
- }
101
- }
102
- }
103
-
104
- // Cleanup: drop keys that were NOT in this tick's active set AND
105
- // haven't been seen for > maxKeyAgeMs. This covers runs that
106
- // completed or fell out of the manifest cache's top-50 window.
107
- const cutoff = now - this.maxKeyAgeMs;
108
- for (const [key, ts] of this.lastSeen) {
109
- if (!activeKeys.has(key) && ts < cutoff) {
110
- this.lastLevel.delete(key);
111
- this.consecutiveDead.delete(key);
112
- this.lastSeen.delete(key);
113
- }
114
- }
115
- }
116
-
117
- dispose(): void {
118
- if (this.timer) clearTimeout(this.timer);
119
- this.timer = undefined;
120
- this.lastLevel.clear();
121
- this.consecutiveDead.clear();
122
- this.lastSeen.clear();
123
- }
124
- }
1
+ import type { NotificationDescriptor } from "../extension/notification-router.ts";
2
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
3
+ import { appendEvent } from "../state/event-log.ts";
4
+ import { loadRunManifestById } from "../state/state-store.ts";
5
+ import type { TeamRunManifest } from "../state/types.ts";
6
+ import { logInternalError } from "../utils/internal-error.ts";
7
+ import type { ManifestCache } from "./manifest-cache.ts";
8
+ import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
9
+
10
+ export interface HeartbeatWatcherRouter {
11
+ enqueue(notification: NotificationDescriptor): boolean;
12
+ }
13
+
14
+ export interface HeartbeatWatcherOptions {
15
+ cwd: string;
16
+ pollIntervalMs?: number;
17
+ thresholds?: GradientThresholds;
18
+ manifestCache: ManifestCache;
19
+ registry: MetricRegistry;
20
+ router: HeartbeatWatcherRouter;
21
+ deadletterTickThreshold?: number;
22
+ onDead?: (runId: string, taskId: string, elapsed: number) => void;
23
+ onDeadletterTrigger?: (manifest: TeamRunManifest, taskId: string) => void;
24
+ }
25
+
26
+ /**
27
+ * Polls running runs for heartbeat staleness.
28
+ *
29
+ * Uses recursive setTimeout to avoid timer storms.
30
+ * Cleanup is done in the same pass — no second scan over manifests.
31
+ * Keys for runs that disappear from the cache are cleaned via staleness-age policy
32
+ * rather than being leaked forever.
33
+ */
34
+ export class HeartbeatWatcher {
35
+ private timer?: ReturnType<typeof setTimeout>;
36
+ private lastLevel = new Map<string, HeartbeatLevel>();
37
+ private consecutiveDead = new Map<string, number>();
38
+ private lastSeen = new Map<string, number>(); // key → last time it was active
39
+ /** Max age (ms) to retain a stale key before garbage-collecting it. */
40
+ private readonly maxKeyAgeMs = 600_000; // 10 minutes
41
+ private readonly opts: HeartbeatWatcherOptions;
42
+
43
+ constructor(opts: HeartbeatWatcherOptions) {
44
+ this.opts = opts;
45
+ }
46
+
47
+ start(): void {
48
+ this.dispose();
49
+ this.scheduleTick();
50
+ }
51
+
52
+ private scheduleTick(): void {
53
+ this.timer = setTimeout(() => this.tick(), this.opts.pollIntervalMs ?? 5000);
54
+ this.timer.unref();
55
+ }
56
+
57
+ tick(now = Date.now()): void {
58
+ try {
59
+ this.tickUnsafe(now);
60
+ } catch (error) {
61
+ logInternalError("heartbeat-watcher.tick", error);
62
+ } finally {
63
+ this.scheduleTick();
64
+ }
65
+ }
66
+
67
+ private tickUnsafe(now: number): void {
68
+ const thresholds = this.opts.thresholds ?? DEFAULT_GRADIENT_THRESHOLDS;
69
+ const tickThreshold = this.opts.deadletterTickThreshold ?? 3;
70
+ const activeKeys = new Set<string>();
71
+
72
+ for (const run of this.opts.manifestCache.list(50)) {
73
+ if (run.status !== "running") continue;
74
+ const loaded = loadRunManifestById(this.opts.cwd, run.runId);
75
+ if (!loaded) continue;
76
+ for (const task of loaded.tasks) {
77
+ if (task.status !== "running") continue;
78
+ const key = `${run.runId}:${task.id}`;
79
+ activeKeys.add(key);
80
+ this.lastSeen.set(key, now);
81
+
82
+ const elapsed = heartbeatAgeMs(task.heartbeat, now);
83
+ const level = classifyHeartbeat(task.heartbeat, thresholds, now);
84
+ this.opts.registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: run.runId, taskId: task.id }, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
85
+ this.opts.registry.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: run.runId, level });
86
+ const previous = this.lastLevel.get(key);
87
+ this.lastLevel.set(key, level);
88
+ if (level === "dead" && previous !== "dead") {
89
+ this.opts.registry.counter("crew.heartbeat.dead_total", "Dead heartbeat detections").inc({ runId: run.runId });
90
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.task.heartbeat_dead", runId: run.runId, taskId: task.id, message: `Task ${task.id} heartbeat dead.`, data: { elapsedMs: Number.isFinite(elapsed) ? elapsed : undefined } });
91
+ this.opts.router.enqueue({ id: `dead_${run.runId}_${task.id}`, severity: "warning", source: "heartbeat-watcher", runId: run.runId, title: `Task ${task.id} heartbeat dead`, body: "Background watcher detected a stuck worker." });
92
+ this.opts.onDead?.(run.runId, task.id, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
93
+ }
94
+ if (level === "dead") {
95
+ const count = (this.consecutiveDead.get(key) ?? 0) + 1;
96
+ this.consecutiveDead.set(key, count);
97
+ if (count === tickThreshold) this.opts.onDeadletterTrigger?.(loaded.manifest, task.id);
98
+ } else {
99
+ this.consecutiveDead.delete(key);
100
+ }
101
+ }
102
+ }
103
+
104
+ // Cleanup: drop keys that were NOT in this tick's active set AND
105
+ // haven't been seen for > maxKeyAgeMs. This covers runs that
106
+ // completed or fell out of the manifest cache's top-50 window.
107
+ const cutoff = now - this.maxKeyAgeMs;
108
+ for (const [key, ts] of this.lastSeen) {
109
+ if (!activeKeys.has(key) && ts < cutoff) {
110
+ this.lastLevel.delete(key);
111
+ this.consecutiveDead.delete(key);
112
+ this.lastSeen.delete(key);
113
+ }
114
+ }
115
+ }
116
+
117
+ dispose(): void {
118
+ if (this.timer) clearTimeout(this.timer);
119
+ this.timer = undefined;
120
+ this.lastLevel.clear();
121
+ this.consecutiveDead.clear();
122
+ this.lastSeen.clear();
123
+ }
124
+ }