pi-crew 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +57 -32
- package/CHANGELOG.md +466 -413
- package/LICENSE +21 -21
- package/NOTICE.md +16 -16
- package/README.md +323 -323
- package/docs/FEATURE_INTAKE.md +126 -0
- package/docs/HARNESS.md +86 -0
- package/docs/HARNESS_BACKLOG.md +41 -0
- package/docs/TEST_MATRIX.md +49 -0
- package/docs/actions-reference.md +595 -595
- package/docs/architecture.md +180 -180
- package/docs/code-review-2026-05-11.md +592 -0
- package/docs/commands-reference.md +347 -347
- package/docs/comparison-pi-subagents-vs-pi-crew.md +303 -0
- package/docs/decisions/0001-durable-state.md +41 -0
- package/docs/decisions/0002-child-process-for-async.md +42 -0
- package/docs/decisions/0003-depth-guard.md +36 -0
- package/docs/decisions/0004-execfile-over-exec.md +34 -0
- package/docs/decisions/0005-no-parameter-properties.md +49 -0
- package/docs/decisions/0006-publish-bundled-esm.md +63 -0
- package/docs/decisions/0007-active-run-binary-index.md +54 -0
- package/docs/decisions/0008-child-pi-warm-pool.md +61 -0
- package/docs/decisions/README.md +23 -0
- package/docs/followup-plan-2026-05-12.md +463 -0
- package/docs/followup-review-2026-05-12.md +297 -0
- package/docs/followup-review-round3-2026-05-12.md +342 -0
- package/docs/followup-review-round4-2026-05-13.md +107 -0
- package/docs/implementation-plan-top3.md +333 -0
- package/docs/live-mailbox-runtime.md +36 -36
- package/docs/next-upgrade-roadmap.md +808 -808
- package/docs/oh-my-pi-research.md +509 -0
- package/docs/perf/baseline-2026-05.md +113 -0
- package/docs/perf/final-report-2026-05.md +206 -0
- package/docs/perf/sprint-1-report.md +71 -0
- package/docs/perf/sprint-2-report.md +81 -0
- package/docs/perf/sprint-2.5-report.md +53 -0
- package/docs/perf/sprint-3-report.md +36 -0
- package/docs/perf/sprint-4-report.md +47 -0
- package/docs/perf/sprint-5-report.md +51 -0
- package/docs/perf/sprint-6-report.md +94 -0
- package/docs/perf/sprint-7-report.md +74 -0
- package/docs/perf/upgrade-plan-2026-05.md +147 -0
- package/docs/pi-subagents3-deep-analysis.md +508 -0
- package/docs/product/README.md +31 -0
- package/docs/product/platform.md +27 -0
- package/docs/product/runtime-safety.md +37 -0
- package/docs/product/team-run.md +39 -0
- package/docs/product/team-tool.md +37 -0
- package/docs/publishing.md +65 -65
- package/docs/resource-formats.md +134 -134
- package/docs/runtime-analysis-child-vs-live.md +171 -0
- package/docs/runtime-flow.md +148 -148
- package/docs/runtime-migration-in-process-analysis.md +250 -0
- package/docs/stories/README.md +30 -0
- package/docs/stories/backlog.md +36 -0
- package/docs/templates/decision.md +27 -0
- package/docs/templates/story.md +44 -0
- package/docs/templates/validation-report.md +32 -0
- package/docs/usage.md +238 -238
- package/index.ts +7 -6
- package/install.mjs +65 -65
- package/package.json +107 -99
- package/schema.json +222 -222
- package/skills/child-pi-spawning/SKILL.md +213 -0
- package/skills/context-artifact-hygiene/SKILL.md +32 -0
- package/skills/event-log-tracing/SKILL.md +299 -0
- package/skills/git-master/SKILL.md +225 -24
- package/skills/live-agent-lifecycle/SKILL.md +192 -0
- package/skills/mailbox-interactive/SKILL.md +300 -19
- package/skills/model-routing-context/SKILL.md +94 -0
- package/skills/multi-perspective-review/SKILL.md +88 -0
- package/skills/read-only-explorer/SKILL.md +250 -26
- package/skills/safe-bash/SKILL.md +307 -21
- package/skills/verification-before-done/SKILL.md +11 -2
- package/skills/widget-rendering/SKILL.md +258 -0
- package/skills/workspace-isolation/SKILL.md +202 -0
- package/skills/worktree-isolation/SKILL.md +202 -18
- package/src/adapters/claude-adapter.ts +25 -25
- package/src/adapters/codex-adapter.ts +21 -21
- package/src/adapters/cursor-adapter.ts +17 -17
- package/src/adapters/export-util.ts +137 -137
- package/src/adapters/index.ts +15 -15
- package/src/adapters/registry.ts +18 -18
- package/src/adapters/types.ts +23 -23
- package/src/agents/agent-config.ts +38 -38
- package/src/agents/agent-serializer.ts +38 -38
- package/src/agents/discover-agents.ts +121 -118
- package/src/config/config.ts +740 -858
- package/src/config/defaults.ts +96 -96
- package/src/config/drift-detector.ts +211 -211
- package/src/config/markers.ts +327 -327
- package/src/config/resilient-parser.ts +109 -108
- package/src/config/suggestions.ts +74 -74
- package/src/config/types.ts +199 -0
- package/src/extension/async-notifier.ts +123 -89
- package/src/extension/autonomous-policy.ts +169 -169
- package/src/extension/cross-extension-rpc.ts +104 -103
- package/src/extension/help.ts +47 -47
- package/src/extension/import-index.ts +69 -69
- package/src/extension/management.ts +395 -382
- package/src/extension/notification-router.ts +116 -116
- package/src/extension/notification-sink.ts +51 -51
- package/src/extension/project-init.ts +168 -168
- package/src/extension/register.ts +859 -668
- package/src/extension/registration/artifact-cleanup.ts +15 -15
- package/src/extension/registration/command-utils.ts +54 -54
- package/src/extension/registration/commands.ts +559 -452
- package/src/extension/registration/compaction-guard.ts +125 -125
- package/src/extension/registration/subagent-helpers.ts +102 -102
- package/src/extension/registration/subagent-tools.ts +220 -158
- package/src/extension/registration/team-tool.ts +159 -98
- package/src/extension/registration/viewers.ts +29 -0
- package/src/extension/result-watcher.ts +128 -128
- package/src/extension/run-bundle-schema.ts +89 -89
- package/src/extension/run-export.ts +73 -73
- package/src/extension/run-import.ts +84 -84
- package/src/extension/run-index.ts +94 -94
- package/src/extension/run-maintenance.ts +142 -142
- package/src/extension/session-summary.ts +8 -8
- package/src/extension/team-manager-command.ts +96 -95
- package/src/extension/team-recommendation.ts +188 -188
- package/src/extension/team-tool/api.ts +5 -2
- package/src/extension/team-tool/cancel.ts +224 -209
- package/src/extension/team-tool/config-patch.ts +36 -36
- package/src/extension/team-tool/context.ts +60 -60
- package/src/extension/team-tool/doctor.ts +242 -242
- package/src/extension/team-tool/handle-settings.ts +421 -195
- package/src/extension/team-tool/inspect.ts +41 -41
- package/src/extension/team-tool/lifecycle-actions.ts +139 -139
- package/src/extension/team-tool/parallel-dispatch.ts +156 -156
- package/src/extension/team-tool/plan.ts +19 -19
- package/src/extension/team-tool/respond.ts +112 -111
- package/src/extension/team-tool/run.ts +246 -228
- package/src/extension/team-tool/status.ts +110 -110
- package/src/extension/team-tool-types.ts +13 -13
- package/src/extension/team-tool.ts +16 -4
- package/src/extension/tool-result.ts +16 -16
- package/src/extension/validate-resources.ts +77 -77
- package/src/hooks/registry.ts +61 -61
- package/src/hooks/types.ts +40 -40
- package/src/i18n.ts +184 -184
- package/src/observability/correlation.ts +35 -35
- package/src/observability/event-to-metric.ts +68 -68
- package/src/observability/exporters/adapter.ts +30 -30
- package/src/observability/exporters/otlp-exporter.ts +106 -92
- package/src/observability/exporters/prometheus-exporter.ts +54 -54
- package/src/observability/metric-registry.ts +87 -87
- package/src/observability/metric-retention.ts +54 -54
- package/src/observability/metric-sink.ts +81 -56
- package/src/observability/metrics-primitives.ts +167 -167
- package/src/prompt/prompt-runtime.ts +72 -72
- package/src/runtime/adaptive-plan.ts +338 -0
- package/src/runtime/agent-control.ts +169 -169
- package/src/runtime/agent-memory.ts +72 -72
- package/src/runtime/agent-observability.ts +114 -114
- package/src/runtime/async-marker.ts +26 -26
- package/src/runtime/async-runner.ts +153 -79
- package/src/runtime/attention-events.ts +28 -28
- package/src/runtime/auto-resume.ts +100 -100
- package/src/runtime/background-runner.ts +122 -88
- package/src/runtime/cancellation.ts +61 -61
- package/src/runtime/capability-inventory.ts +116 -116
- package/src/runtime/child-pi-pool.ts +68 -0
- package/src/runtime/child-pi.ts +541 -463
- package/src/runtime/code-summary.ts +247 -247
- package/src/runtime/compaction-summary.ts +271 -271
- package/src/runtime/concurrency.ts +58 -58
- package/src/runtime/crash-recovery.ts +317 -301
- package/src/runtime/crew-agent-records.ts +379 -281
- package/src/runtime/crew-agent-runtime.ts +60 -60
- package/src/runtime/cross-extension-rpc.ts +72 -0
- package/src/runtime/custom-tools/irc-tool.ts +201 -201
- package/src/runtime/custom-tools/submit-result-tool.ts +90 -90
- package/src/runtime/deadletter.ts +47 -47
- package/src/runtime/delivery-coordinator.ts +176 -176
- package/src/runtime/delta-conflict.ts +360 -360
- package/src/runtime/diagnostic-export.ts +102 -102
- package/src/runtime/direct-run.ts +35 -35
- package/src/runtime/effectiveness.ts +82 -81
- package/src/runtime/errors/crew-errors.ts +166 -0
- package/src/runtime/event-stream-bridge.ts +92 -92
- package/src/runtime/foreground-control.ts +82 -82
- package/src/runtime/green-contract.ts +46 -46
- package/src/runtime/group-join.ts +234 -106
- package/src/runtime/heartbeat-watcher.ts +145 -124
- package/src/runtime/iteration-hooks.ts +267 -264
- package/src/runtime/live-agent-control.ts +88 -88
- package/src/runtime/live-agent-manager.ts +377 -179
- package/src/runtime/live-control-realtime.ts +36 -36
- package/src/runtime/live-session-runtime.ts +676 -599
- package/src/runtime/loop-gates.ts +129 -129
- package/src/runtime/manifest-cache.ts +263 -263
- package/src/runtime/mcp-proxy.ts +113 -113
- package/src/runtime/metric-parser.ts +40 -40
- package/src/runtime/model-fallback.ts +282 -274
- package/src/runtime/model-resolver.ts +118 -0
- package/src/runtime/output-validator.ts +187 -187
- package/src/runtime/overflow-recovery.ts +175 -175
- package/src/runtime/parallel-research.ts +44 -44
- package/src/runtime/parallel-utils.ts +156 -156
- package/src/runtime/parent-guard.ts +80 -80
- package/src/runtime/phase-progress.ts +217 -217
- package/src/runtime/pi-args.ts +165 -165
- package/src/runtime/pi-json-output.ts +111 -111
- package/src/runtime/pi-spawn.ts +167 -167
- package/src/runtime/policy-engine.ts +79 -79
- package/src/runtime/post-checks.ts +125 -122
- package/src/runtime/post-exit-stdio-guard.ts +86 -86
- package/src/runtime/process-status.ts +97 -73
- package/src/runtime/progress-event-coalescer.ts +43 -43
- package/src/runtime/recovery-recipes.ts +74 -74
- package/src/runtime/retry-executor.ts +81 -81
- package/src/runtime/role-permission.ts +39 -39
- package/src/runtime/run-tracker.ts +99 -0
- package/src/runtime/runtime-policy.ts +21 -0
- package/src/runtime/runtime-resolver.ts +94 -90
- package/src/runtime/scheduler.ts +294 -0
- package/src/runtime/semaphore.ts +131 -131
- package/src/runtime/sensitive-paths.ts +92 -92
- package/src/runtime/session-usage.ts +79 -79
- package/src/runtime/settings-store.ts +103 -0
- package/src/runtime/sidechain-output.ts +29 -29
- package/src/runtime/skill-instructions.ts +222 -222
- package/src/runtime/stale-reconciler.ts +198 -189
- package/src/runtime/streaming-output.ts +47 -0
- package/src/runtime/subagent-manager.ts +404 -395
- package/src/runtime/subprocess-tool-registry.ts +67 -67
- package/src/runtime/task-display.ts +38 -38
- package/src/runtime/task-graph-scheduler.ts +122 -122
- package/src/runtime/task-graph.ts +207 -207
- package/src/runtime/task-output-context.ts +177 -177
- package/src/runtime/task-packet.ts +93 -93
- package/src/runtime/task-quality.ts +207 -207
- package/src/runtime/task-runner/capabilities.ts +78 -78
- package/src/runtime/task-runner/live-executor.ts +131 -113
- package/src/runtime/task-runner/progress.ts +119 -119
- package/src/runtime/task-runner/prompt-builder.ts +139 -139
- package/src/runtime/task-runner/prompt-pipeline.ts +64 -64
- package/src/runtime/task-runner/result-utils.ts +14 -14
- package/src/runtime/task-runner/run-projection.ts +103 -103
- package/src/runtime/task-runner/state-helpers.ts +22 -22
- package/src/runtime/task-runner.ts +469 -458
- package/src/runtime/team-runner.ts +693 -945
- package/src/runtime/usage-tracker.ts +71 -0
- package/src/runtime/worker-heartbeat.ts +21 -21
- package/src/runtime/worker-startup.ts +57 -57
- package/src/runtime/workflow-state.ts +187 -187
- package/src/runtime/yield-handler.ts +190 -189
- package/src/schema/config-schema.ts +172 -168
- package/src/schema/team-tool-schema.ts +126 -125
- package/src/schema/validation-types.ts +151 -148
- package/src/skills/discover-skills.ts +67 -67
- package/src/skills/skill-templates.ts +374 -374
- package/src/state/active-run-registry.ts +227 -191
- package/src/state/artifact-store.ts +130 -129
- package/src/state/atomic-write.ts +262 -178
- package/src/state/blob-store.ts +116 -116
- package/src/state/contracts.ts +111 -111
- package/src/state/event-log-rotation.ts +161 -158
- package/src/state/event-log.ts +383 -240
- package/src/state/event-reconstructor.ts +217 -217
- package/src/state/jsonl-writer.ts +82 -82
- package/src/state/locks.ts +146 -148
- package/src/state/mailbox.ts +446 -405
- package/src/state/state-store.ts +364 -351
- package/src/state/task-claims.ts +44 -44
- package/src/state/types.ts +285 -285
- package/src/state/usage.ts +29 -29
- package/src/subagents/async-entry.ts +1 -1
- package/src/subagents/index.ts +3 -3
- package/src/subagents/live/control.ts +1 -1
- package/src/subagents/live/manager.ts +1 -1
- package/src/subagents/live/realtime.ts +1 -1
- package/src/subagents/live/session-runtime.ts +1 -1
- package/src/subagents/manager.ts +1 -1
- package/src/subagents/spawn.ts +1 -1
- package/src/teams/discover-teams.ts +116 -116
- package/src/teams/team-config.ts +27 -27
- package/src/teams/team-serializer.ts +38 -38
- package/src/types/diff.d.ts +18 -18
- package/src/ui/agent-management-overlay.ts +144 -144
- package/src/ui/crew-widget.ts +487 -370
- package/src/ui/dashboard-panes/agents-pane.ts +109 -28
- package/src/ui/dashboard-panes/cancellation-pane.ts +42 -42
- package/src/ui/dashboard-panes/capability-pane.ts +59 -59
- package/src/ui/dashboard-panes/health-pane.ts +30 -30
- package/src/ui/dashboard-panes/mailbox-pane.ts +35 -35
- package/src/ui/dashboard-panes/progress-pane.ts +30 -30
- package/src/ui/dashboard-panes/transcript-pane.ts +10 -10
- package/src/ui/heartbeat-aggregator.ts +63 -63
- package/src/ui/keybinding-map.ts +97 -94
- package/src/ui/live-conversation-overlay.ts +152 -0
- package/src/ui/live-run-sidebar.ts +180 -180
- package/src/ui/mascot.ts +442 -442
- package/src/ui/overlays/agent-picker-overlay.ts +57 -57
- package/src/ui/overlays/confirm-overlay.ts +58 -58
- package/src/ui/overlays/mailbox-compose-overlay.ts +144 -144
- package/src/ui/overlays/mailbox-compose-preview.ts +63 -63
- package/src/ui/overlays/mailbox-detail-overlay.ts +122 -122
- package/src/ui/pi-ui-compat.ts +57 -57
- package/src/ui/powerbar-publisher.ts +221 -197
- package/src/ui/render-scheduler.ts +216 -143
- package/src/ui/run-action-dispatcher.ts +118 -117
- package/src/ui/run-dashboard.ts +526 -464
- package/src/ui/run-event-bus.ts +208 -208
- package/src/ui/run-snapshot-cache.ts +826 -777
- package/src/ui/settings-overlay.ts +721 -0
- package/src/ui/snapshot-types.ts +86 -70
- package/src/ui/theme-adapter.ts +190 -190
- package/src/ui/tool-progress-formatter.ts +89 -0
- package/src/ui/transcript-cache.ts +94 -94
- package/src/ui/transcript-viewer.ts +335 -335
- package/src/utils/conflict-detect.ts +662 -0
- package/src/utils/env-filter.ts +30 -0
- package/src/utils/file-coalescer.ts +86 -86
- package/src/utils/frontmatter.ts +68 -68
- package/src/utils/fs-watch.ts +88 -31
- package/src/utils/gh-protocol.ts +479 -0
- package/src/utils/ids.ts +17 -17
- package/src/utils/incremental-reader.ts +104 -104
- package/src/utils/internal-error.ts +6 -6
- package/src/utils/names.ts +27 -27
- package/src/utils/paths.ts +102 -63
- package/src/utils/redaction.ts +44 -44
- package/src/utils/resolve-shell.ts +34 -0
- package/src/utils/safe-paths.ts +47 -47
- package/src/utils/scan-cache.ts +136 -136
- package/src/utils/sleep.ts +2 -1
- package/src/utils/sse-parser.ts +134 -134
- package/src/utils/task-name-generator.ts +337 -337
- package/src/utils/timings.ts +33 -33
- package/src/utils/visual.ts +243 -198
- package/src/workflows/discover-workflows.ts +139 -139
- package/src/workflows/validate-workflow.ts +40 -40
- package/src/workflows/workflow-config.ts +26 -26
- package/src/workflows/workflow-serializer.ts +32 -32
- package/src/worktree/branch-freshness.ts +45 -45
- package/src/worktree/cleanup.ts +75 -72
- package/src/worktree/worktree-manager.ts +188 -146
- package/teams/default.team.md +12 -12
- package/teams/fast-fix.team.md +11 -11
- package/teams/implementation.team.md +18 -18
- package/teams/parallel-research.team.md +14 -14
- package/teams/research.team.md +11 -11
- package/teams/review.team.md +12 -12
- package/tsconfig.json +19 -19
- package/workflows/default.workflow.md +30 -30
- package/workflows/fast-fix.workflow.md +23 -23
- package/workflows/implementation.workflow.md +43 -43
- package/workflows/parallel-research.workflow.md +46 -46
- package/workflows/research.workflow.md +22 -22
- package/workflows/review.workflow.md +30 -30
- package/skills/task-packet/SKILL.md +0 -28
- package/skills/verify-evidence/SKILL.md +0 -27
|
@@ -1,106 +1,234 @@
|
|
|
1
|
-
import type { CrewRuntimeConfig } from "../config/config.ts";
|
|
2
|
-
import { writeArtifact } from "../state/artifact-store.ts";
|
|
3
|
-
import { appendEvent } from "../state/event-log.ts";
|
|
4
|
-
import { appendMailboxMessage, findMailboxMessageByRequestId, readDeliveryState } from "../state/mailbox.ts";
|
|
5
|
-
import type { ArtifactDescriptor, TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
6
|
-
import { aggregateTaskOutputs } from "./task-output-context.ts";
|
|
7
|
-
|
|
8
|
-
export type CrewGroupJoinMode = "off" | "group" | "smart";
|
|
9
|
-
|
|
10
|
-
export interface CrewGroupJoinDelivery {
|
|
11
|
-
batchId: string;
|
|
12
|
-
mode: CrewGroupJoinMode;
|
|
13
|
-
partial: boolean;
|
|
14
|
-
taskIds: string[];
|
|
15
|
-
completed: string[];
|
|
16
|
-
failed: string[];
|
|
17
|
-
skipped: string[];
|
|
18
|
-
remaining: string[];
|
|
19
|
-
artifact?: ArtifactDescriptor;
|
|
20
|
-
messageId?: string;
|
|
21
|
-
requestId?: string;
|
|
22
|
-
ackRequired?: boolean;
|
|
23
|
-
ackStatus?: "pending" | "acknowledged";
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
export function resolveGroupJoinMode(runtime?: CrewRuntimeConfig): CrewGroupJoinMode {
|
|
27
|
-
return runtime?.groupJoin ?? "smart";
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
export function shouldGroupJoin(mode: CrewGroupJoinMode, batch: TeamTaskState[]): boolean {
|
|
31
|
-
if (mode === "off") return false;
|
|
32
|
-
if (mode === "group") return batch.length > 0;
|
|
33
|
-
return batch.length > 1;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
function batchIdFor(runId: string, taskIds: string[]): string {
|
|
37
|
-
return `${runId}_${taskIds.join("+").replace(/[^a-zA-Z0-9_+-]/g, "_")}`;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
function requestIdFor(runId: string, batchId: string, partial: boolean): string {
|
|
41
|
-
return `${runId}:group-join:${partial ? "partial" : "completed"}:${batchId}`;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
function statusList(tasks: TeamTaskState[], status: TeamTaskState["status"]): string[] {
|
|
45
|
-
return tasks.filter((task) => task.status === status).map((task) => task.id);
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
export function deliverGroupJoin(input: {
|
|
49
|
-
manifest: TeamRunManifest;
|
|
50
|
-
mode: CrewGroupJoinMode;
|
|
51
|
-
batch: TeamTaskState[];
|
|
52
|
-
allTasks: TeamTaskState[];
|
|
53
|
-
partial?: boolean;
|
|
54
|
-
}): CrewGroupJoinDelivery | undefined {
|
|
55
|
-
if (!shouldGroupJoin(input.mode, input.batch)) return undefined;
|
|
56
|
-
const taskIds = input.batch.map((task) => task.id);
|
|
57
|
-
const latest = taskIds.map((id) => input.allTasks.find((task) => task.id === id)).filter((task): task is TeamTaskState => Boolean(task));
|
|
58
|
-
const completed = statusList(latest, "completed");
|
|
59
|
-
const failed = statusList(latest, "failed");
|
|
60
|
-
const skipped = statusList(latest, "skipped");
|
|
61
|
-
const remaining = latest.filter((task) => task.status === "queued" || task.status === "running").map((task) => task.id);
|
|
62
|
-
const partial = input.partial ?? remaining.length > 0;
|
|
63
|
-
const batchId = batchIdFor(input.manifest.runId, taskIds);
|
|
64
|
-
const summary = aggregateTaskOutputs(latest, input.manifest);
|
|
65
|
-
const requestId = requestIdFor(input.manifest.runId, batchId, partial);
|
|
66
|
-
const existingMailbox = findMailboxMessageByRequestId(input.manifest, requestId);
|
|
67
|
-
const existingStatus = existingMailbox ? readDeliveryState(input.manifest).messages[existingMailbox.id] ?? existingMailbox.status : undefined;
|
|
68
|
-
const delivery: CrewGroupJoinDelivery = { batchId, mode: input.mode, partial, taskIds, completed, failed, skipped, remaining, requestId, ackRequired: true, ackStatus: existingStatus === "acknowledged" ? "acknowledged" : "pending" };
|
|
69
|
-
const content = `${JSON.stringify({ ...delivery, createdAt: new Date().toISOString() }, null, 2)}\n`;
|
|
70
|
-
const artifact = writeArtifact(input.manifest.artifactsRoot, {
|
|
71
|
-
kind: "metadata",
|
|
72
|
-
relativePath: `metadata/group-joins/${batchId}.json`,
|
|
73
|
-
producer: "group-join",
|
|
74
|
-
content,
|
|
75
|
-
});
|
|
76
|
-
const mailbox = existingMailbox ?? appendMailboxMessage(input.manifest, {
|
|
77
|
-
direction: "outbox",
|
|
78
|
-
from: "group-join",
|
|
79
|
-
to: "leader",
|
|
80
|
-
body: [
|
|
81
|
-
`Group join ${partial ? "partial" : "completed"}: ${taskIds.join(", ")}`,
|
|
82
|
-
`Request: ${requestId}`,
|
|
83
|
-
`Completed: ${completed.join(", ") || "none"}`,
|
|
84
|
-
`Failed: ${failed.join(", ") || "none"}`,
|
|
85
|
-
`Skipped: ${skipped.join(", ") || "none"}`,
|
|
86
|
-
`Remaining: ${remaining.join(", ") || "none"}`,
|
|
87
|
-
"",
|
|
88
|
-
summary,
|
|
89
|
-
].join("\n"),
|
|
90
|
-
status: "delivered",
|
|
91
|
-
data: { kind: "group_join", requestId, batchId, partial, ackRequired: true, taskIds, completed, failed, skipped, remaining },
|
|
92
|
-
});
|
|
93
|
-
appendEvent(input.manifest.eventsPath, {
|
|
94
|
-
type: partial ? "agent.group_join.partial" : "agent.group_join.completed",
|
|
95
|
-
runId: input.manifest.runId,
|
|
96
|
-
message: `Group join ${partial ? "partial" : "completed"} for ${taskIds.length} task(s).`,
|
|
97
|
-
data: { ...delivery, artifactPath: artifact.path, messageId: mailbox.id, fallback: "mailbox-delivered", reused: Boolean(existingMailbox) },
|
|
98
|
-
});
|
|
99
|
-
if (existingMailbox) appendEvent(input.manifest.eventsPath, {
|
|
100
|
-
type: "agent.group_join.delivery_reused",
|
|
101
|
-
runId: input.manifest.runId,
|
|
102
|
-
message: `Reused group join mailbox delivery for ${taskIds.length} task(s).`,
|
|
103
|
-
data: { requestId, messageId: mailbox.id, batchId, partial },
|
|
104
|
-
});
|
|
105
|
-
return { ...delivery, artifact, messageId: mailbox.id };
|
|
106
|
-
}
|
|
1
|
+
import type { CrewRuntimeConfig } from "../config/config.ts";
|
|
2
|
+
import { writeArtifact } from "../state/artifact-store.ts";
|
|
3
|
+
import { appendEvent } from "../state/event-log.ts";
|
|
4
|
+
import { appendMailboxMessage, findMailboxMessageByRequestId, readDeliveryState } from "../state/mailbox.ts";
|
|
5
|
+
import type { ArtifactDescriptor, TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
6
|
+
import { aggregateTaskOutputs } from "./task-output-context.ts";
|
|
7
|
+
|
|
8
|
+
export type CrewGroupJoinMode = "off" | "group" | "smart";
|
|
9
|
+
|
|
10
|
+
export interface CrewGroupJoinDelivery {
|
|
11
|
+
batchId: string;
|
|
12
|
+
mode: CrewGroupJoinMode;
|
|
13
|
+
partial: boolean;
|
|
14
|
+
taskIds: string[];
|
|
15
|
+
completed: string[];
|
|
16
|
+
failed: string[];
|
|
17
|
+
skipped: string[];
|
|
18
|
+
remaining: string[];
|
|
19
|
+
artifact?: ArtifactDescriptor;
|
|
20
|
+
messageId?: string;
|
|
21
|
+
requestId?: string;
|
|
22
|
+
ackRequired?: boolean;
|
|
23
|
+
ackStatus?: "pending" | "acknowledged";
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function resolveGroupJoinMode(runtime?: CrewRuntimeConfig): CrewGroupJoinMode {
|
|
27
|
+
return runtime?.groupJoin ?? "smart";
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export function shouldGroupJoin(mode: CrewGroupJoinMode, batch: TeamTaskState[]): boolean {
|
|
31
|
+
if (mode === "off") return false;
|
|
32
|
+
if (mode === "group") return batch.length > 0;
|
|
33
|
+
return batch.length > 1;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function batchIdFor(runId: string, taskIds: string[]): string {
|
|
37
|
+
return `${runId}_${taskIds.join("+").replace(/[^a-zA-Z0-9_+-]/g, "_")}`;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function requestIdFor(runId: string, batchId: string, partial: boolean): string {
|
|
41
|
+
return `${runId}:group-join:${partial ? "partial" : "completed"}:${batchId}`;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function statusList(tasks: TeamTaskState[], status: TeamTaskState["status"]): string[] {
|
|
45
|
+
return tasks.filter((task) => task.status === status).map((task) => task.id);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export function deliverGroupJoin(input: {
|
|
49
|
+
manifest: TeamRunManifest;
|
|
50
|
+
mode: CrewGroupJoinMode;
|
|
51
|
+
batch: TeamTaskState[];
|
|
52
|
+
allTasks: TeamTaskState[];
|
|
53
|
+
partial?: boolean;
|
|
54
|
+
}): CrewGroupJoinDelivery | undefined {
|
|
55
|
+
if (!shouldGroupJoin(input.mode, input.batch)) return undefined;
|
|
56
|
+
const taskIds = input.batch.map((task) => task.id);
|
|
57
|
+
const latest = taskIds.map((id) => input.allTasks.find((task) => task.id === id)).filter((task): task is TeamTaskState => Boolean(task));
|
|
58
|
+
const completed = statusList(latest, "completed");
|
|
59
|
+
const failed = statusList(latest, "failed");
|
|
60
|
+
const skipped = statusList(latest, "skipped");
|
|
61
|
+
const remaining = latest.filter((task) => task.status === "queued" || task.status === "running").map((task) => task.id);
|
|
62
|
+
const partial = input.partial ?? remaining.length > 0;
|
|
63
|
+
const batchId = batchIdFor(input.manifest.runId, taskIds);
|
|
64
|
+
const summary = aggregateTaskOutputs(latest, input.manifest);
|
|
65
|
+
const requestId = requestIdFor(input.manifest.runId, batchId, partial);
|
|
66
|
+
const existingMailbox = findMailboxMessageByRequestId(input.manifest, requestId);
|
|
67
|
+
const existingStatus = existingMailbox ? readDeliveryState(input.manifest).messages[existingMailbox.id] ?? existingMailbox.status : undefined;
|
|
68
|
+
const delivery: CrewGroupJoinDelivery = { batchId, mode: input.mode, partial, taskIds, completed, failed, skipped, remaining, requestId, ackRequired: true, ackStatus: existingStatus === "acknowledged" ? "acknowledged" : "pending" };
|
|
69
|
+
const content = `${JSON.stringify({ ...delivery, createdAt: new Date().toISOString() }, null, 2)}\n`;
|
|
70
|
+
const artifact = writeArtifact(input.manifest.artifactsRoot, {
|
|
71
|
+
kind: "metadata",
|
|
72
|
+
relativePath: `metadata/group-joins/${batchId}.json`,
|
|
73
|
+
producer: "group-join",
|
|
74
|
+
content,
|
|
75
|
+
});
|
|
76
|
+
const mailbox = existingMailbox ?? appendMailboxMessage(input.manifest, {
|
|
77
|
+
direction: "outbox",
|
|
78
|
+
from: "group-join",
|
|
79
|
+
to: "leader",
|
|
80
|
+
body: [
|
|
81
|
+
`Group join ${partial ? "partial" : "completed"}: ${taskIds.join(", ")}`,
|
|
82
|
+
`Request: ${requestId}`,
|
|
83
|
+
`Completed: ${completed.join(", ") || "none"}`,
|
|
84
|
+
`Failed: ${failed.join(", ") || "none"}`,
|
|
85
|
+
`Skipped: ${skipped.join(", ") || "none"}`,
|
|
86
|
+
`Remaining: ${remaining.join(", ") || "none"}`,
|
|
87
|
+
"",
|
|
88
|
+
summary,
|
|
89
|
+
].join("\n"),
|
|
90
|
+
status: "delivered",
|
|
91
|
+
data: { kind: "group_join", requestId, batchId, partial, ackRequired: true, taskIds, completed, failed, skipped, remaining },
|
|
92
|
+
});
|
|
93
|
+
appendEvent(input.manifest.eventsPath, {
|
|
94
|
+
type: partial ? "agent.group_join.partial" : "agent.group_join.completed",
|
|
95
|
+
runId: input.manifest.runId,
|
|
96
|
+
message: `Group join ${partial ? "partial" : "completed"} for ${taskIds.length} task(s).`,
|
|
97
|
+
data: { ...delivery, artifactPath: artifact.path, messageId: mailbox.id, fallback: "mailbox-delivered", reused: Boolean(existingMailbox) },
|
|
98
|
+
});
|
|
99
|
+
if (existingMailbox) appendEvent(input.manifest.eventsPath, {
|
|
100
|
+
type: "agent.group_join.delivery_reused",
|
|
101
|
+
runId: input.manifest.runId,
|
|
102
|
+
message: `Reused group join mailbox delivery for ${taskIds.length} task(s).`,
|
|
103
|
+
data: { requestId, messageId: mailbox.id, batchId, partial },
|
|
104
|
+
});
|
|
105
|
+
return { ...delivery, artifact, messageId: mailbox.id };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
import type { CrewAgentRecord } from "./crew-agent-runtime.ts";
|
|
109
|
+
|
|
110
|
+
export type JoinMode = "async" | "group" | "smart";
|
|
111
|
+
export type DeliveryCallback = (records: CrewAgentRecord[], partial: boolean) => void;
|
|
112
|
+
|
|
113
|
+
interface AgentGroup {
|
|
114
|
+
groupId: string;
|
|
115
|
+
agentIds: Set<string>;
|
|
116
|
+
completedRecords: Map<string, CrewAgentRecord>;
|
|
117
|
+
timeoutHandle?: ReturnType<typeof setTimeout>;
|
|
118
|
+
delivered: boolean;
|
|
119
|
+
isStraggler: boolean;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const DEFAULT_TIMEOUT = 30_000;
|
|
123
|
+
const STRAGGLER_TIMEOUT = 15_000;
|
|
124
|
+
|
|
125
|
+
export class GroupJoinManager {
|
|
126
|
+
private groups = new Map<string, AgentGroup>();
|
|
127
|
+
private agentToGroup = new Map<string, string>();
|
|
128
|
+
|
|
129
|
+
private deliverCb: DeliveryCallback;
|
|
130
|
+
private groupTimeout: number;
|
|
131
|
+
|
|
132
|
+
constructor(
|
|
133
|
+
deliverCb: DeliveryCallback,
|
|
134
|
+
groupTimeout = DEFAULT_TIMEOUT,
|
|
135
|
+
) {
|
|
136
|
+
this.deliverCb = deliverCb;
|
|
137
|
+
this.groupTimeout = groupTimeout;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
registerGroup(groupId: string, agentIds: string[]): void {
|
|
141
|
+
const group: AgentGroup = {
|
|
142
|
+
groupId,
|
|
143
|
+
agentIds: new Set(agentIds),
|
|
144
|
+
completedRecords: new Map(),
|
|
145
|
+
delivered: false,
|
|
146
|
+
isStraggler: false,
|
|
147
|
+
};
|
|
148
|
+
this.groups.set(groupId, group);
|
|
149
|
+
for (const id of agentIds) {
|
|
150
|
+
this.agentToGroup.set(id, groupId);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
onAgentComplete(record: CrewAgentRecord): "delivered" | "held" | "pass" {
|
|
155
|
+
const groupId = this.agentToGroup.get(record.taskId);
|
|
156
|
+
if (!groupId) return "pass";
|
|
157
|
+
|
|
158
|
+
const group = this.groups.get(groupId);
|
|
159
|
+
if (!group || group.delivered) return "pass";
|
|
160
|
+
|
|
161
|
+
group.completedRecords.set(record.taskId, record);
|
|
162
|
+
|
|
163
|
+
if (group.completedRecords.size >= group.agentIds.size) {
|
|
164
|
+
this.deliver(group, false);
|
|
165
|
+
return "delivered";
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (!group.timeoutHandle) {
|
|
169
|
+
const timeout = group.isStraggler ? STRAGGLER_TIMEOUT : this.groupTimeout;
|
|
170
|
+
group.timeoutHandle = setTimeout(() => {
|
|
171
|
+
this.onTimeout(group);
|
|
172
|
+
}, timeout);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return "held";
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
private onTimeout(group: AgentGroup): void {
|
|
179
|
+
if (group.delivered) return;
|
|
180
|
+
group.timeoutHandle = undefined;
|
|
181
|
+
|
|
182
|
+
const remaining = new Set<string>();
|
|
183
|
+
for (const id of group.agentIds) {
|
|
184
|
+
if (!group.completedRecords.has(id)) remaining.add(id);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
for (const id of group.completedRecords.keys()) {
|
|
188
|
+
this.agentToGroup.delete(id);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
this.deliverCb([...group.completedRecords.values()], true);
|
|
192
|
+
|
|
193
|
+
group.completedRecords.clear();
|
|
194
|
+
group.agentIds = remaining;
|
|
195
|
+
group.isStraggler = true;
|
|
196
|
+
|
|
197
|
+
// Re-arm timer for remaining stragglers so they aren't silently abandoned
|
|
198
|
+
if (remaining.size > 0) {
|
|
199
|
+
group.timeoutHandle = setTimeout(() => this.onTimeout(group), STRAGGLER_TIMEOUT);
|
|
200
|
+
group.timeoutHandle.unref();
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
private deliver(group: AgentGroup, partial: boolean): void {
|
|
205
|
+
if (group.timeoutHandle) {
|
|
206
|
+
clearTimeout(group.timeoutHandle);
|
|
207
|
+
group.timeoutHandle = undefined;
|
|
208
|
+
}
|
|
209
|
+
group.delivered = true;
|
|
210
|
+
this.deliverCb([...group.completedRecords.values()], partial);
|
|
211
|
+
this.cleanupGroup(group.groupId);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
private cleanupGroup(groupId: string): void {
|
|
215
|
+
const group = this.groups.get(groupId);
|
|
216
|
+
if (!group) return;
|
|
217
|
+
for (const id of group.agentIds) {
|
|
218
|
+
this.agentToGroup.delete(id);
|
|
219
|
+
}
|
|
220
|
+
this.groups.delete(groupId);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
isGrouped(agentId: string): boolean {
|
|
224
|
+
return this.agentToGroup.has(agentId);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
dispose(): void {
|
|
228
|
+
for (const group of this.groups.values()) {
|
|
229
|
+
if (group.timeoutHandle) clearTimeout(group.timeoutHandle);
|
|
230
|
+
}
|
|
231
|
+
this.groups.clear();
|
|
232
|
+
this.agentToGroup.clear();
|
|
233
|
+
}
|
|
234
|
+
}
|
|
@@ -1,124 +1,145 @@
|
|
|
1
|
-
import type { NotificationDescriptor } from "../extension/notification-router.ts";
|
|
2
|
-
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
|
-
import { appendEvent } from "../state/event-log.ts";
|
|
4
|
-
import { loadRunManifestById } from "../state/state-store.ts";
|
|
5
|
-
import type { TeamRunManifest } from "../state/types.ts";
|
|
6
|
-
import { logInternalError } from "../utils/internal-error.ts";
|
|
7
|
-
import type { ManifestCache } from "./manifest-cache.ts";
|
|
8
|
-
import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
|
|
9
|
-
|
|
10
|
-
export interface HeartbeatWatcherRouter {
|
|
11
|
-
enqueue(notification: NotificationDescriptor): boolean;
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export interface HeartbeatWatcherOptions {
|
|
15
|
-
cwd: string;
|
|
16
|
-
pollIntervalMs?: number;
|
|
17
|
-
thresholds?: GradientThresholds;
|
|
18
|
-
manifestCache: ManifestCache;
|
|
19
|
-
registry: MetricRegistry;
|
|
20
|
-
router: HeartbeatWatcherRouter;
|
|
21
|
-
deadletterTickThreshold?: number;
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
1
|
+
import type { NotificationDescriptor } from "../extension/notification-router.ts";
|
|
2
|
+
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
|
+
import { appendEvent } from "../state/event-log.ts";
|
|
4
|
+
import { loadRunManifestById } from "../state/state-store.ts";
|
|
5
|
+
import type { TeamRunManifest } from "../state/types.ts";
|
|
6
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
7
|
+
import type { ManifestCache } from "./manifest-cache.ts";
|
|
8
|
+
import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
|
|
9
|
+
|
|
10
|
+
export interface HeartbeatWatcherRouter {
|
|
11
|
+
enqueue(notification: NotificationDescriptor): boolean;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface HeartbeatWatcherOptions {
|
|
15
|
+
cwd: string;
|
|
16
|
+
pollIntervalMs?: number;
|
|
17
|
+
thresholds?: GradientThresholds;
|
|
18
|
+
manifestCache: ManifestCache;
|
|
19
|
+
registry: MetricRegistry;
|
|
20
|
+
router: HeartbeatWatcherRouter;
|
|
21
|
+
deadletterTickThreshold?: number;
|
|
22
|
+
/**
|
|
23
|
+
* 3.6 — minimum interval between repeated deadletter triggers for the same
|
|
24
|
+
* runId+taskId. Without this, a flaky worker (dead → alive → dead) can
|
|
25
|
+
* fire deadletter entries faster than the operator can respond. Default
|
|
26
|
+
* 60_000 ms.
|
|
27
|
+
*/
|
|
28
|
+
deadletterCooldownMs?: number;
|
|
29
|
+
onDead?: (runId: string, taskId: string, elapsed: number) => void;
|
|
30
|
+
onDeadletterTrigger?: (manifest: TeamRunManifest, taskId: string) => void;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Polls running runs for heartbeat staleness.
|
|
35
|
+
*
|
|
36
|
+
* Uses recursive setTimeout to avoid timer storms.
|
|
37
|
+
* Cleanup is done in the same pass — no second scan over manifests.
|
|
38
|
+
* Keys for runs that disappear from the cache are cleaned via staleness-age policy
|
|
39
|
+
* rather than being leaked forever.
|
|
40
|
+
*/
|
|
41
|
+
export class HeartbeatWatcher {
|
|
42
|
+
private timer?: ReturnType<typeof setTimeout>;
|
|
43
|
+
private lastLevel = new Map<string, HeartbeatLevel>();
|
|
44
|
+
private consecutiveDead = new Map<string, number>();
|
|
45
|
+
private lastSeen = new Map<string, number>(); // key → last time it was active
|
|
46
|
+
private lastDeadletterTriggerAt = new Map<string, number>(); // 3.6 cooldown gate
|
|
47
|
+
/** Max age (ms) to retain a stale key before garbage-collecting it. */
|
|
48
|
+
private readonly maxKeyAgeMs = 600_000; // 10 minutes
|
|
49
|
+
private readonly opts: HeartbeatWatcherOptions;
|
|
50
|
+
|
|
51
|
+
constructor(opts: HeartbeatWatcherOptions) {
|
|
52
|
+
this.opts = opts;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
start(): void {
|
|
56
|
+
this.dispose();
|
|
57
|
+
this.scheduleTick();
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
private scheduleTick(): void {
|
|
61
|
+
// 3.2 — when at least one run has a dead-streak in progress, poll faster
|
|
62
|
+
// (1s) so operators get notified quickly. Healthy state stays at the
|
|
63
|
+
// configured interval (default 5s) to keep idle CPU near zero.
|
|
64
|
+
const baseInterval = this.opts.pollIntervalMs ?? 5000;
|
|
65
|
+
const interval = this.consecutiveDead.size > 0 ? Math.min(1000, baseInterval) : baseInterval;
|
|
66
|
+
this.timer = setTimeout(() => this.tick(), interval);
|
|
67
|
+
this.timer.unref();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
tick(now = Date.now()): void {
|
|
71
|
+
try {
|
|
72
|
+
this.tickUnsafe(now);
|
|
73
|
+
} catch (error) {
|
|
74
|
+
logInternalError("heartbeat-watcher.tick", error);
|
|
75
|
+
} finally {
|
|
76
|
+
this.scheduleTick();
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
private tickUnsafe(now: number): void {
|
|
81
|
+
const thresholds = this.opts.thresholds ?? DEFAULT_GRADIENT_THRESHOLDS;
|
|
82
|
+
const tickThreshold = this.opts.deadletterTickThreshold ?? 3;
|
|
83
|
+
const activeKeys = new Set<string>();
|
|
84
|
+
|
|
85
|
+
for (const run of this.opts.manifestCache.list(50)) {
|
|
86
|
+
if (run.status !== "running") continue;
|
|
87
|
+
const loaded = loadRunManifestById(this.opts.cwd, run.runId);
|
|
88
|
+
if (!loaded) continue;
|
|
89
|
+
for (const task of loaded.tasks) {
|
|
90
|
+
if (task.status !== "running") continue;
|
|
91
|
+
const key = `${run.runId}:${task.id}`;
|
|
92
|
+
activeKeys.add(key);
|
|
93
|
+
this.lastSeen.set(key, now);
|
|
94
|
+
|
|
95
|
+
const elapsed = heartbeatAgeMs(task.heartbeat, now);
|
|
96
|
+
const level = classifyHeartbeat(task.heartbeat, thresholds, now);
|
|
97
|
+
this.opts.registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: run.runId, taskId: task.id }, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
|
|
98
|
+
this.opts.registry.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: run.runId, level });
|
|
99
|
+
const previous = this.lastLevel.get(key);
|
|
100
|
+
this.lastLevel.set(key, level);
|
|
101
|
+
if (level === "dead" && previous !== "dead") {
|
|
102
|
+
this.opts.registry.counter("crew.heartbeat.dead_total", "Dead heartbeat detections").inc({ runId: run.runId });
|
|
103
|
+
appendEvent(loaded.manifest.eventsPath, { type: "crew.task.heartbeat_dead", runId: run.runId, taskId: task.id, message: `Task ${task.id} heartbeat dead.`, data: { elapsedMs: Number.isFinite(elapsed) ? elapsed : undefined } });
|
|
104
|
+
this.opts.router.enqueue({ id: `dead_${run.runId}_${task.id}`, severity: "warning", source: "heartbeat-watcher", runId: run.runId, title: `Task ${task.id} heartbeat dead`, body: "Background watcher detected a stuck worker." });
|
|
105
|
+
this.opts.onDead?.(run.runId, task.id, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
|
|
106
|
+
}
|
|
107
|
+
if (level === "dead") {
|
|
108
|
+
const count = (this.consecutiveDead.get(key) ?? 0) + 1;
|
|
109
|
+
this.consecutiveDead.set(key, count);
|
|
110
|
+
if (count === tickThreshold) {
|
|
111
|
+
// 3.6 cooldown gate
|
|
112
|
+
const cooldown = this.opts.deadletterCooldownMs ?? 60_000;
|
|
113
|
+
const lastTrigger = this.lastDeadletterTriggerAt.get(key) ?? 0;
|
|
114
|
+
if (now - lastTrigger >= cooldown) {
|
|
115
|
+
this.lastDeadletterTriggerAt.set(key, now);
|
|
116
|
+
this.opts.onDeadletterTrigger?.(loaded.manifest, task.id);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
} else {
|
|
120
|
+
this.consecutiveDead.delete(key);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Cleanup: drop keys that were NOT in this tick's active set AND
|
|
126
|
+
// haven't been seen for > maxKeyAgeMs. This covers runs that
|
|
127
|
+
// completed or fell out of the manifest cache's top-50 window.
|
|
128
|
+
const cutoff = now - this.maxKeyAgeMs;
|
|
129
|
+
for (const [key, ts] of this.lastSeen) {
|
|
130
|
+
if (!activeKeys.has(key) && ts < cutoff) {
|
|
131
|
+
this.lastLevel.delete(key);
|
|
132
|
+
this.consecutiveDead.delete(key);
|
|
133
|
+
this.lastSeen.delete(key);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
dispose(): void {
|
|
139
|
+
if (this.timer) clearTimeout(this.timer);
|
|
140
|
+
this.timer = undefined;
|
|
141
|
+
this.lastLevel.clear();
|
|
142
|
+
this.consecutiveDead.clear();
|
|
143
|
+
this.lastSeen.clear();
|
|
144
|
+
}
|
|
145
|
+
}
|