pi-crew 0.1.35 → 0.1.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -0
- package/docs/architecture.md +8 -1
- package/docs/research-phase9-observability-reliability-plan.md +42 -42
- package/package.json +1 -1
- package/schema.json +42 -0
- package/src/config/config.ts +101 -0
- package/src/extension/register.ts +65 -2
- package/src/extension/registration/commands.ts +14 -3
- package/src/extension/registration/team-tool.ts +3 -1
- package/src/extension/team-tool/api.ts +27 -2
- package/src/extension/team-tool/context.ts +2 -0
- package/src/extension/team-tool/run.ts +2 -2
- package/src/extension/team-tool.ts +1 -1
- package/src/observability/correlation.ts +35 -0
- package/src/observability/event-to-metric.ts +54 -0
- package/src/observability/exporters/adapter.ts +24 -0
- package/src/observability/exporters/otlp-exporter.ts +65 -0
- package/src/observability/exporters/prometheus-exporter.ts +47 -0
- package/src/observability/metric-registry.ts +72 -0
- package/src/observability/metric-retention.ts +46 -0
- package/src/observability/metric-sink.ts +51 -0
- package/src/observability/metrics-primitives.ts +166 -0
- package/src/runtime/crash-recovery.ts +56 -0
- package/src/runtime/deadletter.ts +36 -0
- package/src/runtime/diagnostic-export.ts +8 -1
- package/src/runtime/heartbeat-gradient.ts +28 -0
- package/src/runtime/heartbeat-watcher.ts +80 -0
- package/src/runtime/retry-executor.ts +59 -0
- package/src/runtime/task-runner.ts +14 -1
- package/src/runtime/team-runner.ts +57 -5
- package/src/schema/config-schema.ts +29 -0
- package/src/state/event-log.ts +3 -2
- package/src/state/types.ts +7 -0
- package/src/ui/dashboard-panes/metrics-pane.ts +34 -0
- package/src/ui/heartbeat-aggregator.ts +15 -5
- package/src/ui/keybinding-map.ts +4 -2
- package/src/ui/run-action-dispatcher.ts +3 -2
- package/src/ui/run-dashboard.ts +11 -4
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
|
|
2
|
+
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
|
+
import { appendEvent, scanSequence } from "../state/event-log.ts";
|
|
4
|
+
import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
|
|
5
|
+
import type { TeamTaskState } from "../state/types.ts";
|
|
6
|
+
import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
|
|
7
|
+
import type { ManifestCache } from "./manifest-cache.ts";
|
|
8
|
+
import { checkProcessLiveness } from "./process-status.ts";
|
|
9
|
+
|
|
10
|
+
export interface RecoveryPlan {
|
|
11
|
+
runId: string;
|
|
12
|
+
resumableTasks: string[];
|
|
13
|
+
preservedTasks: string[];
|
|
14
|
+
lastEventSeq: number;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function isTerminalTask(task: TeamTaskState): boolean {
|
|
18
|
+
return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped";
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function shouldRecoverTask(task: TeamTaskState, deadMs: number): boolean {
|
|
22
|
+
if (task.status !== "running") return false;
|
|
23
|
+
if (!task.heartbeat) return true;
|
|
24
|
+
return task.heartbeat.alive === false || isWorkerHeartbeatStale(task.heartbeat, deadMs);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache, deadMs = 300_000): RecoveryPlan[] {
|
|
28
|
+
const plans: RecoveryPlan[] = [];
|
|
29
|
+
for (const manifest of manifestCache.list(50)) {
|
|
30
|
+
if (manifest.status !== "running") continue;
|
|
31
|
+
if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
|
|
32
|
+
const loaded = loadRunManifestById(cwd, manifest.runId);
|
|
33
|
+
if (!loaded) continue;
|
|
34
|
+
const resumableTasks = loaded.tasks.filter((task) => shouldRecoverTask(task, deadMs)).map((task) => task.id);
|
|
35
|
+
if (!resumableTasks.length) continue;
|
|
36
|
+
plans.push({ runId: manifest.runId, resumableTasks, preservedTasks: loaded.tasks.filter(isTerminalTask).map((task) => task.id), lastEventSeq: scanSequence(loaded.manifest.eventsPath) });
|
|
37
|
+
}
|
|
38
|
+
return plans;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export async function applyRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">, registry?: MetricRegistry): Promise<void> {
|
|
42
|
+
const loaded = loadRunManifestById(ctx.cwd, plan.runId);
|
|
43
|
+
if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
|
|
44
|
+
const reset = new Set(plan.resumableTasks);
|
|
45
|
+
const tasks = loaded.tasks.map((task) => reset.has(task.id) ? { ...task, status: "queued" as const, startedAt: undefined, finishedAt: undefined, error: undefined, heartbeat: undefined } : task);
|
|
46
|
+
saveRunTasks(loaded.manifest, tasks);
|
|
47
|
+
appendEvent(loaded.manifest.eventsPath, { type: "crew.run.resumed", runId: plan.runId, message: `Recovered ${plan.resumableTasks.length} interrupted task(s).`, data: { recoveredFromSeq: plan.lastEventSeq, resumableTasks: plan.resumableTasks } });
|
|
48
|
+
registry?.counter("crew.run.count", "Total runs by status").inc({ status: "resumed" });
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export function declineRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">): void {
|
|
52
|
+
const loaded = loadRunManifestById(ctx.cwd, plan.runId);
|
|
53
|
+
if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
|
|
54
|
+
updateRunStatus(loaded.manifest, "cancelled", "interrupted-not-resumed");
|
|
55
|
+
appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_declined", runId: plan.runId, message: "Interrupted run was not resumed.", data: { recoveredFromSeq: plan.lastEventSeq } });
|
|
56
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import * as fs from "node:fs";
|
|
2
|
+
import * as path from "node:path";
|
|
3
|
+
import type { TeamRunManifest } from "../state/types.ts";
|
|
4
|
+
|
|
5
|
+
export type DeadletterReason = "max-retries" | "heartbeat-dead" | "manual";
|
|
6
|
+
|
|
7
|
+
export interface DeadletterEntry {
|
|
8
|
+
taskId: string;
|
|
9
|
+
runId: string;
|
|
10
|
+
reason: DeadletterReason;
|
|
11
|
+
attempts: number;
|
|
12
|
+
lastError?: string;
|
|
13
|
+
timestamp: string;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export function deadletterPath(manifest: TeamRunManifest): string {
|
|
17
|
+
return path.join(manifest.stateRoot, "deadletter.jsonl");
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function appendDeadletter(manifest: TeamRunManifest, entry: DeadletterEntry): void {
|
|
21
|
+
fs.mkdirSync(manifest.stateRoot, { recursive: true });
|
|
22
|
+
fs.appendFileSync(deadletterPath(manifest), `${JSON.stringify(entry)}\n`, "utf-8");
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function readDeadletter(manifest: TeamRunManifest): DeadletterEntry[] {
|
|
26
|
+
const filePath = deadletterPath(manifest);
|
|
27
|
+
if (!fs.existsSync(filePath)) return [];
|
|
28
|
+
return fs.readFileSync(filePath, "utf-8").split(/\r?\n/).filter(Boolean).flatMap((line) => {
|
|
29
|
+
try {
|
|
30
|
+
const parsed = JSON.parse(line) as DeadletterEntry;
|
|
31
|
+
return parsed && typeof parsed.taskId === "string" && typeof parsed.runId === "string" ? [parsed] : [];
|
|
32
|
+
} catch {
|
|
33
|
+
return [];
|
|
34
|
+
}
|
|
35
|
+
});
|
|
36
|
+
}
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
|
|
2
|
+
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
|
+
import type { MetricSnapshot } from "../observability/metrics-primitives.ts";
|
|
2
4
|
import * as fs from "node:fs";
|
|
3
5
|
import * as path from "node:path";
|
|
4
6
|
import { readCrewAgents } from "./crew-agent-records.ts";
|
|
@@ -9,6 +11,7 @@ import { summarizeHeartbeats, type HeartbeatSummary } from "../ui/heartbeat-aggr
|
|
|
9
11
|
import type { RunUiSnapshot } from "../ui/snapshot-types.ts";
|
|
10
12
|
|
|
11
13
|
export interface DiagnosticReport {
|
|
14
|
+
schemaVersion?: number;
|
|
12
15
|
runId: string;
|
|
13
16
|
exportedAt: string;
|
|
14
17
|
manifest: TeamRunManifest;
|
|
@@ -17,6 +20,7 @@ export interface DiagnosticReport {
|
|
|
17
20
|
heartbeat: HeartbeatSummary;
|
|
18
21
|
agents: unknown[];
|
|
19
22
|
envRedacted: Record<string, string>;
|
|
23
|
+
metricsSnapshot?: MetricSnapshot[];
|
|
20
24
|
}
|
|
21
25
|
|
|
22
26
|
const SECRET_KEY_PATTERN = /(token|key|password|secret|credential|auth)/i;
|
|
@@ -70,13 +74,15 @@ function buildSnapshot(manifest: TeamRunManifest, tasks: TeamTaskState[]): RunUi
|
|
|
70
74
|
};
|
|
71
75
|
}
|
|
72
76
|
|
|
73
|
-
export async function exportDiagnostic(ctx: Pick<ExtensionContext, "cwd">, runId: string): Promise<{ path: string; report: DiagnosticReport }> {
|
|
77
|
+
export async function exportDiagnostic(ctx: Pick<ExtensionContext, "cwd">, runId: string, options: { registry?: MetricRegistry } = {}): Promise<{ path: string; report: DiagnosticReport }> {
|
|
74
78
|
const loaded = loadRunManifestById(ctx.cwd, runId);
|
|
75
79
|
if (!loaded) throw new Error(`Run '${runId}' not found.`);
|
|
76
80
|
const exportedAt = new Date().toISOString();
|
|
77
81
|
const safeTimestamp = exportedAt.replace(/[:.]/g, "-");
|
|
78
82
|
const recentEvents = readEvents(loaded.manifest.eventsPath).slice(-200);
|
|
83
|
+
const metricsSnapshot = options.registry?.snapshot();
|
|
79
84
|
const report: DiagnosticReport = {
|
|
85
|
+
...(metricsSnapshot ? { schemaVersion: 2 } : {}),
|
|
80
86
|
runId,
|
|
81
87
|
exportedAt,
|
|
82
88
|
manifest: redactSecrets(loaded.manifest) as TeamRunManifest,
|
|
@@ -85,6 +91,7 @@ export async function exportDiagnostic(ctx: Pick<ExtensionContext, "cwd">, runId
|
|
|
85
91
|
heartbeat: summarizeHeartbeats(buildSnapshot(loaded.manifest, loaded.tasks)),
|
|
86
92
|
agents: redactSecrets(readCrewAgents(loaded.manifest)) as unknown[],
|
|
87
93
|
envRedacted: envRedacted(),
|
|
94
|
+
...(metricsSnapshot ? { metricsSnapshot: redactSecrets(metricsSnapshot) as MetricSnapshot[] } : {}),
|
|
88
95
|
};
|
|
89
96
|
const dir = path.join(loaded.manifest.artifactsRoot, "diagnostic");
|
|
90
97
|
fs.mkdirSync(dir, { recursive: true });
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { WorkerHeartbeatState } from "./worker-heartbeat.ts";
|
|
2
|
+
|
|
3
|
+
export type HeartbeatLevel = "healthy" | "warn" | "stale" | "dead";
|
|
4
|
+
|
|
5
|
+
export interface GradientThresholds {
|
|
6
|
+
warnMs: number;
|
|
7
|
+
staleMs: number;
|
|
8
|
+
deadMs: number;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export const DEFAULT_GRADIENT_THRESHOLDS: GradientThresholds = { warnMs: 30_000, staleMs: 60_000, deadMs: 300_000 };
|
|
12
|
+
|
|
13
|
+
export function heartbeatAgeMs(heartbeat: WorkerHeartbeatState | undefined, now = Date.now()): number {
|
|
14
|
+
if (!heartbeat) return Number.POSITIVE_INFINITY;
|
|
15
|
+
const lastSeen = Date.parse(heartbeat.lastSeenAt);
|
|
16
|
+
return Number.isFinite(lastSeen) ? Math.max(0, now - lastSeen) : Number.POSITIVE_INFINITY;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function classifyHeartbeat(heartbeat: WorkerHeartbeatState | undefined, thresholds: GradientThresholds = DEFAULT_GRADIENT_THRESHOLDS, now = Date.now()): HeartbeatLevel {
|
|
20
|
+
if (!heartbeat) return "dead";
|
|
21
|
+
if (heartbeat.alive === false) return "dead";
|
|
22
|
+
const elapsed = heartbeatAgeMs(heartbeat, now);
|
|
23
|
+
if (!Number.isFinite(elapsed)) return "dead";
|
|
24
|
+
if (elapsed > thresholds.deadMs) return "dead";
|
|
25
|
+
if (elapsed > thresholds.staleMs) return "stale";
|
|
26
|
+
if (elapsed > thresholds.warnMs) return "warn";
|
|
27
|
+
return "healthy";
|
|
28
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import type { NotificationDescriptor } from "../extension/notification-router.ts";
|
|
2
|
+
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
3
|
+
import { appendEvent } from "../state/event-log.ts";
|
|
4
|
+
import { loadRunManifestById } from "../state/state-store.ts";
|
|
5
|
+
import type { TeamRunManifest } from "../state/types.ts";
|
|
6
|
+
import type { ManifestCache } from "./manifest-cache.ts";
|
|
7
|
+
import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
|
|
8
|
+
|
|
9
|
+
export interface HeartbeatWatcherRouter {
|
|
10
|
+
enqueue(notification: NotificationDescriptor): boolean;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface HeartbeatWatcherOptions {
|
|
14
|
+
cwd: string;
|
|
15
|
+
pollIntervalMs?: number;
|
|
16
|
+
thresholds?: GradientThresholds;
|
|
17
|
+
manifestCache: ManifestCache;
|
|
18
|
+
registry: MetricRegistry;
|
|
19
|
+
router: HeartbeatWatcherRouter;
|
|
20
|
+
deadletterTickThreshold?: number;
|
|
21
|
+
onDead?: (runId: string, taskId: string, elapsed: number) => void;
|
|
22
|
+
onDeadletterTrigger?: (manifest: TeamRunManifest, taskId: string) => void;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export class HeartbeatWatcher {
|
|
26
|
+
private timer?: ReturnType<typeof setInterval>;
|
|
27
|
+
private lastLevel = new Map<string, HeartbeatLevel>();
|
|
28
|
+
private consecutiveDead = new Map<string, number>();
|
|
29
|
+
private readonly opts: HeartbeatWatcherOptions;
|
|
30
|
+
|
|
31
|
+
constructor(opts: HeartbeatWatcherOptions) {
|
|
32
|
+
this.opts = opts;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
start(): void {
|
|
36
|
+
this.dispose();
|
|
37
|
+
this.timer = setInterval(() => this.tick(), this.opts.pollIntervalMs ?? 5000);
|
|
38
|
+
this.timer.unref?.();
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
tick(now = Date.now()): void {
|
|
42
|
+
const thresholds = this.opts.thresholds ?? DEFAULT_GRADIENT_THRESHOLDS;
|
|
43
|
+
const tickThreshold = this.opts.deadletterTickThreshold ?? 3;
|
|
44
|
+
for (const run of this.opts.manifestCache.list(50)) {
|
|
45
|
+
if (run.status !== "running") continue;
|
|
46
|
+
const loaded = loadRunManifestById(this.opts.cwd, run.runId);
|
|
47
|
+
if (!loaded) continue;
|
|
48
|
+
for (const task of loaded.tasks) {
|
|
49
|
+
if (task.status !== "running") continue;
|
|
50
|
+
const key = `${run.runId}:${task.id}`;
|
|
51
|
+
const elapsed = heartbeatAgeMs(task.heartbeat, now);
|
|
52
|
+
const level = classifyHeartbeat(task.heartbeat, thresholds, now);
|
|
53
|
+
this.opts.registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: run.runId, taskId: task.id }, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
|
|
54
|
+
this.opts.registry.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: run.runId, level });
|
|
55
|
+
const previous = this.lastLevel.get(key);
|
|
56
|
+
this.lastLevel.set(key, level);
|
|
57
|
+
if (level === "dead" && previous !== "dead") {
|
|
58
|
+
this.opts.registry.counter("crew.heartbeat.dead_total", "Dead heartbeat detections").inc({ runId: run.runId });
|
|
59
|
+
appendEvent(loaded.manifest.eventsPath, { type: "crew.task.heartbeat_dead", runId: run.runId, taskId: task.id, message: `Task ${task.id} heartbeat dead.`, data: { elapsedMs: Number.isFinite(elapsed) ? elapsed : undefined } });
|
|
60
|
+
this.opts.router.enqueue({ id: `dead_${run.runId}_${task.id}`, severity: "warning", source: "heartbeat-watcher", runId: run.runId, title: `Task ${task.id} heartbeat dead`, body: "Background watcher detected a stuck worker." });
|
|
61
|
+
this.opts.onDead?.(run.runId, task.id, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
|
|
62
|
+
}
|
|
63
|
+
if (level === "dead") {
|
|
64
|
+
const count = (this.consecutiveDead.get(key) ?? 0) + 1;
|
|
65
|
+
this.consecutiveDead.set(key, count);
|
|
66
|
+
if (count === tickThreshold) this.opts.onDeadletterTrigger?.(loaded.manifest, task.id);
|
|
67
|
+
} else {
|
|
68
|
+
this.consecutiveDead.delete(key);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
dispose(): void {
|
|
75
|
+
if (this.timer) clearInterval(this.timer);
|
|
76
|
+
this.timer = undefined;
|
|
77
|
+
this.lastLevel.clear();
|
|
78
|
+
this.consecutiveDead.clear();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { sleep } from "../utils/sleep.ts";
|
|
2
|
+
|
|
3
|
+
export interface RetryPolicy {
|
|
4
|
+
maxAttempts: number;
|
|
5
|
+
backoffMs: number;
|
|
6
|
+
jitterRatio: number;
|
|
7
|
+
exponentialFactor: number;
|
|
8
|
+
retryableErrors?: string[];
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface RetryHooks {
|
|
12
|
+
onAttemptFailed?: (attempt: number, error: Error, nextDelayMs: number) => void;
|
|
13
|
+
onRetryGivenUp?: (attempts: number, error: Error) => void;
|
|
14
|
+
signal?: AbortSignal;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export const DEFAULT_RETRY_POLICY: RetryPolicy = { maxAttempts: 3, backoffMs: 1000, jitterRatio: 0.3, exponentialFactor: 2 };
|
|
18
|
+
|
|
19
|
+
function asError(error: unknown): Error {
|
|
20
|
+
return error instanceof Error ? error : new Error(String(error));
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function globToRegex(pattern: string): RegExp {
|
|
24
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
25
|
+
return new RegExp(`^${escaped}$`, "i");
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function isRetryable(error: Error, policy: RetryPolicy): boolean {
|
|
29
|
+
const patterns = policy.retryableErrors ?? [];
|
|
30
|
+
if (!patterns.length) return true;
|
|
31
|
+
return patterns.some((pattern) => globToRegex(pattern).test(error.message));
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function calculateRetryDelay(attempt: number, policy: RetryPolicy = DEFAULT_RETRY_POLICY, random = Math.random): number {
|
|
35
|
+
const base = policy.backoffMs * Math.pow(policy.exponentialFactor, Math.max(0, attempt - 1));
|
|
36
|
+
const jitter = (random() * 2 - 1) * policy.jitterRatio * base;
|
|
37
|
+
return Math.max(0, base + jitter);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export async function executeWithRetry<T>(fn: (attempt: number) => Promise<T>, policy: RetryPolicy = DEFAULT_RETRY_POLICY, hooks: RetryHooks = {}): Promise<T> {
|
|
41
|
+
const normalized: RetryPolicy = { ...DEFAULT_RETRY_POLICY, ...policy, maxAttempts: Math.max(1, policy.maxAttempts ?? DEFAULT_RETRY_POLICY.maxAttempts) };
|
|
42
|
+
let lastError: Error | undefined;
|
|
43
|
+
for (let attempt = 1; attempt <= normalized.maxAttempts; attempt += 1) {
|
|
44
|
+
if (hooks.signal?.aborted) throw new Error("Retry aborted.");
|
|
45
|
+
try {
|
|
46
|
+
return await fn(attempt);
|
|
47
|
+
} catch (error) {
|
|
48
|
+
lastError = asError(error);
|
|
49
|
+
if (attempt >= normalized.maxAttempts || !isRetryable(lastError, normalized)) {
|
|
50
|
+
hooks.onRetryGivenUp?.(attempt, lastError);
|
|
51
|
+
throw lastError;
|
|
52
|
+
}
|
|
53
|
+
const delay = calculateRetryDelay(attempt, normalized);
|
|
54
|
+
hooks.onAttemptFailed?.(attempt, lastError, delay);
|
|
55
|
+
await sleep(delay, hooks.signal);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
throw lastError ?? new Error("Retry failed without error.");
|
|
59
|
+
}
|
|
@@ -106,8 +106,16 @@ export async function runTeamTask(input: TaskRunnerInput): Promise<{ manifest: T
|
|
|
106
106
|
const transcriptPath = `${manifest.artifactsRoot}/transcripts/${task.id}.jsonl`;
|
|
107
107
|
let finalCheckpointWritten = false;
|
|
108
108
|
let lastAgentRecordPersistedAt = 0;
|
|
109
|
+
let lastHeartbeatPersistedAt = 0;
|
|
109
110
|
let lastRunProgressPersistedAt = 0;
|
|
110
111
|
let lastRunProgressSummary: ProgressEventSummary | undefined;
|
|
112
|
+
const persistHeartbeat = (force = false): void => {
|
|
113
|
+
const now = Date.now();
|
|
114
|
+
if (!force && now - lastHeartbeatPersistedAt < 1000) return;
|
|
115
|
+
lastHeartbeatPersistedAt = now;
|
|
116
|
+
task = { ...task, heartbeat: touchWorkerHeartbeat(task.heartbeat ?? createWorkerHeartbeat(task.id)) };
|
|
117
|
+
tasks = persistSingleTaskUpdate(manifest, tasks, task);
|
|
118
|
+
};
|
|
111
119
|
const persistChildProgress = (event: unknown, force = false): void => {
|
|
112
120
|
const now = Date.now();
|
|
113
121
|
if (force || shouldFlushProgressEvent(event) || now - lastAgentRecordPersistedAt >= 500) {
|
|
@@ -140,9 +148,13 @@ export async function runTeamTask(input: TaskRunnerInput): Promise<{ manifest: T
|
|
|
140
148
|
onSpawn: (pid) => {
|
|
141
149
|
({ task, tasks } = checkpointTask(manifest, tasks, task, "child-spawned", pid));
|
|
142
150
|
},
|
|
143
|
-
onStdoutLine: (line) =>
|
|
151
|
+
onStdoutLine: (line) => {
|
|
152
|
+
appendCrewAgentOutput(manifest, task.id, line);
|
|
153
|
+
persistHeartbeat();
|
|
154
|
+
},
|
|
144
155
|
onJsonEvent: (event) => {
|
|
145
156
|
appendCrewAgentEvent(manifest, task.id, event);
|
|
157
|
+
persistHeartbeat();
|
|
146
158
|
task = { ...task, agentProgress: applyAgentProgressEvent(task.agentProgress ?? emptyCrewAgentProgress(), event, task.startedAt) };
|
|
147
159
|
tasks = updateTask(tasks, task);
|
|
148
160
|
if (!finalCheckpointWritten && isFinalChildEvent(event)) {
|
|
@@ -158,6 +170,7 @@ export async function runTeamTask(input: TaskRunnerInput): Promise<{ manifest: T
|
|
|
158
170
|
finalStderr = childResult.stderr;
|
|
159
171
|
parsedOutput = parsePiJsonOutput(fs.existsSync(transcriptPath) ? fs.readFileSync(transcriptPath, "utf-8") : childResult.stdout);
|
|
160
172
|
error = childResult.error || (childResult.exitCode && childResult.exitCode !== 0 ? childResult.stderr || `Child Pi exited with ${childResult.exitCode}` : undefined);
|
|
173
|
+
persistHeartbeat(true);
|
|
161
174
|
persistChildProgress({ type: "attempt_finished" }, true);
|
|
162
175
|
const attempt: ModelAttemptSummary = { model: model ?? "default", success: !error, exitCode, error };
|
|
163
176
|
modelAttempts.push(attempt);
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
2
|
import type { AgentConfig } from "../agents/agent-config.ts";
|
|
3
|
-
import type { CrewLimitsConfig, CrewRuntimeConfig } from "../config/config.ts";
|
|
3
|
+
import type { CrewLimitsConfig, CrewRuntimeConfig, CrewReliabilityConfig } from "../config/config.ts";
|
|
4
4
|
import type { CrewRuntimeCapabilities } from "./runtime-resolver.ts";
|
|
5
5
|
import { writeArtifact } from "../state/artifact-store.ts";
|
|
6
6
|
import { appendEvent } from "../state/event-log.ts";
|
|
7
7
|
import type { TeamConfig } from "../teams/team-config.ts";
|
|
8
|
-
import type { ArtifactDescriptor, PolicyDecision, TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
8
|
+
import type { ArtifactDescriptor, PolicyDecision, TeamRunManifest, TaskAttemptState, TeamTaskState } from "../state/types.ts";
|
|
9
9
|
import { saveRunManifest, saveRunManifestAsync, saveRunTasksAsync, updateRunStatus } from "../state/state-store.ts";
|
|
10
10
|
import { aggregateUsage, formatUsage } from "../state/usage.ts";
|
|
11
11
|
import type { WorkflowConfig, WorkflowStep } from "../workflows/workflow-config.ts";
|
|
@@ -18,6 +18,10 @@ import { saveCrewAgents } from "./crew-agent-records.ts";
|
|
|
18
18
|
import { recordsForMaterializedTasks } from "./task-display.ts";
|
|
19
19
|
import { deliverGroupJoin, resolveGroupJoinMode } from "./group-join.ts";
|
|
20
20
|
import { runTeamTask } from "./task-runner.ts";
|
|
21
|
+
import { executeWithRetry, DEFAULT_RETRY_POLICY, type RetryPolicy } from "./retry-executor.ts";
|
|
22
|
+
import { appendDeadletter } from "./deadletter.ts";
|
|
23
|
+
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
24
|
+
import { childCorrelation, withCorrelation } from "../observability/correlation.ts";
|
|
21
25
|
import { resolveBatchConcurrency } from "./concurrency.ts";
|
|
22
26
|
import { mapConcurrent } from "./parallel-utils.ts";
|
|
23
27
|
|
|
@@ -36,6 +40,8 @@ export interface ExecuteTeamRunInput {
|
|
|
36
40
|
modelRegistry?: unknown;
|
|
37
41
|
modelOverride?: string;
|
|
38
42
|
signal?: AbortSignal;
|
|
43
|
+
reliability?: CrewReliabilityConfig;
|
|
44
|
+
metricRegistry?: MetricRegistry;
|
|
39
45
|
}
|
|
40
46
|
|
|
41
47
|
function findReadyTask(tasks: TeamTaskState[]): TeamTaskState | undefined {
|
|
@@ -73,7 +79,7 @@ function shouldMergeTaskUpdate(current: TeamTaskState, updated: TeamTaskState):
|
|
|
73
79
|
// contain stale queued/running copies of tasks that another worker already
|
|
74
80
|
// completed. Never let those stale snapshots regress durable task state.
|
|
75
81
|
if (!isNonTerminalTaskStatus(current.status) && isNonTerminalTaskStatus(updated.status)) return false;
|
|
76
|
-
return updated.status !== current.status || updated.finishedAt !== current.finishedAt || updated.startedAt !== current.startedAt || Boolean(updated.resultArtifact) || Boolean(updated.error) || Boolean(updated.modelAttempts?.length) || Boolean(updated.usage);
|
|
82
|
+
return updated.status !== current.status || updated.finishedAt !== current.finishedAt || updated.startedAt !== current.startedAt || Boolean(updated.resultArtifact) || Boolean(updated.error) || Boolean(updated.modelAttempts?.length) || Boolean(updated.usage) || Boolean(updated.attempts?.length);
|
|
77
83
|
}
|
|
78
84
|
|
|
79
85
|
export function __test__mergeTaskUpdates(base: TeamTaskState[], results: Array<{ tasks: TeamTaskState[] }>): TeamTaskState[] {
|
|
@@ -384,6 +390,14 @@ function applyPolicy(manifest: TeamRunManifest, tasks: TeamTaskState[], limits?:
|
|
|
384
390
|
return { ...manifest, updatedAt: new Date().toISOString(), policyDecisions: decisions, artifacts: [...manifest.artifacts.filter((artifact) => !(artifact.kind === "metadata" && (artifact.path.endsWith("policy-decisions.json") || artifact.path.endsWith("recovery-ledger.json") || artifact.path.endsWith("branch-freshness.json")))), branchArtifact, policyArtifact, recoveryArtifact] };
|
|
385
391
|
}
|
|
386
392
|
|
|
393
|
+
function retryPolicyFromConfig(config: CrewReliabilityConfig | undefined): RetryPolicy {
|
|
394
|
+
return { ...DEFAULT_RETRY_POLICY, ...(config?.retryPolicy ?? {}) };
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
function failedTaskFrom(result: { tasks: TeamTaskState[] }, taskId: string): TeamTaskState | undefined {
|
|
398
|
+
return result.tasks.find((item) => item.id === taskId && item.status === "failed");
|
|
399
|
+
}
|
|
400
|
+
|
|
387
401
|
export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ manifest: TeamRunManifest; tasks: TeamTaskState[] }> {
|
|
388
402
|
let workflow = input.workflow;
|
|
389
403
|
let manifest = updateRunStatus(input.manifest, "running", input.executeWorkers ? "Executing team workflow." : "Creating workflow prompts and placeholder results.");
|
|
@@ -450,10 +464,48 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
|
|
|
450
464
|
const results = await mapConcurrent(
|
|
451
465
|
readyBatch,
|
|
452
466
|
concurrency.selectedCount,
|
|
453
|
-
(task) => {
|
|
467
|
+
async (task) => {
|
|
454
468
|
const step = findStep(workflow, task);
|
|
455
469
|
const agent = findAgent(input.agents, task);
|
|
456
|
-
|
|
470
|
+
const baseInput = { manifest, tasks, task, step, agent, signal: input.signal, executeWorkers: input.executeWorkers, runtimeKind: input.runtime?.kind, runtimeConfig: input.runtimeConfig, parentContext: input.parentContext, parentModel: input.parentModel, modelRegistry: input.modelRegistry, modelOverride: input.modelOverride, limits: input.limits };
|
|
471
|
+
if (input.reliability?.autoRetry !== true) return withCorrelation(childCorrelation(manifest.runId, task.id), () => runTeamTask(baseInput));
|
|
472
|
+
let lastFailed: { manifest: TeamRunManifest; tasks: TeamTaskState[] } | undefined;
|
|
473
|
+
const attemptsSoFar: TaskAttemptState[] = [...(task.attempts ?? [])];
|
|
474
|
+
const policy = retryPolicyFromConfig(input.reliability);
|
|
475
|
+
try {
|
|
476
|
+
return await executeWithRetry(async (attempt) => {
|
|
477
|
+
const startedAt = new Date().toISOString();
|
|
478
|
+
const inFlightAttempts: TaskAttemptState[] = [...attemptsSoFar, { startedAt }];
|
|
479
|
+
input.metricRegistry?.counter("crew.task.retry_attempt_total", "Retry attempts by run and task").inc({ runId: manifest.runId, taskId: task.id });
|
|
480
|
+
const taskWithAttempt: TeamTaskState = { ...task, attempts: inFlightAttempts };
|
|
481
|
+
const result = await withCorrelation(childCorrelation(manifest.runId, task.id), () => runTeamTask({ ...baseInput, task: taskWithAttempt }));
|
|
482
|
+
const failed = failedTaskFrom(result, task.id);
|
|
483
|
+
const endedAt = new Date().toISOString();
|
|
484
|
+
const finishedAttempt: TaskAttemptState = { startedAt, endedAt, ...(failed?.error ? { error: failed.error } : {}) };
|
|
485
|
+
attemptsSoFar.push(finishedAttempt);
|
|
486
|
+
const withAttempt = result.tasks.map((item) => item.id === task.id ? { ...item, attempts: [...attemptsSoFar] } : item);
|
|
487
|
+
const enriched = { manifest: result.manifest, tasks: withAttempt };
|
|
488
|
+
if (failed) {
|
|
489
|
+
lastFailed = enriched;
|
|
490
|
+
throw new Error(failed.error ?? `Task ${task.id} failed.`);
|
|
491
|
+
}
|
|
492
|
+
input.metricRegistry?.histogram("crew.task.retry_count", "Retries per task", [0, 1, 2, 3, 5, 10]).observe({ runId: manifest.runId, team: input.team.name }, Math.max(0, attempt - 1));
|
|
493
|
+
return enriched;
|
|
494
|
+
}, policy, {
|
|
495
|
+
signal: input.signal,
|
|
496
|
+
onAttemptFailed: (attempt, error, delayMs) => {
|
|
497
|
+
appendEvent(manifest.eventsPath, { type: "crew.task.retry_attempt", runId: manifest.runId, taskId: task.id, message: error.message, data: { attempt, delayMs } });
|
|
498
|
+
input.metricRegistry?.histogram("crew.task.retry_delay_ms", "Retry backoff delay, milliseconds").observe({ runId: manifest.runId, taskId: task.id }, delayMs);
|
|
499
|
+
},
|
|
500
|
+
onRetryGivenUp: (attempts, error) => {
|
|
501
|
+
appendDeadletter(manifest, { runId: manifest.runId, taskId: task.id, reason: "max-retries", attempts, lastError: error.message, timestamp: new Date().toISOString() });
|
|
502
|
+
input.metricRegistry?.counter("crew.task.deadletter_total", "Deadletter triggers by reason").inc({ reason: "max-retries" });
|
|
503
|
+
input.metricRegistry?.histogram("crew.task.retry_count", "Retries per task", [0, 1, 2, 3, 5, 10]).observe({ runId: manifest.runId, team: input.team.name }, Math.max(0, attempts - 1));
|
|
504
|
+
},
|
|
505
|
+
});
|
|
506
|
+
} catch {
|
|
507
|
+
return lastFailed ?? withCorrelation(childCorrelation(manifest.runId, task.id), () => runTeamTask(baseInput));
|
|
508
|
+
}
|
|
457
509
|
},
|
|
458
510
|
);
|
|
459
511
|
manifest = { ...results.at(-1)!.manifest, artifacts: mergeArtifacts([manifest.artifacts, ...results.map((item) => item.manifest.artifacts)].flat()) };
|
|
@@ -81,6 +81,32 @@ export const PiTeamsNotificationsConfigSchema = Type.Object({
|
|
|
81
81
|
sinkRetentionDays: Type.Optional(Type.Integer({ minimum: 1, maximum: 90 })),
|
|
82
82
|
});
|
|
83
83
|
|
|
84
|
+
export const PiTeamsObservabilityConfigSchema = Type.Object({
|
|
85
|
+
enabled: Type.Optional(Type.Boolean()),
|
|
86
|
+
pollIntervalMs: Type.Optional(Type.Integer({ minimum: 1000, maximum: 60000 })),
|
|
87
|
+
metricRetentionDays: Type.Optional(Type.Integer({ minimum: 1, maximum: 365 })),
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
export const PiTeamsReliabilityConfigSchema = Type.Object({
|
|
91
|
+
autoRetry: Type.Optional(Type.Boolean()),
|
|
92
|
+
retryPolicy: Type.Optional(Type.Object({
|
|
93
|
+
maxAttempts: Type.Optional(Type.Integer({ minimum: 1, maximum: 10 })),
|
|
94
|
+
backoffMs: Type.Optional(Type.Integer({ minimum: 100, maximum: 60000 })),
|
|
95
|
+
jitterRatio: Type.Optional(Type.Number({ minimum: 0, maximum: 1 })),
|
|
96
|
+
exponentialFactor: Type.Optional(Type.Number({ minimum: 1, maximum: 5 })),
|
|
97
|
+
retryableErrors: Type.Optional(Type.Array(Type.String({ minLength: 1 }))),
|
|
98
|
+
})),
|
|
99
|
+
autoRecover: Type.Optional(Type.Boolean()),
|
|
100
|
+
deadletterThreshold: Type.Optional(Type.Integer({ minimum: 1 })),
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
export const PiTeamsOtlpConfigSchema = Type.Object({
|
|
104
|
+
enabled: Type.Optional(Type.Boolean()),
|
|
105
|
+
endpoint: Type.Optional(Type.String({ minLength: 1 })),
|
|
106
|
+
headers: Type.Optional(Type.Record(Type.String({ minLength: 1 }), Type.String())),
|
|
107
|
+
intervalMs: Type.Optional(Type.Integer({ minimum: 5000 })),
|
|
108
|
+
});
|
|
109
|
+
|
|
84
110
|
export const PiTeamsUiConfigSchema = Type.Object({
|
|
85
111
|
widgetPlacement: Type.Optional(Type.Union([Type.Literal("aboveEditor"), Type.Literal("belowEditor")])),
|
|
86
112
|
widgetMaxLines: Type.Optional(Type.Integer({ minimum: 1 })),
|
|
@@ -112,5 +138,8 @@ export const PiTeamsConfigSchema = Type.Object({
|
|
|
112
138
|
tools: Type.Optional(PiTeamsToolsConfigSchema),
|
|
113
139
|
telemetry: Type.Optional(PiTeamsTelemetryConfigSchema),
|
|
114
140
|
notifications: Type.Optional(PiTeamsNotificationsConfigSchema),
|
|
141
|
+
observability: Type.Optional(PiTeamsObservabilityConfigSchema),
|
|
142
|
+
reliability: Type.Optional(PiTeamsReliabilityConfigSchema),
|
|
143
|
+
otlp: Type.Optional(PiTeamsOtlpConfigSchema),
|
|
115
144
|
ui: Type.Optional(PiTeamsUiConfigSchema),
|
|
116
145
|
});
|
package/src/state/event-log.ts
CHANGED
|
@@ -48,7 +48,7 @@ const MAX_EVENTS_BYTES = 50 * 1024 * 1024;
|
|
|
48
48
|
|
|
49
49
|
const sequenceCache = new Map<string, { size: number; mtimeMs: number; seq: number }>();
|
|
50
50
|
|
|
51
|
-
function sequencePath(eventsPath: string): string {
|
|
51
|
+
export function sequencePath(eventsPath: string): string {
|
|
52
52
|
return `${eventsPath}.seq`;
|
|
53
53
|
}
|
|
54
54
|
|
|
@@ -57,7 +57,8 @@ function parseSequence(raw: string): number | undefined {
|
|
|
57
57
|
return Number.isInteger(value) && value >= 0 ? value : undefined;
|
|
58
58
|
}
|
|
59
59
|
|
|
60
|
-
function scanSequence(eventsPath: string): number {
|
|
60
|
+
export function scanSequence(eventsPath: string): number {
|
|
61
|
+
if (!fs.existsSync(eventsPath)) return 0;
|
|
61
62
|
let max = 0;
|
|
62
63
|
for (const line of fs.readFileSync(eventsPath, "utf-8").split("\n")) {
|
|
63
64
|
if (!line.trim()) continue;
|
package/src/state/types.ts
CHANGED
|
@@ -138,6 +138,12 @@ export interface TaskCheckpointState {
|
|
|
138
138
|
childPid?: number;
|
|
139
139
|
}
|
|
140
140
|
|
|
141
|
+
export interface TaskAttemptState {
|
|
142
|
+
startedAt: string;
|
|
143
|
+
endedAt?: string;
|
|
144
|
+
error?: string;
|
|
145
|
+
}
|
|
146
|
+
|
|
141
147
|
export interface TeamTaskState {
|
|
142
148
|
id: string;
|
|
143
149
|
runId: string;
|
|
@@ -166,6 +172,7 @@ export interface TeamTaskState {
|
|
|
166
172
|
claim?: TaskClaimState;
|
|
167
173
|
heartbeat?: WorkerHeartbeatState;
|
|
168
174
|
checkpoint?: TaskCheckpointState;
|
|
175
|
+
attempts?: TaskAttemptState[];
|
|
169
176
|
taskPacket?: TaskPacket;
|
|
170
177
|
verification?: VerificationEvidence;
|
|
171
178
|
graph?: TaskGraphNode;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import type { MetricRegistry } from "../../observability/metric-registry.ts";
|
|
2
|
+
import type { HistogramPoint, MetricLabels, MetricPoint } from "../../observability/metrics-primitives.ts";
|
|
3
|
+
import type { RunUiSnapshot } from "../snapshot-types.ts";
|
|
4
|
+
|
|
5
|
+
export interface MetricsPaneOptions {
|
|
6
|
+
registry?: MetricRegistry;
|
|
7
|
+
maxCounters?: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function labelsText(labels: MetricLabels): string {
|
|
11
|
+
const entries = Object.entries(labels);
|
|
12
|
+
return entries.length ? `{${entries.map(([key, value]) => `${key}=${value}`).join(",")}}` : "";
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function isHistogramPoint(point: MetricPoint | HistogramPoint): point is HistogramPoint {
|
|
16
|
+
return "quantiles" in point;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function renderMetricsPane(_snapshot: RunUiSnapshot | undefined, opts: MetricsPaneOptions = {}): string[] {
|
|
20
|
+
if (!opts.registry) return ["Metrics pane: registry unavailable"];
|
|
21
|
+
const snapshots = opts.registry.snapshot();
|
|
22
|
+
if (!snapshots.length) return ["Metrics pane: no metrics recorded"];
|
|
23
|
+
const lines = ["Metrics pane: top metrics"];
|
|
24
|
+
for (const snapshot of snapshots.slice(0, opts.maxCounters ?? 10)) {
|
|
25
|
+
const first = snapshot.values[0];
|
|
26
|
+
if (!first) {
|
|
27
|
+
lines.push(`${snapshot.name}: empty`);
|
|
28
|
+
continue;
|
|
29
|
+
}
|
|
30
|
+
if (isHistogramPoint(first)) lines.push(`${snapshot.name}${labelsText(first.labels)} count=${first.count} p95=${Number.isFinite(first.quantiles.p95) ? Math.round(first.quantiles.p95) : "n/a"}`);
|
|
31
|
+
else lines.push(`${snapshot.name}${labelsText(first.labels)} ${first.value}`);
|
|
32
|
+
}
|
|
33
|
+
return lines;
|
|
34
|
+
}
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import type { TeamTaskState } from "../state/types.ts";
|
|
2
|
+
import { classifyHeartbeat, heartbeatAgeMs } from "../runtime/heartbeat-gradient.ts";
|
|
3
|
+
import type { MetricRegistry } from "../observability/metric-registry.ts";
|
|
2
4
|
import type { RunUiSnapshot } from "./snapshot-types.ts";
|
|
3
5
|
|
|
4
6
|
export interface HeartbeatSummary {
|
|
@@ -9,12 +11,14 @@ export interface HeartbeatSummary {
|
|
|
9
11
|
dead: number;
|
|
10
12
|
missing: number;
|
|
11
13
|
worstStaleMs: number;
|
|
14
|
+
gradient: { healthy: number; warn: number; stale: number; dead: number };
|
|
12
15
|
}
|
|
13
16
|
|
|
14
17
|
export interface HeartbeatSummaryOptions {
|
|
15
18
|
staleMs?: number;
|
|
16
19
|
deadMs?: number;
|
|
17
20
|
now?: number | Date;
|
|
21
|
+
registry?: MetricRegistry;
|
|
18
22
|
}
|
|
19
23
|
|
|
20
24
|
function nowMs(now: number | Date | undefined): number {
|
|
@@ -24,29 +28,35 @@ function nowMs(now: number | Date | undefined): number {
|
|
|
24
28
|
}
|
|
25
29
|
|
|
26
30
|
function isActiveTask(task: TeamTaskState): boolean {
|
|
27
|
-
return task.status === "running"
|
|
31
|
+
return task.status === "running";
|
|
28
32
|
}
|
|
29
33
|
|
|
30
34
|
export function summarizeHeartbeats(snapshot: RunUiSnapshot, opts: HeartbeatSummaryOptions = {}): HeartbeatSummary {
|
|
31
35
|
const staleMs = opts.staleMs ?? 60_000;
|
|
32
36
|
const deadMs = opts.deadMs ?? 5 * 60_000;
|
|
33
37
|
const current = nowMs(opts.now);
|
|
34
|
-
const summary: HeartbeatSummary = { runId: snapshot.runId, totalTasks: snapshot.tasks.length, healthy: 0, stale: 0, dead: 0, missing: 0, worstStaleMs: 0 };
|
|
38
|
+
const summary: HeartbeatSummary = { runId: snapshot.runId, totalTasks: snapshot.tasks.length, healthy: 0, stale: 0, dead: 0, missing: 0, worstStaleMs: 0, gradient: { healthy: 0, warn: 0, stale: 0, dead: 0 } };
|
|
35
39
|
for (const task of snapshot.tasks) {
|
|
36
40
|
if (!isActiveTask(task)) continue;
|
|
37
41
|
const heartbeat = task.heartbeat;
|
|
38
42
|
if (!heartbeat) {
|
|
39
43
|
summary.missing += 1;
|
|
44
|
+
summary.gradient.dead += 1;
|
|
40
45
|
continue;
|
|
41
46
|
}
|
|
42
|
-
const age =
|
|
47
|
+
const age = heartbeatAgeMs(heartbeat, current);
|
|
43
48
|
if (!Number.isFinite(age)) {
|
|
44
49
|
summary.missing += 1;
|
|
50
|
+
summary.gradient.dead += 1;
|
|
45
51
|
continue;
|
|
46
52
|
}
|
|
47
53
|
summary.worstStaleMs = Math.max(summary.worstStaleMs, age);
|
|
48
|
-
|
|
49
|
-
|
|
54
|
+
const level = classifyHeartbeat(heartbeat, { warnMs: Math.max(1, Math.floor(staleMs / 2)), staleMs, deadMs }, current);
|
|
55
|
+
summary.gradient[level] += 1;
|
|
56
|
+
opts.registry?.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: snapshot.runId, taskId: task.id }, age);
|
|
57
|
+
opts.registry?.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: snapshot.runId, level });
|
|
58
|
+
if (level === "dead") summary.dead += 1;
|
|
59
|
+
else if (level === "stale") summary.stale += 1;
|
|
50
60
|
else summary.healthy += 1;
|
|
51
61
|
}
|
|
52
62
|
return summary;
|