pi-crew 0.1.35 → 0.1.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +36 -0
  2. package/docs/architecture.md +8 -1
  3. package/docs/research-phase9-observability-reliability-plan.md +42 -42
  4. package/package.json +1 -1
  5. package/schema.json +42 -0
  6. package/src/config/config.ts +101 -0
  7. package/src/extension/register.ts +65 -2
  8. package/src/extension/registration/commands.ts +14 -3
  9. package/src/extension/registration/team-tool.ts +3 -1
  10. package/src/extension/team-tool/api.ts +27 -2
  11. package/src/extension/team-tool/context.ts +2 -0
  12. package/src/extension/team-tool/run.ts +2 -2
  13. package/src/extension/team-tool.ts +1 -1
  14. package/src/observability/correlation.ts +35 -0
  15. package/src/observability/event-to-metric.ts +54 -0
  16. package/src/observability/exporters/adapter.ts +24 -0
  17. package/src/observability/exporters/otlp-exporter.ts +65 -0
  18. package/src/observability/exporters/prometheus-exporter.ts +47 -0
  19. package/src/observability/metric-registry.ts +72 -0
  20. package/src/observability/metric-retention.ts +46 -0
  21. package/src/observability/metric-sink.ts +51 -0
  22. package/src/observability/metrics-primitives.ts +166 -0
  23. package/src/runtime/crash-recovery.ts +56 -0
  24. package/src/runtime/deadletter.ts +36 -0
  25. package/src/runtime/diagnostic-export.ts +8 -1
  26. package/src/runtime/heartbeat-gradient.ts +28 -0
  27. package/src/runtime/heartbeat-watcher.ts +80 -0
  28. package/src/runtime/retry-executor.ts +59 -0
  29. package/src/runtime/team-runner.ts +57 -5
  30. package/src/schema/config-schema.ts +29 -0
  31. package/src/state/event-log.ts +3 -2
  32. package/src/state/types.ts +7 -0
  33. package/src/ui/dashboard-panes/metrics-pane.ts +34 -0
  34. package/src/ui/heartbeat-aggregator.ts +14 -4
  35. package/src/ui/keybinding-map.ts +4 -2
  36. package/src/ui/run-action-dispatcher.ts +3 -2
  37. package/src/ui/run-dashboard.ts +11 -4
@@ -0,0 +1,56 @@
1
+ import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
3
+ import { appendEvent, scanSequence } from "../state/event-log.ts";
4
+ import { loadRunManifestById, saveRunTasks, updateRunStatus } from "../state/state-store.ts";
5
+ import type { TeamTaskState } from "../state/types.ts";
6
+ import { isWorkerHeartbeatStale } from "./worker-heartbeat.ts";
7
+ import type { ManifestCache } from "./manifest-cache.ts";
8
+ import { checkProcessLiveness } from "./process-status.ts";
9
+
10
+ export interface RecoveryPlan {
11
+ runId: string;
12
+ resumableTasks: string[];
13
+ preservedTasks: string[];
14
+ lastEventSeq: number;
15
+ }
16
+
17
+ function isTerminalTask(task: TeamTaskState): boolean {
18
+ return task.status === "completed" || task.status === "failed" || task.status === "cancelled" || task.status === "skipped";
19
+ }
20
+
21
+ function shouldRecoverTask(task: TeamTaskState, deadMs: number): boolean {
22
+ if (task.status !== "running") return false;
23
+ if (!task.heartbeat) return true;
24
+ return task.heartbeat.alive === false || isWorkerHeartbeatStale(task.heartbeat, deadMs);
25
+ }
26
+
27
+ export function detectInterruptedRuns(cwd: string, manifestCache: ManifestCache, deadMs = 300_000): RecoveryPlan[] {
28
+ const plans: RecoveryPlan[] = [];
29
+ for (const manifest of manifestCache.list(50)) {
30
+ if (manifest.status !== "running") continue;
31
+ if (manifest.async?.pid !== undefined && checkProcessLiveness(manifest.async.pid).alive) continue;
32
+ const loaded = loadRunManifestById(cwd, manifest.runId);
33
+ if (!loaded) continue;
34
+ const resumableTasks = loaded.tasks.filter((task) => shouldRecoverTask(task, deadMs)).map((task) => task.id);
35
+ if (!resumableTasks.length) continue;
36
+ plans.push({ runId: manifest.runId, resumableTasks, preservedTasks: loaded.tasks.filter(isTerminalTask).map((task) => task.id), lastEventSeq: scanSequence(loaded.manifest.eventsPath) });
37
+ }
38
+ return plans;
39
+ }
40
+
41
+ export async function applyRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">, registry?: MetricRegistry): Promise<void> {
42
+ const loaded = loadRunManifestById(ctx.cwd, plan.runId);
43
+ if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
44
+ const reset = new Set(plan.resumableTasks);
45
+ const tasks = loaded.tasks.map((task) => reset.has(task.id) ? { ...task, status: "queued" as const, startedAt: undefined, finishedAt: undefined, error: undefined, heartbeat: undefined } : task);
46
+ saveRunTasks(loaded.manifest, tasks);
47
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.run.resumed", runId: plan.runId, message: `Recovered ${plan.resumableTasks.length} interrupted task(s).`, data: { recoveredFromSeq: plan.lastEventSeq, resumableTasks: plan.resumableTasks } });
48
+ registry?.counter("crew.run.count", "Total runs by status").inc({ status: "resumed" });
49
+ }
50
+
51
+ export function declineRecoveryPlan(plan: RecoveryPlan, ctx: Pick<ExtensionContext, "cwd">): void {
52
+ const loaded = loadRunManifestById(ctx.cwd, plan.runId);
53
+ if (!loaded) throw new Error(`Run '${plan.runId}' not found.`);
54
+ updateRunStatus(loaded.manifest, "cancelled", "interrupted-not-resumed");
55
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.run.recovery_declined", runId: plan.runId, message: "Interrupted run was not resumed.", data: { recoveredFromSeq: plan.lastEventSeq } });
56
+ }
@@ -0,0 +1,36 @@
1
+ import * as fs from "node:fs";
2
+ import * as path from "node:path";
3
+ import type { TeamRunManifest } from "../state/types.ts";
4
+
5
+ export type DeadletterReason = "max-retries" | "heartbeat-dead" | "manual";
6
+
7
+ export interface DeadletterEntry {
8
+ taskId: string;
9
+ runId: string;
10
+ reason: DeadletterReason;
11
+ attempts: number;
12
+ lastError?: string;
13
+ timestamp: string;
14
+ }
15
+
16
+ export function deadletterPath(manifest: TeamRunManifest): string {
17
+ return path.join(manifest.stateRoot, "deadletter.jsonl");
18
+ }
19
+
20
+ export function appendDeadletter(manifest: TeamRunManifest, entry: DeadletterEntry): void {
21
+ fs.mkdirSync(manifest.stateRoot, { recursive: true });
22
+ fs.appendFileSync(deadletterPath(manifest), `${JSON.stringify(entry)}\n`, "utf-8");
23
+ }
24
+
25
+ export function readDeadletter(manifest: TeamRunManifest): DeadletterEntry[] {
26
+ const filePath = deadletterPath(manifest);
27
+ if (!fs.existsSync(filePath)) return [];
28
+ return fs.readFileSync(filePath, "utf-8").split(/\r?\n/).filter(Boolean).flatMap((line) => {
29
+ try {
30
+ const parsed = JSON.parse(line) as DeadletterEntry;
31
+ return parsed && typeof parsed.taskId === "string" && typeof parsed.runId === "string" ? [parsed] : [];
32
+ } catch {
33
+ return [];
34
+ }
35
+ });
36
+ }
@@ -1,4 +1,6 @@
1
1
  import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
3
+ import type { MetricSnapshot } from "../observability/metrics-primitives.ts";
2
4
  import * as fs from "node:fs";
3
5
  import * as path from "node:path";
4
6
  import { readCrewAgents } from "./crew-agent-records.ts";
@@ -9,6 +11,7 @@ import { summarizeHeartbeats, type HeartbeatSummary } from "../ui/heartbeat-aggr
9
11
  import type { RunUiSnapshot } from "../ui/snapshot-types.ts";
10
12
 
11
13
  export interface DiagnosticReport {
14
+ schemaVersion?: number;
12
15
  runId: string;
13
16
  exportedAt: string;
14
17
  manifest: TeamRunManifest;
@@ -17,6 +20,7 @@ export interface DiagnosticReport {
17
20
  heartbeat: HeartbeatSummary;
18
21
  agents: unknown[];
19
22
  envRedacted: Record<string, string>;
23
+ metricsSnapshot?: MetricSnapshot[];
20
24
  }
21
25
 
22
26
  const SECRET_KEY_PATTERN = /(token|key|password|secret|credential|auth)/i;
@@ -70,13 +74,15 @@ function buildSnapshot(manifest: TeamRunManifest, tasks: TeamTaskState[]): RunUi
70
74
  };
71
75
  }
72
76
 
73
- export async function exportDiagnostic(ctx: Pick<ExtensionContext, "cwd">, runId: string): Promise<{ path: string; report: DiagnosticReport }> {
77
+ export async function exportDiagnostic(ctx: Pick<ExtensionContext, "cwd">, runId: string, options: { registry?: MetricRegistry } = {}): Promise<{ path: string; report: DiagnosticReport }> {
74
78
  const loaded = loadRunManifestById(ctx.cwd, runId);
75
79
  if (!loaded) throw new Error(`Run '${runId}' not found.`);
76
80
  const exportedAt = new Date().toISOString();
77
81
  const safeTimestamp = exportedAt.replace(/[:.]/g, "-");
78
82
  const recentEvents = readEvents(loaded.manifest.eventsPath).slice(-200);
83
+ const metricsSnapshot = options.registry?.snapshot();
79
84
  const report: DiagnosticReport = {
85
+ ...(metricsSnapshot ? { schemaVersion: 2 } : {}),
80
86
  runId,
81
87
  exportedAt,
82
88
  manifest: redactSecrets(loaded.manifest) as TeamRunManifest,
@@ -85,6 +91,7 @@ export async function exportDiagnostic(ctx: Pick<ExtensionContext, "cwd">, runId
85
91
  heartbeat: summarizeHeartbeats(buildSnapshot(loaded.manifest, loaded.tasks)),
86
92
  agents: redactSecrets(readCrewAgents(loaded.manifest)) as unknown[],
87
93
  envRedacted: envRedacted(),
94
+ ...(metricsSnapshot ? { metricsSnapshot: redactSecrets(metricsSnapshot) as MetricSnapshot[] } : {}),
88
95
  };
89
96
  const dir = path.join(loaded.manifest.artifactsRoot, "diagnostic");
90
97
  fs.mkdirSync(dir, { recursive: true });
@@ -0,0 +1,28 @@
1
+ import type { WorkerHeartbeatState } from "./worker-heartbeat.ts";
2
+
3
+ export type HeartbeatLevel = "healthy" | "warn" | "stale" | "dead";
4
+
5
+ export interface GradientThresholds {
6
+ warnMs: number;
7
+ staleMs: number;
8
+ deadMs: number;
9
+ }
10
+
11
+ export const DEFAULT_GRADIENT_THRESHOLDS: GradientThresholds = { warnMs: 30_000, staleMs: 60_000, deadMs: 300_000 };
12
+
13
+ export function heartbeatAgeMs(heartbeat: WorkerHeartbeatState | undefined, now = Date.now()): number {
14
+ if (!heartbeat) return Number.POSITIVE_INFINITY;
15
+ const lastSeen = Date.parse(heartbeat.lastSeenAt);
16
+ return Number.isFinite(lastSeen) ? Math.max(0, now - lastSeen) : Number.POSITIVE_INFINITY;
17
+ }
18
+
19
+ export function classifyHeartbeat(heartbeat: WorkerHeartbeatState | undefined, thresholds: GradientThresholds = DEFAULT_GRADIENT_THRESHOLDS, now = Date.now()): HeartbeatLevel {
20
+ if (!heartbeat) return "dead";
21
+ if (heartbeat.alive === false) return "dead";
22
+ const elapsed = heartbeatAgeMs(heartbeat, now);
23
+ if (!Number.isFinite(elapsed)) return "dead";
24
+ if (elapsed > thresholds.deadMs) return "dead";
25
+ if (elapsed > thresholds.staleMs) return "stale";
26
+ if (elapsed > thresholds.warnMs) return "warn";
27
+ return "healthy";
28
+ }
@@ -0,0 +1,80 @@
1
+ import type { NotificationDescriptor } from "../extension/notification-router.ts";
2
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
3
+ import { appendEvent } from "../state/event-log.ts";
4
+ import { loadRunManifestById } from "../state/state-store.ts";
5
+ import type { TeamRunManifest } from "../state/types.ts";
6
+ import type { ManifestCache } from "./manifest-cache.ts";
7
+ import { classifyHeartbeat, DEFAULT_GRADIENT_THRESHOLDS, heartbeatAgeMs, type GradientThresholds, type HeartbeatLevel } from "./heartbeat-gradient.ts";
8
+
9
+ export interface HeartbeatWatcherRouter {
10
+ enqueue(notification: NotificationDescriptor): boolean;
11
+ }
12
+
13
+ export interface HeartbeatWatcherOptions {
14
+ cwd: string;
15
+ pollIntervalMs?: number;
16
+ thresholds?: GradientThresholds;
17
+ manifestCache: ManifestCache;
18
+ registry: MetricRegistry;
19
+ router: HeartbeatWatcherRouter;
20
+ deadletterTickThreshold?: number;
21
+ onDead?: (runId: string, taskId: string, elapsed: number) => void;
22
+ onDeadletterTrigger?: (manifest: TeamRunManifest, taskId: string) => void;
23
+ }
24
+
25
+ export class HeartbeatWatcher {
26
+ private timer?: ReturnType<typeof setInterval>;
27
+ private lastLevel = new Map<string, HeartbeatLevel>();
28
+ private consecutiveDead = new Map<string, number>();
29
+ private readonly opts: HeartbeatWatcherOptions;
30
+
31
+ constructor(opts: HeartbeatWatcherOptions) {
32
+ this.opts = opts;
33
+ }
34
+
35
+ start(): void {
36
+ this.dispose();
37
+ this.timer = setInterval(() => this.tick(), this.opts.pollIntervalMs ?? 5000);
38
+ this.timer.unref?.();
39
+ }
40
+
41
+ tick(now = Date.now()): void {
42
+ const thresholds = this.opts.thresholds ?? DEFAULT_GRADIENT_THRESHOLDS;
43
+ const tickThreshold = this.opts.deadletterTickThreshold ?? 3;
44
+ for (const run of this.opts.manifestCache.list(50)) {
45
+ if (run.status !== "running") continue;
46
+ const loaded = loadRunManifestById(this.opts.cwd, run.runId);
47
+ if (!loaded) continue;
48
+ for (const task of loaded.tasks) {
49
+ if (task.status !== "running" && task.status !== "queued") continue;
50
+ const key = `${run.runId}:${task.id}`;
51
+ const elapsed = heartbeatAgeMs(task.heartbeat, now);
52
+ const level = classifyHeartbeat(task.heartbeat, thresholds, now);
53
+ this.opts.registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: run.runId, taskId: task.id }, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
54
+ this.opts.registry.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: run.runId, level });
55
+ const previous = this.lastLevel.get(key);
56
+ this.lastLevel.set(key, level);
57
+ if (level === "dead" && previous !== "dead") {
58
+ this.opts.registry.counter("crew.heartbeat.dead_total", "Dead heartbeat detections").inc({ runId: run.runId });
59
+ appendEvent(loaded.manifest.eventsPath, { type: "crew.task.heartbeat_dead", runId: run.runId, taskId: task.id, message: `Task ${task.id} heartbeat dead.`, data: { elapsedMs: Number.isFinite(elapsed) ? elapsed : undefined } });
60
+ this.opts.router.enqueue({ id: `dead_${run.runId}_${task.id}`, severity: "warning", source: "heartbeat-watcher", runId: run.runId, title: `Task ${task.id} heartbeat dead`, body: "Background watcher detected a stuck worker." });
61
+ this.opts.onDead?.(run.runId, task.id, Number.isFinite(elapsed) ? elapsed : thresholds.deadMs);
62
+ }
63
+ if (level === "dead") {
64
+ const count = (this.consecutiveDead.get(key) ?? 0) + 1;
65
+ this.consecutiveDead.set(key, count);
66
+ if (count === tickThreshold) this.opts.onDeadletterTrigger?.(loaded.manifest, task.id);
67
+ } else {
68
+ this.consecutiveDead.delete(key);
69
+ }
70
+ }
71
+ }
72
+ }
73
+
74
+ dispose(): void {
75
+ if (this.timer) clearInterval(this.timer);
76
+ this.timer = undefined;
77
+ this.lastLevel.clear();
78
+ this.consecutiveDead.clear();
79
+ }
80
+ }
@@ -0,0 +1,59 @@
1
+ import { sleep } from "../utils/sleep.ts";
2
+
3
+ export interface RetryPolicy {
4
+ maxAttempts: number;
5
+ backoffMs: number;
6
+ jitterRatio: number;
7
+ exponentialFactor: number;
8
+ retryableErrors?: string[];
9
+ }
10
+
11
+ export interface RetryHooks {
12
+ onAttemptFailed?: (attempt: number, error: Error, nextDelayMs: number) => void;
13
+ onRetryGivenUp?: (attempts: number, error: Error) => void;
14
+ signal?: AbortSignal;
15
+ }
16
+
17
+ export const DEFAULT_RETRY_POLICY: RetryPolicy = { maxAttempts: 3, backoffMs: 1000, jitterRatio: 0.3, exponentialFactor: 2 };
18
+
19
+ function asError(error: unknown): Error {
20
+ return error instanceof Error ? error : new Error(String(error));
21
+ }
22
+
23
+ function globToRegex(pattern: string): RegExp {
24
+ const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
25
+ return new RegExp(`^${escaped}$`, "i");
26
+ }
27
+
28
+ function isRetryable(error: Error, policy: RetryPolicy): boolean {
29
+ const patterns = policy.retryableErrors ?? [];
30
+ if (!patterns.length) return true;
31
+ return patterns.some((pattern) => globToRegex(pattern).test(error.message));
32
+ }
33
+
34
+ export function calculateRetryDelay(attempt: number, policy: RetryPolicy = DEFAULT_RETRY_POLICY, random = Math.random): number {
35
+ const base = policy.backoffMs * Math.pow(policy.exponentialFactor, Math.max(0, attempt - 1));
36
+ const jitter = (random() * 2 - 1) * policy.jitterRatio * base;
37
+ return Math.max(0, base + jitter);
38
+ }
39
+
40
+ export async function executeWithRetry<T>(fn: (attempt: number) => Promise<T>, policy: RetryPolicy = DEFAULT_RETRY_POLICY, hooks: RetryHooks = {}): Promise<T> {
41
+ const normalized: RetryPolicy = { ...DEFAULT_RETRY_POLICY, ...policy, maxAttempts: Math.max(1, policy.maxAttempts ?? DEFAULT_RETRY_POLICY.maxAttempts) };
42
+ let lastError: Error | undefined;
43
+ for (let attempt = 1; attempt <= normalized.maxAttempts; attempt += 1) {
44
+ if (hooks.signal?.aborted) throw new Error("Retry aborted.");
45
+ try {
46
+ return await fn(attempt);
47
+ } catch (error) {
48
+ lastError = asError(error);
49
+ if (attempt >= normalized.maxAttempts || !isRetryable(lastError, normalized)) {
50
+ hooks.onRetryGivenUp?.(attempt, lastError);
51
+ throw lastError;
52
+ }
53
+ const delay = calculateRetryDelay(attempt, normalized);
54
+ hooks.onAttemptFailed?.(attempt, lastError, delay);
55
+ await sleep(delay, hooks.signal);
56
+ }
57
+ }
58
+ throw lastError ?? new Error("Retry failed without error.");
59
+ }
@@ -1,11 +1,11 @@
1
1
  import * as fs from "node:fs";
2
2
  import type { AgentConfig } from "../agents/agent-config.ts";
3
- import type { CrewLimitsConfig, CrewRuntimeConfig } from "../config/config.ts";
3
+ import type { CrewLimitsConfig, CrewRuntimeConfig, CrewReliabilityConfig } from "../config/config.ts";
4
4
  import type { CrewRuntimeCapabilities } from "./runtime-resolver.ts";
5
5
  import { writeArtifact } from "../state/artifact-store.ts";
6
6
  import { appendEvent } from "../state/event-log.ts";
7
7
  import type { TeamConfig } from "../teams/team-config.ts";
8
- import type { ArtifactDescriptor, PolicyDecision, TeamRunManifest, TeamTaskState } from "../state/types.ts";
8
+ import type { ArtifactDescriptor, PolicyDecision, TeamRunManifest, TaskAttemptState, TeamTaskState } from "../state/types.ts";
9
9
  import { saveRunManifest, saveRunManifestAsync, saveRunTasksAsync, updateRunStatus } from "../state/state-store.ts";
10
10
  import { aggregateUsage, formatUsage } from "../state/usage.ts";
11
11
  import type { WorkflowConfig, WorkflowStep } from "../workflows/workflow-config.ts";
@@ -18,6 +18,10 @@ import { saveCrewAgents } from "./crew-agent-records.ts";
18
18
  import { recordsForMaterializedTasks } from "./task-display.ts";
19
19
  import { deliverGroupJoin, resolveGroupJoinMode } from "./group-join.ts";
20
20
  import { runTeamTask } from "./task-runner.ts";
21
+ import { executeWithRetry, DEFAULT_RETRY_POLICY, type RetryPolicy } from "./retry-executor.ts";
22
+ import { appendDeadletter } from "./deadletter.ts";
23
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
24
+ import { childCorrelation, withCorrelation } from "../observability/correlation.ts";
21
25
  import { resolveBatchConcurrency } from "./concurrency.ts";
22
26
  import { mapConcurrent } from "./parallel-utils.ts";
23
27
 
@@ -36,6 +40,8 @@ export interface ExecuteTeamRunInput {
36
40
  modelRegistry?: unknown;
37
41
  modelOverride?: string;
38
42
  signal?: AbortSignal;
43
+ reliability?: CrewReliabilityConfig;
44
+ metricRegistry?: MetricRegistry;
39
45
  }
40
46
 
41
47
  function findReadyTask(tasks: TeamTaskState[]): TeamTaskState | undefined {
@@ -73,7 +79,7 @@ function shouldMergeTaskUpdate(current: TeamTaskState, updated: TeamTaskState):
73
79
  // contain stale queued/running copies of tasks that another worker already
74
80
  // completed. Never let those stale snapshots regress durable task state.
75
81
  if (!isNonTerminalTaskStatus(current.status) && isNonTerminalTaskStatus(updated.status)) return false;
76
- return updated.status !== current.status || updated.finishedAt !== current.finishedAt || updated.startedAt !== current.startedAt || Boolean(updated.resultArtifact) || Boolean(updated.error) || Boolean(updated.modelAttempts?.length) || Boolean(updated.usage);
82
+ return updated.status !== current.status || updated.finishedAt !== current.finishedAt || updated.startedAt !== current.startedAt || Boolean(updated.resultArtifact) || Boolean(updated.error) || Boolean(updated.modelAttempts?.length) || Boolean(updated.usage) || Boolean(updated.attempts?.length);
77
83
  }
78
84
 
79
85
  export function __test__mergeTaskUpdates(base: TeamTaskState[], results: Array<{ tasks: TeamTaskState[] }>): TeamTaskState[] {
@@ -384,6 +390,14 @@ function applyPolicy(manifest: TeamRunManifest, tasks: TeamTaskState[], limits?:
384
390
  return { ...manifest, updatedAt: new Date().toISOString(), policyDecisions: decisions, artifacts: [...manifest.artifacts.filter((artifact) => !(artifact.kind === "metadata" && (artifact.path.endsWith("policy-decisions.json") || artifact.path.endsWith("recovery-ledger.json") || artifact.path.endsWith("branch-freshness.json")))), branchArtifact, policyArtifact, recoveryArtifact] };
385
391
  }
386
392
 
393
+ function retryPolicyFromConfig(config: CrewReliabilityConfig | undefined): RetryPolicy {
394
+ return { ...DEFAULT_RETRY_POLICY, ...(config?.retryPolicy ?? {}) };
395
+ }
396
+
397
+ function failedTaskFrom(result: { tasks: TeamTaskState[] }, taskId: string): TeamTaskState | undefined {
398
+ return result.tasks.find((item) => item.id === taskId && item.status === "failed");
399
+ }
400
+
387
401
  export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ manifest: TeamRunManifest; tasks: TeamTaskState[] }> {
388
402
  let workflow = input.workflow;
389
403
  let manifest = updateRunStatus(input.manifest, "running", input.executeWorkers ? "Executing team workflow." : "Creating workflow prompts and placeholder results.");
@@ -450,10 +464,48 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
450
464
  const results = await mapConcurrent(
451
465
  readyBatch,
452
466
  concurrency.selectedCount,
453
- (task) => {
467
+ async (task) => {
454
468
  const step = findStep(workflow, task);
455
469
  const agent = findAgent(input.agents, task);
456
- return runTeamTask({ manifest, tasks, task, step, agent, signal: input.signal, executeWorkers: input.executeWorkers, runtimeKind: input.runtime?.kind, runtimeConfig: input.runtimeConfig, parentContext: input.parentContext, parentModel: input.parentModel, modelRegistry: input.modelRegistry, modelOverride: input.modelOverride, limits: input.limits });
470
+ const baseInput = { manifest, tasks, task, step, agent, signal: input.signal, executeWorkers: input.executeWorkers, runtimeKind: input.runtime?.kind, runtimeConfig: input.runtimeConfig, parentContext: input.parentContext, parentModel: input.parentModel, modelRegistry: input.modelRegistry, modelOverride: input.modelOverride, limits: input.limits };
471
+ if (input.reliability?.autoRetry !== true) return withCorrelation(childCorrelation(manifest.runId, task.id), () => runTeamTask(baseInput));
472
+ let lastFailed: { manifest: TeamRunManifest; tasks: TeamTaskState[] } | undefined;
473
+ const attemptsSoFar: TaskAttemptState[] = [...(task.attempts ?? [])];
474
+ const policy = retryPolicyFromConfig(input.reliability);
475
+ try {
476
+ return await executeWithRetry(async (attempt) => {
477
+ const startedAt = new Date().toISOString();
478
+ const inFlightAttempts: TaskAttemptState[] = [...attemptsSoFar, { startedAt }];
479
+ input.metricRegistry?.counter("crew.task.retry_attempt_total", "Retry attempts by run and task").inc({ runId: manifest.runId, taskId: task.id });
480
+ const taskWithAttempt: TeamTaskState = { ...task, attempts: inFlightAttempts };
481
+ const result = await withCorrelation(childCorrelation(manifest.runId, task.id), () => runTeamTask({ ...baseInput, task: taskWithAttempt }));
482
+ const failed = failedTaskFrom(result, task.id);
483
+ const endedAt = new Date().toISOString();
484
+ const finishedAttempt: TaskAttemptState = { startedAt, endedAt, ...(failed?.error ? { error: failed.error } : {}) };
485
+ attemptsSoFar.push(finishedAttempt);
486
+ const withAttempt = result.tasks.map((item) => item.id === task.id ? { ...item, attempts: [...attemptsSoFar] } : item);
487
+ const enriched = { manifest: result.manifest, tasks: withAttempt };
488
+ if (failed) {
489
+ lastFailed = enriched;
490
+ throw new Error(failed.error ?? `Task ${task.id} failed.`);
491
+ }
492
+ input.metricRegistry?.histogram("crew.task.retry_count", "Retries per task", [0, 1, 2, 3, 5, 10]).observe({ runId: manifest.runId, team: input.team.name }, Math.max(0, attempt - 1));
493
+ return enriched;
494
+ }, policy, {
495
+ signal: input.signal,
496
+ onAttemptFailed: (attempt, error, delayMs) => {
497
+ appendEvent(manifest.eventsPath, { type: "crew.task.retry_attempt", runId: manifest.runId, taskId: task.id, message: error.message, data: { attempt, delayMs } });
498
+ input.metricRegistry?.histogram("crew.task.retry_delay_ms", "Retry backoff delay, milliseconds").observe({ runId: manifest.runId, taskId: task.id }, delayMs);
499
+ },
500
+ onRetryGivenUp: (attempts, error) => {
501
+ appendDeadletter(manifest, { runId: manifest.runId, taskId: task.id, reason: "max-retries", attempts, lastError: error.message, timestamp: new Date().toISOString() });
502
+ input.metricRegistry?.counter("crew.task.deadletter_total", "Deadletter triggers by reason").inc({ reason: "max-retries" });
503
+ input.metricRegistry?.histogram("crew.task.retry_count", "Retries per task", [0, 1, 2, 3, 5, 10]).observe({ runId: manifest.runId, team: input.team.name }, Math.max(0, attempts - 1));
504
+ },
505
+ });
506
+ } catch {
507
+ return lastFailed ?? withCorrelation(childCorrelation(manifest.runId, task.id), () => runTeamTask(baseInput));
508
+ }
457
509
  },
458
510
  );
459
511
  manifest = { ...results.at(-1)!.manifest, artifacts: mergeArtifacts([manifest.artifacts, ...results.map((item) => item.manifest.artifacts)].flat()) };
@@ -81,6 +81,32 @@ export const PiTeamsNotificationsConfigSchema = Type.Object({
81
81
  sinkRetentionDays: Type.Optional(Type.Integer({ minimum: 1, maximum: 90 })),
82
82
  });
83
83
 
84
+ export const PiTeamsObservabilityConfigSchema = Type.Object({
85
+ enabled: Type.Optional(Type.Boolean()),
86
+ pollIntervalMs: Type.Optional(Type.Integer({ minimum: 1000, maximum: 60000 })),
87
+ metricRetentionDays: Type.Optional(Type.Integer({ minimum: 1, maximum: 365 })),
88
+ });
89
+
90
+ export const PiTeamsReliabilityConfigSchema = Type.Object({
91
+ autoRetry: Type.Optional(Type.Boolean()),
92
+ retryPolicy: Type.Optional(Type.Object({
93
+ maxAttempts: Type.Optional(Type.Integer({ minimum: 1, maximum: 10 })),
94
+ backoffMs: Type.Optional(Type.Integer({ minimum: 100, maximum: 60000 })),
95
+ jitterRatio: Type.Optional(Type.Number({ minimum: 0, maximum: 1 })),
96
+ exponentialFactor: Type.Optional(Type.Number({ minimum: 1, maximum: 5 })),
97
+ retryableErrors: Type.Optional(Type.Array(Type.String({ minLength: 1 }))),
98
+ })),
99
+ autoRecover: Type.Optional(Type.Boolean()),
100
+ deadletterThreshold: Type.Optional(Type.Integer({ minimum: 1 })),
101
+ });
102
+
103
+ export const PiTeamsOtlpConfigSchema = Type.Object({
104
+ enabled: Type.Optional(Type.Boolean()),
105
+ endpoint: Type.Optional(Type.String({ minLength: 1 })),
106
+ headers: Type.Optional(Type.Record(Type.String({ minLength: 1 }), Type.String())),
107
+ intervalMs: Type.Optional(Type.Integer({ minimum: 5000 })),
108
+ });
109
+
84
110
  export const PiTeamsUiConfigSchema = Type.Object({
85
111
  widgetPlacement: Type.Optional(Type.Union([Type.Literal("aboveEditor"), Type.Literal("belowEditor")])),
86
112
  widgetMaxLines: Type.Optional(Type.Integer({ minimum: 1 })),
@@ -112,5 +138,8 @@ export const PiTeamsConfigSchema = Type.Object({
112
138
  tools: Type.Optional(PiTeamsToolsConfigSchema),
113
139
  telemetry: Type.Optional(PiTeamsTelemetryConfigSchema),
114
140
  notifications: Type.Optional(PiTeamsNotificationsConfigSchema),
141
+ observability: Type.Optional(PiTeamsObservabilityConfigSchema),
142
+ reliability: Type.Optional(PiTeamsReliabilityConfigSchema),
143
+ otlp: Type.Optional(PiTeamsOtlpConfigSchema),
115
144
  ui: Type.Optional(PiTeamsUiConfigSchema),
116
145
  });
@@ -48,7 +48,7 @@ const MAX_EVENTS_BYTES = 50 * 1024 * 1024;
48
48
 
49
49
  const sequenceCache = new Map<string, { size: number; mtimeMs: number; seq: number }>();
50
50
 
51
- function sequencePath(eventsPath: string): string {
51
+ export function sequencePath(eventsPath: string): string {
52
52
  return `${eventsPath}.seq`;
53
53
  }
54
54
 
@@ -57,7 +57,8 @@ function parseSequence(raw: string): number | undefined {
57
57
  return Number.isInteger(value) && value >= 0 ? value : undefined;
58
58
  }
59
59
 
60
- function scanSequence(eventsPath: string): number {
60
+ export function scanSequence(eventsPath: string): number {
61
+ if (!fs.existsSync(eventsPath)) return 0;
61
62
  let max = 0;
62
63
  for (const line of fs.readFileSync(eventsPath, "utf-8").split("\n")) {
63
64
  if (!line.trim()) continue;
@@ -138,6 +138,12 @@ export interface TaskCheckpointState {
138
138
  childPid?: number;
139
139
  }
140
140
 
141
+ export interface TaskAttemptState {
142
+ startedAt: string;
143
+ endedAt?: string;
144
+ error?: string;
145
+ }
146
+
141
147
  export interface TeamTaskState {
142
148
  id: string;
143
149
  runId: string;
@@ -166,6 +172,7 @@ export interface TeamTaskState {
166
172
  claim?: TaskClaimState;
167
173
  heartbeat?: WorkerHeartbeatState;
168
174
  checkpoint?: TaskCheckpointState;
175
+ attempts?: TaskAttemptState[];
169
176
  taskPacket?: TaskPacket;
170
177
  verification?: VerificationEvidence;
171
178
  graph?: TaskGraphNode;
@@ -0,0 +1,34 @@
1
+ import type { MetricRegistry } from "../../observability/metric-registry.ts";
2
+ import type { HistogramPoint, MetricLabels, MetricPoint } from "../../observability/metrics-primitives.ts";
3
+ import type { RunUiSnapshot } from "../snapshot-types.ts";
4
+
5
+ export interface MetricsPaneOptions {
6
+ registry?: MetricRegistry;
7
+ maxCounters?: number;
8
+ }
9
+
10
+ function labelsText(labels: MetricLabels): string {
11
+ const entries = Object.entries(labels);
12
+ return entries.length ? `{${entries.map(([key, value]) => `${key}=${value}`).join(",")}}` : "";
13
+ }
14
+
15
+ function isHistogramPoint(point: MetricPoint | HistogramPoint): point is HistogramPoint {
16
+ return "quantiles" in point;
17
+ }
18
+
19
+ export function renderMetricsPane(_snapshot: RunUiSnapshot | undefined, opts: MetricsPaneOptions = {}): string[] {
20
+ if (!opts.registry) return ["Metrics pane: registry unavailable"];
21
+ const snapshots = opts.registry.snapshot();
22
+ if (!snapshots.length) return ["Metrics pane: no metrics recorded"];
23
+ const lines = ["Metrics pane: top metrics"];
24
+ for (const snapshot of snapshots.slice(0, opts.maxCounters ?? 10)) {
25
+ const first = snapshot.values[0];
26
+ if (!first) {
27
+ lines.push(`${snapshot.name}: empty`);
28
+ continue;
29
+ }
30
+ if (isHistogramPoint(first)) lines.push(`${snapshot.name}${labelsText(first.labels)} count=${first.count} p95=${Number.isFinite(first.quantiles.p95) ? Math.round(first.quantiles.p95) : "n/a"}`);
31
+ else lines.push(`${snapshot.name}${labelsText(first.labels)} ${first.value}`);
32
+ }
33
+ return lines;
34
+ }
@@ -1,4 +1,6 @@
1
1
  import type { TeamTaskState } from "../state/types.ts";
2
+ import { classifyHeartbeat, heartbeatAgeMs } from "../runtime/heartbeat-gradient.ts";
3
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
2
4
  import type { RunUiSnapshot } from "./snapshot-types.ts";
3
5
 
4
6
  export interface HeartbeatSummary {
@@ -9,12 +11,14 @@ export interface HeartbeatSummary {
9
11
  dead: number;
10
12
  missing: number;
11
13
  worstStaleMs: number;
14
+ gradient: { healthy: number; warn: number; stale: number; dead: number };
12
15
  }
13
16
 
14
17
  export interface HeartbeatSummaryOptions {
15
18
  staleMs?: number;
16
19
  deadMs?: number;
17
20
  now?: number | Date;
21
+ registry?: MetricRegistry;
18
22
  }
19
23
 
20
24
  function nowMs(now: number | Date | undefined): number {
@@ -31,22 +35,28 @@ export function summarizeHeartbeats(snapshot: RunUiSnapshot, opts: HeartbeatSumm
31
35
  const staleMs = opts.staleMs ?? 60_000;
32
36
  const deadMs = opts.deadMs ?? 5 * 60_000;
33
37
  const current = nowMs(opts.now);
34
- const summary: HeartbeatSummary = { runId: snapshot.runId, totalTasks: snapshot.tasks.length, healthy: 0, stale: 0, dead: 0, missing: 0, worstStaleMs: 0 };
38
+ const summary: HeartbeatSummary = { runId: snapshot.runId, totalTasks: snapshot.tasks.length, healthy: 0, stale: 0, dead: 0, missing: 0, worstStaleMs: 0, gradient: { healthy: 0, warn: 0, stale: 0, dead: 0 } };
35
39
  for (const task of snapshot.tasks) {
36
40
  if (!isActiveTask(task)) continue;
37
41
  const heartbeat = task.heartbeat;
38
42
  if (!heartbeat) {
39
43
  summary.missing += 1;
44
+ summary.gradient.dead += 1;
40
45
  continue;
41
46
  }
42
- const age = Math.max(0, current - Date.parse(heartbeat.lastSeenAt));
47
+ const age = heartbeatAgeMs(heartbeat, current);
43
48
  if (!Number.isFinite(age)) {
44
49
  summary.missing += 1;
50
+ summary.gradient.dead += 1;
45
51
  continue;
46
52
  }
47
53
  summary.worstStaleMs = Math.max(summary.worstStaleMs, age);
48
- if (heartbeat.alive === false || age > deadMs) summary.dead += 1;
49
- else if (age > staleMs) summary.stale += 1;
54
+ const level = classifyHeartbeat(heartbeat, { warnMs: Math.max(1, Math.floor(staleMs / 2)), staleMs, deadMs }, current);
55
+ summary.gradient[level] += 1;
56
+ opts.registry?.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds").set({ runId: snapshot.runId, taskId: task.id }, age);
57
+ opts.registry?.counter("crew.heartbeat.level_total", "Heartbeat classifications by level").inc({ runId: snapshot.runId, level });
58
+ if (level === "dead") summary.dead += 1;
59
+ else if (level === "stale") summary.stale += 1;
50
60
  else summary.healthy += 1;
51
61
  }
52
62
  return summary;
@@ -13,7 +13,7 @@ export const DASHBOARD_KEYS = {
13
13
  reload: ["r"],
14
14
  progressToggle: ["p"],
15
15
  },
16
- pane: { agents: ["1"], progress: ["2"], mailbox: ["3"], output: ["4"], health: ["5"] },
16
+ pane: { agents: ["1"], progress: ["2"], mailbox: ["3"], output: ["4"], health: ["5"], metrics: ["6"] },
17
17
  navigation: { up: ["k", "\u001b[A"], down: ["j", "\u001b[B"] },
18
18
  mailbox: { ack: ["A"], nudge: ["N"], compose: ["C"], preview: ["P"], ackAll: ["X"], openDetail: ["\r", "\n"] },
19
19
  health: { recovery: ["R"], killStale: ["K"], diagnosticExport: ["D"] },
@@ -53,6 +53,7 @@ export type DashboardKeyAction =
53
53
  | "pane-mailbox"
54
54
  | "pane-output"
55
55
  | "pane-health"
56
+ | "pane-metrics"
56
57
  | "up"
57
58
  | "down"
58
59
  | "mailbox-detail"
@@ -61,7 +62,7 @@ export type DashboardKeyAction =
61
62
  | "health-diagnostic-export"
62
63
  | "notifications-dismiss";
63
64
 
64
- export function dashboardActionForKey(data: string, activePane?: "agents" | "progress" | "mailbox" | "output" | "health"): DashboardKeyAction | undefined {
65
+ export function dashboardActionForKey(data: string, activePane?: "agents" | "progress" | "mailbox" | "output" | "health" | "metrics"): DashboardKeyAction | undefined {
65
66
  if (includes(DASHBOARD_KEYS.close, data)) return "close";
66
67
  if (activePane === "mailbox" && includes(DASHBOARD_KEYS.mailbox.openDetail, data)) return "mailbox-detail";
67
68
  if (activePane === "health") {
@@ -86,6 +87,7 @@ export function dashboardActionForKey(data: string, activePane?: "agents" | "pro
86
87
  if (includes(DASHBOARD_KEYS.pane.mailbox, data)) return "pane-mailbox";
87
88
  if (includes(DASHBOARD_KEYS.pane.output, data)) return "pane-output";
88
89
  if (includes(DASHBOARD_KEYS.pane.health, data)) return "pane-health";
90
+ if (includes(DASHBOARD_KEYS.pane.metrics, data)) return "pane-metrics";
89
91
  if (includes(DASHBOARD_KEYS.navigation.up, data)) return "up";
90
92
  if (includes(DASHBOARD_KEYS.navigation.down, data)) return "down";
91
93
  return undefined;
@@ -1,4 +1,5 @@
1
1
  import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
+ import type { MetricRegistry } from "../observability/metric-registry.ts";
2
3
  import { handleTeamTool } from "../extension/team-tool.ts";
3
4
  import { isToolError, textFromToolResult } from "../extension/tool-result.ts";
4
5
  import { loadRunManifestById, saveRunTasks } from "../state/state-store.ts";
@@ -91,9 +92,9 @@ export async function dispatchKillStaleWorkers(ctx: ExtensionContext, runId: str
91
92
  }
92
93
  }
93
94
 
94
- export async function dispatchDiagnosticExport(ctx: ExtensionContext, runId: string): Promise<RunActionResult> {
95
+ export async function dispatchDiagnosticExport(ctx: ExtensionContext, runId: string, options: { registry?: MetricRegistry } = {}): Promise<RunActionResult> {
95
96
  try {
96
- const exported = await exportDiagnostic(ctx, runId);
97
+ const exported = await exportDiagnostic(ctx, runId, options);
97
98
  return { ok: true, message: `Diagnostic exported to ${exported.path}`, data: exported.path };
98
99
  } catch (error) {
99
100
  return err(error);