pi-crew 0.1.34 → 0.1.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +36 -0
  2. package/docs/architecture.md +8 -1
  3. package/docs/research-phase9-observability-reliability-plan.md +42 -42
  4. package/docs/research-source-pi-crew-reference.md +174 -0
  5. package/package.json +1 -1
  6. package/schema.json +42 -0
  7. package/src/config/config.ts +101 -0
  8. package/src/extension/register.ts +66 -3
  9. package/src/extension/registration/commands.ts +14 -3
  10. package/src/extension/registration/team-tool.ts +3 -1
  11. package/src/extension/team-tool/api.ts +27 -2
  12. package/src/extension/team-tool/context.ts +2 -0
  13. package/src/extension/team-tool/run.ts +2 -2
  14. package/src/extension/team-tool.ts +1 -1
  15. package/src/observability/correlation.ts +35 -0
  16. package/src/observability/event-to-metric.ts +54 -0
  17. package/src/observability/exporters/adapter.ts +24 -0
  18. package/src/observability/exporters/otlp-exporter.ts +65 -0
  19. package/src/observability/exporters/prometheus-exporter.ts +47 -0
  20. package/src/observability/metric-registry.ts +72 -0
  21. package/src/observability/metric-retention.ts +46 -0
  22. package/src/observability/metric-sink.ts +51 -0
  23. package/src/observability/metrics-primitives.ts +166 -0
  24. package/src/runtime/child-pi.ts +5 -1
  25. package/src/runtime/crash-recovery.ts +56 -0
  26. package/src/runtime/deadletter.ts +36 -0
  27. package/src/runtime/diagnostic-export.ts +8 -1
  28. package/src/runtime/heartbeat-gradient.ts +28 -0
  29. package/src/runtime/heartbeat-watcher.ts +80 -0
  30. package/src/runtime/retry-executor.ts +59 -0
  31. package/src/runtime/team-runner.ts +57 -5
  32. package/src/schema/config-schema.ts +29 -0
  33. package/src/state/event-log.ts +3 -2
  34. package/src/state/types.ts +7 -0
  35. package/src/ui/dashboard-panes/agents-pane.ts +4 -1
  36. package/src/ui/dashboard-panes/metrics-pane.ts +34 -0
  37. package/src/ui/heartbeat-aggregator.ts +14 -4
  38. package/src/ui/keybinding-map.ts +4 -2
  39. package/src/ui/live-run-sidebar.ts +5 -4
  40. package/src/ui/run-action-dispatcher.ts +3 -2
  41. package/src/ui/run-dashboard.ts +17 -6
  42. package/src/ui/spinner.ts +17 -0
@@ -27,6 +27,13 @@ import { NotificationRouter, type NotificationDescriptor } from "./notification-
27
27
  import { createJsonlSink, type NotificationSink } from "./notification-sink.ts";
28
28
  import { projectCrewRoot } from "../utils/paths.ts";
29
29
  import { summarizeHeartbeats } from "../ui/heartbeat-aggregator.ts";
30
+ import { createMetricRegistry, type MetricRegistry } from "../observability/metric-registry.ts";
31
+ import { wireEventToMetrics, type EventToMetricSubscription } from "../observability/event-to-metric.ts";
32
+ import { createMetricFileSink, type MetricSink } from "../observability/metric-sink.ts";
33
+ import { OTLPExporter } from "../observability/exporters/otlp-exporter.ts";
34
+ import { HeartbeatWatcher } from "../runtime/heartbeat-watcher.ts";
35
+ import { appendDeadletter } from "../runtime/deadletter.ts";
36
+ import { detectInterruptedRuns } from "../runtime/crash-recovery.ts";
30
37
 
31
38
  export { __test__subagentSpawnParams };
32
39
 
@@ -68,6 +75,11 @@ export function registerPiTeams(pi: ExtensionAPI): void {
68
75
  const widgetState: CrewWidgetState = { frame: 0 };
69
76
  let notificationSink: NotificationSink | undefined;
70
77
  let notificationRouter: NotificationRouter | undefined;
78
+ let metricRegistry: MetricRegistry | undefined;
79
+ let eventMetricSub: EventToMetricSubscription | undefined;
80
+ let metricSink: MetricSink | undefined;
81
+ let heartbeatWatcher: HeartbeatWatcher | undefined;
82
+ let otlpExporter: OTLPExporter | undefined;
71
83
  const configureNotifications = (ctx: ExtensionContext): void => {
72
84
  notificationRouter?.dispose();
73
85
  notificationSink?.dispose();
@@ -92,6 +104,46 @@ export function registerPiTeams(pi: ExtensionAPI): void {
92
104
  }
93
105
  });
94
106
  };
107
+ const configureObservability = (ctx: ExtensionContext): void => {
108
+ heartbeatWatcher?.dispose();
109
+ metricSink?.dispose();
110
+ eventMetricSub?.dispose();
111
+ otlpExporter?.dispose();
112
+ metricRegistry?.dispose();
113
+ heartbeatWatcher = undefined;
114
+ metricSink = undefined;
115
+ eventMetricSub = undefined;
116
+ otlpExporter = undefined;
117
+ metricRegistry = undefined;
118
+ const config = loadConfig(ctx.cwd).config;
119
+ if (config.observability?.enabled === false) return;
120
+ metricRegistry = createMetricRegistry();
121
+ eventMetricSub = wireEventToMetrics(pi.events, metricRegistry);
122
+ if (config.telemetry?.enabled !== false) metricSink = createMetricFileSink({ crewRoot: projectCrewRoot(ctx.cwd), registry: metricRegistry, retentionDays: config.observability?.metricRetentionDays ?? 7 });
123
+ if (config.otlp?.enabled === true && config.otlp.endpoint) {
124
+ otlpExporter = new OTLPExporter({ endpoint: config.otlp.endpoint, headers: config.otlp.headers, intervalMs: config.otlp.intervalMs }, metricRegistry);
125
+ otlpExporter.start();
126
+ }
127
+ heartbeatWatcher = new HeartbeatWatcher({
128
+ cwd: ctx.cwd,
129
+ pollIntervalMs: config.observability?.pollIntervalMs ?? 5000,
130
+ manifestCache: getManifestCache(ctx.cwd),
131
+ registry: metricRegistry,
132
+ router: { enqueue: (notification) => { notifyOperator(notification); return true; } },
133
+ deadletterTickThreshold: config.reliability?.deadletterThreshold ?? 3,
134
+ onDeadletterTrigger: (manifest, taskId) => {
135
+ appendDeadletter(manifest, { taskId, runId: manifest.runId, reason: "heartbeat-dead", attempts: 0, timestamp: new Date().toISOString() });
136
+ metricRegistry?.counter("crew.task.deadletter_total", "Deadletter triggers by reason").inc({ reason: "heartbeat-dead" });
137
+ pi.events?.emit?.("crew.task.deadletter", { runId: manifest.runId, taskId, reason: "heartbeat-dead" });
138
+ },
139
+ });
140
+ heartbeatWatcher.start();
141
+ if (config.reliability?.autoRecover === true) {
142
+ for (const plan of detectInterruptedRuns(ctx.cwd, getManifestCache(ctx.cwd))) {
143
+ notifyOperator({ id: `recovery_prompt_${plan.runId}`, severity: "warning", source: "crash-recovery", runId: plan.runId, title: `Run ${plan.runId} was interrupted`, body: `${plan.resumableTasks.length} tasks pending recovery. Open dashboard to inspect before resuming.` });
144
+ }
145
+ }
146
+ };
95
147
  const autoRecoveryLast = new Map<string, number>();
96
148
  const notifyOperator = (notification: NotificationDescriptor): void => {
97
149
  try {
@@ -245,6 +297,16 @@ export function registerPiTeams(pi: ExtensionAPI): void {
245
297
  stopAsyncRunNotifier(notifierState);
246
298
  stopCrewWidget(currentCtx, widgetState, currentCtx ? loadConfig(currentCtx.cwd).config.ui : undefined);
247
299
  clearPiCrewPowerbar(pi.events, currentCtx);
300
+ heartbeatWatcher?.dispose();
301
+ metricSink?.dispose();
302
+ eventMetricSub?.dispose();
303
+ otlpExporter?.dispose();
304
+ metricRegistry?.dispose();
305
+ heartbeatWatcher = undefined;
306
+ metricSink = undefined;
307
+ eventMetricSub = undefined;
308
+ otlpExporter = undefined;
309
+ metricRegistry = undefined;
248
310
  manifestCache.dispose();
249
311
  runSnapshotCache.dispose?.();
250
312
  renderScheduler?.dispose();
@@ -272,6 +334,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
272
334
  const loadedConfig = loadConfig(ctx.cwd);
273
335
  autoRecoveryLast.clear();
274
336
  configureNotifications(ctx);
337
+ configureObservability(ctx);
275
338
  registerPiCrewPowerbarSegments(pi.events, loadedConfig.config.ui);
276
339
  startAsyncRunNotifier(ctx, notifierState, loadedConfig.config.notifierIntervalMs ?? DEFAULT_UI.notifierIntervalMs);
277
340
  const cache = getManifestCache(ctx.cwd);
@@ -318,7 +381,7 @@ export function registerPiTeams(pi: ExtensionAPI): void {
318
381
  }
319
382
  };
320
383
  renderScheduler = new RenderScheduler(pi.events, renderTick, {
321
- fallbackMs: loadedConfig.config.ui?.dashboardLiveRefreshMs ?? 750,
384
+ fallbackMs: loadedConfig.config.ui?.dashboardLiveRefreshMs ?? 250,
322
385
  onInvalidate: () => getRunSnapshotCache(ctx.cwd).invalidate(),
323
386
  });
324
387
  });
@@ -343,11 +406,11 @@ export function registerPiTeams(pi: ExtensionAPI): void {
343
406
  };
344
407
  });
345
408
 
346
- registerTeamTool(pi, { foregroundControllers, startForegroundRun, openLiveSidebar, getManifestCache, getRunSnapshotCache, widgetState });
409
+ registerTeamTool(pi, { foregroundControllers, startForegroundRun, openLiveSidebar, getManifestCache, getRunSnapshotCache, getMetricRegistry: () => metricRegistry, widgetState });
347
410
  registerSubagentTools(pi, subagentManager);
348
411
  time("register.tools");
349
412
 
350
- registerTeamCommands(pi, { startForegroundRun, openLiveSidebar, getManifestCache, getRunSnapshotCache, dismissNotifications: () => {
413
+ registerTeamCommands(pi, { startForegroundRun, openLiveSidebar, getManifestCache, getRunSnapshotCache, getMetricRegistry: () => metricRegistry, dismissNotifications: () => {
351
414
  widgetState.notificationCount = 0;
352
415
  if (currentCtx) {
353
416
  const uiConfig = loadConfig(currentCtx.cwd).config.ui;
@@ -21,12 +21,14 @@ import { openTranscriptViewer, selectAgentTask } from "./viewers.ts";
21
21
  import { printTimings, time } from "../../utils/timings.ts";
22
22
  import { requestRenderTarget } from "../../ui/pi-ui-compat.ts";
23
23
  import type { createRunSnapshotCache } from "../../ui/run-snapshot-cache.ts";
24
+ import type { MetricRegistry } from "../../observability/metric-registry.ts";
24
25
 
25
26
  export interface RegisterTeamCommandsDeps {
26
27
  startForegroundRun: (ctx: ExtensionContext, runner: (signal?: AbortSignal) => Promise<void>, runId?: string) => void;
27
28
  openLiveSidebar: (ctx: ExtensionContext, runId: string) => void;
28
29
  getManifestCache: (cwd: string) => { list(max?: number): TeamRunManifest[] };
29
30
  getRunSnapshotCache?: (cwd: string) => ReturnType<typeof createRunSnapshotCache>;
31
+ getMetricRegistry?: () => MetricRegistry | undefined;
30
32
  dismissNotifications?: () => void;
31
33
  }
32
34
 
@@ -106,12 +108,15 @@ async function handleHealthDashboardAction(ctx: ExtensionCommandContext, selecti
106
108
  const confirmed = await openConfirm(ctx, { title: "Recent diagnostic exists", body: `File ${recent} was created <1min ago. Export another diagnostic?`, defaultAction: "cancel" });
107
109
  if (!confirmed) return;
108
110
  }
109
- const result = await dispatchDiagnosticExport(ctx as ExtensionContext, selection.runId);
111
+ const result = await dispatchDiagnosticExport(ctx as ExtensionContext, selection.runId, { registry: depsRef?.getMetricRegistry?.() });
110
112
  depsNotify(ctx, result.message, result.ok ? "info" : "error");
111
113
  }
112
114
  }
113
115
 
116
+ let depsRef: RegisterTeamCommandsDeps | undefined;
117
+
114
118
  export function registerTeamCommands(pi: ExtensionAPI, deps: RegisterTeamCommandsDeps): void {
119
+ depsRef = deps;
115
120
  pi.registerCommand("teams", {
116
121
  description: "List pi-crew teams, workflows, and agents",
117
122
  handler: async (_args: string, ctx: ExtensionCommandContext) => {
@@ -123,7 +128,7 @@ export function registerTeamCommands(pi: ExtensionAPI, deps: RegisterTeamCommand
123
128
  pi.registerCommand("team-run", {
124
129
  description: "Manually start a pi-crew run (agent may also use the team tool autonomously)",
125
130
  handler: async (args: string, ctx: ExtensionCommandContext) => {
126
- const result = await handleTeamTool(parseRunArgs(args), { ...ctx, startForegroundRun: (runner, runId) => deps.startForegroundRun(ctx as ExtensionContext, runner, runId), onRunStarted: (runId) => deps.openLiveSidebar(ctx as ExtensionContext, runId) });
131
+ const result = await handleTeamTool(parseRunArgs(args), { ...ctx, metricRegistry: deps.getMetricRegistry?.(), startForegroundRun: (runner, runId) => deps.startForegroundRun(ctx as ExtensionContext, runner, runId), onRunStarted: (runId) => deps.openLiveSidebar(ctx as ExtensionContext, runId) });
127
132
  await notifyCommandResult(ctx, commandText(result));
128
133
  },
129
134
  });
@@ -161,6 +166,12 @@ export function registerTeamCommands(pi: ExtensionAPI, deps: RegisterTeamCommand
161
166
  },
162
167
  });
163
168
 
169
+ pi.registerCommand("team-metrics", { description: "Show pi-crew metrics snapshot: [filter]", handler: async (args: string, ctx: ExtensionCommandContext) => {
170
+ const filter = args.trim() || undefined;
171
+ const result = await handleTeamTool({ action: "api", config: { operation: "metrics-snapshot", filter } }, { ...ctx, metricRegistry: deps.getMetricRegistry?.() });
172
+ await notifyCommandResult(ctx, commandText(result));
173
+ } });
174
+
164
175
  pi.registerCommand("team-imports", { description: "List imported pi-crew run bundles", handler: async (_args: string, ctx: ExtensionCommandContext) => {
165
176
  const result = await handleTeamTool({ action: "imports" }, ctx);
166
177
  await notifyCommandResult(ctx, commandText(result));
@@ -225,7 +236,7 @@ export function registerTeamCommands(pi: ExtensionAPI, deps: RegisterTeamCommand
225
236
  const uiConfig = loadConfig(ctx.cwd).config.ui;
226
237
  const rightPanel = uiConfig?.dashboardPlacement !== "center";
227
238
  const width = rightPanel ? Math.min(90, Math.max(40, uiConfig?.dashboardWidth ?? 56)) : "90%";
228
- const selection = await ctx.ui.custom<RunDashboardSelection | undefined>((_tui, theme, _keybindings, done) => new RunDashboard(runs, done, theme, { placement: rightPanel ? "right" : "center", showModel: uiConfig?.showModel, showTokens: uiConfig?.showTokens, showTools: uiConfig?.showTools, snapshotCache: deps.getRunSnapshotCache?.(ctx.cwd), runProvider: () => deps.getManifestCache(ctx.cwd).list(50) }), { overlay: true, overlayOptions: rightPanel ? { width, minWidth: 40, maxHeight: "100%", anchor: "top-right", offsetX: 0, offsetY: 0, margin: { top: 0, right: 0, bottom: 0, left: 0 } } : { width, maxHeight: "90%", anchor: "center", margin: 2 } });
239
+ const selection = await ctx.ui.custom<RunDashboardSelection | undefined>((_tui, theme, _keybindings, done) => new RunDashboard(runs, done, theme, { placement: rightPanel ? "right" : "center", showModel: uiConfig?.showModel, showTokens: uiConfig?.showTokens, showTools: uiConfig?.showTools, snapshotCache: deps.getRunSnapshotCache?.(ctx.cwd), runProvider: () => deps.getManifestCache(ctx.cwd).list(50), registry: deps.getMetricRegistry?.() }), { overlay: true, overlayOptions: rightPanel ? { width, minWidth: 40, maxHeight: "100%", anchor: "top-right", offsetX: 0, offsetY: 0, margin: { top: 0, right: 0, bottom: 0, left: 0 } } : { width, maxHeight: "90%", anchor: "center", margin: 2 } });
229
240
  if (!selection) return;
230
241
  if (selection.action === "reload") continue;
231
242
  if (selection.action === "notifications-dismiss") {
@@ -6,6 +6,7 @@ import { updateCrewWidget } from "../../ui/crew-widget.ts";
6
6
  import { updatePiCrewPowerbar } from "../../ui/powerbar-publisher.ts";
7
7
  import type { createManifestCache } from "../../runtime/manifest-cache.ts";
8
8
  import type { createRunSnapshotCache } from "../../ui/run-snapshot-cache.ts";
9
+ import type { MetricRegistry } from "../../observability/metric-registry.ts";
9
10
  import { handleTeamTool } from "../team-tool.ts";
10
11
 
11
12
  export interface RegisterTeamToolDeps {
@@ -14,6 +15,7 @@ export interface RegisterTeamToolDeps {
14
15
  openLiveSidebar: (ctx: ExtensionContext, runId: string) => void;
15
16
  getManifestCache: (cwd: string) => ReturnType<typeof createManifestCache>;
16
17
  getRunSnapshotCache?: (cwd: string) => ReturnType<typeof createRunSnapshotCache>;
18
+ getMetricRegistry?: () => MetricRegistry | undefined;
17
19
  widgetState: CrewWidgetState;
18
20
  }
19
21
 
@@ -36,7 +38,7 @@ export function registerTeamTool(pi: ExtensionAPI, deps: RegisterTeamToolDeps):
36
38
  const runLabel = resolved.team ?? resolved.agent ?? "direct";
37
39
  pi.setSessionName(`pi-crew: ${runLabel}/${resolved.workflow ?? "default"} — ${resolved.goal.slice(0, 60)}`);
38
40
  }
39
- const output = await handleTeamTool(resolved, { ...ctx, signal: controller.signal, startForegroundRun: (runner, runId) => deps.startForegroundRun(ctx, runner, runId), onRunStarted: (runId) => deps.openLiveSidebar(ctx, runId) });
41
+ const output = await handleTeamTool(resolved, { ...ctx, signal: controller.signal, metricRegistry: deps.getMetricRegistry?.(), startForegroundRun: (runner, runId) => deps.startForegroundRun(ctx, runner, runId), onRunStarted: (runId) => deps.openLiveSidebar(ctx, runId) });
40
42
  if (resolved.action === "run") {
41
43
  pi.appendEntry("crew:run-started", {
42
44
  runId: output.details?.runId,
@@ -19,12 +19,37 @@ import { liveControlRealtimeMessage, publishLiveControlRealtime } from "../../su
19
19
  import type { PiTeamsToolResult } from "../tool-result.ts";
20
20
  import { configRecord, result, type TeamContext } from "./context.ts";
21
21
 
22
+ function globMatch(value: string, pattern: string): boolean {
23
+ const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
24
+ return new RegExp(`^${escaped}$`).test(value);
25
+ }
26
+
27
+ function snapshotHasRunId(snapshot: { values?: unknown }, runId: string): boolean {
28
+ const values = Array.isArray(snapshot.values) ? snapshot.values : [];
29
+ return values.some((value) => {
30
+ if (!value || typeof value !== "object" || Array.isArray(value)) return false;
31
+ const labels = (value as { labels?: unknown }).labels;
32
+ return labels && typeof labels === "object" && !Array.isArray(labels) && (labels as Record<string, unknown>).runId === runId;
33
+ });
34
+ }
35
+
22
36
  export async function handleApi(params: TeamToolParamsValue, ctx: TeamContext): Promise<PiTeamsToolResult> {
37
+ const cfg = configRecord(params.config);
38
+ const operation = typeof cfg.operation === "string" ? cfg.operation : "read-manifest";
39
+ if (operation === "metrics-snapshot") {
40
+ const filter = typeof cfg.filter === "string" ? cfg.filter : undefined;
41
+ const runIdFilter = typeof cfg.runId === "string" ? cfg.runId : params.runId;
42
+ const snapshots = ctx.metricRegistry?.snapshot() ?? [];
43
+ const filtered = snapshots.filter((snapshot) => {
44
+ if (filter && !globMatch(snapshot.name, filter)) return false;
45
+ if (runIdFilter && !snapshotHasRunId(snapshot, runIdFilter)) return false;
46
+ return true;
47
+ });
48
+ return result(JSON.stringify(filtered, null, 2), { action: "api", status: "ok", ...(runIdFilter ? { runId: runIdFilter } : {}) });
49
+ }
23
50
  if (!params.runId) return result("API requires runId.", { action: "api", status: "error" }, true);
24
51
  const loaded = loadRunManifestById(ctx.cwd, params.runId);
25
52
  if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "api", status: "error" }, true);
26
- const cfg = configRecord(params.config);
27
- const operation = typeof cfg.operation === "string" ? cfg.operation : "read-manifest";
28
53
  if (operation === "read-manifest") {
29
54
  return result(JSON.stringify(loaded.manifest, null, 2), { action: "api", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
30
55
  }
@@ -1,4 +1,5 @@
1
1
  import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
2
+ import type { MetricRegistry } from "../../observability/metric-registry.ts";
2
3
  import type { TeamToolDetails } from "../team-tool-types.ts";
3
4
  import { toolResult, type PiTeamsToolResult } from "../tool-result.ts";
4
5
 
@@ -6,6 +7,7 @@ export type TeamContext = Pick<ExtensionContext, "cwd"> & Partial<Pick<Extension
6
7
  modelRegistry?: unknown;
7
8
  sessionManager?: { getBranch?: () => unknown[] };
8
9
  events?: { emit?: (event: string, data: unknown) => void };
10
+ metricRegistry?: MetricRegistry;
9
11
  signal?: AbortSignal;
10
12
  startForegroundRun?: (runner: (signal?: AbortSignal) => Promise<void>, runId?: string) => void;
11
13
  onRunStarted?: (runId: string) => void;
@@ -134,7 +134,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
134
134
  if (executeWorkers && ctx.startForegroundRun) {
135
135
  ctx.onRunStarted?.(updatedManifest.runId);
136
136
  ctx.startForegroundRun(async (signal) => {
137
- await executeTeamRun({ manifest: updatedManifest, tasks, team, workflow, agents, executeWorkers, limits: executedConfig.limits, runtime, runtimeConfig: executedConfig.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal });
137
+ await executeTeamRun({ manifest: updatedManifest, tasks, team, workflow, agents, executeWorkers, limits: executedConfig.limits, runtime, runtimeConfig: executedConfig.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal, reliability: executedConfig.reliability, metricRegistry: ctx.metricRegistry });
138
138
  }, updatedManifest.runId);
139
139
  const text = [
140
140
  `Started foreground pi-crew run ${updatedManifest.runId}.`,
@@ -150,7 +150,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
150
150
  ].join("\n");
151
151
  return result(text, { action: "run", status: "ok", runId: updatedManifest.runId, artifactsRoot: updatedManifest.artifactsRoot });
152
152
  }
153
- const executed = await executeTeamRun({ manifest: updatedManifest, tasks, team, workflow, agents, executeWorkers, limits: executedConfig.limits, runtime, runtimeConfig: executedConfig.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal: ctx.signal });
153
+ const executed = await executeTeamRun({ manifest: updatedManifest, tasks, team, workflow, agents, executeWorkers, limits: executedConfig.limits, runtime, runtimeConfig: executedConfig.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal: ctx.signal, reliability: executedConfig.reliability, metricRegistry: ctx.metricRegistry });
154
154
  const text = [
155
155
  `Created pi-crew run ${executed.manifest.runId}.`,
156
156
  `Team: ${team.name}`,
@@ -185,7 +185,7 @@ export async function handleResume(params: TeamToolParamsValue, ctx: TeamContext
185
185
  const loadedConfig = loadConfig(ctx.cwd);
186
186
  const runtime = await resolveCrewRuntime(loadedConfig.config);
187
187
  const executeWorkers = runtime.kind !== "scaffold";
188
- const executed = await executeTeamRun({ manifest: resumeManifest, tasks: resetTasks, team, workflow, agents, executeWorkers, limits: loadedConfig.config.limits, runtime, runtimeConfig: loadedConfig.config.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal: ctx.signal });
188
+ const executed = await executeTeamRun({ manifest: resumeManifest, tasks: resetTasks, team, workflow, agents, executeWorkers, limits: loadedConfig.config.limits, runtime, runtimeConfig: loadedConfig.config.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal: ctx.signal, reliability: loadedConfig.config.reliability, metricRegistry: ctx.metricRegistry });
189
189
  return result([`Resumed run ${executed.manifest.runId}.`, `Status: ${executed.manifest.status}`, `Tasks: ${executed.tasks.length}`, `Artifacts: ${executed.manifest.artifactsRoot}`].join("\n"), { action: "resume", status: executed.manifest.status === "failed" ? "error" : "ok", runId: executed.manifest.runId, artifactsRoot: executed.manifest.artifactsRoot }, executed.manifest.status === "failed");
190
190
  });
191
191
  }
@@ -0,0 +1,35 @@
1
+ import { AsyncLocalStorage } from "node:async_hooks";
2
+
3
+ export interface CorrelationContext {
4
+ traceId: string;
5
+ parentSpanId?: string;
6
+ spanId: string;
7
+ }
8
+
9
+ const storage = new AsyncLocalStorage<CorrelationContext>();
10
+ let spanCounter = 0;
11
+
12
+ export function withCorrelation<T>(ctx: CorrelationContext, fn: () => T): T {
13
+ return storage.run(ctx, fn);
14
+ }
15
+
16
+ export function getCurrentContext(): CorrelationContext | undefined {
17
+ return storage.getStore();
18
+ }
19
+
20
+ export function newSpanId(runId: string, taskId = "main"): string {
21
+ spanCounter += 1;
22
+ return `${runId}:${taskId}:${spanCounter}`;
23
+ }
24
+
25
+ export function childCorrelation(runId: string, taskId: string): CorrelationContext {
26
+ const parent = getCurrentContext();
27
+ const spanId = newSpanId(runId, taskId);
28
+ return { traceId: parent?.traceId ?? spanId, parentSpanId: parent?.spanId, spanId };
29
+ }
30
+
31
+ export function correlatedEvent<T extends { runId?: string; data?: Record<string, unknown> }>(event: T): T {
32
+ const ctx = getCurrentContext();
33
+ if (!ctx) return event;
34
+ return { ...event, data: { ...(event.data ?? {}), traceId: ctx.traceId, spanId: ctx.spanId, parentSpanId: ctx.parentSpanId } };
35
+ }
@@ -0,0 +1,54 @@
1
+ import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
2
+ import { MetricRegistry } from "./metric-registry.ts";
3
+
4
+ function recordValue(value: unknown): Record<string, unknown> {
5
+ return value && typeof value === "object" && !Array.isArray(value) ? value as Record<string, unknown> : {};
6
+ }
7
+
8
+ function stringValue(value: unknown, fallback: string): string {
9
+ return typeof value === "string" && value.length > 0 ? value : fallback;
10
+ }
11
+
12
+ function numberValue(value: unknown, fallback = 0): number {
13
+ return typeof value === "number" && Number.isFinite(value) ? value : fallback;
14
+ }
15
+
16
+ export interface EventToMetricSubscription {
17
+ dispose(): void;
18
+ }
19
+
20
+ export function wireEventToMetrics(events: ExtensionAPI["events"] | undefined, registry: MetricRegistry): EventToMetricSubscription {
21
+ const runCount = registry.counter("crew.run.count", "Total runs by status");
22
+ const taskCount = registry.counter("crew.task.count", "Total tasks by status");
23
+ const subagentCount = registry.counter("crew.subagent.count", "Total subagent records by status");
24
+ const mailboxCount = registry.counter("crew.mailbox.count", "Total mailbox messages by direction");
25
+ registry.counter("crew.task.deadletter_total", "Deadletter triggers by reason");
26
+ registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds");
27
+ const runDuration = registry.histogram("crew.run.duration_ms", "Run end-to-end duration, milliseconds");
28
+ const taskDuration = registry.histogram("crew.task.duration_ms", "Task duration, milliseconds");
29
+ registry.histogram("crew.task.retry_count", "Retries per task", [0, 1, 2, 3, 5, 10]);
30
+ const tokenUsage = registry.histogram("crew.task.tokens_total", "Token usage per task");
31
+
32
+ const handlers: Array<[string, (data: unknown) => void]> = [
33
+ ["crew.run.completed", (data) => { const item = recordValue(data); runCount.inc({ status: "completed" }); runDuration.observe({ team: stringValue(item.team, "unknown") }, numberValue(item.durationMs)); }],
34
+ ["crew.run.failed", () => runCount.inc({ status: "failed" })],
35
+ ["crew.run.cancelled", () => runCount.inc({ status: "cancelled" })],
36
+ ["crew.task.completed", (data) => { const item = recordValue(data); taskCount.inc({ status: "completed" }); taskDuration.observe({ role: stringValue(item.role, "unknown") }, numberValue(item.durationMs)); tokenUsage.observe({ role: stringValue(item.role, "unknown") }, numberValue(item.tokens)); }],
37
+ ["crew.task.failed", () => taskCount.inc({ status: "failed" })],
38
+ ["crew.task.retry_attempt", (data) => { const item = recordValue(data); taskCount.inc({ status: "retry" }); registry.counter("crew.task.retry_attempt_total", "Retry attempts by run and task").inc({ runId: stringValue(item.runId, "unknown"), taskId: stringValue(item.taskId, "unknown") }); }],
39
+ ["crew.task.deadletter", (data) => { const item = recordValue(data); registry.counter("crew.task.deadletter_total", "Deadletter triggers by reason").inc({ reason: stringValue(item.reason, "unknown") }); }],
40
+ ["crew.subagent.completed", (data) => { const item = recordValue(data); subagentCount.inc({ status: stringValue(item.status, "completed") }); }],
41
+ ["crew.subagent.failed", () => subagentCount.inc({ status: "failed" })],
42
+ ["crew.mailbox.message", (data) => { const item = recordValue(data); mailboxCount.inc({ direction: stringValue(item.direction, "unknown") }); }],
43
+ ];
44
+
45
+ const unsubscribers: Array<() => void> = [];
46
+ for (const [event, handler] of handlers) {
47
+ const unsubscribe = events?.on?.(event, (data: unknown) => {
48
+ try { handler(data); } catch { /* metric handlers must never break event delivery */ }
49
+ });
50
+ if (typeof unsubscribe === "function") unsubscribers.push(unsubscribe);
51
+ }
52
+ let disposed = false;
53
+ return { dispose() { if (disposed) return; disposed = true; for (const unsubscribe of unsubscribers.splice(0)) unsubscribe(); } };
54
+ }
@@ -0,0 +1,24 @@
1
+ import type { MetricSnapshot } from "../metrics-primitives.ts";
2
+
3
+ export interface MetricExporter {
4
+ name: string;
5
+ push(snapshots: MetricSnapshot[]): Promise<void>;
6
+ dispose(): void;
7
+ }
8
+
9
+ export class CompositeExporter implements MetricExporter {
10
+ name = "composite";
11
+ private readonly exporters: MetricExporter[];
12
+
13
+ constructor(exporters: MetricExporter[]) {
14
+ this.exporters = exporters;
15
+ }
16
+
17
+ async push(snapshots: MetricSnapshot[]): Promise<void> {
18
+ await Promise.allSettled(this.exporters.map((exporter) => exporter.push(snapshots)));
19
+ }
20
+
21
+ dispose(): void {
22
+ for (const exporter of this.exporters) exporter.dispose();
23
+ }
24
+ }
@@ -0,0 +1,65 @@
1
+ import { logInternalError } from "../../utils/internal-error.ts";
2
+ import type { MetricRegistry } from "../metric-registry.ts";
3
+ import type { MetricSnapshot } from "../metrics-primitives.ts";
4
+ import type { MetricExporter } from "./adapter.ts";
5
+
6
+ export interface OTLPExporterOptions {
7
+ endpoint: string;
8
+ headers?: Record<string, string>;
9
+ intervalMs?: number;
10
+ timeoutMs?: number;
11
+ }
12
+
13
+ function pointValues(snapshot: MetricSnapshot): unknown[] {
14
+ return snapshot.values.map((value) => ({ attributes: Object.entries(value.labels).map(([key, item]) => ({ key, value: { stringValue: String(item) } })), asDouble: "value" in value ? value.value : undefined, count: "count" in value ? value.count : undefined, sum: "sum" in value ? value.sum : undefined }));
15
+ }
16
+
17
+ export function convertToOTLP(snapshots: MetricSnapshot[]): unknown {
18
+ return {
19
+ resourceMetrics: [{
20
+ resource: { attributes: [{ key: "service.name", value: { stringValue: "pi-crew" } }] },
21
+ scopeMetrics: [{
22
+ scope: { name: "pi-crew" },
23
+ metrics: snapshots.map((snapshot) => ({ name: snapshot.name, description: snapshot.description, [snapshot.type === "histogram" ? "histogram" : snapshot.type === "gauge" ? "gauge" : "sum"]: { dataPoints: pointValues(snapshot) } })),
24
+ }],
25
+ }],
26
+ };
27
+ }
28
+
29
+ export class OTLPExporter implements MetricExporter {
30
+ name = "otlp";
31
+ private timer?: ReturnType<typeof setInterval>;
32
+ private readonly opts: OTLPExporterOptions;
33
+ private readonly registry: MetricRegistry;
34
+
35
+ constructor(opts: OTLPExporterOptions, registry: MetricRegistry) {
36
+ this.opts = opts;
37
+ this.registry = registry;
38
+ }
39
+
40
+ start(): void {
41
+ this.dispose();
42
+ this.timer = setInterval(() => { void this.push(this.registry.snapshot()); }, this.opts.intervalMs ?? 60_000);
43
+ this.timer.unref?.();
44
+ }
45
+
46
+ async push(snapshots: MetricSnapshot[]): Promise<void> {
47
+ try {
48
+ const timeoutMs = this.opts.timeoutMs ?? 10_000;
49
+ const controller = new AbortController();
50
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
51
+ try {
52
+ await fetch(this.opts.endpoint, { method: "POST", headers: { "content-type": "application/json", ...(this.opts.headers ?? {}) }, body: JSON.stringify(convertToOTLP(snapshots)), signal: controller.signal });
53
+ } finally {
54
+ clearTimeout(timer);
55
+ }
56
+ } catch (error) {
57
+ logInternalError("otlp-export", error);
58
+ }
59
+ }
60
+
61
+ dispose(): void {
62
+ if (this.timer) clearInterval(this.timer);
63
+ this.timer = undefined;
64
+ }
65
+ }
@@ -0,0 +1,47 @@
1
+ import type { HistogramPoint, MetricLabels, MetricPoint, MetricSnapshot } from "../metrics-primitives.ts";
2
+
3
+ function prometheusName(name: string): string {
4
+ return name.replace(/\./g, "_");
5
+ }
6
+
7
+ function escapeLabel(value: string): string {
8
+ return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n").replace(/"/g, "\\\"");
9
+ }
10
+
11
+ function labelsText(labels: MetricLabels): string {
12
+ const entries = Object.entries(labels);
13
+ if (!entries.length) return "";
14
+ return `{${entries.map(([key, value]) => `${key}="${escapeLabel(String(value))}"`).join(",")}}`;
15
+ }
16
+
17
+ function metricType(type: MetricSnapshot["type"]): string {
18
+ return type === "histogram" ? "histogram" : type === "gauge" ? "gauge" : "counter";
19
+ }
20
+
21
+ function isHistogramPoint(value: MetricPoint | HistogramPoint): value is HistogramPoint {
22
+ return "buckets" in value && "counts" in value;
23
+ }
24
+
25
+ export function formatPrometheus(snapshots: MetricSnapshot[]): string {
26
+ const lines: string[] = [];
27
+ for (const snapshot of snapshots) {
28
+ const name = prometheusName(snapshot.name);
29
+ lines.push(`# HELP ${name} ${snapshot.description}`);
30
+ lines.push(`# TYPE ${name} ${metricType(snapshot.type)}`);
31
+ for (const value of snapshot.values) {
32
+ if (isHistogramPoint(value)) {
33
+ let cumulative = 0;
34
+ for (let index = 0; index < value.buckets.length; index += 1) {
35
+ cumulative += value.counts[index] ?? 0;
36
+ const le = Number.isFinite(value.buckets[index]) ? String(value.buckets[index]) : "+Inf";
37
+ lines.push(`${name}_bucket${labelsText({ ...value.labels, le })} ${cumulative}`);
38
+ }
39
+ lines.push(`${name}_sum${labelsText(value.labels)} ${value.sum}`);
40
+ lines.push(`${name}_count${labelsText(value.labels)} ${value.count}`);
41
+ } else {
42
+ lines.push(`${name}${labelsText(value.labels)} ${value.value}`);
43
+ }
44
+ }
45
+ }
46
+ return `${lines.join("\n")}\n`;
47
+ }
@@ -0,0 +1,72 @@
1
+ import { Counter, Gauge, Histogram, type Metric, type MetricSnapshot } from "./metrics-primitives.ts";
2
+
3
+ const METRIC_NAME_PATTERN = /^crew\.[a-z]+\.[a-z][a-z_]*$/;
4
+
5
+ function assertMetricName(name: string): void {
6
+ if (!METRIC_NAME_PATTERN.test(name)) throw new Error(`Invalid metric name '${name}'. Expected crew.<domain>.<measure>.`);
7
+ }
8
+
9
+ export class MetricRegistry {
10
+ private metrics = new Map<string, Metric>();
11
+
12
+ registerCounter(name: string, description: string): Counter {
13
+ assertMetricName(name);
14
+ if (this.metrics.has(name)) throw new Error(`Metric '${name}' is already registered.`);
15
+ const metric = new Counter(name, description);
16
+ this.metrics.set(name, metric);
17
+ return metric;
18
+ }
19
+
20
+ registerGauge(name: string, description: string): Gauge {
21
+ assertMetricName(name);
22
+ if (this.metrics.has(name)) throw new Error(`Metric '${name}' is already registered.`);
23
+ const metric = new Gauge(name, description);
24
+ this.metrics.set(name, metric);
25
+ return metric;
26
+ }
27
+
28
+ registerHistogram(name: string, description: string, buckets?: number[]): Histogram {
29
+ assertMetricName(name);
30
+ if (this.metrics.has(name)) throw new Error(`Metric '${name}' is already registered.`);
31
+ const metric = new Histogram(name, description, buckets);
32
+ this.metrics.set(name, metric);
33
+ return metric;
34
+ }
35
+
36
+ counter(name: string, description: string): Counter {
37
+ const existing = this.metrics.get(name);
38
+ if (existing instanceof Counter) return existing;
39
+ if (existing) throw new Error(`Metric '${name}' is not a counter.`);
40
+ return this.registerCounter(name, description);
41
+ }
42
+
43
+ gauge(name: string, description: string): Gauge {
44
+ const existing = this.metrics.get(name);
45
+ if (existing instanceof Gauge) return existing;
46
+ if (existing) throw new Error(`Metric '${name}' is not a gauge.`);
47
+ return this.registerGauge(name, description);
48
+ }
49
+
50
+ histogram(name: string, description: string, buckets?: number[]): Histogram {
51
+ const existing = this.metrics.get(name);
52
+ if (existing instanceof Histogram) return existing;
53
+ if (existing) throw new Error(`Metric '${name}' is not a histogram.`);
54
+ return this.registerHistogram(name, description, buckets);
55
+ }
56
+
57
+ get(name: string): Metric | undefined {
58
+ return this.metrics.get(name);
59
+ }
60
+
61
+ snapshot(): MetricSnapshot[] {
62
+ return [...this.metrics.values()].map((metric) => metric.snapshot());
63
+ }
64
+
65
+ dispose(): void {
66
+ this.metrics.clear();
67
+ }
68
+ }
69
+
70
+ export function createMetricRegistry(): MetricRegistry {
71
+ return new MetricRegistry();
72
+ }
@@ -0,0 +1,46 @@
1
+ import { labelKey, type MetricLabels } from "./metrics-primitives.ts";
2
+
3
+ interface WindowEvent {
4
+ timestamp: number;
5
+ labels: MetricLabels;
6
+ delta: number;
7
+ }
8
+
9
+ export class TimeWindowedCounter {
10
+ private events: WindowEvent[] = [];
11
+ private readonly windowMs: number;
12
+ private readonly now: () => number;
13
+
14
+ constructor(windowMs = 3_600_000, now: () => number = () => Date.now()) {
15
+ this.windowMs = windowMs;
16
+ this.now = now;
17
+ }
18
+
19
+ inc(labels: MetricLabels = {}, delta = 1): void {
20
+ if (!Number.isFinite(delta)) return;
21
+ this.events.push({ timestamp: this.now(), labels: { ...labels }, delta });
22
+ this.prune();
23
+ }
24
+
25
+ count(labels: MetricLabels = {}, durationMs = this.windowMs): number {
26
+ this.prune();
27
+ const key = labelKey(labels);
28
+ const cutoff = this.now() - durationMs;
29
+ return this.events.filter((event) => event.timestamp >= cutoff && labelKey(event.labels) === key).reduce((sum, event) => sum + event.delta, 0);
30
+ }
31
+
32
+ rate(labels: MetricLabels = {}, durationMs = this.windowMs): number {
33
+ if (durationMs <= 0) return 0;
34
+ return this.count(labels, durationMs) / (durationMs / 1000);
35
+ }
36
+
37
+ size(): number {
38
+ this.prune();
39
+ return this.events.length;
40
+ }
41
+
42
+ private prune(): void {
43
+ const cutoff = this.now() - this.windowMs;
44
+ this.events = this.events.filter((event) => event.timestamp >= cutoff);
45
+ }
46
+ }