pi-crew 0.1.35 → 0.1.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -0
- package/docs/architecture.md +8 -1
- package/docs/research-phase9-observability-reliability-plan.md +42 -42
- package/package.json +1 -1
- package/schema.json +42 -0
- package/src/config/config.ts +101 -0
- package/src/extension/register.ts +65 -2
- package/src/extension/registration/commands.ts +14 -3
- package/src/extension/registration/team-tool.ts +3 -1
- package/src/extension/team-tool/api.ts +27 -2
- package/src/extension/team-tool/context.ts +2 -0
- package/src/extension/team-tool/run.ts +2 -2
- package/src/extension/team-tool.ts +1 -1
- package/src/observability/correlation.ts +35 -0
- package/src/observability/event-to-metric.ts +54 -0
- package/src/observability/exporters/adapter.ts +24 -0
- package/src/observability/exporters/otlp-exporter.ts +65 -0
- package/src/observability/exporters/prometheus-exporter.ts +47 -0
- package/src/observability/metric-registry.ts +72 -0
- package/src/observability/metric-retention.ts +46 -0
- package/src/observability/metric-sink.ts +51 -0
- package/src/observability/metrics-primitives.ts +166 -0
- package/src/runtime/crash-recovery.ts +56 -0
- package/src/runtime/deadletter.ts +36 -0
- package/src/runtime/diagnostic-export.ts +8 -1
- package/src/runtime/heartbeat-gradient.ts +28 -0
- package/src/runtime/heartbeat-watcher.ts +80 -0
- package/src/runtime/retry-executor.ts +59 -0
- package/src/runtime/task-runner.ts +14 -1
- package/src/runtime/team-runner.ts +57 -5
- package/src/schema/config-schema.ts +29 -0
- package/src/state/event-log.ts +3 -2
- package/src/state/types.ts +7 -0
- package/src/ui/dashboard-panes/metrics-pane.ts +34 -0
- package/src/ui/heartbeat-aggregator.ts +15 -5
- package/src/ui/keybinding-map.ts +4 -2
- package/src/ui/run-action-dispatcher.ts +3 -2
- package/src/ui/run-dashboard.ts +11 -4
|
@@ -19,12 +19,37 @@ import { liveControlRealtimeMessage, publishLiveControlRealtime } from "../../su
|
|
|
19
19
|
import type { PiTeamsToolResult } from "../tool-result.ts";
|
|
20
20
|
import { configRecord, result, type TeamContext } from "./context.ts";
|
|
21
21
|
|
|
22
|
+
function globMatch(value: string, pattern: string): boolean {
|
|
23
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
24
|
+
return new RegExp(`^${escaped}$`).test(value);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function snapshotHasRunId(snapshot: { values?: unknown }, runId: string): boolean {
|
|
28
|
+
const values = Array.isArray(snapshot.values) ? snapshot.values : [];
|
|
29
|
+
return values.some((value) => {
|
|
30
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) return false;
|
|
31
|
+
const labels = (value as { labels?: unknown }).labels;
|
|
32
|
+
return labels && typeof labels === "object" && !Array.isArray(labels) && (labels as Record<string, unknown>).runId === runId;
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
|
|
22
36
|
export async function handleApi(params: TeamToolParamsValue, ctx: TeamContext): Promise<PiTeamsToolResult> {
|
|
37
|
+
const cfg = configRecord(params.config);
|
|
38
|
+
const operation = typeof cfg.operation === "string" ? cfg.operation : "read-manifest";
|
|
39
|
+
if (operation === "metrics-snapshot") {
|
|
40
|
+
const filter = typeof cfg.filter === "string" ? cfg.filter : undefined;
|
|
41
|
+
const runIdFilter = typeof cfg.runId === "string" ? cfg.runId : params.runId;
|
|
42
|
+
const snapshots = ctx.metricRegistry?.snapshot() ?? [];
|
|
43
|
+
const filtered = snapshots.filter((snapshot) => {
|
|
44
|
+
if (filter && !globMatch(snapshot.name, filter)) return false;
|
|
45
|
+
if (runIdFilter && !snapshotHasRunId(snapshot, runIdFilter)) return false;
|
|
46
|
+
return true;
|
|
47
|
+
});
|
|
48
|
+
return result(JSON.stringify(filtered, null, 2), { action: "api", status: "ok", ...(runIdFilter ? { runId: runIdFilter } : {}) });
|
|
49
|
+
}
|
|
23
50
|
if (!params.runId) return result("API requires runId.", { action: "api", status: "error" }, true);
|
|
24
51
|
const loaded = loadRunManifestById(ctx.cwd, params.runId);
|
|
25
52
|
if (!loaded) return result(`Run '${params.runId}' not found.`, { action: "api", status: "error" }, true);
|
|
26
|
-
const cfg = configRecord(params.config);
|
|
27
|
-
const operation = typeof cfg.operation === "string" ? cfg.operation : "read-manifest";
|
|
28
53
|
if (operation === "read-manifest") {
|
|
29
54
|
return result(JSON.stringify(loaded.manifest, null, 2), { action: "api", status: "ok", runId: loaded.manifest.runId, artifactsRoot: loaded.manifest.artifactsRoot });
|
|
30
55
|
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ExtensionContext } from "@mariozechner/pi-coding-agent";
|
|
2
|
+
import type { MetricRegistry } from "../../observability/metric-registry.ts";
|
|
2
3
|
import type { TeamToolDetails } from "../team-tool-types.ts";
|
|
3
4
|
import { toolResult, type PiTeamsToolResult } from "../tool-result.ts";
|
|
4
5
|
|
|
@@ -6,6 +7,7 @@ export type TeamContext = Pick<ExtensionContext, "cwd"> & Partial<Pick<Extension
|
|
|
6
7
|
modelRegistry?: unknown;
|
|
7
8
|
sessionManager?: { getBranch?: () => unknown[] };
|
|
8
9
|
events?: { emit?: (event: string, data: unknown) => void };
|
|
10
|
+
metricRegistry?: MetricRegistry;
|
|
9
11
|
signal?: AbortSignal;
|
|
10
12
|
startForegroundRun?: (runner: (signal?: AbortSignal) => Promise<void>, runId?: string) => void;
|
|
11
13
|
onRunStarted?: (runId: string) => void;
|
|
@@ -134,7 +134,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
|
|
|
134
134
|
if (executeWorkers && ctx.startForegroundRun) {
|
|
135
135
|
ctx.onRunStarted?.(updatedManifest.runId);
|
|
136
136
|
ctx.startForegroundRun(async (signal) => {
|
|
137
|
-
await executeTeamRun({ manifest: updatedManifest, tasks, team, workflow, agents, executeWorkers, limits: executedConfig.limits, runtime, runtimeConfig: executedConfig.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal });
|
|
137
|
+
await executeTeamRun({ manifest: updatedManifest, tasks, team, workflow, agents, executeWorkers, limits: executedConfig.limits, runtime, runtimeConfig: executedConfig.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal, reliability: executedConfig.reliability, metricRegistry: ctx.metricRegistry });
|
|
138
138
|
}, updatedManifest.runId);
|
|
139
139
|
const text = [
|
|
140
140
|
`Started foreground pi-crew run ${updatedManifest.runId}.`,
|
|
@@ -150,7 +150,7 @@ export async function handleRun(params: TeamToolParamsValue, ctx: TeamContext):
|
|
|
150
150
|
].join("\n");
|
|
151
151
|
return result(text, { action: "run", status: "ok", runId: updatedManifest.runId, artifactsRoot: updatedManifest.artifactsRoot });
|
|
152
152
|
}
|
|
153
|
-
const executed = await executeTeamRun({ manifest: updatedManifest, tasks, team, workflow, agents, executeWorkers, limits: executedConfig.limits, runtime, runtimeConfig: executedConfig.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal: ctx.signal });
|
|
153
|
+
const executed = await executeTeamRun({ manifest: updatedManifest, tasks, team, workflow, agents, executeWorkers, limits: executedConfig.limits, runtime, runtimeConfig: executedConfig.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal: ctx.signal, reliability: executedConfig.reliability, metricRegistry: ctx.metricRegistry });
|
|
154
154
|
const text = [
|
|
155
155
|
`Created pi-crew run ${executed.manifest.runId}.`,
|
|
156
156
|
`Team: ${team.name}`,
|
|
@@ -185,7 +185,7 @@ export async function handleResume(params: TeamToolParamsValue, ctx: TeamContext
|
|
|
185
185
|
const loadedConfig = loadConfig(ctx.cwd);
|
|
186
186
|
const runtime = await resolveCrewRuntime(loadedConfig.config);
|
|
187
187
|
const executeWorkers = runtime.kind !== "scaffold";
|
|
188
|
-
const executed = await executeTeamRun({ manifest: resumeManifest, tasks: resetTasks, team, workflow, agents, executeWorkers, limits: loadedConfig.config.limits, runtime, runtimeConfig: loadedConfig.config.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal: ctx.signal });
|
|
188
|
+
const executed = await executeTeamRun({ manifest: resumeManifest, tasks: resetTasks, team, workflow, agents, executeWorkers, limits: loadedConfig.config.limits, runtime, runtimeConfig: loadedConfig.config.runtime, parentContext: buildParentContext(ctx), parentModel: ctx.model, modelRegistry: ctx.modelRegistry, modelOverride: params.model, signal: ctx.signal, reliability: loadedConfig.config.reliability, metricRegistry: ctx.metricRegistry });
|
|
189
189
|
return result([`Resumed run ${executed.manifest.runId}.`, `Status: ${executed.manifest.status}`, `Tasks: ${executed.tasks.length}`, `Artifacts: ${executed.manifest.artifactsRoot}`].join("\n"), { action: "resume", status: executed.manifest.status === "failed" ? "error" : "ok", runId: executed.manifest.runId, artifactsRoot: executed.manifest.artifactsRoot }, executed.manifest.status === "failed");
|
|
190
190
|
});
|
|
191
191
|
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { AsyncLocalStorage } from "node:async_hooks";
|
|
2
|
+
|
|
3
|
+
export interface CorrelationContext {
|
|
4
|
+
traceId: string;
|
|
5
|
+
parentSpanId?: string;
|
|
6
|
+
spanId: string;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
const storage = new AsyncLocalStorage<CorrelationContext>();
|
|
10
|
+
let spanCounter = 0;
|
|
11
|
+
|
|
12
|
+
export function withCorrelation<T>(ctx: CorrelationContext, fn: () => T): T {
|
|
13
|
+
return storage.run(ctx, fn);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export function getCurrentContext(): CorrelationContext | undefined {
|
|
17
|
+
return storage.getStore();
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function newSpanId(runId: string, taskId = "main"): string {
|
|
21
|
+
spanCounter += 1;
|
|
22
|
+
return `${runId}:${taskId}:${spanCounter}`;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function childCorrelation(runId: string, taskId: string): CorrelationContext {
|
|
26
|
+
const parent = getCurrentContext();
|
|
27
|
+
const spanId = newSpanId(runId, taskId);
|
|
28
|
+
return { traceId: parent?.traceId ?? spanId, parentSpanId: parent?.spanId, spanId };
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function correlatedEvent<T extends { runId?: string; data?: Record<string, unknown> }>(event: T): T {
|
|
32
|
+
const ctx = getCurrentContext();
|
|
33
|
+
if (!ctx) return event;
|
|
34
|
+
return { ...event, data: { ...(event.data ?? {}), traceId: ctx.traceId, spanId: ctx.spanId, parentSpanId: ctx.parentSpanId } };
|
|
35
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
2
|
+
import { MetricRegistry } from "./metric-registry.ts";
|
|
3
|
+
|
|
4
|
+
function recordValue(value: unknown): Record<string, unknown> {
|
|
5
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value as Record<string, unknown> : {};
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
function stringValue(value: unknown, fallback: string): string {
|
|
9
|
+
return typeof value === "string" && value.length > 0 ? value : fallback;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
function numberValue(value: unknown, fallback = 0): number {
|
|
13
|
+
return typeof value === "number" && Number.isFinite(value) ? value : fallback;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface EventToMetricSubscription {
|
|
17
|
+
dispose(): void;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function wireEventToMetrics(events: ExtensionAPI["events"] | undefined, registry: MetricRegistry): EventToMetricSubscription {
|
|
21
|
+
const runCount = registry.counter("crew.run.count", "Total runs by status");
|
|
22
|
+
const taskCount = registry.counter("crew.task.count", "Total tasks by status");
|
|
23
|
+
const subagentCount = registry.counter("crew.subagent.count", "Total subagent records by status");
|
|
24
|
+
const mailboxCount = registry.counter("crew.mailbox.count", "Total mailbox messages by direction");
|
|
25
|
+
registry.counter("crew.task.deadletter_total", "Deadletter triggers by reason");
|
|
26
|
+
registry.gauge("crew.heartbeat.staleness_ms", "Heartbeat elapsed since last seen, milliseconds");
|
|
27
|
+
const runDuration = registry.histogram("crew.run.duration_ms", "Run end-to-end duration, milliseconds");
|
|
28
|
+
const taskDuration = registry.histogram("crew.task.duration_ms", "Task duration, milliseconds");
|
|
29
|
+
registry.histogram("crew.task.retry_count", "Retries per task", [0, 1, 2, 3, 5, 10]);
|
|
30
|
+
const tokenUsage = registry.histogram("crew.task.tokens_total", "Token usage per task");
|
|
31
|
+
|
|
32
|
+
const handlers: Array<[string, (data: unknown) => void]> = [
|
|
33
|
+
["crew.run.completed", (data) => { const item = recordValue(data); runCount.inc({ status: "completed" }); runDuration.observe({ team: stringValue(item.team, "unknown") }, numberValue(item.durationMs)); }],
|
|
34
|
+
["crew.run.failed", () => runCount.inc({ status: "failed" })],
|
|
35
|
+
["crew.run.cancelled", () => runCount.inc({ status: "cancelled" })],
|
|
36
|
+
["crew.task.completed", (data) => { const item = recordValue(data); taskCount.inc({ status: "completed" }); taskDuration.observe({ role: stringValue(item.role, "unknown") }, numberValue(item.durationMs)); tokenUsage.observe({ role: stringValue(item.role, "unknown") }, numberValue(item.tokens)); }],
|
|
37
|
+
["crew.task.failed", () => taskCount.inc({ status: "failed" })],
|
|
38
|
+
["crew.task.retry_attempt", (data) => { const item = recordValue(data); taskCount.inc({ status: "retry" }); registry.counter("crew.task.retry_attempt_total", "Retry attempts by run and task").inc({ runId: stringValue(item.runId, "unknown"), taskId: stringValue(item.taskId, "unknown") }); }],
|
|
39
|
+
["crew.task.deadletter", (data) => { const item = recordValue(data); registry.counter("crew.task.deadletter_total", "Deadletter triggers by reason").inc({ reason: stringValue(item.reason, "unknown") }); }],
|
|
40
|
+
["crew.subagent.completed", (data) => { const item = recordValue(data); subagentCount.inc({ status: stringValue(item.status, "completed") }); }],
|
|
41
|
+
["crew.subagent.failed", () => subagentCount.inc({ status: "failed" })],
|
|
42
|
+
["crew.mailbox.message", (data) => { const item = recordValue(data); mailboxCount.inc({ direction: stringValue(item.direction, "unknown") }); }],
|
|
43
|
+
];
|
|
44
|
+
|
|
45
|
+
const unsubscribers: Array<() => void> = [];
|
|
46
|
+
for (const [event, handler] of handlers) {
|
|
47
|
+
const unsubscribe = events?.on?.(event, (data: unknown) => {
|
|
48
|
+
try { handler(data); } catch { /* metric handlers must never break event delivery */ }
|
|
49
|
+
});
|
|
50
|
+
if (typeof unsubscribe === "function") unsubscribers.push(unsubscribe);
|
|
51
|
+
}
|
|
52
|
+
let disposed = false;
|
|
53
|
+
return { dispose() { if (disposed) return; disposed = true; for (const unsubscribe of unsubscribers.splice(0)) unsubscribe(); } };
|
|
54
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { MetricSnapshot } from "../metrics-primitives.ts";
|
|
2
|
+
|
|
3
|
+
export interface MetricExporter {
|
|
4
|
+
name: string;
|
|
5
|
+
push(snapshots: MetricSnapshot[]): Promise<void>;
|
|
6
|
+
dispose(): void;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export class CompositeExporter implements MetricExporter {
|
|
10
|
+
name = "composite";
|
|
11
|
+
private readonly exporters: MetricExporter[];
|
|
12
|
+
|
|
13
|
+
constructor(exporters: MetricExporter[]) {
|
|
14
|
+
this.exporters = exporters;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
async push(snapshots: MetricSnapshot[]): Promise<void> {
|
|
18
|
+
await Promise.allSettled(this.exporters.map((exporter) => exporter.push(snapshots)));
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
dispose(): void {
|
|
22
|
+
for (const exporter of this.exporters) exporter.dispose();
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { logInternalError } from "../../utils/internal-error.ts";
|
|
2
|
+
import type { MetricRegistry } from "../metric-registry.ts";
|
|
3
|
+
import type { MetricSnapshot } from "../metrics-primitives.ts";
|
|
4
|
+
import type { MetricExporter } from "./adapter.ts";
|
|
5
|
+
|
|
6
|
+
export interface OTLPExporterOptions {
|
|
7
|
+
endpoint: string;
|
|
8
|
+
headers?: Record<string, string>;
|
|
9
|
+
intervalMs?: number;
|
|
10
|
+
timeoutMs?: number;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
function pointValues(snapshot: MetricSnapshot): unknown[] {
|
|
14
|
+
return snapshot.values.map((value) => ({ attributes: Object.entries(value.labels).map(([key, item]) => ({ key, value: { stringValue: String(item) } })), asDouble: "value" in value ? value.value : undefined, count: "count" in value ? value.count : undefined, sum: "sum" in value ? value.sum : undefined }));
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function convertToOTLP(snapshots: MetricSnapshot[]): unknown {
|
|
18
|
+
return {
|
|
19
|
+
resourceMetrics: [{
|
|
20
|
+
resource: { attributes: [{ key: "service.name", value: { stringValue: "pi-crew" } }] },
|
|
21
|
+
scopeMetrics: [{
|
|
22
|
+
scope: { name: "pi-crew" },
|
|
23
|
+
metrics: snapshots.map((snapshot) => ({ name: snapshot.name, description: snapshot.description, [snapshot.type === "histogram" ? "histogram" : snapshot.type === "gauge" ? "gauge" : "sum"]: { dataPoints: pointValues(snapshot) } })),
|
|
24
|
+
}],
|
|
25
|
+
}],
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export class OTLPExporter implements MetricExporter {
|
|
30
|
+
name = "otlp";
|
|
31
|
+
private timer?: ReturnType<typeof setInterval>;
|
|
32
|
+
private readonly opts: OTLPExporterOptions;
|
|
33
|
+
private readonly registry: MetricRegistry;
|
|
34
|
+
|
|
35
|
+
constructor(opts: OTLPExporterOptions, registry: MetricRegistry) {
|
|
36
|
+
this.opts = opts;
|
|
37
|
+
this.registry = registry;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
start(): void {
|
|
41
|
+
this.dispose();
|
|
42
|
+
this.timer = setInterval(() => { void this.push(this.registry.snapshot()); }, this.opts.intervalMs ?? 60_000);
|
|
43
|
+
this.timer.unref?.();
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
async push(snapshots: MetricSnapshot[]): Promise<void> {
|
|
47
|
+
try {
|
|
48
|
+
const timeoutMs = this.opts.timeoutMs ?? 10_000;
|
|
49
|
+
const controller = new AbortController();
|
|
50
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
51
|
+
try {
|
|
52
|
+
await fetch(this.opts.endpoint, { method: "POST", headers: { "content-type": "application/json", ...(this.opts.headers ?? {}) }, body: JSON.stringify(convertToOTLP(snapshots)), signal: controller.signal });
|
|
53
|
+
} finally {
|
|
54
|
+
clearTimeout(timer);
|
|
55
|
+
}
|
|
56
|
+
} catch (error) {
|
|
57
|
+
logInternalError("otlp-export", error);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
dispose(): void {
|
|
62
|
+
if (this.timer) clearInterval(this.timer);
|
|
63
|
+
this.timer = undefined;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import type { HistogramPoint, MetricLabels, MetricPoint, MetricSnapshot } from "../metrics-primitives.ts";
|
|
2
|
+
|
|
3
|
+
function prometheusName(name: string): string {
|
|
4
|
+
return name.replace(/\./g, "_");
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
function escapeLabel(value: string): string {
|
|
8
|
+
return value.replace(/\\/g, "\\\\").replace(/\n/g, "\\n").replace(/"/g, "\\\"");
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
function labelsText(labels: MetricLabels): string {
|
|
12
|
+
const entries = Object.entries(labels);
|
|
13
|
+
if (!entries.length) return "";
|
|
14
|
+
return `{${entries.map(([key, value]) => `${key}="${escapeLabel(String(value))}"`).join(",")}}`;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function metricType(type: MetricSnapshot["type"]): string {
|
|
18
|
+
return type === "histogram" ? "histogram" : type === "gauge" ? "gauge" : "counter";
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function isHistogramPoint(value: MetricPoint | HistogramPoint): value is HistogramPoint {
|
|
22
|
+
return "buckets" in value && "counts" in value;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function formatPrometheus(snapshots: MetricSnapshot[]): string {
|
|
26
|
+
const lines: string[] = [];
|
|
27
|
+
for (const snapshot of snapshots) {
|
|
28
|
+
const name = prometheusName(snapshot.name);
|
|
29
|
+
lines.push(`# HELP ${name} ${snapshot.description}`);
|
|
30
|
+
lines.push(`# TYPE ${name} ${metricType(snapshot.type)}`);
|
|
31
|
+
for (const value of snapshot.values) {
|
|
32
|
+
if (isHistogramPoint(value)) {
|
|
33
|
+
let cumulative = 0;
|
|
34
|
+
for (let index = 0; index < value.buckets.length; index += 1) {
|
|
35
|
+
cumulative += value.counts[index] ?? 0;
|
|
36
|
+
const le = Number.isFinite(value.buckets[index]) ? String(value.buckets[index]) : "+Inf";
|
|
37
|
+
lines.push(`${name}_bucket${labelsText({ ...value.labels, le })} ${cumulative}`);
|
|
38
|
+
}
|
|
39
|
+
lines.push(`${name}_sum${labelsText(value.labels)} ${value.sum}`);
|
|
40
|
+
lines.push(`${name}_count${labelsText(value.labels)} ${value.count}`);
|
|
41
|
+
} else {
|
|
42
|
+
lines.push(`${name}${labelsText(value.labels)} ${value.value}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
return `${lines.join("\n")}\n`;
|
|
47
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import { Counter, Gauge, Histogram, type Metric, type MetricSnapshot } from "./metrics-primitives.ts";
|
|
2
|
+
|
|
3
|
+
const METRIC_NAME_PATTERN = /^crew\.[a-z]+\.[a-z][a-z_]*$/;
|
|
4
|
+
|
|
5
|
+
function assertMetricName(name: string): void {
|
|
6
|
+
if (!METRIC_NAME_PATTERN.test(name)) throw new Error(`Invalid metric name '${name}'. Expected crew.<domain>.<measure>.`);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export class MetricRegistry {
|
|
10
|
+
private metrics = new Map<string, Metric>();
|
|
11
|
+
|
|
12
|
+
registerCounter(name: string, description: string): Counter {
|
|
13
|
+
assertMetricName(name);
|
|
14
|
+
if (this.metrics.has(name)) throw new Error(`Metric '${name}' is already registered.`);
|
|
15
|
+
const metric = new Counter(name, description);
|
|
16
|
+
this.metrics.set(name, metric);
|
|
17
|
+
return metric;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
registerGauge(name: string, description: string): Gauge {
|
|
21
|
+
assertMetricName(name);
|
|
22
|
+
if (this.metrics.has(name)) throw new Error(`Metric '${name}' is already registered.`);
|
|
23
|
+
const metric = new Gauge(name, description);
|
|
24
|
+
this.metrics.set(name, metric);
|
|
25
|
+
return metric;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
registerHistogram(name: string, description: string, buckets?: number[]): Histogram {
|
|
29
|
+
assertMetricName(name);
|
|
30
|
+
if (this.metrics.has(name)) throw new Error(`Metric '${name}' is already registered.`);
|
|
31
|
+
const metric = new Histogram(name, description, buckets);
|
|
32
|
+
this.metrics.set(name, metric);
|
|
33
|
+
return metric;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
counter(name: string, description: string): Counter {
|
|
37
|
+
const existing = this.metrics.get(name);
|
|
38
|
+
if (existing instanceof Counter) return existing;
|
|
39
|
+
if (existing) throw new Error(`Metric '${name}' is not a counter.`);
|
|
40
|
+
return this.registerCounter(name, description);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
gauge(name: string, description: string): Gauge {
|
|
44
|
+
const existing = this.metrics.get(name);
|
|
45
|
+
if (existing instanceof Gauge) return existing;
|
|
46
|
+
if (existing) throw new Error(`Metric '${name}' is not a gauge.`);
|
|
47
|
+
return this.registerGauge(name, description);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
histogram(name: string, description: string, buckets?: number[]): Histogram {
|
|
51
|
+
const existing = this.metrics.get(name);
|
|
52
|
+
if (existing instanceof Histogram) return existing;
|
|
53
|
+
if (existing) throw new Error(`Metric '${name}' is not a histogram.`);
|
|
54
|
+
return this.registerHistogram(name, description, buckets);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
get(name: string): Metric | undefined {
|
|
58
|
+
return this.metrics.get(name);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
snapshot(): MetricSnapshot[] {
|
|
62
|
+
return [...this.metrics.values()].map((metric) => metric.snapshot());
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
dispose(): void {
|
|
66
|
+
this.metrics.clear();
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export function createMetricRegistry(): MetricRegistry {
|
|
71
|
+
return new MetricRegistry();
|
|
72
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { labelKey, type MetricLabels } from "./metrics-primitives.ts";
|
|
2
|
+
|
|
3
|
+
interface WindowEvent {
|
|
4
|
+
timestamp: number;
|
|
5
|
+
labels: MetricLabels;
|
|
6
|
+
delta: number;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export class TimeWindowedCounter {
|
|
10
|
+
private events: WindowEvent[] = [];
|
|
11
|
+
private readonly windowMs: number;
|
|
12
|
+
private readonly now: () => number;
|
|
13
|
+
|
|
14
|
+
constructor(windowMs = 3_600_000, now: () => number = () => Date.now()) {
|
|
15
|
+
this.windowMs = windowMs;
|
|
16
|
+
this.now = now;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
inc(labels: MetricLabels = {}, delta = 1): void {
|
|
20
|
+
if (!Number.isFinite(delta)) return;
|
|
21
|
+
this.events.push({ timestamp: this.now(), labels: { ...labels }, delta });
|
|
22
|
+
this.prune();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
count(labels: MetricLabels = {}, durationMs = this.windowMs): number {
|
|
26
|
+
this.prune();
|
|
27
|
+
const key = labelKey(labels);
|
|
28
|
+
const cutoff = this.now() - durationMs;
|
|
29
|
+
return this.events.filter((event) => event.timestamp >= cutoff && labelKey(event.labels) === key).reduce((sum, event) => sum + event.delta, 0);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
rate(labels: MetricLabels = {}, durationMs = this.windowMs): number {
|
|
33
|
+
if (durationMs <= 0) return 0;
|
|
34
|
+
return this.count(labels, durationMs) / (durationMs / 1000);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
size(): number {
|
|
38
|
+
this.prune();
|
|
39
|
+
return this.events.length;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
private prune(): void {
|
|
43
|
+
const cutoff = this.now() - this.windowMs;
|
|
44
|
+
this.events = this.events.filter((event) => event.timestamp >= cutoff);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import * as fs from "node:fs";
|
|
2
|
+
import * as path from "node:path";
|
|
3
|
+
import { redactSecrets } from "../runtime/diagnostic-export.ts";
|
|
4
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
5
|
+
import type { MetricRegistry } from "./metric-registry.ts";
|
|
6
|
+
import type { MetricSnapshot } from "./metrics-primitives.ts";
|
|
7
|
+
|
|
8
|
+
export interface MetricSink {
|
|
9
|
+
writeSnapshot(snapshots: MetricSnapshot[]): void;
|
|
10
|
+
dispose(): void;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface MetricFileSinkOptions {
|
|
14
|
+
crewRoot: string;
|
|
15
|
+
registry: MetricRegistry;
|
|
16
|
+
retentionDays?: number;
|
|
17
|
+
intervalMs?: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function rotateOldFiles(dir: string, retentionDays: number, now = Date.now()): void {
|
|
21
|
+
if (!fs.existsSync(dir)) return;
|
|
22
|
+
const maxAge = retentionDays * 24 * 60 * 60 * 1000;
|
|
23
|
+
for (const file of fs.readdirSync(dir)) {
|
|
24
|
+
if (!file.endsWith(".jsonl")) continue;
|
|
25
|
+
const fullPath = path.join(dir, file);
|
|
26
|
+
try {
|
|
27
|
+
if (now - fs.statSync(fullPath).mtimeMs > maxAge) fs.unlinkSync(fullPath);
|
|
28
|
+
} catch (error) {
|
|
29
|
+
logInternalError("metric-sink.rotate", error, fullPath);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function createMetricFileSink(opts: MetricFileSinkOptions): MetricSink {
|
|
35
|
+
const dir = path.join(opts.crewRoot, "state", "metrics");
|
|
36
|
+
const retentionDays = opts.retentionDays ?? 7;
|
|
37
|
+
const writeSnapshot = (snapshots: MetricSnapshot[]): void => {
|
|
38
|
+
try {
|
|
39
|
+
const date = new Date().toISOString().slice(0, 10);
|
|
40
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
41
|
+
rotateOldFiles(dir, retentionDays);
|
|
42
|
+
const redacted = redactSecrets(snapshots) as MetricSnapshot[];
|
|
43
|
+
fs.appendFileSync(path.join(dir, `${date}.jsonl`), `${JSON.stringify({ exportedAt: new Date().toISOString(), snapshots: redacted })}\n`, "utf-8");
|
|
44
|
+
} catch (error) {
|
|
45
|
+
logInternalError("metric-sink.write", error);
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
const timer = setInterval(() => writeSnapshot(opts.registry.snapshot()), opts.intervalMs ?? 60_000);
|
|
49
|
+
timer.unref?.();
|
|
50
|
+
return { writeSnapshot, dispose: () => clearInterval(timer) };
|
|
51
|
+
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
export type MetricLabelValue = string | number;
|
|
2
|
+
export type MetricLabels = Record<string, MetricLabelValue>;
|
|
3
|
+
|
|
4
|
+
export interface MetricPoint {
|
|
5
|
+
labels: MetricLabels;
|
|
6
|
+
value: number;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export interface HistogramPoint {
|
|
10
|
+
labels: MetricLabels;
|
|
11
|
+
buckets: number[];
|
|
12
|
+
counts: number[];
|
|
13
|
+
sum: number;
|
|
14
|
+
count: number;
|
|
15
|
+
quantiles: Record<string, number>;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface MetricSnapshot {
|
|
19
|
+
type: "counter" | "gauge" | "histogram";
|
|
20
|
+
name: string;
|
|
21
|
+
description: string;
|
|
22
|
+
values: MetricPoint[] | HistogramPoint[];
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
interface StoredValue {
|
|
26
|
+
labels: MetricLabels;
|
|
27
|
+
value: number;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
interface StoredHistogram {
|
|
31
|
+
labels: MetricLabels;
|
|
32
|
+
counts: number[];
|
|
33
|
+
sum: number;
|
|
34
|
+
count: number;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export const DEFAULT_HISTOGRAM_BUCKETS = [1, 2, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000] as const;
|
|
38
|
+
|
|
39
|
+
function normalizeLabels(labels: MetricLabels = {}): MetricLabels {
|
|
40
|
+
const normalized: MetricLabels = {};
|
|
41
|
+
for (const [key, value] of Object.entries(labels).sort(([left], [right]) => left.localeCompare(right))) normalized[key] = value;
|
|
42
|
+
return normalized;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export function labelKey(labels: MetricLabels = {}): string {
|
|
46
|
+
return JSON.stringify(normalizeLabels(labels));
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function cloneLabels(labels: MetricLabels): MetricLabels {
|
|
50
|
+
return { ...labels };
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export abstract class Metric {
|
|
54
|
+
readonly name: string;
|
|
55
|
+
readonly description: string;
|
|
56
|
+
|
|
57
|
+
constructor(name: string, description: string) {
|
|
58
|
+
this.name = name;
|
|
59
|
+
this.description = description;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
abstract snapshot(): MetricSnapshot;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export class Counter extends Metric {
|
|
66
|
+
private values = new Map<string, StoredValue>();
|
|
67
|
+
|
|
68
|
+
inc(labels: MetricLabels = {}, delta = 1): void {
|
|
69
|
+
if (!Number.isFinite(delta)) return;
|
|
70
|
+
const key = labelKey(labels);
|
|
71
|
+
const current = this.values.get(key) ?? { labels: normalizeLabels(labels), value: 0 };
|
|
72
|
+
this.values.set(key, { labels: current.labels, value: current.value + delta });
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
value(labels: MetricLabels = {}): number {
|
|
76
|
+
return this.values.get(labelKey(labels))?.value ?? 0;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
snapshot(): MetricSnapshot {
|
|
80
|
+
return { type: "counter", name: this.name, description: this.description, values: [...this.values.values()].map((entry) => ({ labels: cloneLabels(entry.labels), value: entry.value })) };
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export class Gauge extends Metric {
|
|
85
|
+
private values = new Map<string, StoredValue>();
|
|
86
|
+
|
|
87
|
+
set(labels: MetricLabels = {}, value: number): void {
|
|
88
|
+
if (!Number.isFinite(value)) return;
|
|
89
|
+
this.values.set(labelKey(labels), { labels: normalizeLabels(labels), value });
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
add(labels: MetricLabels = {}, delta: number): void {
|
|
93
|
+
if (!Number.isFinite(delta)) return;
|
|
94
|
+
this.set(labels, this.value(labels) + delta);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
value(labels: MetricLabels = {}): number {
|
|
98
|
+
return this.values.get(labelKey(labels))?.value ?? 0;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
snapshot(): MetricSnapshot {
|
|
102
|
+
return { type: "gauge", name: this.name, description: this.description, values: [...this.values.values()].map((entry) => ({ labels: cloneLabels(entry.labels), value: entry.value })) };
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export class Histogram extends Metric {
|
|
107
|
+
private readonly buckets: number[];
|
|
108
|
+
private observations = new Map<string, StoredHistogram>();
|
|
109
|
+
|
|
110
|
+
constructor(name: string, description: string, buckets?: number[]) {
|
|
111
|
+
super(name, description);
|
|
112
|
+
const source = buckets?.length ? buckets : [...DEFAULT_HISTOGRAM_BUCKETS];
|
|
113
|
+
this.buckets = [...new Set(source.filter((bucket) => Number.isFinite(bucket)).sort((left, right) => left - right))];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
observe(labels: MetricLabels = {}, value: number): void {
|
|
117
|
+
if (!Number.isFinite(value)) return;
|
|
118
|
+
const key = labelKey(labels);
|
|
119
|
+
const current = this.observations.get(key) ?? { labels: normalizeLabels(labels), counts: new Array(this.buckets.length + 1).fill(0) as number[], sum: 0, count: 0 };
|
|
120
|
+
const bucketIndex = this.buckets.findIndex((bucket) => value <= bucket);
|
|
121
|
+
current.counts[bucketIndex === -1 ? this.buckets.length : bucketIndex] = (current.counts[bucketIndex === -1 ? this.buckets.length : bucketIndex] ?? 0) + 1;
|
|
122
|
+
current.sum += value;
|
|
123
|
+
current.count += 1;
|
|
124
|
+
this.observations.set(key, current);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
quantile(labels: MetricLabels = {}, q: number): number {
|
|
128
|
+
const obs = this.observations.get(labelKey(labels));
|
|
129
|
+
if (!obs || obs.count === 0 || !Number.isFinite(q)) return Number.NaN;
|
|
130
|
+
const bounded = Math.min(1, Math.max(0, q));
|
|
131
|
+
const target = Math.max(1, bounded * obs.count);
|
|
132
|
+
let cumulative = 0;
|
|
133
|
+
for (let index = 0; index < obs.counts.length; index += 1) {
|
|
134
|
+
const count = obs.counts[index] ?? 0;
|
|
135
|
+
cumulative += count;
|
|
136
|
+
if (cumulative >= target) {
|
|
137
|
+
const previous = cumulative - count;
|
|
138
|
+
const lower = index === 0 ? 0 : this.buckets[index - 1] ?? this.buckets.at(-1) ?? 0;
|
|
139
|
+
const upper = index < this.buckets.length ? this.buckets[index] ?? lower : Math.max(lower, obs.sum / Math.max(1, obs.count));
|
|
140
|
+
const fraction = count === 0 ? 0 : (target - previous) / Math.max(1, count);
|
|
141
|
+
return lower + fraction * (upper - lower);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
return this.buckets.at(-1) ?? Number.NaN;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
count(labels: MetricLabels = {}): number {
|
|
148
|
+
return this.observations.get(labelKey(labels))?.count ?? 0;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
snapshot(): MetricSnapshot {
|
|
152
|
+
return {
|
|
153
|
+
type: "histogram",
|
|
154
|
+
name: this.name,
|
|
155
|
+
description: this.description,
|
|
156
|
+
values: [...this.observations.values()].map((entry) => ({
|
|
157
|
+
labels: cloneLabels(entry.labels),
|
|
158
|
+
buckets: [...this.buckets, Number.POSITIVE_INFINITY],
|
|
159
|
+
counts: [...entry.counts],
|
|
160
|
+
sum: entry.sum,
|
|
161
|
+
count: entry.count,
|
|
162
|
+
quantiles: { p50: this.quantile(entry.labels, 0.5), p95: this.quantile(entry.labels, 0.95), p99: this.quantile(entry.labels, 0.99) },
|
|
163
|
+
})),
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
}
|