@botbotgo/agent-harness 0.0.104 → 0.0.105
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const AGENT_HARNESS_VERSION = "0.0.
|
|
1
|
+
export declare const AGENT_HARNESS_VERSION = "0.0.104";
|
package/dist/package-version.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export const AGENT_HARNESS_VERSION = "0.0.
|
|
1
|
+
export const AGENT_HARNESS_VERSION = "0.0.104";
|
|
@@ -24,18 +24,6 @@ type HealthMonitorConfig = {
|
|
|
24
24
|
degradedAbove: number;
|
|
25
25
|
unhealthyAbove: number;
|
|
26
26
|
};
|
|
27
|
-
checkpointBytes: {
|
|
28
|
-
degradedAbove: number;
|
|
29
|
-
unhealthyAbove: number;
|
|
30
|
-
};
|
|
31
|
-
runtimeDbBytes: {
|
|
32
|
-
degradedAbove: number;
|
|
33
|
-
unhealthyAbove: number;
|
|
34
|
-
};
|
|
35
|
-
artifactBytes: {
|
|
36
|
-
degradedAbove: number;
|
|
37
|
-
unhealthyAbove: number;
|
|
38
|
-
};
|
|
39
27
|
};
|
|
40
28
|
};
|
|
41
29
|
type HealthMonitorOptions = {
|
|
@@ -51,12 +39,9 @@ export declare function readHealthMonitorConfig(workspace: WorkspaceBundle): Hea
|
|
|
51
39
|
export declare class HealthMonitor {
|
|
52
40
|
private readonly options;
|
|
53
41
|
private readonly config;
|
|
54
|
-
private readonly runRoots;
|
|
55
|
-
private readonly checkpointDbPaths;
|
|
56
42
|
private readonly llmSamples;
|
|
57
43
|
private timer;
|
|
58
44
|
private latestSnapshot;
|
|
59
|
-
private runtimeEventSequence;
|
|
60
45
|
constructor(options: HealthMonitorOptions);
|
|
61
46
|
recordLlmSuccess(latencyMs: number, nowMs?: number): void;
|
|
62
47
|
recordLlmFailure(latencyMs: number, nowMs?: number): void;
|
|
@@ -67,15 +52,7 @@ export declare class HealthMonitor {
|
|
|
67
52
|
evaluate(nowMs?: number): Promise<RuntimeHealthSnapshot>;
|
|
68
53
|
private llmStats;
|
|
69
54
|
private evaluateLlmCheck;
|
|
70
|
-
private llmSymptoms;
|
|
71
|
-
private evaluatePersistenceCheck;
|
|
72
|
-
private evaluateCapacityCheck;
|
|
73
|
-
private capacitySymptoms;
|
|
74
55
|
private evaluateWorkloadCheck;
|
|
75
|
-
private workloadSymptoms;
|
|
76
56
|
private countStuckRuns;
|
|
77
|
-
private sumRuntimeDbBytes;
|
|
78
|
-
private sumCheckpointDbBytes;
|
|
79
|
-
private sumArtifactBytes;
|
|
80
57
|
}
|
|
81
58
|
export {};
|
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
import path from "node:path";
|
|
2
|
-
import { readdir, stat } from "node:fs/promises";
|
|
3
1
|
import { getRuntimeDefaults } from "../../../workspace/support/workspace-ref-utils.js";
|
|
4
|
-
import { discoverCheckpointMaintenanceTargets } from "../../maintenance/checkpoint-maintenance.js";
|
|
5
2
|
const DEFAULT_HEALTH_CONFIG = {
|
|
6
3
|
enabled: false,
|
|
7
4
|
evaluateIntervalSeconds: 30,
|
|
@@ -25,18 +22,6 @@ const DEFAULT_HEALTH_CONFIG = {
|
|
|
25
22
|
degradedAbove: 300,
|
|
26
23
|
unhealthyAbove: 900,
|
|
27
24
|
},
|
|
28
|
-
checkpointBytes: {
|
|
29
|
-
degradedAbove: 512 * 1024 * 1024,
|
|
30
|
-
unhealthyAbove: 2 * 1024 * 1024 * 1024,
|
|
31
|
-
},
|
|
32
|
-
runtimeDbBytes: {
|
|
33
|
-
degradedAbove: 256 * 1024 * 1024,
|
|
34
|
-
unhealthyAbove: 1024 * 1024 * 1024,
|
|
35
|
-
},
|
|
36
|
-
artifactBytes: {
|
|
37
|
-
degradedAbove: 512 * 1024 * 1024,
|
|
38
|
-
unhealthyAbove: 2 * 1024 * 1024 * 1024,
|
|
39
|
-
},
|
|
40
25
|
},
|
|
41
26
|
};
|
|
42
27
|
function asObject(value) {
|
|
@@ -52,23 +37,18 @@ function maxStatus(left, right) {
|
|
|
52
37
|
const rank = { healthy: 0, degraded: 1, unhealthy: 2 };
|
|
53
38
|
return rank[left] >= rank[right] ? left : right;
|
|
54
39
|
}
|
|
55
|
-
function compareStatus(left, right) {
|
|
56
|
-
return left === right;
|
|
57
|
-
}
|
|
58
|
-
function computeP95(values) {
|
|
59
|
-
if (values.length === 0) {
|
|
60
|
-
return undefined;
|
|
61
|
-
}
|
|
62
|
-
const sorted = [...values].sort((a, b) => a - b);
|
|
63
|
-
const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
|
|
64
|
-
return sorted[index];
|
|
65
|
-
}
|
|
66
|
-
function isoFromMs(value) {
|
|
67
|
-
return new Date(value).toISOString();
|
|
68
|
-
}
|
|
69
40
|
function buildCheck(status, updatedAt, reason) {
|
|
70
41
|
return reason ? { status, updatedAt, reason } : { status, updatedAt };
|
|
71
42
|
}
|
|
43
|
+
function createSymptom(code, status, message, timestamp) {
|
|
44
|
+
return {
|
|
45
|
+
code,
|
|
46
|
+
severity: status === "unhealthy" ? "error" : status === "degraded" ? "warn" : "info",
|
|
47
|
+
message,
|
|
48
|
+
firstSeenAt: timestamp,
|
|
49
|
+
lastSeenAt: timestamp,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
72
52
|
function describeThresholdStatus(value, thresholds) {
|
|
73
53
|
if (value >= thresholds.unhealthyAbove) {
|
|
74
54
|
return "unhealthy";
|
|
@@ -78,6 +58,14 @@ function describeThresholdStatus(value, thresholds) {
|
|
|
78
58
|
}
|
|
79
59
|
return "healthy";
|
|
80
60
|
}
|
|
61
|
+
function computeP95(values) {
|
|
62
|
+
if (values.length === 0) {
|
|
63
|
+
return undefined;
|
|
64
|
+
}
|
|
65
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
66
|
+
const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
|
|
67
|
+
return sorted[index];
|
|
68
|
+
}
|
|
81
69
|
export function readHealthMonitorConfig(workspace) {
|
|
82
70
|
const runtimeDefaults = getRuntimeDefaults(workspace.refs);
|
|
83
71
|
const observability = asObject(runtimeDefaults?.observability);
|
|
@@ -87,9 +75,6 @@ export function readHealthMonitorConfig(workspace) {
|
|
|
87
75
|
const llmP95LatencyMs = asObject(thresholds?.llmP95LatencyMs);
|
|
88
76
|
const pendingApprovals = asObject(thresholds?.pendingApprovals);
|
|
89
77
|
const stuckRunSeconds = asObject(thresholds?.stuckRunSeconds);
|
|
90
|
-
const checkpointBytes = asObject(thresholds?.checkpointBytes);
|
|
91
|
-
const runtimeDbBytes = asObject(thresholds?.runtimeDbBytes);
|
|
92
|
-
const artifactBytes = asObject(thresholds?.artifactBytes);
|
|
93
78
|
return {
|
|
94
79
|
enabled: health?.enabled === true,
|
|
95
80
|
evaluateIntervalSeconds: readPositiveNumber(health?.evaluateIntervalSeconds, DEFAULT_HEALTH_CONFIG.evaluateIntervalSeconds),
|
|
@@ -113,112 +98,18 @@ export function readHealthMonitorConfig(workspace) {
|
|
|
113
98
|
degradedAbove: readNonNegativeNumber(stuckRunSeconds?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.stuckRunSeconds.degradedAbove),
|
|
114
99
|
unhealthyAbove: readNonNegativeNumber(stuckRunSeconds?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.stuckRunSeconds.unhealthyAbove),
|
|
115
100
|
},
|
|
116
|
-
checkpointBytes: {
|
|
117
|
-
degradedAbove: readNonNegativeNumber(checkpointBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.checkpointBytes.degradedAbove),
|
|
118
|
-
unhealthyAbove: readNonNegativeNumber(checkpointBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.checkpointBytes.unhealthyAbove),
|
|
119
|
-
},
|
|
120
|
-
runtimeDbBytes: {
|
|
121
|
-
degradedAbove: readNonNegativeNumber(runtimeDbBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.runtimeDbBytes.degradedAbove),
|
|
122
|
-
unhealthyAbove: readNonNegativeNumber(runtimeDbBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.runtimeDbBytes.unhealthyAbove),
|
|
123
|
-
},
|
|
124
|
-
artifactBytes: {
|
|
125
|
-
degradedAbove: readNonNegativeNumber(artifactBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.artifactBytes.degradedAbove),
|
|
126
|
-
unhealthyAbove: readNonNegativeNumber(artifactBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.artifactBytes.unhealthyAbove),
|
|
127
|
-
},
|
|
128
101
|
},
|
|
129
102
|
};
|
|
130
103
|
}
|
|
131
|
-
async function safeFileSize(filePath) {
|
|
132
|
-
try {
|
|
133
|
-
const stats = await stat(filePath);
|
|
134
|
-
return stats.size;
|
|
135
|
-
}
|
|
136
|
-
catch {
|
|
137
|
-
return 0;
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
async function directorySize(root) {
|
|
141
|
-
try {
|
|
142
|
-
const entries = await readdir(root, { withFileTypes: true });
|
|
143
|
-
let total = 0;
|
|
144
|
-
for (const entry of entries) {
|
|
145
|
-
const fullPath = path.join(root, entry.name);
|
|
146
|
-
if (entry.isDirectory()) {
|
|
147
|
-
total += await directorySize(fullPath);
|
|
148
|
-
continue;
|
|
149
|
-
}
|
|
150
|
-
if (entry.isFile()) {
|
|
151
|
-
total += await safeFileSize(fullPath);
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
return total;
|
|
155
|
-
}
|
|
156
|
-
catch {
|
|
157
|
-
return 0;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
function evaluateMaintenanceStatus(label, status, updatedAt) {
|
|
161
|
-
if (!status) {
|
|
162
|
-
return {
|
|
163
|
-
check: buildCheck("healthy", updatedAt, `${label} disabled`),
|
|
164
|
-
symptoms: [],
|
|
165
|
-
};
|
|
166
|
-
}
|
|
167
|
-
if (status.consecutiveFailures > 0) {
|
|
168
|
-
return {
|
|
169
|
-
check: buildCheck("degraded", updatedAt, `${label} has ${status.consecutiveFailures} consecutive failure(s)`),
|
|
170
|
-
symptoms: [{
|
|
171
|
-
code: `${label}.loop.failure`,
|
|
172
|
-
severity: "warn",
|
|
173
|
-
message: status.lastError ?? `${label} maintenance loop failed`,
|
|
174
|
-
firstSeenAt: status.lastFailedAt ?? updatedAt,
|
|
175
|
-
lastSeenAt: status.lastFailedAt ?? updatedAt,
|
|
176
|
-
}],
|
|
177
|
-
};
|
|
178
|
-
}
|
|
179
|
-
if (status.lastCompletedAt) {
|
|
180
|
-
return {
|
|
181
|
-
check: buildCheck("healthy", updatedAt, `${label} completed successfully`),
|
|
182
|
-
symptoms: [],
|
|
183
|
-
};
|
|
184
|
-
}
|
|
185
|
-
return {
|
|
186
|
-
check: buildCheck("healthy", updatedAt, `${label} idle`),
|
|
187
|
-
symptoms: [],
|
|
188
|
-
};
|
|
189
|
-
}
|
|
190
|
-
function normalizeSeverity(status) {
|
|
191
|
-
if (status === "unhealthy") {
|
|
192
|
-
return "error";
|
|
193
|
-
}
|
|
194
|
-
if (status === "degraded") {
|
|
195
|
-
return "warn";
|
|
196
|
-
}
|
|
197
|
-
return "info";
|
|
198
|
-
}
|
|
199
|
-
function createSymptom(code, status, message, timestamp) {
|
|
200
|
-
return {
|
|
201
|
-
code,
|
|
202
|
-
severity: normalizeSeverity(status),
|
|
203
|
-
message,
|
|
204
|
-
firstSeenAt: timestamp,
|
|
205
|
-
lastSeenAt: timestamp,
|
|
206
|
-
};
|
|
207
|
-
}
|
|
208
104
|
export class HealthMonitor {
|
|
209
105
|
options;
|
|
210
106
|
config;
|
|
211
|
-
runRoots;
|
|
212
|
-
checkpointDbPaths;
|
|
213
107
|
llmSamples = [];
|
|
214
108
|
timer = null;
|
|
215
109
|
latestSnapshot = null;
|
|
216
|
-
runtimeEventSequence = 0;
|
|
217
110
|
constructor(options) {
|
|
218
111
|
this.options = options;
|
|
219
112
|
this.config = readHealthMonitorConfig(options.workspace);
|
|
220
|
-
this.runRoots = Array.from(new Set(Array.from(options.workspace.bindings.values()).map((binding) => binding.harnessRuntime.runRoot)));
|
|
221
|
-
this.checkpointDbPaths = Array.from(new Set(discoverCheckpointMaintenanceTargets(options.workspace).map((target) => target.dbPath)));
|
|
222
113
|
}
|
|
223
114
|
recordLlmSuccess(latencyMs, nowMs = Date.now()) {
|
|
224
115
|
this.recordLlmSample({ timestampMs: nowMs, latencyMs, success: true });
|
|
@@ -253,172 +144,96 @@ export class HealthMonitor {
|
|
|
253
144
|
return this.evaluate();
|
|
254
145
|
}
|
|
255
146
|
async evaluate(nowMs = Date.now()) {
|
|
256
|
-
const updatedAt =
|
|
257
|
-
const [runs, approvals
|
|
147
|
+
const updatedAt = new Date(nowMs).toISOString();
|
|
148
|
+
const [runs, approvals] = await Promise.all([
|
|
258
149
|
this.options.persistence.listRuns(),
|
|
259
150
|
this.options.persistence.listApprovals(),
|
|
260
|
-
this.sumRuntimeDbBytes(),
|
|
261
|
-
this.sumCheckpointDbBytes(),
|
|
262
|
-
this.sumArtifactBytes(),
|
|
263
151
|
]);
|
|
264
152
|
const pendingApprovals = approvals.filter((approval) => approval.status === "pending").length;
|
|
265
153
|
const stuckRuns = this.countStuckRuns(runs, nowMs);
|
|
266
|
-
const runtimeMaintenance = evaluateMaintenanceStatus("checkpoint", this.options.getCheckpointMaintenanceStatus(), updatedAt);
|
|
267
|
-
const recordMaintenance = evaluateMaintenanceStatus("records", this.options.getRuntimeRecordMaintenanceStatus(), updatedAt);
|
|
268
|
-
let runtimeCheck = buildCheck("healthy", updatedAt, "runtime loops healthy");
|
|
269
|
-
const runtimeSymptoms = [...runtimeMaintenance.symptoms, ...recordMaintenance.symptoms];
|
|
270
|
-
runtimeCheck = buildCheck(maxStatus(runtimeMaintenance.check.status, recordMaintenance.check.status), updatedAt, runtimeSymptoms.length > 0 ? runtimeSymptoms.map((symptom) => symptom.message).join("; ") : "runtime loops healthy");
|
|
271
154
|
const llmCheck = this.evaluateLlmCheck(updatedAt, nowMs);
|
|
272
|
-
const persistenceCheck = this.evaluatePersistenceCheck(updatedAt, runtimeDbBytes, runtimeSymptoms);
|
|
273
|
-
const capacityCheck = this.evaluateCapacityCheck(updatedAt, checkpointBytes, runtimeDbBytes, artifactBytes);
|
|
274
155
|
const workloadCheck = this.evaluateWorkloadCheck(updatedAt, pendingApprovals, stuckRuns);
|
|
275
|
-
const
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
llm
|
|
287
|
-
|
|
288
|
-
capacity: capacityCheck,
|
|
289
|
-
workload: workloadCheck,
|
|
290
|
-
};
|
|
156
|
+
const runtimeCheck = buildCheck("healthy", updatedAt, "runtime health monitoring enabled");
|
|
157
|
+
const persistenceCheck = buildCheck("healthy", updatedAt, "runtime persistence accessible");
|
|
158
|
+
const capacityCheck = buildCheck("healthy", updatedAt, "capacity checks disabled");
|
|
159
|
+
const symptoms = [];
|
|
160
|
+
if (describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals) !== "healthy") {
|
|
161
|
+
symptoms.push(createSymptom("workload.pending-approvals", describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals), `pendingApprovals=${pendingApprovals}`, updatedAt));
|
|
162
|
+
}
|
|
163
|
+
if (stuckRuns > 0) {
|
|
164
|
+
symptoms.push(createSymptom("workload.stuck-runs", workloadCheck.status, `stuckRuns=${stuckRuns}`, updatedAt));
|
|
165
|
+
}
|
|
166
|
+
if (llmCheck.status !== "healthy") {
|
|
167
|
+
symptoms.push(createSymptom("llm.invocations", llmCheck.status, llmCheck.reason ?? "llm health degraded", updatedAt));
|
|
168
|
+
}
|
|
291
169
|
let overallStatus = "healthy";
|
|
292
|
-
for (const check of
|
|
170
|
+
for (const check of [runtimeCheck, llmCheck, persistenceCheck, capacityCheck, workloadCheck]) {
|
|
293
171
|
overallStatus = maxStatus(overallStatus, check.status);
|
|
294
172
|
}
|
|
295
173
|
const snapshot = {
|
|
296
174
|
status: overallStatus,
|
|
297
175
|
updatedAt,
|
|
298
|
-
checks
|
|
176
|
+
checks: {
|
|
177
|
+
runtime: runtimeCheck,
|
|
178
|
+
llm: llmCheck,
|
|
179
|
+
persistence: persistenceCheck,
|
|
180
|
+
capacity: capacityCheck,
|
|
181
|
+
workload: workloadCheck,
|
|
182
|
+
},
|
|
299
183
|
symptoms,
|
|
300
184
|
stats: {
|
|
301
185
|
activeRunSlots: this.options.getActiveRunSlots(),
|
|
302
186
|
pendingRunSlots: this.options.getPendingRunSlots(),
|
|
303
187
|
pendingApprovals,
|
|
304
188
|
stuckRuns,
|
|
305
|
-
checkpointBytes,
|
|
306
|
-
runtimeDbBytes,
|
|
307
|
-
artifactBytes,
|
|
308
189
|
...this.llmStats(nowMs),
|
|
309
190
|
},
|
|
310
191
|
};
|
|
311
|
-
const previous = this.latestSnapshot;
|
|
312
|
-
this.latestSnapshot = snapshot;
|
|
313
192
|
if (this.config.enabled &&
|
|
314
193
|
this.config.emitEvents &&
|
|
315
194
|
this.options.publishEvent &&
|
|
316
|
-
|
|
317
|
-
|
|
195
|
+
this.latestSnapshot &&
|
|
196
|
+
this.latestSnapshot.status !== snapshot.status) {
|
|
318
197
|
await this.options.publishEvent({
|
|
319
|
-
previousStatus:
|
|
198
|
+
previousStatus: this.latestSnapshot.status,
|
|
320
199
|
status: snapshot.status,
|
|
321
200
|
checks: snapshot.checks,
|
|
322
201
|
stats: snapshot.stats,
|
|
323
202
|
});
|
|
324
203
|
}
|
|
204
|
+
this.latestSnapshot = snapshot;
|
|
325
205
|
return snapshot;
|
|
326
206
|
}
|
|
327
207
|
llmStats(nowMs) {
|
|
328
|
-
const
|
|
329
|
-
const recent = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= oneMinuteWindowMs);
|
|
208
|
+
const recent = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= 60 * 1000);
|
|
330
209
|
const successes = recent.filter((sample) => sample.success).length;
|
|
331
|
-
const latencies = recent.map((sample) => sample.latencyMs);
|
|
332
210
|
return {
|
|
333
211
|
llmSuccessRate1m: recent.length > 0 ? successes / recent.length : undefined,
|
|
334
|
-
llmP95LatencyMs1m: computeP95(
|
|
212
|
+
llmP95LatencyMs1m: computeP95(recent.map((sample) => sample.latencyMs)),
|
|
335
213
|
};
|
|
336
214
|
}
|
|
337
215
|
evaluateLlmCheck(updatedAt, nowMs) {
|
|
338
|
-
const
|
|
339
|
-
const
|
|
340
|
-
const errorSamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= errorWindow);
|
|
341
|
-
const latencySamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= latencyWindow);
|
|
216
|
+
const errorSamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= this.config.thresholds.llmErrorRate.windowSeconds * 1000);
|
|
217
|
+
const latencySamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= this.config.thresholds.llmP95LatencyMs.windowSeconds * 1000);
|
|
342
218
|
if (errorSamples.length === 0 && latencySamples.length === 0) {
|
|
343
219
|
return buildCheck("healthy", updatedAt, "no recent llm invocations");
|
|
344
220
|
}
|
|
345
|
-
const
|
|
346
|
-
|
|
221
|
+
const errorRate = errorSamples.length > 0
|
|
222
|
+
? errorSamples.filter((sample) => !sample.success).length / errorSamples.length
|
|
223
|
+
: 0;
|
|
347
224
|
const p95Latency = computeP95(latencySamples.map((sample) => sample.latencyMs));
|
|
348
|
-
let status =
|
|
349
|
-
if (errorRate >= this.config.thresholds.llmErrorRate.unhealthyAbove) {
|
|
350
|
-
status = "unhealthy";
|
|
351
|
-
}
|
|
352
|
-
else if (errorRate >= this.config.thresholds.llmErrorRate.degradedAbove) {
|
|
353
|
-
status = "degraded";
|
|
354
|
-
}
|
|
225
|
+
let status = describeThresholdStatus(errorRate, this.config.thresholds.llmErrorRate);
|
|
355
226
|
if (p95Latency !== undefined) {
|
|
356
|
-
|
|
357
|
-
status = maxStatus(status, latencyStatus);
|
|
227
|
+
status = maxStatus(status, describeThresholdStatus(p95Latency, this.config.thresholds.llmP95LatencyMs));
|
|
358
228
|
}
|
|
359
229
|
return buildCheck(status, updatedAt, `errorRate=${errorRate.toFixed(2)}${p95Latency !== undefined ? ` p95LatencyMs=${Math.round(p95Latency)}` : ""}`);
|
|
360
230
|
}
|
|
361
|
-
llmSymptoms(check, updatedAt) {
|
|
362
|
-
if (check.status === "healthy") {
|
|
363
|
-
return [];
|
|
364
|
-
}
|
|
365
|
-
return [createSymptom("llm.invocations", check.status, check.reason ?? "llm health degraded", updatedAt)];
|
|
366
|
-
}
|
|
367
|
-
evaluatePersistenceCheck(updatedAt, runtimeDbBytes, runtimeSymptoms) {
|
|
368
|
-
if (runtimeSymptoms.length > 0) {
|
|
369
|
-
return buildCheck("degraded", updatedAt, runtimeSymptoms.map((symptom) => symptom.message).join("; "));
|
|
370
|
-
}
|
|
371
|
-
if (runtimeDbBytes === 0) {
|
|
372
|
-
return buildCheck("healthy", updatedAt, "runtime sqlite not materialized yet");
|
|
373
|
-
}
|
|
374
|
-
return buildCheck("healthy", updatedAt, "runtime sqlite accessible");
|
|
375
|
-
}
|
|
376
|
-
evaluateCapacityCheck(updatedAt, checkpointBytes, runtimeDbBytes, artifactBytes) {
|
|
377
|
-
let status = "healthy";
|
|
378
|
-
status = maxStatus(status, describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes));
|
|
379
|
-
status = maxStatus(status, describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes));
|
|
380
|
-
status = maxStatus(status, describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes));
|
|
381
|
-
return buildCheck(status, updatedAt, `checkpointBytes=${checkpointBytes} runtimeDbBytes=${runtimeDbBytes} artifactBytes=${artifactBytes}`);
|
|
382
|
-
}
|
|
383
|
-
capacitySymptoms(check, checkpointBytes, runtimeDbBytes, artifactBytes, updatedAt) {
|
|
384
|
-
if (check.status === "healthy") {
|
|
385
|
-
return [];
|
|
386
|
-
}
|
|
387
|
-
const symptoms = [];
|
|
388
|
-
if (describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes) !== "healthy") {
|
|
389
|
-
symptoms.push(createSymptom("capacity.checkpoints", describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes), `checkpointBytes=${checkpointBytes}`, updatedAt));
|
|
390
|
-
}
|
|
391
|
-
if (describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes) !== "healthy") {
|
|
392
|
-
symptoms.push(createSymptom("capacity.runtime-db", describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes), `runtimeDbBytes=${runtimeDbBytes}`, updatedAt));
|
|
393
|
-
}
|
|
394
|
-
if (describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes) !== "healthy") {
|
|
395
|
-
symptoms.push(createSymptom("capacity.artifacts", describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes), `artifactBytes=${artifactBytes}`, updatedAt));
|
|
396
|
-
}
|
|
397
|
-
return symptoms;
|
|
398
|
-
}
|
|
399
231
|
evaluateWorkloadCheck(updatedAt, pendingApprovals, stuckRuns) {
|
|
400
232
|
let status = describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals);
|
|
401
|
-
status = maxStatus(status, describeThresholdStatus(stuckRuns, {
|
|
402
|
-
degradedAbove: this.config.thresholds.stuckRunSeconds.degradedAbove > 0 ? 1 : Number.MAX_SAFE_INTEGER,
|
|
403
|
-
unhealthyAbove: this.config.thresholds.stuckRunSeconds.unhealthyAbove > 0 ? 1 : Number.MAX_SAFE_INTEGER,
|
|
404
|
-
}));
|
|
405
|
-
return buildCheck(status, updatedAt, `pendingApprovals=${pendingApprovals} stuckRuns=${stuckRuns}`);
|
|
406
|
-
}
|
|
407
|
-
workloadSymptoms(check, pendingApprovals, stuckRuns, updatedAt) {
|
|
408
|
-
if (check.status === "healthy") {
|
|
409
|
-
return [];
|
|
410
|
-
}
|
|
411
|
-
const symptoms = [];
|
|
412
|
-
if (describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals) !== "healthy") {
|
|
413
|
-
symptoms.push(createSymptom("workload.pending-approvals", describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals), `pendingApprovals=${pendingApprovals}`, updatedAt));
|
|
414
|
-
}
|
|
415
233
|
if (stuckRuns > 0) {
|
|
416
|
-
|
|
417
|
-
? "unhealthy"
|
|
418
|
-
: "degraded";
|
|
419
|
-
symptoms.push(createSymptom("workload.stuck-runs", stuckStatus, `stuckRuns=${stuckRuns}`, updatedAt));
|
|
234
|
+
status = maxStatus(status, stuckRuns >= this.config.thresholds.stuckRunSeconds.unhealthyAbove ? "unhealthy" : "degraded");
|
|
420
235
|
}
|
|
421
|
-
return
|
|
236
|
+
return buildCheck(status, updatedAt, `pendingApprovals=${pendingApprovals} stuckRuns=${stuckRuns}`);
|
|
422
237
|
}
|
|
423
238
|
countStuckRuns(runs, nowMs) {
|
|
424
239
|
return runs.filter((run) => {
|
|
@@ -429,20 +244,7 @@ export class HealthMonitor {
|
|
|
429
244
|
if (!Number.isFinite(updatedAtMs)) {
|
|
430
245
|
return false;
|
|
431
246
|
}
|
|
432
|
-
|
|
433
|
-
return ageSeconds >= this.config.thresholds.stuckRunSeconds.degradedAbove;
|
|
247
|
+
return (nowMs - updatedAtMs) / 1000 >= this.config.thresholds.stuckRunSeconds.degradedAbove;
|
|
434
248
|
}).length;
|
|
435
249
|
}
|
|
436
|
-
async sumRuntimeDbBytes() {
|
|
437
|
-
const sizes = await Promise.all(this.runRoots.map((runRoot) => safeFileSize(path.join(runRoot, "runtime.sqlite"))));
|
|
438
|
-
return sizes.reduce((sum, value) => sum + value, 0);
|
|
439
|
-
}
|
|
440
|
-
async sumCheckpointDbBytes() {
|
|
441
|
-
const sizes = await Promise.all(this.checkpointDbPaths.map((dbPath) => safeFileSize(dbPath)));
|
|
442
|
-
return sizes.reduce((sum, value) => sum + value, 0);
|
|
443
|
-
}
|
|
444
|
-
async sumArtifactBytes() {
|
|
445
|
-
const sizes = await Promise.all(this.runRoots.map((runRoot) => directorySize(path.join(runRoot, "threads"))));
|
|
446
|
-
return sizes.reduce((sum, value) => sum + value, 0);
|
|
447
|
-
}
|
|
448
250
|
}
|