@botbotgo/agent-harness 0.0.75 → 0.0.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +2 -1
- package/dist/api.js +3 -0
- package/dist/benchmark/checkpoint-resume-cost-benchmark.d.ts +33 -0
- package/dist/benchmark/checkpoint-resume-cost-benchmark.js +55 -0
- package/dist/benchmark/deepagent-local-model-benchmark.d.ts +27 -0
- package/dist/benchmark/deepagent-local-model-benchmark.js +35 -0
- package/dist/config/agents/direct.yaml +1 -1
- package/dist/config/agents/orchestra.yaml +1 -2
- package/dist/config/workspace.yaml +31 -0
- package/dist/contracts/types.d.ts +38 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/package-version.d.ts +1 -1
- package/dist/package-version.js +1 -1
- package/dist/persistence/file-store.d.ts +3 -40
- package/dist/persistence/file-store.js +5 -2
- package/dist/persistence/sqlite-store.d.ts +68 -0
- package/dist/persistence/sqlite-store.js +569 -0
- package/dist/persistence/types.d.ts +83 -0
- package/dist/persistence/types.js +1 -0
- package/dist/runtime/agent-runtime-adapter.d.ts +3 -0
- package/dist/runtime/agent-runtime-adapter.js +58 -2
- package/dist/runtime/checkpoint-maintenance.d.ts +11 -2
- package/dist/runtime/checkpoint-maintenance.js +41 -5
- package/dist/runtime/harness.d.ts +5 -1
- package/dist/runtime/harness.js +67 -4
- package/dist/runtime/health-monitor.d.ts +81 -0
- package/dist/runtime/health-monitor.js +448 -0
- package/dist/runtime/runtime-record-maintenance.d.ts +43 -0
- package/dist/runtime/runtime-record-maintenance.js +169 -0
- package/dist/runtime/store.d.ts +2 -0
- package/dist/runtime/store.js +38 -20
- package/dist/runtime/support/embedding-models.js +57 -1
- package/dist/runtime/thread-memory-sync.d.ts +3 -2
- package/dist/runtime/thread-memory-sync.js +7 -1
- package/dist/workspace/agent-binding-compiler.js +3 -1
- package/dist/workspace/support/workspace-ref-utils.d.ts +9 -0
- package/dist/workspace/support/workspace-ref-utils.js +38 -0
- package/package.json +2 -2
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { readdir, stat } from "node:fs/promises";
|
|
3
|
+
import { getRuntimeDefaults } from "../workspace/support/workspace-ref-utils.js";
|
|
4
|
+
import { discoverCheckpointMaintenanceTargets } from "./checkpoint-maintenance.js";
|
|
5
|
+
const DEFAULT_HEALTH_CONFIG = {
|
|
6
|
+
enabled: false,
|
|
7
|
+
evaluateIntervalSeconds: 30,
|
|
8
|
+
emitEvents: true,
|
|
9
|
+
thresholds: {
|
|
10
|
+
llmErrorRate: {
|
|
11
|
+
windowSeconds: 60,
|
|
12
|
+
degradedAbove: 0.15,
|
|
13
|
+
unhealthyAbove: 0.4,
|
|
14
|
+
},
|
|
15
|
+
llmP95LatencyMs: {
|
|
16
|
+
windowSeconds: 300,
|
|
17
|
+
degradedAbove: 8_000,
|
|
18
|
+
unhealthyAbove: 20_000,
|
|
19
|
+
},
|
|
20
|
+
pendingApprovals: {
|
|
21
|
+
degradedAbove: 20,
|
|
22
|
+
unhealthyAbove: 100,
|
|
23
|
+
},
|
|
24
|
+
stuckRunSeconds: {
|
|
25
|
+
degradedAbove: 300,
|
|
26
|
+
unhealthyAbove: 900,
|
|
27
|
+
},
|
|
28
|
+
checkpointBytes: {
|
|
29
|
+
degradedAbove: 512 * 1024 * 1024,
|
|
30
|
+
unhealthyAbove: 2 * 1024 * 1024 * 1024,
|
|
31
|
+
},
|
|
32
|
+
runtimeDbBytes: {
|
|
33
|
+
degradedAbove: 256 * 1024 * 1024,
|
|
34
|
+
unhealthyAbove: 1024 * 1024 * 1024,
|
|
35
|
+
},
|
|
36
|
+
artifactBytes: {
|
|
37
|
+
degradedAbove: 512 * 1024 * 1024,
|
|
38
|
+
unhealthyAbove: 2 * 1024 * 1024 * 1024,
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
};
|
|
42
|
+
function asObject(value) {
|
|
43
|
+
return typeof value === "object" && value !== null && !Array.isArray(value) ? value : undefined;
|
|
44
|
+
}
|
|
45
|
+
function readPositiveNumber(value, fallback) {
|
|
46
|
+
return typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback;
|
|
47
|
+
}
|
|
48
|
+
function readNonNegativeNumber(value, fallback) {
|
|
49
|
+
return typeof value === "number" && Number.isFinite(value) && value >= 0 ? value : fallback;
|
|
50
|
+
}
|
|
51
|
+
function maxStatus(left, right) {
|
|
52
|
+
const rank = { healthy: 0, degraded: 1, unhealthy: 2 };
|
|
53
|
+
return rank[left] >= rank[right] ? left : right;
|
|
54
|
+
}
|
|
55
|
+
function compareStatus(left, right) {
|
|
56
|
+
return left === right;
|
|
57
|
+
}
|
|
58
|
+
function computeP95(values) {
|
|
59
|
+
if (values.length === 0) {
|
|
60
|
+
return undefined;
|
|
61
|
+
}
|
|
62
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
63
|
+
const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
|
|
64
|
+
return sorted[index];
|
|
65
|
+
}
|
|
66
|
+
function isoFromMs(value) {
|
|
67
|
+
return new Date(value).toISOString();
|
|
68
|
+
}
|
|
69
|
+
function buildCheck(status, updatedAt, reason) {
|
|
70
|
+
return reason ? { status, updatedAt, reason } : { status, updatedAt };
|
|
71
|
+
}
|
|
72
|
+
function describeThresholdStatus(value, thresholds) {
|
|
73
|
+
if (value >= thresholds.unhealthyAbove) {
|
|
74
|
+
return "unhealthy";
|
|
75
|
+
}
|
|
76
|
+
if (value >= thresholds.degradedAbove) {
|
|
77
|
+
return "degraded";
|
|
78
|
+
}
|
|
79
|
+
return "healthy";
|
|
80
|
+
}
|
|
81
|
+
export function readHealthMonitorConfig(workspace) {
|
|
82
|
+
const runtimeDefaults = getRuntimeDefaults(workspace.refs);
|
|
83
|
+
const observability = asObject(runtimeDefaults?.observability);
|
|
84
|
+
const health = asObject(observability?.health);
|
|
85
|
+
const thresholds = asObject(health?.thresholds);
|
|
86
|
+
const llmErrorRate = asObject(thresholds?.llmErrorRate);
|
|
87
|
+
const llmP95LatencyMs = asObject(thresholds?.llmP95LatencyMs);
|
|
88
|
+
const pendingApprovals = asObject(thresholds?.pendingApprovals);
|
|
89
|
+
const stuckRunSeconds = asObject(thresholds?.stuckRunSeconds);
|
|
90
|
+
const checkpointBytes = asObject(thresholds?.checkpointBytes);
|
|
91
|
+
const runtimeDbBytes = asObject(thresholds?.runtimeDbBytes);
|
|
92
|
+
const artifactBytes = asObject(thresholds?.artifactBytes);
|
|
93
|
+
return {
|
|
94
|
+
enabled: health?.enabled === true,
|
|
95
|
+
evaluateIntervalSeconds: readPositiveNumber(health?.evaluateIntervalSeconds, DEFAULT_HEALTH_CONFIG.evaluateIntervalSeconds),
|
|
96
|
+
emitEvents: health?.emitEvents !== false,
|
|
97
|
+
thresholds: {
|
|
98
|
+
llmErrorRate: {
|
|
99
|
+
windowSeconds: readPositiveNumber(llmErrorRate?.windowSeconds, DEFAULT_HEALTH_CONFIG.thresholds.llmErrorRate.windowSeconds),
|
|
100
|
+
degradedAbove: readNonNegativeNumber(llmErrorRate?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.llmErrorRate.degradedAbove),
|
|
101
|
+
unhealthyAbove: readNonNegativeNumber(llmErrorRate?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.llmErrorRate.unhealthyAbove),
|
|
102
|
+
},
|
|
103
|
+
llmP95LatencyMs: {
|
|
104
|
+
windowSeconds: readPositiveNumber(llmP95LatencyMs?.windowSeconds, DEFAULT_HEALTH_CONFIG.thresholds.llmP95LatencyMs.windowSeconds),
|
|
105
|
+
degradedAbove: readNonNegativeNumber(llmP95LatencyMs?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.llmP95LatencyMs.degradedAbove),
|
|
106
|
+
unhealthyAbove: readNonNegativeNumber(llmP95LatencyMs?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.llmP95LatencyMs.unhealthyAbove),
|
|
107
|
+
},
|
|
108
|
+
pendingApprovals: {
|
|
109
|
+
degradedAbove: readNonNegativeNumber(pendingApprovals?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.pendingApprovals.degradedAbove),
|
|
110
|
+
unhealthyAbove: readNonNegativeNumber(pendingApprovals?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.pendingApprovals.unhealthyAbove),
|
|
111
|
+
},
|
|
112
|
+
stuckRunSeconds: {
|
|
113
|
+
degradedAbove: readNonNegativeNumber(stuckRunSeconds?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.stuckRunSeconds.degradedAbove),
|
|
114
|
+
unhealthyAbove: readNonNegativeNumber(stuckRunSeconds?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.stuckRunSeconds.unhealthyAbove),
|
|
115
|
+
},
|
|
116
|
+
checkpointBytes: {
|
|
117
|
+
degradedAbove: readNonNegativeNumber(checkpointBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.checkpointBytes.degradedAbove),
|
|
118
|
+
unhealthyAbove: readNonNegativeNumber(checkpointBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.checkpointBytes.unhealthyAbove),
|
|
119
|
+
},
|
|
120
|
+
runtimeDbBytes: {
|
|
121
|
+
degradedAbove: readNonNegativeNumber(runtimeDbBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.runtimeDbBytes.degradedAbove),
|
|
122
|
+
unhealthyAbove: readNonNegativeNumber(runtimeDbBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.runtimeDbBytes.unhealthyAbove),
|
|
123
|
+
},
|
|
124
|
+
artifactBytes: {
|
|
125
|
+
degradedAbove: readNonNegativeNumber(artifactBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.artifactBytes.degradedAbove),
|
|
126
|
+
unhealthyAbove: readNonNegativeNumber(artifactBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.artifactBytes.unhealthyAbove),
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
async function safeFileSize(filePath) {
|
|
132
|
+
try {
|
|
133
|
+
const stats = await stat(filePath);
|
|
134
|
+
return stats.size;
|
|
135
|
+
}
|
|
136
|
+
catch {
|
|
137
|
+
return 0;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
async function directorySize(root) {
|
|
141
|
+
try {
|
|
142
|
+
const entries = await readdir(root, { withFileTypes: true });
|
|
143
|
+
let total = 0;
|
|
144
|
+
for (const entry of entries) {
|
|
145
|
+
const fullPath = path.join(root, entry.name);
|
|
146
|
+
if (entry.isDirectory()) {
|
|
147
|
+
total += await directorySize(fullPath);
|
|
148
|
+
continue;
|
|
149
|
+
}
|
|
150
|
+
if (entry.isFile()) {
|
|
151
|
+
total += await safeFileSize(fullPath);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return total;
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
return 0;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
function evaluateMaintenanceStatus(label, status, updatedAt) {
|
|
161
|
+
if (!status) {
|
|
162
|
+
return {
|
|
163
|
+
check: buildCheck("healthy", updatedAt, `${label} disabled`),
|
|
164
|
+
symptoms: [],
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
if (status.consecutiveFailures > 0) {
|
|
168
|
+
return {
|
|
169
|
+
check: buildCheck("degraded", updatedAt, `${label} has ${status.consecutiveFailures} consecutive failure(s)`),
|
|
170
|
+
symptoms: [{
|
|
171
|
+
code: `${label}.loop.failure`,
|
|
172
|
+
severity: "warn",
|
|
173
|
+
message: status.lastError ?? `${label} maintenance loop failed`,
|
|
174
|
+
firstSeenAt: status.lastFailedAt ?? updatedAt,
|
|
175
|
+
lastSeenAt: status.lastFailedAt ?? updatedAt,
|
|
176
|
+
}],
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
if (status.lastCompletedAt) {
|
|
180
|
+
return {
|
|
181
|
+
check: buildCheck("healthy", updatedAt, `${label} completed successfully`),
|
|
182
|
+
symptoms: [],
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
return {
|
|
186
|
+
check: buildCheck("healthy", updatedAt, `${label} idle`),
|
|
187
|
+
symptoms: [],
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
function normalizeSeverity(status) {
|
|
191
|
+
if (status === "unhealthy") {
|
|
192
|
+
return "error";
|
|
193
|
+
}
|
|
194
|
+
if (status === "degraded") {
|
|
195
|
+
return "warn";
|
|
196
|
+
}
|
|
197
|
+
return "info";
|
|
198
|
+
}
|
|
199
|
+
function createSymptom(code, status, message, timestamp) {
|
|
200
|
+
return {
|
|
201
|
+
code,
|
|
202
|
+
severity: normalizeSeverity(status),
|
|
203
|
+
message,
|
|
204
|
+
firstSeenAt: timestamp,
|
|
205
|
+
lastSeenAt: timestamp,
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
export class HealthMonitor {
|
|
209
|
+
options;
|
|
210
|
+
config;
|
|
211
|
+
runRoots;
|
|
212
|
+
checkpointDbPaths;
|
|
213
|
+
llmSamples = [];
|
|
214
|
+
timer = null;
|
|
215
|
+
latestSnapshot = null;
|
|
216
|
+
runtimeEventSequence = 0;
|
|
217
|
+
constructor(options) {
|
|
218
|
+
this.options = options;
|
|
219
|
+
this.config = readHealthMonitorConfig(options.workspace);
|
|
220
|
+
this.runRoots = Array.from(new Set(Array.from(options.workspace.bindings.values()).map((binding) => binding.harnessRuntime.runRoot)));
|
|
221
|
+
this.checkpointDbPaths = Array.from(new Set(discoverCheckpointMaintenanceTargets(options.workspace).map((target) => target.dbPath)));
|
|
222
|
+
}
|
|
223
|
+
recordLlmSuccess(latencyMs, nowMs = Date.now()) {
|
|
224
|
+
this.recordLlmSample({ timestampMs: nowMs, latencyMs, success: true });
|
|
225
|
+
}
|
|
226
|
+
recordLlmFailure(latencyMs, nowMs = Date.now()) {
|
|
227
|
+
this.recordLlmSample({ timestampMs: nowMs, latencyMs, success: false });
|
|
228
|
+
}
|
|
229
|
+
recordLlmSample(sample) {
|
|
230
|
+
this.llmSamples.push(sample);
|
|
231
|
+
const keepAfter = sample.timestampMs - Math.max(this.config.thresholds.llmErrorRate.windowSeconds, this.config.thresholds.llmP95LatencyMs.windowSeconds) * 1000;
|
|
232
|
+
while (this.llmSamples.length > 0 && this.llmSamples[0].timestampMs < keepAfter) {
|
|
233
|
+
this.llmSamples.shift();
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
async start() {
|
|
237
|
+
if (!this.config.enabled || this.timer) {
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
await this.evaluate();
|
|
241
|
+
this.timer = setInterval(() => {
|
|
242
|
+
void this.evaluate();
|
|
243
|
+
}, this.config.evaluateIntervalSeconds * 1000);
|
|
244
|
+
this.timer.unref?.();
|
|
245
|
+
}
|
|
246
|
+
async stop() {
|
|
247
|
+
if (this.timer) {
|
|
248
|
+
clearInterval(this.timer);
|
|
249
|
+
this.timer = null;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
async getSnapshot() {
|
|
253
|
+
return this.evaluate();
|
|
254
|
+
}
|
|
255
|
+
async evaluate(nowMs = Date.now()) {
|
|
256
|
+
const updatedAt = isoFromMs(nowMs);
|
|
257
|
+
const [runs, approvals, runtimeDbBytes, checkpointBytes, artifactBytes] = await Promise.all([
|
|
258
|
+
this.options.persistence.listRuns(),
|
|
259
|
+
this.options.persistence.listApprovals(),
|
|
260
|
+
this.sumRuntimeDbBytes(),
|
|
261
|
+
this.sumCheckpointDbBytes(),
|
|
262
|
+
this.sumArtifactBytes(),
|
|
263
|
+
]);
|
|
264
|
+
const pendingApprovals = approvals.filter((approval) => approval.status === "pending").length;
|
|
265
|
+
const stuckRuns = this.countStuckRuns(runs, nowMs);
|
|
266
|
+
const runtimeMaintenance = evaluateMaintenanceStatus("checkpoint", this.options.getCheckpointMaintenanceStatus(), updatedAt);
|
|
267
|
+
const recordMaintenance = evaluateMaintenanceStatus("records", this.options.getRuntimeRecordMaintenanceStatus(), updatedAt);
|
|
268
|
+
let runtimeCheck = buildCheck("healthy", updatedAt, "runtime loops healthy");
|
|
269
|
+
const runtimeSymptoms = [...runtimeMaintenance.symptoms, ...recordMaintenance.symptoms];
|
|
270
|
+
runtimeCheck = buildCheck(maxStatus(runtimeMaintenance.check.status, recordMaintenance.check.status), updatedAt, runtimeSymptoms.length > 0 ? runtimeSymptoms.map((symptom) => symptom.message).join("; ") : "runtime loops healthy");
|
|
271
|
+
const llmCheck = this.evaluateLlmCheck(updatedAt, nowMs);
|
|
272
|
+
const persistenceCheck = this.evaluatePersistenceCheck(updatedAt, runtimeDbBytes, runtimeSymptoms);
|
|
273
|
+
const capacityCheck = this.evaluateCapacityCheck(updatedAt, checkpointBytes, runtimeDbBytes, artifactBytes);
|
|
274
|
+
const workloadCheck = this.evaluateWorkloadCheck(updatedAt, pendingApprovals, stuckRuns);
|
|
275
|
+
const symptoms = [
|
|
276
|
+
...runtimeSymptoms,
|
|
277
|
+
...this.capacitySymptoms(capacityCheck, checkpointBytes, runtimeDbBytes, artifactBytes, updatedAt),
|
|
278
|
+
...this.workloadSymptoms(workloadCheck, pendingApprovals, stuckRuns, updatedAt),
|
|
279
|
+
...this.llmSymptoms(llmCheck, updatedAt),
|
|
280
|
+
...(persistenceCheck.status === "healthy"
|
|
281
|
+
? []
|
|
282
|
+
: [createSymptom("persistence.runtime-db", persistenceCheck.status, persistenceCheck.reason ?? "runtime persistence degraded", updatedAt)]),
|
|
283
|
+
];
|
|
284
|
+
const checks = {
|
|
285
|
+
runtime: runtimeCheck,
|
|
286
|
+
llm: llmCheck,
|
|
287
|
+
persistence: persistenceCheck,
|
|
288
|
+
capacity: capacityCheck,
|
|
289
|
+
workload: workloadCheck,
|
|
290
|
+
};
|
|
291
|
+
let overallStatus = "healthy";
|
|
292
|
+
for (const check of Object.values(checks)) {
|
|
293
|
+
overallStatus = maxStatus(overallStatus, check.status);
|
|
294
|
+
}
|
|
295
|
+
const snapshot = {
|
|
296
|
+
status: overallStatus,
|
|
297
|
+
updatedAt,
|
|
298
|
+
checks,
|
|
299
|
+
symptoms,
|
|
300
|
+
stats: {
|
|
301
|
+
activeRunSlots: this.options.getActiveRunSlots(),
|
|
302
|
+
pendingRunSlots: this.options.getPendingRunSlots(),
|
|
303
|
+
pendingApprovals,
|
|
304
|
+
stuckRuns,
|
|
305
|
+
checkpointBytes,
|
|
306
|
+
runtimeDbBytes,
|
|
307
|
+
artifactBytes,
|
|
308
|
+
...this.llmStats(nowMs),
|
|
309
|
+
},
|
|
310
|
+
};
|
|
311
|
+
const previous = this.latestSnapshot;
|
|
312
|
+
this.latestSnapshot = snapshot;
|
|
313
|
+
if (this.config.enabled &&
|
|
314
|
+
this.config.emitEvents &&
|
|
315
|
+
this.options.publishEvent &&
|
|
316
|
+
previous &&
|
|
317
|
+
!compareStatus(previous.status, snapshot.status)) {
|
|
318
|
+
await this.options.publishEvent({
|
|
319
|
+
previousStatus: previous.status,
|
|
320
|
+
status: snapshot.status,
|
|
321
|
+
checks: snapshot.checks,
|
|
322
|
+
stats: snapshot.stats,
|
|
323
|
+
});
|
|
324
|
+
}
|
|
325
|
+
return snapshot;
|
|
326
|
+
}
|
|
327
|
+
llmStats(nowMs) {
|
|
328
|
+
const oneMinuteWindowMs = 60 * 1000;
|
|
329
|
+
const recent = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= oneMinuteWindowMs);
|
|
330
|
+
const successes = recent.filter((sample) => sample.success).length;
|
|
331
|
+
const latencies = recent.map((sample) => sample.latencyMs);
|
|
332
|
+
return {
|
|
333
|
+
llmSuccessRate1m: recent.length > 0 ? successes / recent.length : undefined,
|
|
334
|
+
llmP95LatencyMs1m: computeP95(latencies),
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
evaluateLlmCheck(updatedAt, nowMs) {
|
|
338
|
+
const errorWindow = this.config.thresholds.llmErrorRate.windowSeconds * 1000;
|
|
339
|
+
const latencyWindow = this.config.thresholds.llmP95LatencyMs.windowSeconds * 1000;
|
|
340
|
+
const errorSamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= errorWindow);
|
|
341
|
+
const latencySamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= latencyWindow);
|
|
342
|
+
if (errorSamples.length === 0 && latencySamples.length === 0) {
|
|
343
|
+
return buildCheck("healthy", updatedAt, "no recent llm invocations");
|
|
344
|
+
}
|
|
345
|
+
const errors = errorSamples.filter((sample) => !sample.success).length;
|
|
346
|
+
const errorRate = errorSamples.length > 0 ? errors / errorSamples.length : 0;
|
|
347
|
+
const p95Latency = computeP95(latencySamples.map((sample) => sample.latencyMs));
|
|
348
|
+
let status = "healthy";
|
|
349
|
+
if (errorRate >= this.config.thresholds.llmErrorRate.unhealthyAbove) {
|
|
350
|
+
status = "unhealthy";
|
|
351
|
+
}
|
|
352
|
+
else if (errorRate >= this.config.thresholds.llmErrorRate.degradedAbove) {
|
|
353
|
+
status = "degraded";
|
|
354
|
+
}
|
|
355
|
+
if (p95Latency !== undefined) {
|
|
356
|
+
const latencyStatus = describeThresholdStatus(p95Latency, this.config.thresholds.llmP95LatencyMs);
|
|
357
|
+
status = maxStatus(status, latencyStatus);
|
|
358
|
+
}
|
|
359
|
+
return buildCheck(status, updatedAt, `errorRate=${errorRate.toFixed(2)}${p95Latency !== undefined ? ` p95LatencyMs=${Math.round(p95Latency)}` : ""}`);
|
|
360
|
+
}
|
|
361
|
+
llmSymptoms(check, updatedAt) {
|
|
362
|
+
if (check.status === "healthy") {
|
|
363
|
+
return [];
|
|
364
|
+
}
|
|
365
|
+
return [createSymptom("llm.invocations", check.status, check.reason ?? "llm health degraded", updatedAt)];
|
|
366
|
+
}
|
|
367
|
+
evaluatePersistenceCheck(updatedAt, runtimeDbBytes, runtimeSymptoms) {
|
|
368
|
+
if (runtimeSymptoms.length > 0) {
|
|
369
|
+
return buildCheck("degraded", updatedAt, runtimeSymptoms.map((symptom) => symptom.message).join("; "));
|
|
370
|
+
}
|
|
371
|
+
if (runtimeDbBytes === 0) {
|
|
372
|
+
return buildCheck("healthy", updatedAt, "runtime sqlite not materialized yet");
|
|
373
|
+
}
|
|
374
|
+
return buildCheck("healthy", updatedAt, "runtime sqlite accessible");
|
|
375
|
+
}
|
|
376
|
+
evaluateCapacityCheck(updatedAt, checkpointBytes, runtimeDbBytes, artifactBytes) {
|
|
377
|
+
let status = "healthy";
|
|
378
|
+
status = maxStatus(status, describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes));
|
|
379
|
+
status = maxStatus(status, describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes));
|
|
380
|
+
status = maxStatus(status, describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes));
|
|
381
|
+
return buildCheck(status, updatedAt, `checkpointBytes=${checkpointBytes} runtimeDbBytes=${runtimeDbBytes} artifactBytes=${artifactBytes}`);
|
|
382
|
+
}
|
|
383
|
+
capacitySymptoms(check, checkpointBytes, runtimeDbBytes, artifactBytes, updatedAt) {
|
|
384
|
+
if (check.status === "healthy") {
|
|
385
|
+
return [];
|
|
386
|
+
}
|
|
387
|
+
const symptoms = [];
|
|
388
|
+
if (describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes) !== "healthy") {
|
|
389
|
+
symptoms.push(createSymptom("capacity.checkpoints", describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes), `checkpointBytes=${checkpointBytes}`, updatedAt));
|
|
390
|
+
}
|
|
391
|
+
if (describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes) !== "healthy") {
|
|
392
|
+
symptoms.push(createSymptom("capacity.runtime-db", describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes), `runtimeDbBytes=${runtimeDbBytes}`, updatedAt));
|
|
393
|
+
}
|
|
394
|
+
if (describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes) !== "healthy") {
|
|
395
|
+
symptoms.push(createSymptom("capacity.artifacts", describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes), `artifactBytes=${artifactBytes}`, updatedAt));
|
|
396
|
+
}
|
|
397
|
+
return symptoms;
|
|
398
|
+
}
|
|
399
|
+
evaluateWorkloadCheck(updatedAt, pendingApprovals, stuckRuns) {
|
|
400
|
+
let status = describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals);
|
|
401
|
+
status = maxStatus(status, describeThresholdStatus(stuckRuns, {
|
|
402
|
+
degradedAbove: this.config.thresholds.stuckRunSeconds.degradedAbove > 0 ? 1 : Number.MAX_SAFE_INTEGER,
|
|
403
|
+
unhealthyAbove: this.config.thresholds.stuckRunSeconds.unhealthyAbove > 0 ? 1 : Number.MAX_SAFE_INTEGER,
|
|
404
|
+
}));
|
|
405
|
+
return buildCheck(status, updatedAt, `pendingApprovals=${pendingApprovals} stuckRuns=${stuckRuns}`);
|
|
406
|
+
}
|
|
407
|
+
workloadSymptoms(check, pendingApprovals, stuckRuns, updatedAt) {
|
|
408
|
+
if (check.status === "healthy") {
|
|
409
|
+
return [];
|
|
410
|
+
}
|
|
411
|
+
const symptoms = [];
|
|
412
|
+
if (describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals) !== "healthy") {
|
|
413
|
+
symptoms.push(createSymptom("workload.pending-approvals", describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals), `pendingApprovals=${pendingApprovals}`, updatedAt));
|
|
414
|
+
}
|
|
415
|
+
if (stuckRuns > 0) {
|
|
416
|
+
const stuckStatus = stuckRuns >= 1 && this.config.thresholds.stuckRunSeconds.unhealthyAbove <= this.config.thresholds.stuckRunSeconds.degradedAbove
|
|
417
|
+
? "unhealthy"
|
|
418
|
+
: "degraded";
|
|
419
|
+
symptoms.push(createSymptom("workload.stuck-runs", stuckStatus, `stuckRuns=${stuckRuns}`, updatedAt));
|
|
420
|
+
}
|
|
421
|
+
return symptoms;
|
|
422
|
+
}
|
|
423
|
+
countStuckRuns(runs, nowMs) {
|
|
424
|
+
return runs.filter((run) => {
|
|
425
|
+
if (!["running", "resuming", "queued"].includes(run.state)) {
|
|
426
|
+
return false;
|
|
427
|
+
}
|
|
428
|
+
const updatedAtMs = Date.parse(run.updatedAt);
|
|
429
|
+
if (!Number.isFinite(updatedAtMs)) {
|
|
430
|
+
return false;
|
|
431
|
+
}
|
|
432
|
+
const ageSeconds = (nowMs - updatedAtMs) / 1000;
|
|
433
|
+
return ageSeconds >= this.config.thresholds.stuckRunSeconds.degradedAbove;
|
|
434
|
+
}).length;
|
|
435
|
+
}
|
|
436
|
+
async sumRuntimeDbBytes() {
|
|
437
|
+
const sizes = await Promise.all(this.runRoots.map((runRoot) => safeFileSize(path.join(runRoot, "runtime.sqlite"))));
|
|
438
|
+
return sizes.reduce((sum, value) => sum + value, 0);
|
|
439
|
+
}
|
|
440
|
+
async sumCheckpointDbBytes() {
|
|
441
|
+
const sizes = await Promise.all(this.checkpointDbPaths.map((dbPath) => safeFileSize(dbPath)));
|
|
442
|
+
return sizes.reduce((sum, value) => sum + value, 0);
|
|
443
|
+
}
|
|
444
|
+
async sumArtifactBytes() {
|
|
445
|
+
const sizes = await Promise.all(this.runRoots.map((runRoot) => directorySize(path.join(runRoot, "threads"))));
|
|
446
|
+
return sizes.reduce((sum, value) => sum + value, 0);
|
|
447
|
+
}
|
|
448
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { WorkspaceBundle } from "../contracts/types.js";
|
|
2
|
+
type RuntimeRecordMaintenanceConfig = {
|
|
3
|
+
enabled: boolean;
|
|
4
|
+
schedule: {
|
|
5
|
+
intervalSeconds: number;
|
|
6
|
+
runOnStartup: boolean;
|
|
7
|
+
};
|
|
8
|
+
policies: {
|
|
9
|
+
maxAgeSeconds?: number;
|
|
10
|
+
};
|
|
11
|
+
sqlite: {
|
|
12
|
+
sweepBatchSize: number;
|
|
13
|
+
vacuum: boolean;
|
|
14
|
+
};
|
|
15
|
+
};
|
|
16
|
+
type RuntimeRecordMaintenanceTarget = {
|
|
17
|
+
dbPath: string;
|
|
18
|
+
};
|
|
19
|
+
export type RuntimeRecordMaintenanceLoopStatus = {
|
|
20
|
+
lastStartedAt?: string;
|
|
21
|
+
lastCompletedAt?: string;
|
|
22
|
+
lastFailedAt?: string;
|
|
23
|
+
consecutiveFailures: number;
|
|
24
|
+
lastError?: string;
|
|
25
|
+
};
|
|
26
|
+
export declare function readRuntimeRecordMaintenanceConfig(workspace: WorkspaceBundle): RuntimeRecordMaintenanceConfig | null;
|
|
27
|
+
export declare function discoverRuntimeRecordMaintenanceTargets(workspace: WorkspaceBundle): RuntimeRecordMaintenanceTarget[];
|
|
28
|
+
export declare function maintainSqliteRuntimeRecords(dbPath: string, config: RuntimeRecordMaintenanceConfig, nowMs?: number): Promise<{
|
|
29
|
+
deletedThreadCount: number;
|
|
30
|
+
}>;
|
|
31
|
+
export declare class RuntimeRecordMaintenanceLoop {
|
|
32
|
+
private readonly targets;
|
|
33
|
+
private readonly config;
|
|
34
|
+
private timer;
|
|
35
|
+
private running;
|
|
36
|
+
private status;
|
|
37
|
+
constructor(targets: RuntimeRecordMaintenanceTarget[], config: RuntimeRecordMaintenanceConfig);
|
|
38
|
+
runOnce(nowMs?: number): Promise<void>;
|
|
39
|
+
getStatus(): RuntimeRecordMaintenanceLoopStatus;
|
|
40
|
+
start(): Promise<void>;
|
|
41
|
+
stop(): Promise<void>;
|
|
42
|
+
}
|
|
43
|
+
export {};
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import path from "node:path";
|
|
2
|
+
import { rm } from "node:fs/promises";
|
|
3
|
+
import { createClient } from "@libsql/client";
|
|
4
|
+
import { fileExists } from "../utils/fs.js";
|
|
5
|
+
import { getRuntimeDefaults } from "../workspace/support/workspace-ref-utils.js";
|
|
6
|
+
function asObject(value) {
|
|
7
|
+
return typeof value === "object" && value !== null && !Array.isArray(value) ? value : undefined;
|
|
8
|
+
}
|
|
9
|
+
function readPositiveNumber(value, label, allowUndefined = true) {
|
|
10
|
+
if (value === undefined && allowUndefined) {
|
|
11
|
+
return undefined;
|
|
12
|
+
}
|
|
13
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value <= 0) {
|
|
14
|
+
throw new Error(`${label} must be a positive number`);
|
|
15
|
+
}
|
|
16
|
+
return value;
|
|
17
|
+
}
|
|
18
|
+
export function readRuntimeRecordMaintenanceConfig(workspace) {
|
|
19
|
+
const runtimeDefaults = getRuntimeDefaults(workspace.refs);
|
|
20
|
+
const maintenance = asObject(runtimeDefaults?.maintenance);
|
|
21
|
+
const records = asObject(maintenance?.records);
|
|
22
|
+
if (!records || records.enabled !== true) {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
const schedule = asObject(records.schedule);
|
|
26
|
+
const policies = asObject(records.policies);
|
|
27
|
+
const sqlite = asObject(records.sqlite);
|
|
28
|
+
const config = {
|
|
29
|
+
enabled: true,
|
|
30
|
+
schedule: {
|
|
31
|
+
intervalSeconds: readPositiveNumber(schedule?.intervalSeconds, "runtime.maintenance.records.schedule.intervalSeconds") ?? 3600,
|
|
32
|
+
runOnStartup: schedule?.runOnStartup !== false,
|
|
33
|
+
},
|
|
34
|
+
policies: {
|
|
35
|
+
maxAgeSeconds: readPositiveNumber(policies?.maxAgeSeconds, "runtime.maintenance.records.policies.maxAgeSeconds"),
|
|
36
|
+
},
|
|
37
|
+
sqlite: {
|
|
38
|
+
sweepBatchSize: readPositiveNumber(sqlite?.sweepBatchSize, "runtime.maintenance.records.sqlite.sweepBatchSize") ?? 100,
|
|
39
|
+
vacuum: sqlite?.vacuum === true,
|
|
40
|
+
},
|
|
41
|
+
};
|
|
42
|
+
if (config.policies.maxAgeSeconds === undefined) {
|
|
43
|
+
throw new Error("runtime.maintenance.records.enabled requires at least one cleanup policy");
|
|
44
|
+
}
|
|
45
|
+
return config;
|
|
46
|
+
}
|
|
47
|
+
export function discoverRuntimeRecordMaintenanceTargets(workspace) {
|
|
48
|
+
const runRoots = new Set();
|
|
49
|
+
for (const binding of workspace.bindings.values()) {
|
|
50
|
+
runRoots.add(binding.harnessRuntime.runRoot);
|
|
51
|
+
}
|
|
52
|
+
return Array.from(runRoots.values()).map((runRoot) => ({
|
|
53
|
+
dbPath: path.join(runRoot, "runtime.sqlite"),
|
|
54
|
+
}));
|
|
55
|
+
}
|
|
56
|
+
export async function maintainSqliteRuntimeRecords(dbPath, config, nowMs = Date.now()) {
|
|
57
|
+
if (!(await fileExists(dbPath))) {
|
|
58
|
+
return { deletedThreadCount: 0 };
|
|
59
|
+
}
|
|
60
|
+
const client = createClient({ url: `file:${dbPath}` });
|
|
61
|
+
const cutoffIso = new Date(nowMs - (config.policies.maxAgeSeconds ?? 0) * 1000).toISOString();
|
|
62
|
+
const runRoot = path.dirname(dbPath);
|
|
63
|
+
const result = await client.execute({
|
|
64
|
+
sql: `SELECT thread_id
|
|
65
|
+
FROM threads
|
|
66
|
+
WHERE status IN ('completed', 'failed')
|
|
67
|
+
AND updated_at <= ?
|
|
68
|
+
ORDER BY updated_at ASC, thread_id ASC
|
|
69
|
+
LIMIT ?`,
|
|
70
|
+
args: [cutoffIso, config.sqlite.sweepBatchSize],
|
|
71
|
+
});
|
|
72
|
+
const rows = result.rows.map((row) => row);
|
|
73
|
+
if (rows.length === 0) {
|
|
74
|
+
return { deletedThreadCount: 0 };
|
|
75
|
+
}
|
|
76
|
+
let deletedThreadCount = 0;
|
|
77
|
+
for (const row of rows) {
|
|
78
|
+
const threadId = row.thread_id;
|
|
79
|
+
const artifacts = await client.execute({
|
|
80
|
+
sql: `SELECT run_id, path FROM artifacts WHERE thread_id = ?`,
|
|
81
|
+
args: [threadId],
|
|
82
|
+
});
|
|
83
|
+
for (const artifact of artifacts.rows) {
|
|
84
|
+
const runId = String(artifact.run_id ?? "");
|
|
85
|
+
const artifactPath = String(artifact.path ?? "");
|
|
86
|
+
if (runId && artifactPath) {
|
|
87
|
+
await rm(path.join(runRoot, "threads", threadId, "runs", runId, artifactPath), { force: true });
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
await client.batch([
|
|
91
|
+
"DELETE FROM artifacts WHERE thread_id = ?",
|
|
92
|
+
"DELETE FROM approvals WHERE thread_id = ?",
|
|
93
|
+
"DELETE FROM events WHERE thread_id = ?",
|
|
94
|
+
"DELETE FROM run_requests WHERE thread_id = ?",
|
|
95
|
+
"DELETE FROM recovery_intents WHERE thread_id = ?",
|
|
96
|
+
"DELETE FROM thread_messages WHERE thread_id = ?",
|
|
97
|
+
"DELETE FROM runs WHERE thread_id = ?",
|
|
98
|
+
"DELETE FROM threads WHERE thread_id = ?",
|
|
99
|
+
].map((sql) => ({ sql, args: [threadId] })), "write");
|
|
100
|
+
await rm(path.join(runRoot, "threads", threadId), { recursive: true, force: true });
|
|
101
|
+
deletedThreadCount += 1;
|
|
102
|
+
}
|
|
103
|
+
if (deletedThreadCount > 0 && config.sqlite.vacuum) {
|
|
104
|
+
await client.execute("VACUUM");
|
|
105
|
+
}
|
|
106
|
+
return { deletedThreadCount };
|
|
107
|
+
}
|
|
108
|
+
export class RuntimeRecordMaintenanceLoop {
|
|
109
|
+
targets;
|
|
110
|
+
config;
|
|
111
|
+
timer = null;
|
|
112
|
+
running = false;
|
|
113
|
+
status = {
|
|
114
|
+
consecutiveFailures: 0,
|
|
115
|
+
};
|
|
116
|
+
constructor(targets, config) {
|
|
117
|
+
this.targets = targets;
|
|
118
|
+
this.config = config;
|
|
119
|
+
}
|
|
120
|
+
async runOnce(nowMs = Date.now()) {
|
|
121
|
+
this.status = {
|
|
122
|
+
...this.status,
|
|
123
|
+
lastStartedAt: new Date(nowMs).toISOString(),
|
|
124
|
+
};
|
|
125
|
+
try {
|
|
126
|
+
for (const target of this.targets) {
|
|
127
|
+
await maintainSqliteRuntimeRecords(target.dbPath, this.config, nowMs);
|
|
128
|
+
}
|
|
129
|
+
this.status = {
|
|
130
|
+
...this.status,
|
|
131
|
+
lastCompletedAt: new Date(nowMs).toISOString(),
|
|
132
|
+
consecutiveFailures: 0,
|
|
133
|
+
lastError: undefined,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
catch (error) {
|
|
137
|
+
this.status = {
|
|
138
|
+
...this.status,
|
|
139
|
+
lastFailedAt: new Date(nowMs).toISOString(),
|
|
140
|
+
consecutiveFailures: this.status.consecutiveFailures + 1,
|
|
141
|
+
lastError: error instanceof Error ? error.message : String(error),
|
|
142
|
+
};
|
|
143
|
+
throw error;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
getStatus() {
|
|
147
|
+
return { ...this.status };
|
|
148
|
+
}
|
|
149
|
+
async start() {
|
|
150
|
+
if (this.running) {
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
this.running = true;
|
|
154
|
+
if (this.config.schedule.runOnStartup) {
|
|
155
|
+
await this.runOnce();
|
|
156
|
+
}
|
|
157
|
+
this.timer = setInterval(() => {
|
|
158
|
+
void this.runOnce();
|
|
159
|
+
}, this.config.schedule.intervalSeconds * 1000);
|
|
160
|
+
this.timer.unref?.();
|
|
161
|
+
}
|
|
162
|
+
async stop() {
|
|
163
|
+
if (this.timer) {
|
|
164
|
+
clearInterval(this.timer);
|
|
165
|
+
this.timer = null;
|
|
166
|
+
}
|
|
167
|
+
this.running = false;
|
|
168
|
+
}
|
|
169
|
+
}
|
package/dist/runtime/store.d.ts
CHANGED
|
@@ -24,7 +24,9 @@ export declare class FileBackedStore {
|
|
|
24
24
|
private readonly filePath;
|
|
25
25
|
private readonly delegate;
|
|
26
26
|
private loaded;
|
|
27
|
+
private operationChain;
|
|
27
28
|
constructor(filePath: string);
|
|
29
|
+
private runSerialized;
|
|
28
30
|
private ensureLoaded;
|
|
29
31
|
private persist;
|
|
30
32
|
batch(operations: Parameters<InMemoryStore["batch"]>[0]): Promise<readonly unknown[]>;
|