@botbotgo/agent-harness 0.0.103 → 0.0.105

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- export declare const AGENT_HARNESS_VERSION = "0.0.102";
1
+ export declare const AGENT_HARNESS_VERSION = "0.0.104";
@@ -1 +1 @@
1
- export const AGENT_HARNESS_VERSION = "0.0.102";
1
+ export const AGENT_HARNESS_VERSION = "0.0.104";
@@ -1,22 +1,17 @@
1
1
  import type { RunResult, ThreadSummary } from "../../../contracts/types.js";
2
2
  import type { RuntimePersistence } from "../../../persistence/types.js";
3
3
  import type { ConcurrencyConfig, RecoveryConfig } from "../../../workspace/support/workspace-ref-utils.js";
4
- import { recoverQueuedStartupRun } from "./recovery.js";
5
4
  type Startable = {
6
5
  start(): Promise<void>;
7
6
  };
8
- type StartupRecoveryContext = Parameters<typeof recoverQueuedStartupRun>[0];
9
7
  export declare function initializeHarnessRuntime(input: {
10
8
  persistence: RuntimePersistence;
11
- checkpointMaintenance: Startable | null;
12
- runtimeRecordMaintenance: Startable | null;
13
9
  healthMonitor: Startable | null;
14
- recoverStartupRuns: () => Promise<void>;
15
10
  }): Promise<void>;
16
11
  export declare function recoverStartupRuns(input: {
17
12
  recoveryConfig: RecoveryConfig;
18
13
  persistence: RuntimePersistence;
19
- createStartupRecoveryContext: () => StartupRecoveryContext;
14
+ createStartupRecoveryContext: () => unknown;
20
15
  reclaimExpiredClaimedRuns: (nowIso?: string) => Promise<void>;
21
16
  }): Promise<void>;
22
17
  export declare function reclaimExpiredClaimedRuns(input: {
@@ -1,58 +1,13 @@
1
- import { recoverQueuedStartupRun, recoverResumingStartupRun, recoverRunningStartupRun, } from "./recovery.js";
2
1
  export async function initializeHarnessRuntime(input) {
3
2
  await input.persistence.initialize();
4
- await input.checkpointMaintenance?.start();
5
- await input.runtimeRecordMaintenance?.start();
6
3
  await input.healthMonitor?.start();
7
- await input.recoverStartupRuns();
8
4
  }
9
5
  export async function recoverStartupRuns(input) {
10
- if (!input.recoveryConfig.enabled) {
11
- return;
12
- }
13
- await input.reclaimExpiredClaimedRuns();
14
- const threads = await input.persistence.listSessions();
15
- const recoveryContext = input.createStartupRecoveryContext();
16
- for (const thread of threads) {
17
- const handled = await recoverQueuedStartupRun(recoveryContext, thread) ||
18
- await recoverRunningStartupRun(recoveryContext, thread) ||
19
- await recoverResumingStartupRun(recoveryContext, thread);
20
- if (handled) {
21
- continue;
22
- }
23
- }
6
+ void input;
24
7
  }
25
8
  export async function reclaimExpiredClaimedRuns(input, nowIso = new Date().toISOString()) {
26
- const expiredClaims = await input.persistence.listExpiredClaimedRuns(nowIso);
27
- for (const claim of expiredClaims) {
28
- const thread = await input.persistence.getSession(claim.threadId);
29
- if (!thread) {
30
- await input.persistence.releaseRunClaim(claim.runId);
31
- continue;
32
- }
33
- const lifecycle = await input.persistence.getRunLifecycle(claim.threadId, claim.runId);
34
- if (lifecycle.state === "claimed") {
35
- await input.persistence.enqueueRun({
36
- threadId: claim.threadId,
37
- runId: claim.runId,
38
- priority: claim.priority,
39
- queueKey: claim.queueKey,
40
- availableAt: nowIso,
41
- });
42
- await input.setRunStateAndEmit(claim.threadId, claim.runId, 99, "queued", {
43
- previousState: "claimed",
44
- });
45
- await input.emit(claim.threadId, claim.runId, 100, "run.queued", {
46
- queuePosition: 0,
47
- activeRunCount: input.getActiveRunSlots(),
48
- maxConcurrentRuns: input.concurrencyConfig.maxConcurrentRuns,
49
- recoveredOnStartup: true,
50
- reclaimReason: "expired-lease",
51
- });
52
- continue;
53
- }
54
- await input.persistence.releaseRunClaim(claim.runId);
55
- }
9
+ void input;
10
+ void nowIso;
56
11
  }
57
12
  export async function isStaleRunningRun(input, thread, nowMs = Date.now()) {
58
13
  const control = await input.persistence.getRunControl(thread.latestRunId);
@@ -24,18 +24,6 @@ type HealthMonitorConfig = {
24
24
  degradedAbove: number;
25
25
  unhealthyAbove: number;
26
26
  };
27
- checkpointBytes: {
28
- degradedAbove: number;
29
- unhealthyAbove: number;
30
- };
31
- runtimeDbBytes: {
32
- degradedAbove: number;
33
- unhealthyAbove: number;
34
- };
35
- artifactBytes: {
36
- degradedAbove: number;
37
- unhealthyAbove: number;
38
- };
39
27
  };
40
28
  };
41
29
  type HealthMonitorOptions = {
@@ -51,12 +39,9 @@ export declare function readHealthMonitorConfig(workspace: WorkspaceBundle): Hea
51
39
  export declare class HealthMonitor {
52
40
  private readonly options;
53
41
  private readonly config;
54
- private readonly runRoots;
55
- private readonly checkpointDbPaths;
56
42
  private readonly llmSamples;
57
43
  private timer;
58
44
  private latestSnapshot;
59
- private runtimeEventSequence;
60
45
  constructor(options: HealthMonitorOptions);
61
46
  recordLlmSuccess(latencyMs: number, nowMs?: number): void;
62
47
  recordLlmFailure(latencyMs: number, nowMs?: number): void;
@@ -67,15 +52,7 @@ export declare class HealthMonitor {
67
52
  evaluate(nowMs?: number): Promise<RuntimeHealthSnapshot>;
68
53
  private llmStats;
69
54
  private evaluateLlmCheck;
70
- private llmSymptoms;
71
- private evaluatePersistenceCheck;
72
- private evaluateCapacityCheck;
73
- private capacitySymptoms;
74
55
  private evaluateWorkloadCheck;
75
- private workloadSymptoms;
76
56
  private countStuckRuns;
77
- private sumRuntimeDbBytes;
78
- private sumCheckpointDbBytes;
79
- private sumArtifactBytes;
80
57
  }
81
58
  export {};
@@ -1,7 +1,4 @@
1
- import path from "node:path";
2
- import { readdir, stat } from "node:fs/promises";
3
1
  import { getRuntimeDefaults } from "../../../workspace/support/workspace-ref-utils.js";
4
- import { discoverCheckpointMaintenanceTargets } from "../../maintenance/checkpoint-maintenance.js";
5
2
  const DEFAULT_HEALTH_CONFIG = {
6
3
  enabled: false,
7
4
  evaluateIntervalSeconds: 30,
@@ -25,18 +22,6 @@ const DEFAULT_HEALTH_CONFIG = {
25
22
  degradedAbove: 300,
26
23
  unhealthyAbove: 900,
27
24
  },
28
- checkpointBytes: {
29
- degradedAbove: 512 * 1024 * 1024,
30
- unhealthyAbove: 2 * 1024 * 1024 * 1024,
31
- },
32
- runtimeDbBytes: {
33
- degradedAbove: 256 * 1024 * 1024,
34
- unhealthyAbove: 1024 * 1024 * 1024,
35
- },
36
- artifactBytes: {
37
- degradedAbove: 512 * 1024 * 1024,
38
- unhealthyAbove: 2 * 1024 * 1024 * 1024,
39
- },
40
25
  },
41
26
  };
42
27
  function asObject(value) {
@@ -52,23 +37,18 @@ function maxStatus(left, right) {
52
37
  const rank = { healthy: 0, degraded: 1, unhealthy: 2 };
53
38
  return rank[left] >= rank[right] ? left : right;
54
39
  }
55
- function compareStatus(left, right) {
56
- return left === right;
57
- }
58
- function computeP95(values) {
59
- if (values.length === 0) {
60
- return undefined;
61
- }
62
- const sorted = [...values].sort((a, b) => a - b);
63
- const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
64
- return sorted[index];
65
- }
66
- function isoFromMs(value) {
67
- return new Date(value).toISOString();
68
- }
69
40
  function buildCheck(status, updatedAt, reason) {
70
41
  return reason ? { status, updatedAt, reason } : { status, updatedAt };
71
42
  }
43
+ function createSymptom(code, status, message, timestamp) {
44
+ return {
45
+ code,
46
+ severity: status === "unhealthy" ? "error" : status === "degraded" ? "warn" : "info",
47
+ message,
48
+ firstSeenAt: timestamp,
49
+ lastSeenAt: timestamp,
50
+ };
51
+ }
72
52
  function describeThresholdStatus(value, thresholds) {
73
53
  if (value >= thresholds.unhealthyAbove) {
74
54
  return "unhealthy";
@@ -78,6 +58,14 @@ function describeThresholdStatus(value, thresholds) {
78
58
  }
79
59
  return "healthy";
80
60
  }
61
+ function computeP95(values) {
62
+ if (values.length === 0) {
63
+ return undefined;
64
+ }
65
+ const sorted = [...values].sort((a, b) => a - b);
66
+ const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
67
+ return sorted[index];
68
+ }
81
69
  export function readHealthMonitorConfig(workspace) {
82
70
  const runtimeDefaults = getRuntimeDefaults(workspace.refs);
83
71
  const observability = asObject(runtimeDefaults?.observability);
@@ -87,9 +75,6 @@ export function readHealthMonitorConfig(workspace) {
87
75
  const llmP95LatencyMs = asObject(thresholds?.llmP95LatencyMs);
88
76
  const pendingApprovals = asObject(thresholds?.pendingApprovals);
89
77
  const stuckRunSeconds = asObject(thresholds?.stuckRunSeconds);
90
- const checkpointBytes = asObject(thresholds?.checkpointBytes);
91
- const runtimeDbBytes = asObject(thresholds?.runtimeDbBytes);
92
- const artifactBytes = asObject(thresholds?.artifactBytes);
93
78
  return {
94
79
  enabled: health?.enabled === true,
95
80
  evaluateIntervalSeconds: readPositiveNumber(health?.evaluateIntervalSeconds, DEFAULT_HEALTH_CONFIG.evaluateIntervalSeconds),
@@ -113,112 +98,18 @@ export function readHealthMonitorConfig(workspace) {
113
98
  degradedAbove: readNonNegativeNumber(stuckRunSeconds?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.stuckRunSeconds.degradedAbove),
114
99
  unhealthyAbove: readNonNegativeNumber(stuckRunSeconds?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.stuckRunSeconds.unhealthyAbove),
115
100
  },
116
- checkpointBytes: {
117
- degradedAbove: readNonNegativeNumber(checkpointBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.checkpointBytes.degradedAbove),
118
- unhealthyAbove: readNonNegativeNumber(checkpointBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.checkpointBytes.unhealthyAbove),
119
- },
120
- runtimeDbBytes: {
121
- degradedAbove: readNonNegativeNumber(runtimeDbBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.runtimeDbBytes.degradedAbove),
122
- unhealthyAbove: readNonNegativeNumber(runtimeDbBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.runtimeDbBytes.unhealthyAbove),
123
- },
124
- artifactBytes: {
125
- degradedAbove: readNonNegativeNumber(artifactBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.artifactBytes.degradedAbove),
126
- unhealthyAbove: readNonNegativeNumber(artifactBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.artifactBytes.unhealthyAbove),
127
- },
128
101
  },
129
102
  };
130
103
  }
131
- async function safeFileSize(filePath) {
132
- try {
133
- const stats = await stat(filePath);
134
- return stats.size;
135
- }
136
- catch {
137
- return 0;
138
- }
139
- }
140
- async function directorySize(root) {
141
- try {
142
- const entries = await readdir(root, { withFileTypes: true });
143
- let total = 0;
144
- for (const entry of entries) {
145
- const fullPath = path.join(root, entry.name);
146
- if (entry.isDirectory()) {
147
- total += await directorySize(fullPath);
148
- continue;
149
- }
150
- if (entry.isFile()) {
151
- total += await safeFileSize(fullPath);
152
- }
153
- }
154
- return total;
155
- }
156
- catch {
157
- return 0;
158
- }
159
- }
160
- function evaluateMaintenanceStatus(label, status, updatedAt) {
161
- if (!status) {
162
- return {
163
- check: buildCheck("healthy", updatedAt, `${label} disabled`),
164
- symptoms: [],
165
- };
166
- }
167
- if (status.consecutiveFailures > 0) {
168
- return {
169
- check: buildCheck("degraded", updatedAt, `${label} has ${status.consecutiveFailures} consecutive failure(s)`),
170
- symptoms: [{
171
- code: `${label}.loop.failure`,
172
- severity: "warn",
173
- message: status.lastError ?? `${label} maintenance loop failed`,
174
- firstSeenAt: status.lastFailedAt ?? updatedAt,
175
- lastSeenAt: status.lastFailedAt ?? updatedAt,
176
- }],
177
- };
178
- }
179
- if (status.lastCompletedAt) {
180
- return {
181
- check: buildCheck("healthy", updatedAt, `${label} completed successfully`),
182
- symptoms: [],
183
- };
184
- }
185
- return {
186
- check: buildCheck("healthy", updatedAt, `${label} idle`),
187
- symptoms: [],
188
- };
189
- }
190
- function normalizeSeverity(status) {
191
- if (status === "unhealthy") {
192
- return "error";
193
- }
194
- if (status === "degraded") {
195
- return "warn";
196
- }
197
- return "info";
198
- }
199
- function createSymptom(code, status, message, timestamp) {
200
- return {
201
- code,
202
- severity: normalizeSeverity(status),
203
- message,
204
- firstSeenAt: timestamp,
205
- lastSeenAt: timestamp,
206
- };
207
- }
208
104
  export class HealthMonitor {
209
105
  options;
210
106
  config;
211
- runRoots;
212
- checkpointDbPaths;
213
107
  llmSamples = [];
214
108
  timer = null;
215
109
  latestSnapshot = null;
216
- runtimeEventSequence = 0;
217
110
  constructor(options) {
218
111
  this.options = options;
219
112
  this.config = readHealthMonitorConfig(options.workspace);
220
- this.runRoots = Array.from(new Set(Array.from(options.workspace.bindings.values()).map((binding) => binding.harnessRuntime.runRoot)));
221
- this.checkpointDbPaths = Array.from(new Set(discoverCheckpointMaintenanceTargets(options.workspace).map((target) => target.dbPath)));
222
113
  }
223
114
  recordLlmSuccess(latencyMs, nowMs = Date.now()) {
224
115
  this.recordLlmSample({ timestampMs: nowMs, latencyMs, success: true });
@@ -253,172 +144,96 @@ export class HealthMonitor {
253
144
  return this.evaluate();
254
145
  }
255
146
  async evaluate(nowMs = Date.now()) {
256
- const updatedAt = isoFromMs(nowMs);
257
- const [runs, approvals, runtimeDbBytes, checkpointBytes, artifactBytes] = await Promise.all([
147
+ const updatedAt = new Date(nowMs).toISOString();
148
+ const [runs, approvals] = await Promise.all([
258
149
  this.options.persistence.listRuns(),
259
150
  this.options.persistence.listApprovals(),
260
- this.sumRuntimeDbBytes(),
261
- this.sumCheckpointDbBytes(),
262
- this.sumArtifactBytes(),
263
151
  ]);
264
152
  const pendingApprovals = approvals.filter((approval) => approval.status === "pending").length;
265
153
  const stuckRuns = this.countStuckRuns(runs, nowMs);
266
- const runtimeMaintenance = evaluateMaintenanceStatus("checkpoint", this.options.getCheckpointMaintenanceStatus(), updatedAt);
267
- const recordMaintenance = evaluateMaintenanceStatus("records", this.options.getRuntimeRecordMaintenanceStatus(), updatedAt);
268
- let runtimeCheck = buildCheck("healthy", updatedAt, "runtime loops healthy");
269
- const runtimeSymptoms = [...runtimeMaintenance.symptoms, ...recordMaintenance.symptoms];
270
- runtimeCheck = buildCheck(maxStatus(runtimeMaintenance.check.status, recordMaintenance.check.status), updatedAt, runtimeSymptoms.length > 0 ? runtimeSymptoms.map((symptom) => symptom.message).join("; ") : "runtime loops healthy");
271
154
  const llmCheck = this.evaluateLlmCheck(updatedAt, nowMs);
272
- const persistenceCheck = this.evaluatePersistenceCheck(updatedAt, runtimeDbBytes, runtimeSymptoms);
273
- const capacityCheck = this.evaluateCapacityCheck(updatedAt, checkpointBytes, runtimeDbBytes, artifactBytes);
274
155
  const workloadCheck = this.evaluateWorkloadCheck(updatedAt, pendingApprovals, stuckRuns);
275
- const symptoms = [
276
- ...runtimeSymptoms,
277
- ...this.capacitySymptoms(capacityCheck, checkpointBytes, runtimeDbBytes, artifactBytes, updatedAt),
278
- ...this.workloadSymptoms(workloadCheck, pendingApprovals, stuckRuns, updatedAt),
279
- ...this.llmSymptoms(llmCheck, updatedAt),
280
- ...(persistenceCheck.status === "healthy"
281
- ? []
282
- : [createSymptom("persistence.runtime-db", persistenceCheck.status, persistenceCheck.reason ?? "runtime persistence degraded", updatedAt)]),
283
- ];
284
- const checks = {
285
- runtime: runtimeCheck,
286
- llm: llmCheck,
287
- persistence: persistenceCheck,
288
- capacity: capacityCheck,
289
- workload: workloadCheck,
290
- };
156
+ const runtimeCheck = buildCheck("healthy", updatedAt, "runtime health monitoring enabled");
157
+ const persistenceCheck = buildCheck("healthy", updatedAt, "runtime persistence accessible");
158
+ const capacityCheck = buildCheck("healthy", updatedAt, "capacity checks disabled");
159
+ const symptoms = [];
160
+ if (describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals) !== "healthy") {
161
+ symptoms.push(createSymptom("workload.pending-approvals", describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals), `pendingApprovals=${pendingApprovals}`, updatedAt));
162
+ }
163
+ if (stuckRuns > 0) {
164
+ symptoms.push(createSymptom("workload.stuck-runs", workloadCheck.status, `stuckRuns=${stuckRuns}`, updatedAt));
165
+ }
166
+ if (llmCheck.status !== "healthy") {
167
+ symptoms.push(createSymptom("llm.invocations", llmCheck.status, llmCheck.reason ?? "llm health degraded", updatedAt));
168
+ }
291
169
  let overallStatus = "healthy";
292
- for (const check of Object.values(checks)) {
170
+ for (const check of [runtimeCheck, llmCheck, persistenceCheck, capacityCheck, workloadCheck]) {
293
171
  overallStatus = maxStatus(overallStatus, check.status);
294
172
  }
295
173
  const snapshot = {
296
174
  status: overallStatus,
297
175
  updatedAt,
298
- checks,
176
+ checks: {
177
+ runtime: runtimeCheck,
178
+ llm: llmCheck,
179
+ persistence: persistenceCheck,
180
+ capacity: capacityCheck,
181
+ workload: workloadCheck,
182
+ },
299
183
  symptoms,
300
184
  stats: {
301
185
  activeRunSlots: this.options.getActiveRunSlots(),
302
186
  pendingRunSlots: this.options.getPendingRunSlots(),
303
187
  pendingApprovals,
304
188
  stuckRuns,
305
- checkpointBytes,
306
- runtimeDbBytes,
307
- artifactBytes,
308
189
  ...this.llmStats(nowMs),
309
190
  },
310
191
  };
311
- const previous = this.latestSnapshot;
312
- this.latestSnapshot = snapshot;
313
192
  if (this.config.enabled &&
314
193
  this.config.emitEvents &&
315
194
  this.options.publishEvent &&
316
- previous &&
317
- !compareStatus(previous.status, snapshot.status)) {
195
+ this.latestSnapshot &&
196
+ this.latestSnapshot.status !== snapshot.status) {
318
197
  await this.options.publishEvent({
319
- previousStatus: previous.status,
198
+ previousStatus: this.latestSnapshot.status,
320
199
  status: snapshot.status,
321
200
  checks: snapshot.checks,
322
201
  stats: snapshot.stats,
323
202
  });
324
203
  }
204
+ this.latestSnapshot = snapshot;
325
205
  return snapshot;
326
206
  }
327
207
  llmStats(nowMs) {
328
- const oneMinuteWindowMs = 60 * 1000;
329
- const recent = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= oneMinuteWindowMs);
208
+ const recent = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= 60 * 1000);
330
209
  const successes = recent.filter((sample) => sample.success).length;
331
- const latencies = recent.map((sample) => sample.latencyMs);
332
210
  return {
333
211
  llmSuccessRate1m: recent.length > 0 ? successes / recent.length : undefined,
334
- llmP95LatencyMs1m: computeP95(latencies),
212
+ llmP95LatencyMs1m: computeP95(recent.map((sample) => sample.latencyMs)),
335
213
  };
336
214
  }
337
215
  evaluateLlmCheck(updatedAt, nowMs) {
338
- const errorWindow = this.config.thresholds.llmErrorRate.windowSeconds * 1000;
339
- const latencyWindow = this.config.thresholds.llmP95LatencyMs.windowSeconds * 1000;
340
- const errorSamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= errorWindow);
341
- const latencySamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= latencyWindow);
216
+ const errorSamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= this.config.thresholds.llmErrorRate.windowSeconds * 1000);
217
+ const latencySamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= this.config.thresholds.llmP95LatencyMs.windowSeconds * 1000);
342
218
  if (errorSamples.length === 0 && latencySamples.length === 0) {
343
219
  return buildCheck("healthy", updatedAt, "no recent llm invocations");
344
220
  }
345
- const errors = errorSamples.filter((sample) => !sample.success).length;
346
- const errorRate = errorSamples.length > 0 ? errors / errorSamples.length : 0;
221
+ const errorRate = errorSamples.length > 0
222
+ ? errorSamples.filter((sample) => !sample.success).length / errorSamples.length
223
+ : 0;
347
224
  const p95Latency = computeP95(latencySamples.map((sample) => sample.latencyMs));
348
- let status = "healthy";
349
- if (errorRate >= this.config.thresholds.llmErrorRate.unhealthyAbove) {
350
- status = "unhealthy";
351
- }
352
- else if (errorRate >= this.config.thresholds.llmErrorRate.degradedAbove) {
353
- status = "degraded";
354
- }
225
+ let status = describeThresholdStatus(errorRate, this.config.thresholds.llmErrorRate);
355
226
  if (p95Latency !== undefined) {
356
- const latencyStatus = describeThresholdStatus(p95Latency, this.config.thresholds.llmP95LatencyMs);
357
- status = maxStatus(status, latencyStatus);
227
+ status = maxStatus(status, describeThresholdStatus(p95Latency, this.config.thresholds.llmP95LatencyMs));
358
228
  }
359
229
  return buildCheck(status, updatedAt, `errorRate=${errorRate.toFixed(2)}${p95Latency !== undefined ? ` p95LatencyMs=${Math.round(p95Latency)}` : ""}`);
360
230
  }
361
- llmSymptoms(check, updatedAt) {
362
- if (check.status === "healthy") {
363
- return [];
364
- }
365
- return [createSymptom("llm.invocations", check.status, check.reason ?? "llm health degraded", updatedAt)];
366
- }
367
- evaluatePersistenceCheck(updatedAt, runtimeDbBytes, runtimeSymptoms) {
368
- if (runtimeSymptoms.length > 0) {
369
- return buildCheck("degraded", updatedAt, runtimeSymptoms.map((symptom) => symptom.message).join("; "));
370
- }
371
- if (runtimeDbBytes === 0) {
372
- return buildCheck("healthy", updatedAt, "runtime sqlite not materialized yet");
373
- }
374
- return buildCheck("healthy", updatedAt, "runtime sqlite accessible");
375
- }
376
- evaluateCapacityCheck(updatedAt, checkpointBytes, runtimeDbBytes, artifactBytes) {
377
- let status = "healthy";
378
- status = maxStatus(status, describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes));
379
- status = maxStatus(status, describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes));
380
- status = maxStatus(status, describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes));
381
- return buildCheck(status, updatedAt, `checkpointBytes=${checkpointBytes} runtimeDbBytes=${runtimeDbBytes} artifactBytes=${artifactBytes}`);
382
- }
383
- capacitySymptoms(check, checkpointBytes, runtimeDbBytes, artifactBytes, updatedAt) {
384
- if (check.status === "healthy") {
385
- return [];
386
- }
387
- const symptoms = [];
388
- if (describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes) !== "healthy") {
389
- symptoms.push(createSymptom("capacity.checkpoints", describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes), `checkpointBytes=${checkpointBytes}`, updatedAt));
390
- }
391
- if (describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes) !== "healthy") {
392
- symptoms.push(createSymptom("capacity.runtime-db", describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes), `runtimeDbBytes=${runtimeDbBytes}`, updatedAt));
393
- }
394
- if (describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes) !== "healthy") {
395
- symptoms.push(createSymptom("capacity.artifacts", describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes), `artifactBytes=${artifactBytes}`, updatedAt));
396
- }
397
- return symptoms;
398
- }
399
231
  evaluateWorkloadCheck(updatedAt, pendingApprovals, stuckRuns) {
400
232
  let status = describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals);
401
- status = maxStatus(status, describeThresholdStatus(stuckRuns, {
402
- degradedAbove: this.config.thresholds.stuckRunSeconds.degradedAbove > 0 ? 1 : Number.MAX_SAFE_INTEGER,
403
- unhealthyAbove: this.config.thresholds.stuckRunSeconds.unhealthyAbove > 0 ? 1 : Number.MAX_SAFE_INTEGER,
404
- }));
405
- return buildCheck(status, updatedAt, `pendingApprovals=${pendingApprovals} stuckRuns=${stuckRuns}`);
406
- }
407
- workloadSymptoms(check, pendingApprovals, stuckRuns, updatedAt) {
408
- if (check.status === "healthy") {
409
- return [];
410
- }
411
- const symptoms = [];
412
- if (describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals) !== "healthy") {
413
- symptoms.push(createSymptom("workload.pending-approvals", describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals), `pendingApprovals=${pendingApprovals}`, updatedAt));
414
- }
415
233
  if (stuckRuns > 0) {
416
- const stuckStatus = stuckRuns >= 1 && this.config.thresholds.stuckRunSeconds.unhealthyAbove <= this.config.thresholds.stuckRunSeconds.degradedAbove
417
- ? "unhealthy"
418
- : "degraded";
419
- symptoms.push(createSymptom("workload.stuck-runs", stuckStatus, `stuckRuns=${stuckRuns}`, updatedAt));
234
+ status = maxStatus(status, stuckRuns >= this.config.thresholds.stuckRunSeconds.unhealthyAbove ? "unhealthy" : "degraded");
420
235
  }
421
- return symptoms;
236
+ return buildCheck(status, updatedAt, `pendingApprovals=${pendingApprovals} stuckRuns=${stuckRuns}`);
422
237
  }
423
238
  countStuckRuns(runs, nowMs) {
424
239
  return runs.filter((run) => {
@@ -429,20 +244,7 @@ export class HealthMonitor {
429
244
  if (!Number.isFinite(updatedAtMs)) {
430
245
  return false;
431
246
  }
432
- const ageSeconds = (nowMs - updatedAtMs) / 1000;
433
- return ageSeconds >= this.config.thresholds.stuckRunSeconds.degradedAbove;
247
+ return (nowMs - updatedAtMs) / 1000 >= this.config.thresholds.stuckRunSeconds.degradedAbove;
434
248
  }).length;
435
249
  }
436
- async sumRuntimeDbBytes() {
437
- const sizes = await Promise.all(this.runRoots.map((runRoot) => safeFileSize(path.join(runRoot, "runtime.sqlite"))));
438
- return sizes.reduce((sum, value) => sum + value, 0);
439
- }
440
- async sumCheckpointDbBytes() {
441
- const sizes = await Promise.all(this.checkpointDbPaths.map((dbPath) => safeFileSize(dbPath)));
442
- return sizes.reduce((sum, value) => sum + value, 0);
443
- }
444
- async sumArtifactBytes() {
445
- const sizes = await Promise.all(this.runRoots.map((runRoot) => directorySize(path.join(runRoot, "threads"))));
446
- return sizes.reduce((sum, value) => sum + value, 0);
447
- }
448
250
  }
@@ -22,8 +22,6 @@ export declare class AgentHarnessRuntime {
22
22
  private readonly threadMemorySync;
23
23
  private readonly unregisterThreadMemorySync;
24
24
  private readonly resolvedRuntimeAdapterOptions;
25
- private readonly checkpointMaintenance;
26
- private readonly runtimeRecordMaintenance;
27
25
  private readonly healthMonitor;
28
26
  private readonly recoveryConfig;
29
27
  private readonly concurrencyConfig;
@@ -9,10 +9,7 @@ import { getConcurrencyConfig, getRecoveryConfig, getRoutingDefaultAgentId, getR
9
9
  import { createHarnessEvent, inferRoutingBindings, renderRuntimeFailure, } from "./support/harness-support.js";
10
10
  import { ThreadMemorySync } from "./harness/system/thread-memory-sync.js";
11
11
  import { FileBackedStore } from "./harness/system/store.js";
12
- import { CheckpointMaintenanceLoop, discoverCheckpointMaintenanceTargets, readCheckpointMaintenanceConfig, } from "./maintenance/checkpoint-maintenance.js";
13
- import { RuntimeRecordMaintenanceLoop, discoverRuntimeRecordMaintenanceTargets, readRuntimeRecordMaintenanceConfig, } from "./maintenance/runtime-record-maintenance.js";
14
- import { HealthMonitor } from "./harness/system/health-monitor.js";
15
- import { readHealthMonitorConfig } from "./harness/system/health-monitor.js";
12
+ import { HealthMonitor, readHealthMonitorConfig, } from "./harness/system/health-monitor.js";
16
13
  import { extractMessageText, normalizeMessageContent } from "../utils/message-content.js";
17
14
  import { buildPersistedRunRequest, normalizeInvocationEnvelope, normalizeRunPriority, resolveRunListeners, } from "./harness/run/helpers.js";
18
15
  import { emitHarnessEvent, emitRunCreatedEvent, emitSyntheticFallbackEvent, requestApprovalAndEmitEvent, setRunStateAndEmitEvent, } from "./harness/events/events.js";
@@ -29,7 +26,7 @@ import { getBindingAdapterKind, getBindingPrimaryTools, getBindingStoreConfig }
29
26
  import { isRuntimeEntryBinding } from "./support/runtime-entry.js";
30
27
  import { describeWorkspaceInventory, listAgentSkills as listWorkspaceAgentSkills, } from "./harness/system/inventory.js";
31
28
  import { createDefaultHealthSnapshot, isInventoryEnabled, isThreadMemorySyncEnabled, } from "./harness/runtime-defaults.js";
32
- import { initializeHarnessRuntime, isStaleRunningRun as isHarnessStaleRunningRun, reclaimExpiredClaimedRuns as reclaimHarnessExpiredClaimedRuns, recoverStartupRuns as recoverHarnessStartupRuns, } from "./harness/run/startup-runtime.js";
29
+ import { initializeHarnessRuntime, isStaleRunningRun as isHarnessStaleRunningRun, } from "./harness/run/startup-runtime.js";
33
30
  import { streamHarnessRun } from "./harness/run/stream-run.js";
34
31
  import { deleteThreadRecord, getPublicApproval, getThreadRecord, listPublicApprovals, } from "./harness/run/thread-records.js";
35
32
  export class AgentHarnessRuntime {
@@ -58,8 +55,6 @@ export class AgentHarnessRuntime {
58
55
  threadMemorySync;
59
56
  unregisterThreadMemorySync;
60
57
  resolvedRuntimeAdapterOptions;
61
- checkpointMaintenance;
62
- runtimeRecordMaintenance;
63
58
  healthMonitor;
64
59
  recoveryConfig;
65
60
  concurrencyConfig;
@@ -127,14 +122,6 @@ export class AgentHarnessRuntime {
127
122
  this.threadMemorySync = null;
128
123
  this.unregisterThreadMemorySync = () => { };
129
124
  }
130
- const checkpointMaintenanceConfig = readCheckpointMaintenanceConfig(workspace);
131
- this.checkpointMaintenance = checkpointMaintenanceConfig
132
- ? new CheckpointMaintenanceLoop(discoverCheckpointMaintenanceTargets(workspace), checkpointMaintenanceConfig)
133
- : null;
134
- const runtimeRecordMaintenanceConfig = readRuntimeRecordMaintenanceConfig(workspace);
135
- this.runtimeRecordMaintenance = runtimeRecordMaintenanceConfig
136
- ? new RuntimeRecordMaintenanceLoop(discoverRuntimeRecordMaintenanceTargets(workspace), runtimeRecordMaintenanceConfig)
137
- : null;
138
125
  this.recoveryConfig = getRecoveryConfig(workspace.refs);
139
126
  this.concurrencyConfig = getConcurrencyConfig(workspace.refs);
140
127
  const healthConfig = readHealthMonitorConfig(workspace);
@@ -146,8 +133,8 @@ export class AgentHarnessRuntime {
146
133
  persistence: this.persistence,
147
134
  getActiveRunSlots: () => this.activeRunSlots,
148
135
  getPendingRunSlots: () => this.pendingRunSlots.length,
149
- getCheckpointMaintenanceStatus: () => this.checkpointMaintenance?.getStatus() ?? null,
150
- getRuntimeRecordMaintenanceStatus: () => this.runtimeRecordMaintenance?.getStatus() ?? null,
136
+ getCheckpointMaintenanceStatus: () => null,
137
+ getRuntimeRecordMaintenanceStatus: () => null,
151
138
  publishEvent: async (payload) => {
152
139
  this.eventBus.publish(createHarnessEvent("__runtime__", "__runtime__", ++this.runtimeEventSequence, "runtime.health.changed", payload));
153
140
  },
@@ -162,10 +149,7 @@ export class AgentHarnessRuntime {
162
149
  async initialize() {
163
150
  await initializeHarnessRuntime({
164
151
  persistence: this.persistence,
165
- checkpointMaintenance: this.checkpointMaintenance,
166
- runtimeRecordMaintenance: this.runtimeRecordMaintenance,
167
152
  healthMonitor: this.healthMonitor,
168
- recoverStartupRuns: () => this.recoverStartupRuns(),
169
153
  });
170
154
  }
171
155
  subscribe(listener) {
@@ -720,8 +704,6 @@ export class AgentHarnessRuntime {
720
704
  }
721
705
  async close() {
722
706
  await this.healthMonitor?.stop();
723
- await this.checkpointMaintenance?.stop();
724
- await this.runtimeRecordMaintenance?.stop();
725
707
  this.unregisterThreadMemorySync();
726
708
  await Promise.allSettled(Array.from(this.backgroundTasks));
727
709
  await this.threadMemorySync?.close();
@@ -739,21 +721,10 @@ export class AgentHarnessRuntime {
739
721
  }, options);
740
722
  }
741
723
  async recoverStartupRuns() {
742
- await recoverHarnessStartupRuns({
743
- recoveryConfig: this.recoveryConfig,
744
- persistence: this.persistence,
745
- createStartupRecoveryContext: () => this.createStartupRecoveryContext(),
746
- reclaimExpiredClaimedRuns: (nowIso) => this.reclaimExpiredClaimedRuns(nowIso),
747
- });
724
+ return;
748
725
  }
749
- async reclaimExpiredClaimedRuns(nowIso = new Date().toISOString()) {
750
- await reclaimHarnessExpiredClaimedRuns({
751
- persistence: this.persistence,
752
- setRunStateAndEmit: (threadId, runId, sequence, state, options) => this.setRunStateAndEmit(threadId, runId, sequence, state, options),
753
- emit: (threadId, runId, sequence, eventType, payload) => this.emit(threadId, runId, sequence, eventType, payload),
754
- concurrencyConfig: this.concurrencyConfig,
755
- getActiveRunSlots: () => this.activeRunSlots,
756
- }, nowIso);
726
+ async reclaimExpiredClaimedRuns(_nowIso = new Date().toISOString()) {
727
+ return;
757
728
  }
758
729
  async isStaleRunningRun(thread, nowMs = Date.now()) {
759
730
  return isHarnessStaleRunningRun({
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@botbotgo/agent-harness",
3
- "version": "0.0.103",
3
+ "version": "0.0.105",
4
4
  "description": "Workspace runtime for multi-agent applications",
5
5
  "type": "module",
6
6
  "packageManager": "npm@10.9.2",