@botbotgo/agent-harness 0.0.75 → 0.0.77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/api.d.ts +2 -1
  2. package/dist/api.js +3 -0
  3. package/dist/benchmark/checkpoint-resume-cost-benchmark.d.ts +33 -0
  4. package/dist/benchmark/checkpoint-resume-cost-benchmark.js +55 -0
  5. package/dist/benchmark/deepagent-local-model-benchmark.d.ts +27 -0
  6. package/dist/benchmark/deepagent-local-model-benchmark.js +35 -0
  7. package/dist/config/agents/direct.yaml +1 -1
  8. package/dist/config/agents/orchestra.yaml +1 -2
  9. package/dist/config/workspace.yaml +31 -0
  10. package/dist/contracts/types.d.ts +38 -1
  11. package/dist/index.d.ts +1 -1
  12. package/dist/index.js +1 -1
  13. package/dist/package-version.d.ts +1 -1
  14. package/dist/package-version.js +1 -1
  15. package/dist/persistence/file-store.d.ts +3 -40
  16. package/dist/persistence/file-store.js +5 -2
  17. package/dist/persistence/sqlite-store.d.ts +68 -0
  18. package/dist/persistence/sqlite-store.js +569 -0
  19. package/dist/persistence/types.d.ts +83 -0
  20. package/dist/persistence/types.js +1 -0
  21. package/dist/runtime/agent-runtime-adapter.d.ts +3 -0
  22. package/dist/runtime/agent-runtime-adapter.js +58 -2
  23. package/dist/runtime/checkpoint-maintenance.d.ts +11 -2
  24. package/dist/runtime/checkpoint-maintenance.js +41 -5
  25. package/dist/runtime/harness.d.ts +5 -1
  26. package/dist/runtime/harness.js +67 -4
  27. package/dist/runtime/health-monitor.d.ts +81 -0
  28. package/dist/runtime/health-monitor.js +448 -0
  29. package/dist/runtime/runtime-record-maintenance.d.ts +43 -0
  30. package/dist/runtime/runtime-record-maintenance.js +169 -0
  31. package/dist/runtime/store.d.ts +2 -0
  32. package/dist/runtime/store.js +38 -20
  33. package/dist/runtime/support/embedding-models.js +57 -1
  34. package/dist/runtime/thread-memory-sync.d.ts +3 -2
  35. package/dist/runtime/thread-memory-sync.js +7 -1
  36. package/dist/workspace/agent-binding-compiler.js +3 -1
  37. package/dist/workspace/support/workspace-ref-utils.d.ts +9 -0
  38. package/dist/workspace/support/workspace-ref-utils.js +38 -0
  39. package/package.json +2 -2
@@ -64,6 +64,9 @@ function computeRemainingTimeoutMs(deadlineAt, fallbackTimeoutMs) {
64
64
  }
65
65
  return fallbackTimeoutMs ? Math.min(fallbackTimeoutMs, remaining) : remaining;
66
66
  }
67
+ function sleep(ms) {
68
+ return new Promise((resolve) => setTimeout(resolve, ms));
69
+ }
67
70
  function isPlaceholderApiKey(value) {
68
71
  return typeof value === "string" && value.trim().toLowerCase() === "dummy";
69
72
  }
@@ -485,6 +488,57 @@ export class AgentRuntimeAdapter {
485
488
  }
486
489
  return 15_000;
487
490
  }
491
+ resolveProviderRetryPolicy(binding) {
492
+ const resilience = typeof binding.harnessRuntime.resilience === "object" && binding.harnessRuntime.resilience
493
+ ? binding.harnessRuntime.resilience
494
+ : {};
495
+ const providerRetries = typeof resilience.providerRetries === "object" && resilience.providerRetries
496
+ ? resilience.providerRetries
497
+ : {};
498
+ const maxAttempts = typeof providerRetries.maxAttempts === "number" &&
499
+ Number.isFinite(providerRetries.maxAttempts) &&
500
+ providerRetries.maxAttempts > 0
501
+ ? Math.floor(providerRetries.maxAttempts)
502
+ : 2;
503
+ const backoffMs = typeof providerRetries.backoffMs === "number" &&
504
+ Number.isFinite(providerRetries.backoffMs) &&
505
+ providerRetries.backoffMs >= 0
506
+ ? Math.floor(providerRetries.backoffMs)
507
+ : 1_000;
508
+ const retryableMessages = Array.isArray(providerRetries.retryableMessages)
509
+ ? providerRetries.retryableMessages.filter((value) => typeof value === "string" && value.trim().length > 0)
510
+ : [];
511
+ return {
512
+ maxAttempts,
513
+ backoffMs,
514
+ retryableMessages,
515
+ };
516
+ }
517
+ isRetryableProviderError(binding, error) {
518
+ const message = error instanceof Error ? error.message : String(error);
519
+ const normalized = message.toLowerCase();
520
+ const { retryableMessages } = this.resolveProviderRetryPolicy(binding);
521
+ return retryableMessages.some((candidate) => normalized.includes(candidate.toLowerCase()));
522
+ }
523
+ async invokeWithProviderRetry(binding, operation) {
524
+ const retryPolicy = this.resolveProviderRetryPolicy(binding);
525
+ let lastError;
526
+ for (let attempt = 1; attempt <= retryPolicy.maxAttempts; attempt += 1) {
527
+ try {
528
+ return await operation();
529
+ }
530
+ catch (error) {
531
+ lastError = error;
532
+ if (attempt >= retryPolicy.maxAttempts || !this.isRetryableProviderError(binding, error)) {
533
+ throw error;
534
+ }
535
+ if (retryPolicy.backoffMs > 0) {
536
+ await sleep(retryPolicy.backoffMs);
537
+ }
538
+ }
539
+ }
540
+ throw lastError instanceof Error ? lastError : new Error(String(lastError));
541
+ }
488
542
  async withTimeout(producer, timeoutMs, operation, stage = operation.includes("stream") ? "stream" : "invoke") {
489
543
  if (!timeoutMs) {
490
544
  return Promise.resolve(producer());
@@ -1204,8 +1258,10 @@ export class AgentRuntimeAdapter {
1204
1258
  : new Command({ resume: resumePayload });
1205
1259
  let result;
1206
1260
  const callRuntime = async (activeBinding, activeRequest) => {
1207
- const runnable = await this.create(activeBinding);
1208
- return (await this.withTimeout(() => runnable.invoke(activeRequest, { configurable: { thread_id: threadId }, ...(options.context ? { context: options.context } : {}) }), this.resolveBindingTimeout(activeBinding), "agent invoke", "invoke"));
1261
+ return this.invokeWithProviderRetry(activeBinding, async () => {
1262
+ const runnable = await this.create(activeBinding);
1263
+ return (await this.withTimeout(() => runnable.invoke(activeRequest, { configurable: { thread_id: threadId }, ...(options.context ? { context: options.context } : {}) }), this.resolveBindingTimeout(activeBinding), "agent invoke", "invoke"));
1264
+ });
1209
1265
  };
1210
1266
  const callRuntimeWithToolParseRecovery = async (activeRequest) => {
1211
1267
  try {
@@ -18,18 +18,27 @@ type CheckpointMaintenanceTarget = {
18
18
  agentId: string;
19
19
  dbPath: string;
20
20
  };
21
+ export type MaintenanceLoopStatus = {
22
+ lastStartedAt?: string;
23
+ lastCompletedAt?: string;
24
+ lastFailedAt?: string;
25
+ consecutiveFailures: number;
26
+ lastError?: string;
27
+ };
21
28
  export declare function readCheckpointMaintenanceConfig(workspace: WorkspaceBundle): CheckpointMaintenanceConfig | null;
22
29
  export declare function discoverCheckpointMaintenanceTargets(workspace: WorkspaceBundle): CheckpointMaintenanceTarget[];
23
- export declare function maintainSqliteCheckpoints(dbPath: string, config: CheckpointMaintenanceConfig, nowMs?: number): {
30
+ export declare function maintainSqliteCheckpoints(dbPath: string, config: CheckpointMaintenanceConfig, nowMs?: number): Promise<{
24
31
  deletedCount: number;
25
- };
32
+ }>;
26
33
  export declare class CheckpointMaintenanceLoop {
27
34
  private readonly targets;
28
35
  private readonly config;
29
36
  private timer;
30
37
  private running;
38
+ private status;
31
39
  constructor(targets: CheckpointMaintenanceTarget[], config: CheckpointMaintenanceConfig);
32
40
  runOnce(nowMs?: number): Promise<void>;
41
+ getStatus(): MaintenanceLoopStatus;
33
42
  start(): Promise<void>;
34
43
  stop(): Promise<void>;
35
44
  }
@@ -1,5 +1,7 @@
1
1
  import path from "node:path";
2
2
  import { SqliteSaver } from "@langchain/langgraph-checkpoint-sqlite";
3
+ import { listProtectedCheckpointThreadIds } from "../persistence/sqlite-store.js";
4
+ import { fileExists } from "../utils/fs.js";
3
5
  import { getRuntimeDefaults } from "../workspace/support/workspace-ref-utils.js";
4
6
  import { ManagedSqliteSaver } from "./sqlite-maintained-checkpoint-saver.js";
5
7
  function asObject(value) {
@@ -135,11 +137,18 @@ function totalCheckpointBytes(db) {
135
137
  return Number(checkpointsBytes.total ?? 0) + Number(writesBytes.total ?? 0);
136
138
  }
137
139
  export function maintainSqliteCheckpoints(dbPath, config, nowMs = Date.now()) {
140
+ return maintainSqliteCheckpointsInternal(dbPath, config, nowMs);
141
+ }
142
+ async function maintainSqliteCheckpointsInternal(dbPath, config, nowMs) {
143
+ if (!(await fileExists(dbPath))) {
144
+ return { deletedCount: 0 };
145
+ }
138
146
  const saver = new ManagedSqliteSaver(SqliteSaver.fromConnString(dbPath).db);
139
147
  const db = saver.db;
140
148
  try {
141
149
  saver.prepareMaintenance();
142
150
  backfillCheckpointMetadata(db, nowMs);
151
+ const protectedThreadIds = await listProtectedCheckpointThreadIds(path.join(path.dirname(dbPath), "runtime.sqlite"));
143
152
  let deletedCount = 0;
144
153
  if (config.policies.maxAgeSeconds !== undefined) {
145
154
  const cutoffMs = nowMs - config.policies.maxAgeSeconds * 1000;
@@ -154,13 +163,13 @@ export function maintainSqliteCheckpoints(dbPath, config, nowMs = Date.now()) {
154
163
  WHERE meta.created_at_ms <= ?
155
164
  ORDER BY meta.created_at_ms ASC, meta.checkpoint_id ASC
156
165
  LIMIT ?`)
157
- .all(cutoffMs, config.sqlite.sweepBatchSize);
158
- deletedCount += deleteCheckpointRows(db, expired);
166
+ .all(cutoffMs, config.sqlite.sweepBatchSize * 4);
167
+ deletedCount += deleteCheckpointRows(db, expired.filter((row) => !protectedThreadIds.has(row.thread_id)).slice(0, config.sqlite.sweepBatchSize));
159
168
  }
160
169
  if (config.policies.maxBytes !== undefined) {
161
170
  let currentBytes = totalCheckpointBytes(db);
162
171
  while (currentBytes > config.policies.maxBytes) {
163
- const oldest = selectOldestRows(db, config.sqlite.sweepBatchSize);
172
+ const oldest = selectOldestRows(db, config.sqlite.sweepBatchSize * 4).filter((row) => !protectedThreadIds.has(row.thread_id));
164
173
  if (oldest.length === 0) {
165
174
  break;
166
175
  }
@@ -191,15 +200,42 @@ export class CheckpointMaintenanceLoop {
191
200
  config;
192
201
  timer = null;
193
202
  running = false;
203
+ status = {
204
+ consecutiveFailures: 0,
205
+ };
194
206
  constructor(targets, config) {
195
207
  this.targets = targets;
196
208
  this.config = config;
197
209
  }
198
210
  async runOnce(nowMs = Date.now()) {
199
- for (const target of this.targets) {
200
- maintainSqliteCheckpoints(target.dbPath, this.config, nowMs);
211
+ this.status = {
212
+ ...this.status,
213
+ lastStartedAt: new Date(nowMs).toISOString(),
214
+ };
215
+ try {
216
+ for (const target of this.targets) {
217
+ await maintainSqliteCheckpoints(target.dbPath, this.config, nowMs);
218
+ }
219
+ this.status = {
220
+ ...this.status,
221
+ lastCompletedAt: new Date(nowMs).toISOString(),
222
+ consecutiveFailures: 0,
223
+ lastError: undefined,
224
+ };
225
+ }
226
+ catch (error) {
227
+ this.status = {
228
+ ...this.status,
229
+ lastFailedAt: new Date(nowMs).toISOString(),
230
+ consecutiveFailures: this.status.consecutiveFailures + 1,
231
+ lastError: error instanceof Error ? error.message : String(error),
232
+ };
233
+ throw error;
201
234
  }
202
235
  }
236
+ getStatus() {
237
+ return { ...this.status };
238
+ }
203
239
  async start() {
204
240
  if (this.running) {
205
241
  return;
@@ -1,4 +1,4 @@
1
- import type { ApprovalRecord, HarnessEvent, HarnessStreamItem, MessageContent, RunRecord, RunStartOptions, RestartConversationOptions, RuntimeAdapterOptions, ResumeOptions, RunOptions, RunResult, RunSummary, ThreadSummary, ThreadRecord, WorkspaceBundle } from "../contracts/types.js";
1
+ import type { ApprovalRecord, HarnessEvent, HarnessStreamItem, RuntimeHealthSnapshot, MessageContent, RunRecord, RunStartOptions, RestartConversationOptions, RuntimeAdapterOptions, ResumeOptions, RunOptions, RunResult, RunSummary, ThreadSummary, ThreadRecord, WorkspaceBundle } from "../contracts/types.js";
2
2
  import { type ToolMcpServerOptions } from "../mcp.js";
3
3
  import { type InventoryAgentRecord, type InventorySkillRecord } from "./inventory.js";
4
4
  import type { RequirementAssessmentOptions } from "./skill-requirements.js";
@@ -23,10 +23,13 @@ export declare class AgentHarnessRuntime {
23
23
  private readonly unregisterThreadMemorySync;
24
24
  private readonly resolvedRuntimeAdapterOptions;
25
25
  private readonly checkpointMaintenance;
26
+ private readonly runtimeRecordMaintenance;
27
+ private readonly healthMonitor;
26
28
  private readonly recoveryConfig;
27
29
  private readonly concurrencyConfig;
28
30
  private activeRunSlots;
29
31
  private readonly pendingRunSlots;
32
+ private runtimeEventSequence;
30
33
  private toPublicApprovalRecord;
31
34
  private normalizeInvocationEnvelope;
32
35
  private isTerminalRunState;
@@ -42,6 +45,7 @@ export declare class AgentHarnessRuntime {
42
45
  constructor(workspace: WorkspaceBundle, runtimeAdapterOptions?: RuntimeAdapterOptions);
43
46
  initialize(): Promise<void>;
44
47
  subscribe(listener: (event: HarnessEvent) => void): () => void;
48
+ getHealth(): Promise<RuntimeHealthSnapshot>;
45
49
  private getBinding;
46
50
  private listAgentTools;
47
51
  private resolveAgentTools;
@@ -1,5 +1,5 @@
1
1
  import { AUTO_AGENT_ID } from "../contracts/types.js";
2
- import { FilePersistence } from "../persistence/file-store.js";
2
+ import { SqlitePersistence } from "../persistence/sqlite-store.js";
3
3
  import { createPersistentId } from "../utils/id.js";
4
4
  import { AGENT_INTERRUPT_SENTINEL_PREFIX, AgentRuntimeAdapter, RuntimeOperationTimeoutError } from "./agent-runtime-adapter.js";
5
5
  import { createResourceBackendResolver, createResourceToolResolver } from "../resource/resource.js";
@@ -13,6 +13,8 @@ import { resolveCompiledVectorStore, resolveCompiledVectorStoreRef } from "./sup
13
13
  import { ThreadMemorySync } from "./thread-memory-sync.js";
14
14
  import { FileBackedStore } from "./store.js";
15
15
  import { CheckpointMaintenanceLoop, discoverCheckpointMaintenanceTargets, readCheckpointMaintenanceConfig, } from "./checkpoint-maintenance.js";
16
+ import { RuntimeRecordMaintenanceLoop, discoverRuntimeRecordMaintenanceTargets, readRuntimeRecordMaintenanceConfig, } from "./runtime-record-maintenance.js";
17
+ import { HealthMonitor } from "./health-monitor.js";
16
18
  import { extractMessageText, normalizeMessageContent } from "../utils/message-content.js";
17
19
  import { createToolMcpServerFromTools, serveToolsOverStdioFromHarness } from "../mcp.js";
18
20
  import { getBindingAdapterKind, getBindingPrimaryTools, getBindingStoreConfig, isDeepAgentBinding } from "./support/compiled-binding.js";
@@ -38,10 +40,13 @@ export class AgentHarnessRuntime {
38
40
  unregisterThreadMemorySync;
39
41
  resolvedRuntimeAdapterOptions;
40
42
  checkpointMaintenance;
43
+ runtimeRecordMaintenance;
44
+ healthMonitor;
41
45
  recoveryConfig;
42
46
  concurrencyConfig;
43
47
  activeRunSlots = 0;
44
48
  pendingRunSlots = [];
49
+ runtimeEventSequence = 0;
45
50
  toPublicApprovalRecord(approval) {
46
51
  const { toolCallId: _toolCallId, checkpointRef: _checkpointRef, eventRefs: _eventRefs, ...publicApproval } = approval;
47
52
  return publicApproval;
@@ -141,7 +146,7 @@ export class AgentHarnessRuntime {
141
146
  this.workspace = workspace;
142
147
  this.runtimeAdapterOptions = runtimeAdapterOptions;
143
148
  const runRoot = this.defaultRunRoot();
144
- this.persistence = new FilePersistence(runRoot);
149
+ this.persistence = new SqlitePersistence(runRoot);
145
150
  const defaultStoreConfig = this.listHostBindings()[0]?.harnessRuntime.store;
146
151
  this.defaultStore = this.resolveStoreFromConfig(defaultStoreConfig, runRoot) ?? new FileBackedStore(`${runRoot}/store.json`);
147
152
  const runtimeMemoryStoreConfig = typeof this.listHostBindings()[0]?.harnessRuntime.runtimeMemory?.store === "object" &&
@@ -189,17 +194,37 @@ export class AgentHarnessRuntime {
189
194
  this.checkpointMaintenance = checkpointMaintenanceConfig
190
195
  ? new CheckpointMaintenanceLoop(discoverCheckpointMaintenanceTargets(workspace), checkpointMaintenanceConfig)
191
196
  : null;
197
+ const runtimeRecordMaintenanceConfig = readRuntimeRecordMaintenanceConfig(workspace);
198
+ this.runtimeRecordMaintenance = runtimeRecordMaintenanceConfig
199
+ ? new RuntimeRecordMaintenanceLoop(discoverRuntimeRecordMaintenanceTargets(workspace), runtimeRecordMaintenanceConfig)
200
+ : null;
192
201
  this.recoveryConfig = getRecoveryConfig(workspace.refs);
193
202
  this.concurrencyConfig = getConcurrencyConfig(workspace.refs);
203
+ this.healthMonitor = new HealthMonitor({
204
+ workspace,
205
+ persistence: this.persistence,
206
+ getActiveRunSlots: () => this.activeRunSlots,
207
+ getPendingRunSlots: () => this.pendingRunSlots.length,
208
+ getCheckpointMaintenanceStatus: () => this.checkpointMaintenance?.getStatus() ?? null,
209
+ getRuntimeRecordMaintenanceStatus: () => this.runtimeRecordMaintenance?.getStatus() ?? null,
210
+ publishEvent: async (payload) => {
211
+ this.eventBus.publish(createHarnessEvent("__runtime__", "__runtime__", ++this.runtimeEventSequence, "runtime.health.changed", payload));
212
+ },
213
+ });
194
214
  }
195
215
  async initialize() {
196
216
  await this.persistence.initialize();
197
217
  await this.checkpointMaintenance?.start();
218
+ await this.runtimeRecordMaintenance?.start();
219
+ await this.healthMonitor.start();
198
220
  await this.recoverStartupRuns();
199
221
  }
200
222
  subscribe(listener) {
201
223
  return this.eventBus.subscribe(listener);
202
224
  }
225
+ async getHealth() {
226
+ return this.healthMonitor.getSnapshot();
227
+ }
203
228
  getBinding(agentId) {
204
229
  return this.workspace.bindings.get(agentId);
205
230
  }
@@ -457,7 +482,16 @@ export class AgentHarnessRuntime {
457
482
  }
458
483
  async invokeWithHistory(binding, input, threadId, runId, resumePayload, options = {}) {
459
484
  const priorHistory = await this.loadPriorHistory(threadId, runId);
460
- return this.runtimeAdapter.invoke(binding, input, threadId, runId, resumePayload, priorHistory, options);
485
+ const startedAt = Date.now();
486
+ try {
487
+ const result = await this.runtimeAdapter.invoke(binding, input, threadId, runId, resumePayload, priorHistory, options);
488
+ this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
489
+ return result;
490
+ }
491
+ catch (error) {
492
+ this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
493
+ throw error;
494
+ }
461
495
  }
462
496
  buildPersistedRunRequest(input, invocation) {
463
497
  const envelope = invocation.invocation ?? {
@@ -830,6 +864,7 @@ export class AgentHarnessRuntime {
830
864
  return;
831
865
  }
832
866
  let emitted = false;
867
+ let streamActivityObserved = false;
833
868
  const { threadId, runId } = await this.ensureThreadStarted(selectedAgentId, binding, options.input, options.threadId);
834
869
  await this.persistence.saveRunRequest(threadId, runId, this.buildPersistedRunRequest(options.input, invocation));
835
870
  yield { type: "event", event: await this.emitRunCreated(threadId, runId, {
@@ -851,6 +886,7 @@ export class AgentHarnessRuntime {
851
886
  files: invocation.files,
852
887
  })) {
853
888
  if (chunk) {
889
+ streamActivityObserved = true;
854
890
  const normalizedChunk = typeof chunk === "string"
855
891
  ? chunk.startsWith(AGENT_INTERRUPT_SENTINEL_PREFIX)
856
892
  ? { kind: "interrupt", content: chunk.slice(AGENT_INTERRUPT_SENTINEL_PREFIX.length) }
@@ -964,11 +1000,30 @@ export class AgentHarnessRuntime {
964
1000
  return;
965
1001
  }
966
1002
  catch (error) {
967
- if (emitted) {
1003
+ if (emitted || streamActivityObserved) {
1004
+ const runtimeFailure = renderRuntimeFailure(error);
968
1005
  yield { type: "event", event: await this.setRunStateAndEmit(threadId, runId, 6, "failed", {
969
1006
  previousState: "running",
970
1007
  error: error instanceof Error ? error.message : String(error),
971
1008
  }) };
1009
+ yield {
1010
+ type: "content",
1011
+ threadId,
1012
+ runId,
1013
+ agentId: selectedAgentId,
1014
+ content: runtimeFailure,
1015
+ };
1016
+ yield {
1017
+ type: "result",
1018
+ result: {
1019
+ threadId,
1020
+ runId,
1021
+ agentId: selectedAgentId,
1022
+ state: "failed",
1023
+ output: runtimeFailure,
1024
+ finalMessageText: runtimeFailure,
1025
+ },
1026
+ };
972
1027
  return;
973
1028
  }
974
1029
  if (error instanceof RuntimeOperationTimeoutError && error.stage === "invoke") {
@@ -1097,8 +1152,10 @@ export class AgentHarnessRuntime {
1097
1152
  const history = await this.persistence.listThreadMessages(threadId);
1098
1153
  const priorHistory = history.filter((message) => message.runId !== runId);
1099
1154
  const runInput = await this.loadRunInput(threadId, runId);
1155
+ const startedAt = Date.now();
1100
1156
  try {
1101
1157
  const actual = await this.runtimeAdapter.invoke(binding, "", threadId, runId, resumePayload, priorHistory);
1158
+ this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
1102
1159
  await this.persistence.clearRecoveryIntent(threadId, runId);
1103
1160
  const finalized = await this.finalizeContinuedRun(threadId, runId, runInput, actual, {
1104
1161
  previousState: "resuming",
@@ -1112,6 +1169,7 @@ export class AgentHarnessRuntime {
1112
1169
  };
1113
1170
  }
1114
1171
  catch (error) {
1172
+ this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
1115
1173
  throw error;
1116
1174
  }
1117
1175
  }
@@ -1182,7 +1240,9 @@ export class AgentHarnessRuntime {
1182
1240
  };
1183
1241
  }
1184
1242
  async close() {
1243
+ await this.healthMonitor.stop();
1185
1244
  await this.checkpointMaintenance?.stop();
1245
+ await this.runtimeRecordMaintenance?.stop();
1186
1246
  this.unregisterThreadMemorySync();
1187
1247
  await this.threadMemorySync.close();
1188
1248
  }
@@ -1283,8 +1343,10 @@ export class AgentHarnessRuntime {
1283
1343
  const history = await this.persistence.listThreadMessages(thread.threadId);
1284
1344
  const priorHistory = history.filter((message) => message.runId !== thread.latestRunId);
1285
1345
  const runInput = await this.loadRunInput(thread.threadId, thread.latestRunId);
1346
+ const startedAt = Date.now();
1286
1347
  try {
1287
1348
  const actual = await this.runtimeAdapter.invoke(binding, "", thread.threadId, thread.latestRunId, recoveryIntent.resumePayload, priorHistory);
1349
+ this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
1288
1350
  await this.persistence.clearRecoveryIntent(thread.threadId, thread.latestRunId);
1289
1351
  await this.finalizeContinuedRun(thread.threadId, thread.latestRunId, runInput, actual, {
1290
1352
  previousState: "resuming",
@@ -1293,6 +1355,7 @@ export class AgentHarnessRuntime {
1293
1355
  });
1294
1356
  }
1295
1357
  catch (error) {
1358
+ this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
1296
1359
  if (recoveryIntent.attempts + 1 >= this.recoveryConfig.maxRecoveryAttempts) {
1297
1360
  await this.persistence.setRunState(thread.threadId, thread.latestRunId, "failed", recoveryIntent.checkpointRef);
1298
1361
  await this.persistence.clearRecoveryIntent(thread.threadId, thread.latestRunId);
@@ -0,0 +1,81 @@
1
+ import type { RuntimeHealthSnapshot, WorkspaceBundle } from "../contracts/types.js";
2
+ import type { RuntimePersistence } from "../persistence/types.js";
3
+ import type { MaintenanceLoopStatus } from "./checkpoint-maintenance.js";
4
+ type HealthMonitorConfig = {
5
+ enabled: boolean;
6
+ evaluateIntervalSeconds: number;
7
+ emitEvents: boolean;
8
+ thresholds: {
9
+ llmErrorRate: {
10
+ windowSeconds: number;
11
+ degradedAbove: number;
12
+ unhealthyAbove: number;
13
+ };
14
+ llmP95LatencyMs: {
15
+ windowSeconds: number;
16
+ degradedAbove: number;
17
+ unhealthyAbove: number;
18
+ };
19
+ pendingApprovals: {
20
+ degradedAbove: number;
21
+ unhealthyAbove: number;
22
+ };
23
+ stuckRunSeconds: {
24
+ degradedAbove: number;
25
+ unhealthyAbove: number;
26
+ };
27
+ checkpointBytes: {
28
+ degradedAbove: number;
29
+ unhealthyAbove: number;
30
+ };
31
+ runtimeDbBytes: {
32
+ degradedAbove: number;
33
+ unhealthyAbove: number;
34
+ };
35
+ artifactBytes: {
36
+ degradedAbove: number;
37
+ unhealthyAbove: number;
38
+ };
39
+ };
40
+ };
41
+ type HealthMonitorOptions = {
42
+ workspace: WorkspaceBundle;
43
+ persistence: RuntimePersistence;
44
+ getActiveRunSlots: () => number;
45
+ getPendingRunSlots: () => number;
46
+ getCheckpointMaintenanceStatus: () => MaintenanceLoopStatus | null;
47
+ getRuntimeRecordMaintenanceStatus: () => MaintenanceLoopStatus | null;
48
+ publishEvent?: (payload: Record<string, unknown>) => void | Promise<void>;
49
+ };
50
+ export declare function readHealthMonitorConfig(workspace: WorkspaceBundle): HealthMonitorConfig;
51
+ export declare class HealthMonitor {
52
+ private readonly options;
53
+ private readonly config;
54
+ private readonly runRoots;
55
+ private readonly checkpointDbPaths;
56
+ private readonly llmSamples;
57
+ private timer;
58
+ private latestSnapshot;
59
+ private runtimeEventSequence;
60
+ constructor(options: HealthMonitorOptions);
61
+ recordLlmSuccess(latencyMs: number, nowMs?: number): void;
62
+ recordLlmFailure(latencyMs: number, nowMs?: number): void;
63
+ private recordLlmSample;
64
+ start(): Promise<void>;
65
+ stop(): Promise<void>;
66
+ getSnapshot(): Promise<RuntimeHealthSnapshot>;
67
+ evaluate(nowMs?: number): Promise<RuntimeHealthSnapshot>;
68
+ private llmStats;
69
+ private evaluateLlmCheck;
70
+ private llmSymptoms;
71
+ private evaluatePersistenceCheck;
72
+ private evaluateCapacityCheck;
73
+ private capacitySymptoms;
74
+ private evaluateWorkloadCheck;
75
+ private workloadSymptoms;
76
+ private countStuckRuns;
77
+ private sumRuntimeDbBytes;
78
+ private sumCheckpointDbBytes;
79
+ private sumArtifactBytes;
80
+ }
81
+ export {};