@botbotgo/agent-harness 0.0.75 → 0.0.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/api.d.ts +2 -1
  2. package/dist/api.js +3 -0
  3. package/dist/benchmark/checkpoint-resume-cost-benchmark.d.ts +33 -0
  4. package/dist/benchmark/checkpoint-resume-cost-benchmark.js +55 -0
  5. package/dist/benchmark/deepagent-local-model-benchmark.d.ts +27 -0
  6. package/dist/benchmark/deepagent-local-model-benchmark.js +35 -0
  7. package/dist/config/agents/direct.yaml +1 -1
  8. package/dist/config/agents/orchestra.yaml +1 -2
  9. package/dist/config/workspace.yaml +31 -0
  10. package/dist/contracts/types.d.ts +38 -1
  11. package/dist/index.d.ts +1 -1
  12. package/dist/index.js +1 -1
  13. package/dist/package-version.d.ts +1 -1
  14. package/dist/package-version.js +1 -1
  15. package/dist/persistence/file-store.d.ts +3 -40
  16. package/dist/persistence/file-store.js +5 -2
  17. package/dist/persistence/sqlite-store.d.ts +68 -0
  18. package/dist/persistence/sqlite-store.js +569 -0
  19. package/dist/persistence/types.d.ts +83 -0
  20. package/dist/persistence/types.js +1 -0
  21. package/dist/runtime/agent-runtime-adapter.d.ts +3 -0
  22. package/dist/runtime/agent-runtime-adapter.js +58 -2
  23. package/dist/runtime/checkpoint-maintenance.d.ts +11 -2
  24. package/dist/runtime/checkpoint-maintenance.js +41 -5
  25. package/dist/runtime/harness.d.ts +5 -1
  26. package/dist/runtime/harness.js +45 -3
  27. package/dist/runtime/health-monitor.d.ts +81 -0
  28. package/dist/runtime/health-monitor.js +448 -0
  29. package/dist/runtime/runtime-record-maintenance.d.ts +43 -0
  30. package/dist/runtime/runtime-record-maintenance.js +169 -0
  31. package/dist/runtime/store.d.ts +2 -0
  32. package/dist/runtime/store.js +38 -20
  33. package/dist/runtime/support/embedding-models.js +57 -1
  34. package/dist/runtime/thread-memory-sync.d.ts +3 -2
  35. package/dist/runtime/thread-memory-sync.js +7 -1
  36. package/dist/workspace/agent-binding-compiler.js +3 -1
  37. package/dist/workspace/support/workspace-ref-utils.d.ts +9 -0
  38. package/dist/workspace/support/workspace-ref-utils.js +38 -0
  39. package/package.json +2 -2
@@ -64,6 +64,9 @@ function computeRemainingTimeoutMs(deadlineAt, fallbackTimeoutMs) {
64
64
  }
65
65
  return fallbackTimeoutMs ? Math.min(fallbackTimeoutMs, remaining) : remaining;
66
66
  }
67
+ function sleep(ms) {
68
+ return new Promise((resolve) => setTimeout(resolve, ms));
69
+ }
67
70
  function isPlaceholderApiKey(value) {
68
71
  return typeof value === "string" && value.trim().toLowerCase() === "dummy";
69
72
  }
@@ -485,6 +488,57 @@ export class AgentRuntimeAdapter {
485
488
  }
486
489
  return 15_000;
487
490
  }
491
+ resolveProviderRetryPolicy(binding) {
492
+ const resilience = typeof binding.harnessRuntime.resilience === "object" && binding.harnessRuntime.resilience
493
+ ? binding.harnessRuntime.resilience
494
+ : {};
495
+ const providerRetries = typeof resilience.providerRetries === "object" && resilience.providerRetries
496
+ ? resilience.providerRetries
497
+ : {};
498
+ const maxAttempts = typeof providerRetries.maxAttempts === "number" &&
499
+ Number.isFinite(providerRetries.maxAttempts) &&
500
+ providerRetries.maxAttempts > 0
501
+ ? Math.floor(providerRetries.maxAttempts)
502
+ : 2;
503
+ const backoffMs = typeof providerRetries.backoffMs === "number" &&
504
+ Number.isFinite(providerRetries.backoffMs) &&
505
+ providerRetries.backoffMs >= 0
506
+ ? Math.floor(providerRetries.backoffMs)
507
+ : 1_000;
508
+ const retryableMessages = Array.isArray(providerRetries.retryableMessages)
509
+ ? providerRetries.retryableMessages.filter((value) => typeof value === "string" && value.trim().length > 0)
510
+ : [];
511
+ return {
512
+ maxAttempts,
513
+ backoffMs,
514
+ retryableMessages,
515
+ };
516
+ }
517
+ isRetryableProviderError(binding, error) {
518
+ const message = error instanceof Error ? error.message : String(error);
519
+ const normalized = message.toLowerCase();
520
+ const { retryableMessages } = this.resolveProviderRetryPolicy(binding);
521
+ return retryableMessages.some((candidate) => normalized.includes(candidate.toLowerCase()));
522
+ }
523
+ async invokeWithProviderRetry(binding, operation) {
524
+ const retryPolicy = this.resolveProviderRetryPolicy(binding);
525
+ let lastError;
526
+ for (let attempt = 1; attempt <= retryPolicy.maxAttempts; attempt += 1) {
527
+ try {
528
+ return await operation();
529
+ }
530
+ catch (error) {
531
+ lastError = error;
532
+ if (attempt >= retryPolicy.maxAttempts || !this.isRetryableProviderError(binding, error)) {
533
+ throw error;
534
+ }
535
+ if (retryPolicy.backoffMs > 0) {
536
+ await sleep(retryPolicy.backoffMs);
537
+ }
538
+ }
539
+ }
540
+ throw lastError instanceof Error ? lastError : new Error(String(lastError));
541
+ }
488
542
  async withTimeout(producer, timeoutMs, operation, stage = operation.includes("stream") ? "stream" : "invoke") {
489
543
  if (!timeoutMs) {
490
544
  return Promise.resolve(producer());
@@ -1204,8 +1258,10 @@ export class AgentRuntimeAdapter {
1204
1258
  : new Command({ resume: resumePayload });
1205
1259
  let result;
1206
1260
  const callRuntime = async (activeBinding, activeRequest) => {
1207
- const runnable = await this.create(activeBinding);
1208
- return (await this.withTimeout(() => runnable.invoke(activeRequest, { configurable: { thread_id: threadId }, ...(options.context ? { context: options.context } : {}) }), this.resolveBindingTimeout(activeBinding), "agent invoke", "invoke"));
1261
+ return this.invokeWithProviderRetry(activeBinding, async () => {
1262
+ const runnable = await this.create(activeBinding);
1263
+ return (await this.withTimeout(() => runnable.invoke(activeRequest, { configurable: { thread_id: threadId }, ...(options.context ? { context: options.context } : {}) }), this.resolveBindingTimeout(activeBinding), "agent invoke", "invoke"));
1264
+ });
1209
1265
  };
1210
1266
  const callRuntimeWithToolParseRecovery = async (activeRequest) => {
1211
1267
  try {
@@ -18,18 +18,27 @@ type CheckpointMaintenanceTarget = {
18
18
  agentId: string;
19
19
  dbPath: string;
20
20
  };
21
+ export type MaintenanceLoopStatus = {
22
+ lastStartedAt?: string;
23
+ lastCompletedAt?: string;
24
+ lastFailedAt?: string;
25
+ consecutiveFailures: number;
26
+ lastError?: string;
27
+ };
21
28
  export declare function readCheckpointMaintenanceConfig(workspace: WorkspaceBundle): CheckpointMaintenanceConfig | null;
22
29
  export declare function discoverCheckpointMaintenanceTargets(workspace: WorkspaceBundle): CheckpointMaintenanceTarget[];
23
- export declare function maintainSqliteCheckpoints(dbPath: string, config: CheckpointMaintenanceConfig, nowMs?: number): {
30
+ export declare function maintainSqliteCheckpoints(dbPath: string, config: CheckpointMaintenanceConfig, nowMs?: number): Promise<{
24
31
  deletedCount: number;
25
- };
32
+ }>;
26
33
  export declare class CheckpointMaintenanceLoop {
27
34
  private readonly targets;
28
35
  private readonly config;
29
36
  private timer;
30
37
  private running;
38
+ private status;
31
39
  constructor(targets: CheckpointMaintenanceTarget[], config: CheckpointMaintenanceConfig);
32
40
  runOnce(nowMs?: number): Promise<void>;
41
+ getStatus(): MaintenanceLoopStatus;
33
42
  start(): Promise<void>;
34
43
  stop(): Promise<void>;
35
44
  }
@@ -1,5 +1,7 @@
1
1
  import path from "node:path";
2
2
  import { SqliteSaver } from "@langchain/langgraph-checkpoint-sqlite";
3
+ import { listProtectedCheckpointThreadIds } from "../persistence/sqlite-store.js";
4
+ import { fileExists } from "../utils/fs.js";
3
5
  import { getRuntimeDefaults } from "../workspace/support/workspace-ref-utils.js";
4
6
  import { ManagedSqliteSaver } from "./sqlite-maintained-checkpoint-saver.js";
5
7
  function asObject(value) {
@@ -135,11 +137,18 @@ function totalCheckpointBytes(db) {
135
137
  return Number(checkpointsBytes.total ?? 0) + Number(writesBytes.total ?? 0);
136
138
  }
137
139
  export function maintainSqliteCheckpoints(dbPath, config, nowMs = Date.now()) {
140
+ return maintainSqliteCheckpointsInternal(dbPath, config, nowMs);
141
+ }
142
+ async function maintainSqliteCheckpointsInternal(dbPath, config, nowMs) {
143
+ if (!(await fileExists(dbPath))) {
144
+ return { deletedCount: 0 };
145
+ }
138
146
  const saver = new ManagedSqliteSaver(SqliteSaver.fromConnString(dbPath).db);
139
147
  const db = saver.db;
140
148
  try {
141
149
  saver.prepareMaintenance();
142
150
  backfillCheckpointMetadata(db, nowMs);
151
+ const protectedThreadIds = await listProtectedCheckpointThreadIds(path.join(path.dirname(dbPath), "runtime.sqlite"));
143
152
  let deletedCount = 0;
144
153
  if (config.policies.maxAgeSeconds !== undefined) {
145
154
  const cutoffMs = nowMs - config.policies.maxAgeSeconds * 1000;
@@ -154,13 +163,13 @@ export function maintainSqliteCheckpoints(dbPath, config, nowMs = Date.now()) {
154
163
  WHERE meta.created_at_ms <= ?
155
164
  ORDER BY meta.created_at_ms ASC, meta.checkpoint_id ASC
156
165
  LIMIT ?`)
157
- .all(cutoffMs, config.sqlite.sweepBatchSize);
158
- deletedCount += deleteCheckpointRows(db, expired);
166
+ .all(cutoffMs, config.sqlite.sweepBatchSize * 4);
167
+ deletedCount += deleteCheckpointRows(db, expired.filter((row) => !protectedThreadIds.has(row.thread_id)).slice(0, config.sqlite.sweepBatchSize));
159
168
  }
160
169
  if (config.policies.maxBytes !== undefined) {
161
170
  let currentBytes = totalCheckpointBytes(db);
162
171
  while (currentBytes > config.policies.maxBytes) {
163
- const oldest = selectOldestRows(db, config.sqlite.sweepBatchSize);
172
+ const oldest = selectOldestRows(db, config.sqlite.sweepBatchSize * 4).filter((row) => !protectedThreadIds.has(row.thread_id));
164
173
  if (oldest.length === 0) {
165
174
  break;
166
175
  }
@@ -191,15 +200,42 @@ export class CheckpointMaintenanceLoop {
191
200
  config;
192
201
  timer = null;
193
202
  running = false;
203
+ status = {
204
+ consecutiveFailures: 0,
205
+ };
194
206
  constructor(targets, config) {
195
207
  this.targets = targets;
196
208
  this.config = config;
197
209
  }
198
210
  async runOnce(nowMs = Date.now()) {
199
- for (const target of this.targets) {
200
- maintainSqliteCheckpoints(target.dbPath, this.config, nowMs);
211
+ this.status = {
212
+ ...this.status,
213
+ lastStartedAt: new Date(nowMs).toISOString(),
214
+ };
215
+ try {
216
+ for (const target of this.targets) {
217
+ await maintainSqliteCheckpoints(target.dbPath, this.config, nowMs);
218
+ }
219
+ this.status = {
220
+ ...this.status,
221
+ lastCompletedAt: new Date(nowMs).toISOString(),
222
+ consecutiveFailures: 0,
223
+ lastError: undefined,
224
+ };
225
+ }
226
+ catch (error) {
227
+ this.status = {
228
+ ...this.status,
229
+ lastFailedAt: new Date(nowMs).toISOString(),
230
+ consecutiveFailures: this.status.consecutiveFailures + 1,
231
+ lastError: error instanceof Error ? error.message : String(error),
232
+ };
233
+ throw error;
201
234
  }
202
235
  }
236
+ getStatus() {
237
+ return { ...this.status };
238
+ }
203
239
  async start() {
204
240
  if (this.running) {
205
241
  return;
@@ -1,4 +1,4 @@
1
- import type { ApprovalRecord, HarnessEvent, HarnessStreamItem, MessageContent, RunRecord, RunStartOptions, RestartConversationOptions, RuntimeAdapterOptions, ResumeOptions, RunOptions, RunResult, RunSummary, ThreadSummary, ThreadRecord, WorkspaceBundle } from "../contracts/types.js";
1
+ import type { ApprovalRecord, HarnessEvent, HarnessStreamItem, RuntimeHealthSnapshot, MessageContent, RunRecord, RunStartOptions, RestartConversationOptions, RuntimeAdapterOptions, ResumeOptions, RunOptions, RunResult, RunSummary, ThreadSummary, ThreadRecord, WorkspaceBundle } from "../contracts/types.js";
2
2
  import { type ToolMcpServerOptions } from "../mcp.js";
3
3
  import { type InventoryAgentRecord, type InventorySkillRecord } from "./inventory.js";
4
4
  import type { RequirementAssessmentOptions } from "./skill-requirements.js";
@@ -23,10 +23,13 @@ export declare class AgentHarnessRuntime {
23
23
  private readonly unregisterThreadMemorySync;
24
24
  private readonly resolvedRuntimeAdapterOptions;
25
25
  private readonly checkpointMaintenance;
26
+ private readonly runtimeRecordMaintenance;
27
+ private readonly healthMonitor;
26
28
  private readonly recoveryConfig;
27
29
  private readonly concurrencyConfig;
28
30
  private activeRunSlots;
29
31
  private readonly pendingRunSlots;
32
+ private runtimeEventSequence;
30
33
  private toPublicApprovalRecord;
31
34
  private normalizeInvocationEnvelope;
32
35
  private isTerminalRunState;
@@ -42,6 +45,7 @@ export declare class AgentHarnessRuntime {
42
45
  constructor(workspace: WorkspaceBundle, runtimeAdapterOptions?: RuntimeAdapterOptions);
43
46
  initialize(): Promise<void>;
44
47
  subscribe(listener: (event: HarnessEvent) => void): () => void;
48
+ getHealth(): Promise<RuntimeHealthSnapshot>;
45
49
  private getBinding;
46
50
  private listAgentTools;
47
51
  private resolveAgentTools;
@@ -1,5 +1,5 @@
1
1
  import { AUTO_AGENT_ID } from "../contracts/types.js";
2
- import { FilePersistence } from "../persistence/file-store.js";
2
+ import { SqlitePersistence } from "../persistence/sqlite-store.js";
3
3
  import { createPersistentId } from "../utils/id.js";
4
4
  import { AGENT_INTERRUPT_SENTINEL_PREFIX, AgentRuntimeAdapter, RuntimeOperationTimeoutError } from "./agent-runtime-adapter.js";
5
5
  import { createResourceBackendResolver, createResourceToolResolver } from "../resource/resource.js";
@@ -13,6 +13,8 @@ import { resolveCompiledVectorStore, resolveCompiledVectorStoreRef } from "./sup
13
13
  import { ThreadMemorySync } from "./thread-memory-sync.js";
14
14
  import { FileBackedStore } from "./store.js";
15
15
  import { CheckpointMaintenanceLoop, discoverCheckpointMaintenanceTargets, readCheckpointMaintenanceConfig, } from "./checkpoint-maintenance.js";
16
+ import { RuntimeRecordMaintenanceLoop, discoverRuntimeRecordMaintenanceTargets, readRuntimeRecordMaintenanceConfig, } from "./runtime-record-maintenance.js";
17
+ import { HealthMonitor } from "./health-monitor.js";
16
18
  import { extractMessageText, normalizeMessageContent } from "../utils/message-content.js";
17
19
  import { createToolMcpServerFromTools, serveToolsOverStdioFromHarness } from "../mcp.js";
18
20
  import { getBindingAdapterKind, getBindingPrimaryTools, getBindingStoreConfig, isDeepAgentBinding } from "./support/compiled-binding.js";
@@ -38,10 +40,13 @@ export class AgentHarnessRuntime {
38
40
  unregisterThreadMemorySync;
39
41
  resolvedRuntimeAdapterOptions;
40
42
  checkpointMaintenance;
43
+ runtimeRecordMaintenance;
44
+ healthMonitor;
41
45
  recoveryConfig;
42
46
  concurrencyConfig;
43
47
  activeRunSlots = 0;
44
48
  pendingRunSlots = [];
49
+ runtimeEventSequence = 0;
45
50
  toPublicApprovalRecord(approval) {
46
51
  const { toolCallId: _toolCallId, checkpointRef: _checkpointRef, eventRefs: _eventRefs, ...publicApproval } = approval;
47
52
  return publicApproval;
@@ -141,7 +146,7 @@ export class AgentHarnessRuntime {
141
146
  this.workspace = workspace;
142
147
  this.runtimeAdapterOptions = runtimeAdapterOptions;
143
148
  const runRoot = this.defaultRunRoot();
144
- this.persistence = new FilePersistence(runRoot);
149
+ this.persistence = new SqlitePersistence(runRoot);
145
150
  const defaultStoreConfig = this.listHostBindings()[0]?.harnessRuntime.store;
146
151
  this.defaultStore = this.resolveStoreFromConfig(defaultStoreConfig, runRoot) ?? new FileBackedStore(`${runRoot}/store.json`);
147
152
  const runtimeMemoryStoreConfig = typeof this.listHostBindings()[0]?.harnessRuntime.runtimeMemory?.store === "object" &&
@@ -189,17 +194,37 @@ export class AgentHarnessRuntime {
189
194
  this.checkpointMaintenance = checkpointMaintenanceConfig
190
195
  ? new CheckpointMaintenanceLoop(discoverCheckpointMaintenanceTargets(workspace), checkpointMaintenanceConfig)
191
196
  : null;
197
+ const runtimeRecordMaintenanceConfig = readRuntimeRecordMaintenanceConfig(workspace);
198
+ this.runtimeRecordMaintenance = runtimeRecordMaintenanceConfig
199
+ ? new RuntimeRecordMaintenanceLoop(discoverRuntimeRecordMaintenanceTargets(workspace), runtimeRecordMaintenanceConfig)
200
+ : null;
192
201
  this.recoveryConfig = getRecoveryConfig(workspace.refs);
193
202
  this.concurrencyConfig = getConcurrencyConfig(workspace.refs);
203
+ this.healthMonitor = new HealthMonitor({
204
+ workspace,
205
+ persistence: this.persistence,
206
+ getActiveRunSlots: () => this.activeRunSlots,
207
+ getPendingRunSlots: () => this.pendingRunSlots.length,
208
+ getCheckpointMaintenanceStatus: () => this.checkpointMaintenance?.getStatus() ?? null,
209
+ getRuntimeRecordMaintenanceStatus: () => this.runtimeRecordMaintenance?.getStatus() ?? null,
210
+ publishEvent: async (payload) => {
211
+ this.eventBus.publish(createHarnessEvent("__runtime__", "__runtime__", ++this.runtimeEventSequence, "runtime.health.changed", payload));
212
+ },
213
+ });
194
214
  }
195
215
  async initialize() {
196
216
  await this.persistence.initialize();
197
217
  await this.checkpointMaintenance?.start();
218
+ await this.runtimeRecordMaintenance?.start();
219
+ await this.healthMonitor.start();
198
220
  await this.recoverStartupRuns();
199
221
  }
200
222
  subscribe(listener) {
201
223
  return this.eventBus.subscribe(listener);
202
224
  }
225
+ async getHealth() {
226
+ return this.healthMonitor.getSnapshot();
227
+ }
203
228
  getBinding(agentId) {
204
229
  return this.workspace.bindings.get(agentId);
205
230
  }
@@ -457,7 +482,16 @@ export class AgentHarnessRuntime {
457
482
  }
458
483
  async invokeWithHistory(binding, input, threadId, runId, resumePayload, options = {}) {
459
484
  const priorHistory = await this.loadPriorHistory(threadId, runId);
460
- return this.runtimeAdapter.invoke(binding, input, threadId, runId, resumePayload, priorHistory, options);
485
+ const startedAt = Date.now();
486
+ try {
487
+ const result = await this.runtimeAdapter.invoke(binding, input, threadId, runId, resumePayload, priorHistory, options);
488
+ this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
489
+ return result;
490
+ }
491
+ catch (error) {
492
+ this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
493
+ throw error;
494
+ }
461
495
  }
462
496
  buildPersistedRunRequest(input, invocation) {
463
497
  const envelope = invocation.invocation ?? {
@@ -1097,8 +1131,10 @@ export class AgentHarnessRuntime {
1097
1131
  const history = await this.persistence.listThreadMessages(threadId);
1098
1132
  const priorHistory = history.filter((message) => message.runId !== runId);
1099
1133
  const runInput = await this.loadRunInput(threadId, runId);
1134
+ const startedAt = Date.now();
1100
1135
  try {
1101
1136
  const actual = await this.runtimeAdapter.invoke(binding, "", threadId, runId, resumePayload, priorHistory);
1137
+ this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
1102
1138
  await this.persistence.clearRecoveryIntent(threadId, runId);
1103
1139
  const finalized = await this.finalizeContinuedRun(threadId, runId, runInput, actual, {
1104
1140
  previousState: "resuming",
@@ -1112,6 +1148,7 @@ export class AgentHarnessRuntime {
1112
1148
  };
1113
1149
  }
1114
1150
  catch (error) {
1151
+ this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
1115
1152
  throw error;
1116
1153
  }
1117
1154
  }
@@ -1182,7 +1219,9 @@ export class AgentHarnessRuntime {
1182
1219
  };
1183
1220
  }
1184
1221
  async close() {
1222
+ await this.healthMonitor.stop();
1185
1223
  await this.checkpointMaintenance?.stop();
1224
+ await this.runtimeRecordMaintenance?.stop();
1186
1225
  this.unregisterThreadMemorySync();
1187
1226
  await this.threadMemorySync.close();
1188
1227
  }
@@ -1283,8 +1322,10 @@ export class AgentHarnessRuntime {
1283
1322
  const history = await this.persistence.listThreadMessages(thread.threadId);
1284
1323
  const priorHistory = history.filter((message) => message.runId !== thread.latestRunId);
1285
1324
  const runInput = await this.loadRunInput(thread.threadId, thread.latestRunId);
1325
+ const startedAt = Date.now();
1286
1326
  try {
1287
1327
  const actual = await this.runtimeAdapter.invoke(binding, "", thread.threadId, thread.latestRunId, recoveryIntent.resumePayload, priorHistory);
1328
+ this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
1288
1329
  await this.persistence.clearRecoveryIntent(thread.threadId, thread.latestRunId);
1289
1330
  await this.finalizeContinuedRun(thread.threadId, thread.latestRunId, runInput, actual, {
1290
1331
  previousState: "resuming",
@@ -1293,6 +1334,7 @@ export class AgentHarnessRuntime {
1293
1334
  });
1294
1335
  }
1295
1336
  catch (error) {
1337
+ this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
1296
1338
  if (recoveryIntent.attempts + 1 >= this.recoveryConfig.maxRecoveryAttempts) {
1297
1339
  await this.persistence.setRunState(thread.threadId, thread.latestRunId, "failed", recoveryIntent.checkpointRef);
1298
1340
  await this.persistence.clearRecoveryIntent(thread.threadId, thread.latestRunId);
@@ -0,0 +1,81 @@
1
+ import type { RuntimeHealthSnapshot, WorkspaceBundle } from "../contracts/types.js";
2
+ import type { RuntimePersistence } from "../persistence/types.js";
3
+ import type { MaintenanceLoopStatus } from "./checkpoint-maintenance.js";
4
+ type HealthMonitorConfig = {
5
+ enabled: boolean;
6
+ evaluateIntervalSeconds: number;
7
+ emitEvents: boolean;
8
+ thresholds: {
9
+ llmErrorRate: {
10
+ windowSeconds: number;
11
+ degradedAbove: number;
12
+ unhealthyAbove: number;
13
+ };
14
+ llmP95LatencyMs: {
15
+ windowSeconds: number;
16
+ degradedAbove: number;
17
+ unhealthyAbove: number;
18
+ };
19
+ pendingApprovals: {
20
+ degradedAbove: number;
21
+ unhealthyAbove: number;
22
+ };
23
+ stuckRunSeconds: {
24
+ degradedAbove: number;
25
+ unhealthyAbove: number;
26
+ };
27
+ checkpointBytes: {
28
+ degradedAbove: number;
29
+ unhealthyAbove: number;
30
+ };
31
+ runtimeDbBytes: {
32
+ degradedAbove: number;
33
+ unhealthyAbove: number;
34
+ };
35
+ artifactBytes: {
36
+ degradedAbove: number;
37
+ unhealthyAbove: number;
38
+ };
39
+ };
40
+ };
41
+ type HealthMonitorOptions = {
42
+ workspace: WorkspaceBundle;
43
+ persistence: RuntimePersistence;
44
+ getActiveRunSlots: () => number;
45
+ getPendingRunSlots: () => number;
46
+ getCheckpointMaintenanceStatus: () => MaintenanceLoopStatus | null;
47
+ getRuntimeRecordMaintenanceStatus: () => MaintenanceLoopStatus | null;
48
+ publishEvent?: (payload: Record<string, unknown>) => void | Promise<void>;
49
+ };
50
+ export declare function readHealthMonitorConfig(workspace: WorkspaceBundle): HealthMonitorConfig;
51
+ export declare class HealthMonitor {
52
+ private readonly options;
53
+ private readonly config;
54
+ private readonly runRoots;
55
+ private readonly checkpointDbPaths;
56
+ private readonly llmSamples;
57
+ private timer;
58
+ private latestSnapshot;
59
+ private runtimeEventSequence;
60
+ constructor(options: HealthMonitorOptions);
61
+ recordLlmSuccess(latencyMs: number, nowMs?: number): void;
62
+ recordLlmFailure(latencyMs: number, nowMs?: number): void;
63
+ private recordLlmSample;
64
+ start(): Promise<void>;
65
+ stop(): Promise<void>;
66
+ getSnapshot(): Promise<RuntimeHealthSnapshot>;
67
+ evaluate(nowMs?: number): Promise<RuntimeHealthSnapshot>;
68
+ private llmStats;
69
+ private evaluateLlmCheck;
70
+ private llmSymptoms;
71
+ private evaluatePersistenceCheck;
72
+ private evaluateCapacityCheck;
73
+ private capacitySymptoms;
74
+ private evaluateWorkloadCheck;
75
+ private workloadSymptoms;
76
+ private countStuckRuns;
77
+ private sumRuntimeDbBytes;
78
+ private sumCheckpointDbBytes;
79
+ private sumArtifactBytes;
80
+ }
81
+ export {};