@botbotgo/agent-harness 0.0.75 → 0.0.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +2 -1
- package/dist/api.js +3 -0
- package/dist/benchmark/checkpoint-resume-cost-benchmark.d.ts +33 -0
- package/dist/benchmark/checkpoint-resume-cost-benchmark.js +55 -0
- package/dist/benchmark/deepagent-local-model-benchmark.d.ts +27 -0
- package/dist/benchmark/deepagent-local-model-benchmark.js +35 -0
- package/dist/config/agents/direct.yaml +1 -1
- package/dist/config/agents/orchestra.yaml +1 -2
- package/dist/config/workspace.yaml +31 -0
- package/dist/contracts/types.d.ts +38 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/package-version.d.ts +1 -1
- package/dist/package-version.js +1 -1
- package/dist/persistence/file-store.d.ts +3 -40
- package/dist/persistence/file-store.js +5 -2
- package/dist/persistence/sqlite-store.d.ts +68 -0
- package/dist/persistence/sqlite-store.js +569 -0
- package/dist/persistence/types.d.ts +83 -0
- package/dist/persistence/types.js +1 -0
- package/dist/runtime/agent-runtime-adapter.d.ts +3 -0
- package/dist/runtime/agent-runtime-adapter.js +58 -2
- package/dist/runtime/checkpoint-maintenance.d.ts +11 -2
- package/dist/runtime/checkpoint-maintenance.js +41 -5
- package/dist/runtime/harness.d.ts +5 -1
- package/dist/runtime/harness.js +45 -3
- package/dist/runtime/health-monitor.d.ts +81 -0
- package/dist/runtime/health-monitor.js +448 -0
- package/dist/runtime/runtime-record-maintenance.d.ts +43 -0
- package/dist/runtime/runtime-record-maintenance.js +169 -0
- package/dist/runtime/store.d.ts +2 -0
- package/dist/runtime/store.js +38 -20
- package/dist/runtime/support/embedding-models.js +57 -1
- package/dist/runtime/thread-memory-sync.d.ts +3 -2
- package/dist/runtime/thread-memory-sync.js +7 -1
- package/dist/workspace/agent-binding-compiler.js +3 -1
- package/dist/workspace/support/workspace-ref-utils.d.ts +9 -0
- package/dist/workspace/support/workspace-ref-utils.js +38 -0
- package/package.json +2 -2
|
@@ -64,6 +64,9 @@ function computeRemainingTimeoutMs(deadlineAt, fallbackTimeoutMs) {
|
|
|
64
64
|
}
|
|
65
65
|
return fallbackTimeoutMs ? Math.min(fallbackTimeoutMs, remaining) : remaining;
|
|
66
66
|
}
|
|
67
|
+
function sleep(ms) {
|
|
68
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
69
|
+
}
|
|
67
70
|
function isPlaceholderApiKey(value) {
|
|
68
71
|
return typeof value === "string" && value.trim().toLowerCase() === "dummy";
|
|
69
72
|
}
|
|
@@ -485,6 +488,57 @@ export class AgentRuntimeAdapter {
|
|
|
485
488
|
}
|
|
486
489
|
return 15_000;
|
|
487
490
|
}
|
|
491
|
+
resolveProviderRetryPolicy(binding) {
|
|
492
|
+
const resilience = typeof binding.harnessRuntime.resilience === "object" && binding.harnessRuntime.resilience
|
|
493
|
+
? binding.harnessRuntime.resilience
|
|
494
|
+
: {};
|
|
495
|
+
const providerRetries = typeof resilience.providerRetries === "object" && resilience.providerRetries
|
|
496
|
+
? resilience.providerRetries
|
|
497
|
+
: {};
|
|
498
|
+
const maxAttempts = typeof providerRetries.maxAttempts === "number" &&
|
|
499
|
+
Number.isFinite(providerRetries.maxAttempts) &&
|
|
500
|
+
providerRetries.maxAttempts > 0
|
|
501
|
+
? Math.floor(providerRetries.maxAttempts)
|
|
502
|
+
: 2;
|
|
503
|
+
const backoffMs = typeof providerRetries.backoffMs === "number" &&
|
|
504
|
+
Number.isFinite(providerRetries.backoffMs) &&
|
|
505
|
+
providerRetries.backoffMs >= 0
|
|
506
|
+
? Math.floor(providerRetries.backoffMs)
|
|
507
|
+
: 1_000;
|
|
508
|
+
const retryableMessages = Array.isArray(providerRetries.retryableMessages)
|
|
509
|
+
? providerRetries.retryableMessages.filter((value) => typeof value === "string" && value.trim().length > 0)
|
|
510
|
+
: [];
|
|
511
|
+
return {
|
|
512
|
+
maxAttempts,
|
|
513
|
+
backoffMs,
|
|
514
|
+
retryableMessages,
|
|
515
|
+
};
|
|
516
|
+
}
|
|
517
|
+
isRetryableProviderError(binding, error) {
|
|
518
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
519
|
+
const normalized = message.toLowerCase();
|
|
520
|
+
const { retryableMessages } = this.resolveProviderRetryPolicy(binding);
|
|
521
|
+
return retryableMessages.some((candidate) => normalized.includes(candidate.toLowerCase()));
|
|
522
|
+
}
|
|
523
|
+
async invokeWithProviderRetry(binding, operation) {
|
|
524
|
+
const retryPolicy = this.resolveProviderRetryPolicy(binding);
|
|
525
|
+
let lastError;
|
|
526
|
+
for (let attempt = 1; attempt <= retryPolicy.maxAttempts; attempt += 1) {
|
|
527
|
+
try {
|
|
528
|
+
return await operation();
|
|
529
|
+
}
|
|
530
|
+
catch (error) {
|
|
531
|
+
lastError = error;
|
|
532
|
+
if (attempt >= retryPolicy.maxAttempts || !this.isRetryableProviderError(binding, error)) {
|
|
533
|
+
throw error;
|
|
534
|
+
}
|
|
535
|
+
if (retryPolicy.backoffMs > 0) {
|
|
536
|
+
await sleep(retryPolicy.backoffMs);
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
throw lastError instanceof Error ? lastError : new Error(String(lastError));
|
|
541
|
+
}
|
|
488
542
|
async withTimeout(producer, timeoutMs, operation, stage = operation.includes("stream") ? "stream" : "invoke") {
|
|
489
543
|
if (!timeoutMs) {
|
|
490
544
|
return Promise.resolve(producer());
|
|
@@ -1204,8 +1258,10 @@ export class AgentRuntimeAdapter {
|
|
|
1204
1258
|
: new Command({ resume: resumePayload });
|
|
1205
1259
|
let result;
|
|
1206
1260
|
const callRuntime = async (activeBinding, activeRequest) => {
|
|
1207
|
-
|
|
1208
|
-
|
|
1261
|
+
return this.invokeWithProviderRetry(activeBinding, async () => {
|
|
1262
|
+
const runnable = await this.create(activeBinding);
|
|
1263
|
+
return (await this.withTimeout(() => runnable.invoke(activeRequest, { configurable: { thread_id: threadId }, ...(options.context ? { context: options.context } : {}) }), this.resolveBindingTimeout(activeBinding), "agent invoke", "invoke"));
|
|
1264
|
+
});
|
|
1209
1265
|
};
|
|
1210
1266
|
const callRuntimeWithToolParseRecovery = async (activeRequest) => {
|
|
1211
1267
|
try {
|
|
@@ -18,18 +18,27 @@ type CheckpointMaintenanceTarget = {
|
|
|
18
18
|
agentId: string;
|
|
19
19
|
dbPath: string;
|
|
20
20
|
};
|
|
21
|
+
export type MaintenanceLoopStatus = {
|
|
22
|
+
lastStartedAt?: string;
|
|
23
|
+
lastCompletedAt?: string;
|
|
24
|
+
lastFailedAt?: string;
|
|
25
|
+
consecutiveFailures: number;
|
|
26
|
+
lastError?: string;
|
|
27
|
+
};
|
|
21
28
|
export declare function readCheckpointMaintenanceConfig(workspace: WorkspaceBundle): CheckpointMaintenanceConfig | null;
|
|
22
29
|
export declare function discoverCheckpointMaintenanceTargets(workspace: WorkspaceBundle): CheckpointMaintenanceTarget[];
|
|
23
|
-
export declare function maintainSqliteCheckpoints(dbPath: string, config: CheckpointMaintenanceConfig, nowMs?: number): {
|
|
30
|
+
export declare function maintainSqliteCheckpoints(dbPath: string, config: CheckpointMaintenanceConfig, nowMs?: number): Promise<{
|
|
24
31
|
deletedCount: number;
|
|
25
|
-
}
|
|
32
|
+
}>;
|
|
26
33
|
export declare class CheckpointMaintenanceLoop {
|
|
27
34
|
private readonly targets;
|
|
28
35
|
private readonly config;
|
|
29
36
|
private timer;
|
|
30
37
|
private running;
|
|
38
|
+
private status;
|
|
31
39
|
constructor(targets: CheckpointMaintenanceTarget[], config: CheckpointMaintenanceConfig);
|
|
32
40
|
runOnce(nowMs?: number): Promise<void>;
|
|
41
|
+
getStatus(): MaintenanceLoopStatus;
|
|
33
42
|
start(): Promise<void>;
|
|
34
43
|
stop(): Promise<void>;
|
|
35
44
|
}
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import path from "node:path";
|
|
2
2
|
import { SqliteSaver } from "@langchain/langgraph-checkpoint-sqlite";
|
|
3
|
+
import { listProtectedCheckpointThreadIds } from "../persistence/sqlite-store.js";
|
|
4
|
+
import { fileExists } from "../utils/fs.js";
|
|
3
5
|
import { getRuntimeDefaults } from "../workspace/support/workspace-ref-utils.js";
|
|
4
6
|
import { ManagedSqliteSaver } from "./sqlite-maintained-checkpoint-saver.js";
|
|
5
7
|
function asObject(value) {
|
|
@@ -135,11 +137,18 @@ function totalCheckpointBytes(db) {
|
|
|
135
137
|
return Number(checkpointsBytes.total ?? 0) + Number(writesBytes.total ?? 0);
|
|
136
138
|
}
|
|
137
139
|
export function maintainSqliteCheckpoints(dbPath, config, nowMs = Date.now()) {
|
|
140
|
+
return maintainSqliteCheckpointsInternal(dbPath, config, nowMs);
|
|
141
|
+
}
|
|
142
|
+
async function maintainSqliteCheckpointsInternal(dbPath, config, nowMs) {
|
|
143
|
+
if (!(await fileExists(dbPath))) {
|
|
144
|
+
return { deletedCount: 0 };
|
|
145
|
+
}
|
|
138
146
|
const saver = new ManagedSqliteSaver(SqliteSaver.fromConnString(dbPath).db);
|
|
139
147
|
const db = saver.db;
|
|
140
148
|
try {
|
|
141
149
|
saver.prepareMaintenance();
|
|
142
150
|
backfillCheckpointMetadata(db, nowMs);
|
|
151
|
+
const protectedThreadIds = await listProtectedCheckpointThreadIds(path.join(path.dirname(dbPath), "runtime.sqlite"));
|
|
143
152
|
let deletedCount = 0;
|
|
144
153
|
if (config.policies.maxAgeSeconds !== undefined) {
|
|
145
154
|
const cutoffMs = nowMs - config.policies.maxAgeSeconds * 1000;
|
|
@@ -154,13 +163,13 @@ export function maintainSqliteCheckpoints(dbPath, config, nowMs = Date.now()) {
|
|
|
154
163
|
WHERE meta.created_at_ms <= ?
|
|
155
164
|
ORDER BY meta.created_at_ms ASC, meta.checkpoint_id ASC
|
|
156
165
|
LIMIT ?`)
|
|
157
|
-
.all(cutoffMs, config.sqlite.sweepBatchSize);
|
|
158
|
-
deletedCount += deleteCheckpointRows(db, expired);
|
|
166
|
+
.all(cutoffMs, config.sqlite.sweepBatchSize * 4);
|
|
167
|
+
deletedCount += deleteCheckpointRows(db, expired.filter((row) => !protectedThreadIds.has(row.thread_id)).slice(0, config.sqlite.sweepBatchSize));
|
|
159
168
|
}
|
|
160
169
|
if (config.policies.maxBytes !== undefined) {
|
|
161
170
|
let currentBytes = totalCheckpointBytes(db);
|
|
162
171
|
while (currentBytes > config.policies.maxBytes) {
|
|
163
|
-
const oldest = selectOldestRows(db, config.sqlite.sweepBatchSize);
|
|
172
|
+
const oldest = selectOldestRows(db, config.sqlite.sweepBatchSize * 4).filter((row) => !protectedThreadIds.has(row.thread_id));
|
|
164
173
|
if (oldest.length === 0) {
|
|
165
174
|
break;
|
|
166
175
|
}
|
|
@@ -191,15 +200,42 @@ export class CheckpointMaintenanceLoop {
|
|
|
191
200
|
config;
|
|
192
201
|
timer = null;
|
|
193
202
|
running = false;
|
|
203
|
+
status = {
|
|
204
|
+
consecutiveFailures: 0,
|
|
205
|
+
};
|
|
194
206
|
constructor(targets, config) {
|
|
195
207
|
this.targets = targets;
|
|
196
208
|
this.config = config;
|
|
197
209
|
}
|
|
198
210
|
async runOnce(nowMs = Date.now()) {
|
|
199
|
-
|
|
200
|
-
|
|
211
|
+
this.status = {
|
|
212
|
+
...this.status,
|
|
213
|
+
lastStartedAt: new Date(nowMs).toISOString(),
|
|
214
|
+
};
|
|
215
|
+
try {
|
|
216
|
+
for (const target of this.targets) {
|
|
217
|
+
await maintainSqliteCheckpoints(target.dbPath, this.config, nowMs);
|
|
218
|
+
}
|
|
219
|
+
this.status = {
|
|
220
|
+
...this.status,
|
|
221
|
+
lastCompletedAt: new Date(nowMs).toISOString(),
|
|
222
|
+
consecutiveFailures: 0,
|
|
223
|
+
lastError: undefined,
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
catch (error) {
|
|
227
|
+
this.status = {
|
|
228
|
+
...this.status,
|
|
229
|
+
lastFailedAt: new Date(nowMs).toISOString(),
|
|
230
|
+
consecutiveFailures: this.status.consecutiveFailures + 1,
|
|
231
|
+
lastError: error instanceof Error ? error.message : String(error),
|
|
232
|
+
};
|
|
233
|
+
throw error;
|
|
201
234
|
}
|
|
202
235
|
}
|
|
236
|
+
getStatus() {
|
|
237
|
+
return { ...this.status };
|
|
238
|
+
}
|
|
203
239
|
async start() {
|
|
204
240
|
if (this.running) {
|
|
205
241
|
return;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ApprovalRecord, HarnessEvent, HarnessStreamItem, MessageContent, RunRecord, RunStartOptions, RestartConversationOptions, RuntimeAdapterOptions, ResumeOptions, RunOptions, RunResult, RunSummary, ThreadSummary, ThreadRecord, WorkspaceBundle } from "../contracts/types.js";
|
|
1
|
+
import type { ApprovalRecord, HarnessEvent, HarnessStreamItem, RuntimeHealthSnapshot, MessageContent, RunRecord, RunStartOptions, RestartConversationOptions, RuntimeAdapterOptions, ResumeOptions, RunOptions, RunResult, RunSummary, ThreadSummary, ThreadRecord, WorkspaceBundle } from "../contracts/types.js";
|
|
2
2
|
import { type ToolMcpServerOptions } from "../mcp.js";
|
|
3
3
|
import { type InventoryAgentRecord, type InventorySkillRecord } from "./inventory.js";
|
|
4
4
|
import type { RequirementAssessmentOptions } from "./skill-requirements.js";
|
|
@@ -23,10 +23,13 @@ export declare class AgentHarnessRuntime {
|
|
|
23
23
|
private readonly unregisterThreadMemorySync;
|
|
24
24
|
private readonly resolvedRuntimeAdapterOptions;
|
|
25
25
|
private readonly checkpointMaintenance;
|
|
26
|
+
private readonly runtimeRecordMaintenance;
|
|
27
|
+
private readonly healthMonitor;
|
|
26
28
|
private readonly recoveryConfig;
|
|
27
29
|
private readonly concurrencyConfig;
|
|
28
30
|
private activeRunSlots;
|
|
29
31
|
private readonly pendingRunSlots;
|
|
32
|
+
private runtimeEventSequence;
|
|
30
33
|
private toPublicApprovalRecord;
|
|
31
34
|
private normalizeInvocationEnvelope;
|
|
32
35
|
private isTerminalRunState;
|
|
@@ -42,6 +45,7 @@ export declare class AgentHarnessRuntime {
|
|
|
42
45
|
constructor(workspace: WorkspaceBundle, runtimeAdapterOptions?: RuntimeAdapterOptions);
|
|
43
46
|
initialize(): Promise<void>;
|
|
44
47
|
subscribe(listener: (event: HarnessEvent) => void): () => void;
|
|
48
|
+
getHealth(): Promise<RuntimeHealthSnapshot>;
|
|
45
49
|
private getBinding;
|
|
46
50
|
private listAgentTools;
|
|
47
51
|
private resolveAgentTools;
|
package/dist/runtime/harness.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { AUTO_AGENT_ID } from "../contracts/types.js";
|
|
2
|
-
import {
|
|
2
|
+
import { SqlitePersistence } from "../persistence/sqlite-store.js";
|
|
3
3
|
import { createPersistentId } from "../utils/id.js";
|
|
4
4
|
import { AGENT_INTERRUPT_SENTINEL_PREFIX, AgentRuntimeAdapter, RuntimeOperationTimeoutError } from "./agent-runtime-adapter.js";
|
|
5
5
|
import { createResourceBackendResolver, createResourceToolResolver } from "../resource/resource.js";
|
|
@@ -13,6 +13,8 @@ import { resolveCompiledVectorStore, resolveCompiledVectorStoreRef } from "./sup
|
|
|
13
13
|
import { ThreadMemorySync } from "./thread-memory-sync.js";
|
|
14
14
|
import { FileBackedStore } from "./store.js";
|
|
15
15
|
import { CheckpointMaintenanceLoop, discoverCheckpointMaintenanceTargets, readCheckpointMaintenanceConfig, } from "./checkpoint-maintenance.js";
|
|
16
|
+
import { RuntimeRecordMaintenanceLoop, discoverRuntimeRecordMaintenanceTargets, readRuntimeRecordMaintenanceConfig, } from "./runtime-record-maintenance.js";
|
|
17
|
+
import { HealthMonitor } from "./health-monitor.js";
|
|
16
18
|
import { extractMessageText, normalizeMessageContent } from "../utils/message-content.js";
|
|
17
19
|
import { createToolMcpServerFromTools, serveToolsOverStdioFromHarness } from "../mcp.js";
|
|
18
20
|
import { getBindingAdapterKind, getBindingPrimaryTools, getBindingStoreConfig, isDeepAgentBinding } from "./support/compiled-binding.js";
|
|
@@ -38,10 +40,13 @@ export class AgentHarnessRuntime {
|
|
|
38
40
|
unregisterThreadMemorySync;
|
|
39
41
|
resolvedRuntimeAdapterOptions;
|
|
40
42
|
checkpointMaintenance;
|
|
43
|
+
runtimeRecordMaintenance;
|
|
44
|
+
healthMonitor;
|
|
41
45
|
recoveryConfig;
|
|
42
46
|
concurrencyConfig;
|
|
43
47
|
activeRunSlots = 0;
|
|
44
48
|
pendingRunSlots = [];
|
|
49
|
+
runtimeEventSequence = 0;
|
|
45
50
|
toPublicApprovalRecord(approval) {
|
|
46
51
|
const { toolCallId: _toolCallId, checkpointRef: _checkpointRef, eventRefs: _eventRefs, ...publicApproval } = approval;
|
|
47
52
|
return publicApproval;
|
|
@@ -141,7 +146,7 @@ export class AgentHarnessRuntime {
|
|
|
141
146
|
this.workspace = workspace;
|
|
142
147
|
this.runtimeAdapterOptions = runtimeAdapterOptions;
|
|
143
148
|
const runRoot = this.defaultRunRoot();
|
|
144
|
-
this.persistence = new
|
|
149
|
+
this.persistence = new SqlitePersistence(runRoot);
|
|
145
150
|
const defaultStoreConfig = this.listHostBindings()[0]?.harnessRuntime.store;
|
|
146
151
|
this.defaultStore = this.resolveStoreFromConfig(defaultStoreConfig, runRoot) ?? new FileBackedStore(`${runRoot}/store.json`);
|
|
147
152
|
const runtimeMemoryStoreConfig = typeof this.listHostBindings()[0]?.harnessRuntime.runtimeMemory?.store === "object" &&
|
|
@@ -189,17 +194,37 @@ export class AgentHarnessRuntime {
|
|
|
189
194
|
this.checkpointMaintenance = checkpointMaintenanceConfig
|
|
190
195
|
? new CheckpointMaintenanceLoop(discoverCheckpointMaintenanceTargets(workspace), checkpointMaintenanceConfig)
|
|
191
196
|
: null;
|
|
197
|
+
const runtimeRecordMaintenanceConfig = readRuntimeRecordMaintenanceConfig(workspace);
|
|
198
|
+
this.runtimeRecordMaintenance = runtimeRecordMaintenanceConfig
|
|
199
|
+
? new RuntimeRecordMaintenanceLoop(discoverRuntimeRecordMaintenanceTargets(workspace), runtimeRecordMaintenanceConfig)
|
|
200
|
+
: null;
|
|
192
201
|
this.recoveryConfig = getRecoveryConfig(workspace.refs);
|
|
193
202
|
this.concurrencyConfig = getConcurrencyConfig(workspace.refs);
|
|
203
|
+
this.healthMonitor = new HealthMonitor({
|
|
204
|
+
workspace,
|
|
205
|
+
persistence: this.persistence,
|
|
206
|
+
getActiveRunSlots: () => this.activeRunSlots,
|
|
207
|
+
getPendingRunSlots: () => this.pendingRunSlots.length,
|
|
208
|
+
getCheckpointMaintenanceStatus: () => this.checkpointMaintenance?.getStatus() ?? null,
|
|
209
|
+
getRuntimeRecordMaintenanceStatus: () => this.runtimeRecordMaintenance?.getStatus() ?? null,
|
|
210
|
+
publishEvent: async (payload) => {
|
|
211
|
+
this.eventBus.publish(createHarnessEvent("__runtime__", "__runtime__", ++this.runtimeEventSequence, "runtime.health.changed", payload));
|
|
212
|
+
},
|
|
213
|
+
});
|
|
194
214
|
}
|
|
195
215
|
async initialize() {
|
|
196
216
|
await this.persistence.initialize();
|
|
197
217
|
await this.checkpointMaintenance?.start();
|
|
218
|
+
await this.runtimeRecordMaintenance?.start();
|
|
219
|
+
await this.healthMonitor.start();
|
|
198
220
|
await this.recoverStartupRuns();
|
|
199
221
|
}
|
|
200
222
|
subscribe(listener) {
|
|
201
223
|
return this.eventBus.subscribe(listener);
|
|
202
224
|
}
|
|
225
|
+
async getHealth() {
|
|
226
|
+
return this.healthMonitor.getSnapshot();
|
|
227
|
+
}
|
|
203
228
|
getBinding(agentId) {
|
|
204
229
|
return this.workspace.bindings.get(agentId);
|
|
205
230
|
}
|
|
@@ -457,7 +482,16 @@ export class AgentHarnessRuntime {
|
|
|
457
482
|
}
|
|
458
483
|
async invokeWithHistory(binding, input, threadId, runId, resumePayload, options = {}) {
|
|
459
484
|
const priorHistory = await this.loadPriorHistory(threadId, runId);
|
|
460
|
-
|
|
485
|
+
const startedAt = Date.now();
|
|
486
|
+
try {
|
|
487
|
+
const result = await this.runtimeAdapter.invoke(binding, input, threadId, runId, resumePayload, priorHistory, options);
|
|
488
|
+
this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
|
|
489
|
+
return result;
|
|
490
|
+
}
|
|
491
|
+
catch (error) {
|
|
492
|
+
this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
|
|
493
|
+
throw error;
|
|
494
|
+
}
|
|
461
495
|
}
|
|
462
496
|
buildPersistedRunRequest(input, invocation) {
|
|
463
497
|
const envelope = invocation.invocation ?? {
|
|
@@ -1097,8 +1131,10 @@ export class AgentHarnessRuntime {
|
|
|
1097
1131
|
const history = await this.persistence.listThreadMessages(threadId);
|
|
1098
1132
|
const priorHistory = history.filter((message) => message.runId !== runId);
|
|
1099
1133
|
const runInput = await this.loadRunInput(threadId, runId);
|
|
1134
|
+
const startedAt = Date.now();
|
|
1100
1135
|
try {
|
|
1101
1136
|
const actual = await this.runtimeAdapter.invoke(binding, "", threadId, runId, resumePayload, priorHistory);
|
|
1137
|
+
this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
|
|
1102
1138
|
await this.persistence.clearRecoveryIntent(threadId, runId);
|
|
1103
1139
|
const finalized = await this.finalizeContinuedRun(threadId, runId, runInput, actual, {
|
|
1104
1140
|
previousState: "resuming",
|
|
@@ -1112,6 +1148,7 @@ export class AgentHarnessRuntime {
|
|
|
1112
1148
|
};
|
|
1113
1149
|
}
|
|
1114
1150
|
catch (error) {
|
|
1151
|
+
this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
|
|
1115
1152
|
throw error;
|
|
1116
1153
|
}
|
|
1117
1154
|
}
|
|
@@ -1182,7 +1219,9 @@ export class AgentHarnessRuntime {
|
|
|
1182
1219
|
};
|
|
1183
1220
|
}
|
|
1184
1221
|
async close() {
|
|
1222
|
+
await this.healthMonitor.stop();
|
|
1185
1223
|
await this.checkpointMaintenance?.stop();
|
|
1224
|
+
await this.runtimeRecordMaintenance?.stop();
|
|
1186
1225
|
this.unregisterThreadMemorySync();
|
|
1187
1226
|
await this.threadMemorySync.close();
|
|
1188
1227
|
}
|
|
@@ -1283,8 +1322,10 @@ export class AgentHarnessRuntime {
|
|
|
1283
1322
|
const history = await this.persistence.listThreadMessages(thread.threadId);
|
|
1284
1323
|
const priorHistory = history.filter((message) => message.runId !== thread.latestRunId);
|
|
1285
1324
|
const runInput = await this.loadRunInput(thread.threadId, thread.latestRunId);
|
|
1325
|
+
const startedAt = Date.now();
|
|
1286
1326
|
try {
|
|
1287
1327
|
const actual = await this.runtimeAdapter.invoke(binding, "", thread.threadId, thread.latestRunId, recoveryIntent.resumePayload, priorHistory);
|
|
1328
|
+
this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
|
|
1288
1329
|
await this.persistence.clearRecoveryIntent(thread.threadId, thread.latestRunId);
|
|
1289
1330
|
await this.finalizeContinuedRun(thread.threadId, thread.latestRunId, runInput, actual, {
|
|
1290
1331
|
previousState: "resuming",
|
|
@@ -1293,6 +1334,7 @@ export class AgentHarnessRuntime {
|
|
|
1293
1334
|
});
|
|
1294
1335
|
}
|
|
1295
1336
|
catch (error) {
|
|
1337
|
+
this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
|
|
1296
1338
|
if (recoveryIntent.attempts + 1 >= this.recoveryConfig.maxRecoveryAttempts) {
|
|
1297
1339
|
await this.persistence.setRunState(thread.threadId, thread.latestRunId, "failed", recoveryIntent.checkpointRef);
|
|
1298
1340
|
await this.persistence.clearRecoveryIntent(thread.threadId, thread.latestRunId);
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import type { RuntimeHealthSnapshot, WorkspaceBundle } from "../contracts/types.js";
|
|
2
|
+
import type { RuntimePersistence } from "../persistence/types.js";
|
|
3
|
+
import type { MaintenanceLoopStatus } from "./checkpoint-maintenance.js";
|
|
4
|
+
type HealthMonitorConfig = {
|
|
5
|
+
enabled: boolean;
|
|
6
|
+
evaluateIntervalSeconds: number;
|
|
7
|
+
emitEvents: boolean;
|
|
8
|
+
thresholds: {
|
|
9
|
+
llmErrorRate: {
|
|
10
|
+
windowSeconds: number;
|
|
11
|
+
degradedAbove: number;
|
|
12
|
+
unhealthyAbove: number;
|
|
13
|
+
};
|
|
14
|
+
llmP95LatencyMs: {
|
|
15
|
+
windowSeconds: number;
|
|
16
|
+
degradedAbove: number;
|
|
17
|
+
unhealthyAbove: number;
|
|
18
|
+
};
|
|
19
|
+
pendingApprovals: {
|
|
20
|
+
degradedAbove: number;
|
|
21
|
+
unhealthyAbove: number;
|
|
22
|
+
};
|
|
23
|
+
stuckRunSeconds: {
|
|
24
|
+
degradedAbove: number;
|
|
25
|
+
unhealthyAbove: number;
|
|
26
|
+
};
|
|
27
|
+
checkpointBytes: {
|
|
28
|
+
degradedAbove: number;
|
|
29
|
+
unhealthyAbove: number;
|
|
30
|
+
};
|
|
31
|
+
runtimeDbBytes: {
|
|
32
|
+
degradedAbove: number;
|
|
33
|
+
unhealthyAbove: number;
|
|
34
|
+
};
|
|
35
|
+
artifactBytes: {
|
|
36
|
+
degradedAbove: number;
|
|
37
|
+
unhealthyAbove: number;
|
|
38
|
+
};
|
|
39
|
+
};
|
|
40
|
+
};
|
|
41
|
+
type HealthMonitorOptions = {
|
|
42
|
+
workspace: WorkspaceBundle;
|
|
43
|
+
persistence: RuntimePersistence;
|
|
44
|
+
getActiveRunSlots: () => number;
|
|
45
|
+
getPendingRunSlots: () => number;
|
|
46
|
+
getCheckpointMaintenanceStatus: () => MaintenanceLoopStatus | null;
|
|
47
|
+
getRuntimeRecordMaintenanceStatus: () => MaintenanceLoopStatus | null;
|
|
48
|
+
publishEvent?: (payload: Record<string, unknown>) => void | Promise<void>;
|
|
49
|
+
};
|
|
50
|
+
export declare function readHealthMonitorConfig(workspace: WorkspaceBundle): HealthMonitorConfig;
|
|
51
|
+
export declare class HealthMonitor {
|
|
52
|
+
private readonly options;
|
|
53
|
+
private readonly config;
|
|
54
|
+
private readonly runRoots;
|
|
55
|
+
private readonly checkpointDbPaths;
|
|
56
|
+
private readonly llmSamples;
|
|
57
|
+
private timer;
|
|
58
|
+
private latestSnapshot;
|
|
59
|
+
private runtimeEventSequence;
|
|
60
|
+
constructor(options: HealthMonitorOptions);
|
|
61
|
+
recordLlmSuccess(latencyMs: number, nowMs?: number): void;
|
|
62
|
+
recordLlmFailure(latencyMs: number, nowMs?: number): void;
|
|
63
|
+
private recordLlmSample;
|
|
64
|
+
start(): Promise<void>;
|
|
65
|
+
stop(): Promise<void>;
|
|
66
|
+
getSnapshot(): Promise<RuntimeHealthSnapshot>;
|
|
67
|
+
evaluate(nowMs?: number): Promise<RuntimeHealthSnapshot>;
|
|
68
|
+
private llmStats;
|
|
69
|
+
private evaluateLlmCheck;
|
|
70
|
+
private llmSymptoms;
|
|
71
|
+
private evaluatePersistenceCheck;
|
|
72
|
+
private evaluateCapacityCheck;
|
|
73
|
+
private capacitySymptoms;
|
|
74
|
+
private evaluateWorkloadCheck;
|
|
75
|
+
private workloadSymptoms;
|
|
76
|
+
private countStuckRuns;
|
|
77
|
+
private sumRuntimeDbBytes;
|
|
78
|
+
private sumCheckpointDbBytes;
|
|
79
|
+
private sumArtifactBytes;
|
|
80
|
+
}
|
|
81
|
+
export {};
|