@botbotgo/agent-harness 0.0.75 → 0.0.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +2 -1
- package/dist/api.js +3 -0
- package/dist/benchmark/checkpoint-resume-cost-benchmark.d.ts +33 -0
- package/dist/benchmark/checkpoint-resume-cost-benchmark.js +55 -0
- package/dist/benchmark/deepagent-local-model-benchmark.d.ts +27 -0
- package/dist/benchmark/deepagent-local-model-benchmark.js +35 -0
- package/dist/config/agents/direct.yaml +1 -1
- package/dist/config/agents/orchestra.yaml +1 -2
- package/dist/config/workspace.yaml +31 -0
- package/dist/contracts/types.d.ts +38 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/package-version.d.ts +1 -1
- package/dist/package-version.js +1 -1
- package/dist/persistence/file-store.d.ts +3 -40
- package/dist/persistence/file-store.js +5 -2
- package/dist/persistence/sqlite-store.d.ts +68 -0
- package/dist/persistence/sqlite-store.js +569 -0
- package/dist/persistence/types.d.ts +83 -0
- package/dist/persistence/types.js +1 -0
- package/dist/runtime/agent-runtime-adapter.d.ts +3 -0
- package/dist/runtime/agent-runtime-adapter.js +58 -2
- package/dist/runtime/checkpoint-maintenance.d.ts +11 -2
- package/dist/runtime/checkpoint-maintenance.js +41 -5
- package/dist/runtime/harness.d.ts +5 -1
- package/dist/runtime/harness.js +67 -4
- package/dist/runtime/health-monitor.d.ts +81 -0
- package/dist/runtime/health-monitor.js +448 -0
- package/dist/runtime/runtime-record-maintenance.d.ts +43 -0
- package/dist/runtime/runtime-record-maintenance.js +169 -0
- package/dist/runtime/store.d.ts +2 -0
- package/dist/runtime/store.js +38 -20
- package/dist/runtime/support/embedding-models.js +57 -1
- package/dist/runtime/thread-memory-sync.d.ts +3 -2
- package/dist/runtime/thread-memory-sync.js +7 -1
- package/dist/workspace/agent-binding-compiler.js +3 -1
- package/dist/workspace/support/workspace-ref-utils.d.ts +9 -0
- package/dist/workspace/support/workspace-ref-utils.js +38 -0
- package/package.json +2 -2
|
@@ -64,6 +64,9 @@ function computeRemainingTimeoutMs(deadlineAt, fallbackTimeoutMs) {
|
|
|
64
64
|
}
|
|
65
65
|
return fallbackTimeoutMs ? Math.min(fallbackTimeoutMs, remaining) : remaining;
|
|
66
66
|
}
|
|
67
|
+
function sleep(ms) {
|
|
68
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
69
|
+
}
|
|
67
70
|
function isPlaceholderApiKey(value) {
|
|
68
71
|
return typeof value === "string" && value.trim().toLowerCase() === "dummy";
|
|
69
72
|
}
|
|
@@ -485,6 +488,57 @@ export class AgentRuntimeAdapter {
|
|
|
485
488
|
}
|
|
486
489
|
return 15_000;
|
|
487
490
|
}
|
|
491
|
+
resolveProviderRetryPolicy(binding) {
|
|
492
|
+
const resilience = typeof binding.harnessRuntime.resilience === "object" && binding.harnessRuntime.resilience
|
|
493
|
+
? binding.harnessRuntime.resilience
|
|
494
|
+
: {};
|
|
495
|
+
const providerRetries = typeof resilience.providerRetries === "object" && resilience.providerRetries
|
|
496
|
+
? resilience.providerRetries
|
|
497
|
+
: {};
|
|
498
|
+
const maxAttempts = typeof providerRetries.maxAttempts === "number" &&
|
|
499
|
+
Number.isFinite(providerRetries.maxAttempts) &&
|
|
500
|
+
providerRetries.maxAttempts > 0
|
|
501
|
+
? Math.floor(providerRetries.maxAttempts)
|
|
502
|
+
: 2;
|
|
503
|
+
const backoffMs = typeof providerRetries.backoffMs === "number" &&
|
|
504
|
+
Number.isFinite(providerRetries.backoffMs) &&
|
|
505
|
+
providerRetries.backoffMs >= 0
|
|
506
|
+
? Math.floor(providerRetries.backoffMs)
|
|
507
|
+
: 1_000;
|
|
508
|
+
const retryableMessages = Array.isArray(providerRetries.retryableMessages)
|
|
509
|
+
? providerRetries.retryableMessages.filter((value) => typeof value === "string" && value.trim().length > 0)
|
|
510
|
+
: [];
|
|
511
|
+
return {
|
|
512
|
+
maxAttempts,
|
|
513
|
+
backoffMs,
|
|
514
|
+
retryableMessages,
|
|
515
|
+
};
|
|
516
|
+
}
|
|
517
|
+
isRetryableProviderError(binding, error) {
|
|
518
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
519
|
+
const normalized = message.toLowerCase();
|
|
520
|
+
const { retryableMessages } = this.resolveProviderRetryPolicy(binding);
|
|
521
|
+
return retryableMessages.some((candidate) => normalized.includes(candidate.toLowerCase()));
|
|
522
|
+
}
|
|
523
|
+
async invokeWithProviderRetry(binding, operation) {
|
|
524
|
+
const retryPolicy = this.resolveProviderRetryPolicy(binding);
|
|
525
|
+
let lastError;
|
|
526
|
+
for (let attempt = 1; attempt <= retryPolicy.maxAttempts; attempt += 1) {
|
|
527
|
+
try {
|
|
528
|
+
return await operation();
|
|
529
|
+
}
|
|
530
|
+
catch (error) {
|
|
531
|
+
lastError = error;
|
|
532
|
+
if (attempt >= retryPolicy.maxAttempts || !this.isRetryableProviderError(binding, error)) {
|
|
533
|
+
throw error;
|
|
534
|
+
}
|
|
535
|
+
if (retryPolicy.backoffMs > 0) {
|
|
536
|
+
await sleep(retryPolicy.backoffMs);
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
throw lastError instanceof Error ? lastError : new Error(String(lastError));
|
|
541
|
+
}
|
|
488
542
|
async withTimeout(producer, timeoutMs, operation, stage = operation.includes("stream") ? "stream" : "invoke") {
|
|
489
543
|
if (!timeoutMs) {
|
|
490
544
|
return Promise.resolve(producer());
|
|
@@ -1204,8 +1258,10 @@ export class AgentRuntimeAdapter {
|
|
|
1204
1258
|
: new Command({ resume: resumePayload });
|
|
1205
1259
|
let result;
|
|
1206
1260
|
const callRuntime = async (activeBinding, activeRequest) => {
|
|
1207
|
-
|
|
1208
|
-
|
|
1261
|
+
return this.invokeWithProviderRetry(activeBinding, async () => {
|
|
1262
|
+
const runnable = await this.create(activeBinding);
|
|
1263
|
+
return (await this.withTimeout(() => runnable.invoke(activeRequest, { configurable: { thread_id: threadId }, ...(options.context ? { context: options.context } : {}) }), this.resolveBindingTimeout(activeBinding), "agent invoke", "invoke"));
|
|
1264
|
+
});
|
|
1209
1265
|
};
|
|
1210
1266
|
const callRuntimeWithToolParseRecovery = async (activeRequest) => {
|
|
1211
1267
|
try {
|
|
@@ -18,18 +18,27 @@ type CheckpointMaintenanceTarget = {
|
|
|
18
18
|
agentId: string;
|
|
19
19
|
dbPath: string;
|
|
20
20
|
};
|
|
21
|
+
export type MaintenanceLoopStatus = {
|
|
22
|
+
lastStartedAt?: string;
|
|
23
|
+
lastCompletedAt?: string;
|
|
24
|
+
lastFailedAt?: string;
|
|
25
|
+
consecutiveFailures: number;
|
|
26
|
+
lastError?: string;
|
|
27
|
+
};
|
|
21
28
|
export declare function readCheckpointMaintenanceConfig(workspace: WorkspaceBundle): CheckpointMaintenanceConfig | null;
|
|
22
29
|
export declare function discoverCheckpointMaintenanceTargets(workspace: WorkspaceBundle): CheckpointMaintenanceTarget[];
|
|
23
|
-
export declare function maintainSqliteCheckpoints(dbPath: string, config: CheckpointMaintenanceConfig, nowMs?: number): {
|
|
30
|
+
export declare function maintainSqliteCheckpoints(dbPath: string, config: CheckpointMaintenanceConfig, nowMs?: number): Promise<{
|
|
24
31
|
deletedCount: number;
|
|
25
|
-
}
|
|
32
|
+
}>;
|
|
26
33
|
export declare class CheckpointMaintenanceLoop {
|
|
27
34
|
private readonly targets;
|
|
28
35
|
private readonly config;
|
|
29
36
|
private timer;
|
|
30
37
|
private running;
|
|
38
|
+
private status;
|
|
31
39
|
constructor(targets: CheckpointMaintenanceTarget[], config: CheckpointMaintenanceConfig);
|
|
32
40
|
runOnce(nowMs?: number): Promise<void>;
|
|
41
|
+
getStatus(): MaintenanceLoopStatus;
|
|
33
42
|
start(): Promise<void>;
|
|
34
43
|
stop(): Promise<void>;
|
|
35
44
|
}
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import path from "node:path";
|
|
2
2
|
import { SqliteSaver } from "@langchain/langgraph-checkpoint-sqlite";
|
|
3
|
+
import { listProtectedCheckpointThreadIds } from "../persistence/sqlite-store.js";
|
|
4
|
+
import { fileExists } from "../utils/fs.js";
|
|
3
5
|
import { getRuntimeDefaults } from "../workspace/support/workspace-ref-utils.js";
|
|
4
6
|
import { ManagedSqliteSaver } from "./sqlite-maintained-checkpoint-saver.js";
|
|
5
7
|
function asObject(value) {
|
|
@@ -135,11 +137,18 @@ function totalCheckpointBytes(db) {
|
|
|
135
137
|
return Number(checkpointsBytes.total ?? 0) + Number(writesBytes.total ?? 0);
|
|
136
138
|
}
|
|
137
139
|
export function maintainSqliteCheckpoints(dbPath, config, nowMs = Date.now()) {
|
|
140
|
+
return maintainSqliteCheckpointsInternal(dbPath, config, nowMs);
|
|
141
|
+
}
|
|
142
|
+
async function maintainSqliteCheckpointsInternal(dbPath, config, nowMs) {
|
|
143
|
+
if (!(await fileExists(dbPath))) {
|
|
144
|
+
return { deletedCount: 0 };
|
|
145
|
+
}
|
|
138
146
|
const saver = new ManagedSqliteSaver(SqliteSaver.fromConnString(dbPath).db);
|
|
139
147
|
const db = saver.db;
|
|
140
148
|
try {
|
|
141
149
|
saver.prepareMaintenance();
|
|
142
150
|
backfillCheckpointMetadata(db, nowMs);
|
|
151
|
+
const protectedThreadIds = await listProtectedCheckpointThreadIds(path.join(path.dirname(dbPath), "runtime.sqlite"));
|
|
143
152
|
let deletedCount = 0;
|
|
144
153
|
if (config.policies.maxAgeSeconds !== undefined) {
|
|
145
154
|
const cutoffMs = nowMs - config.policies.maxAgeSeconds * 1000;
|
|
@@ -154,13 +163,13 @@ export function maintainSqliteCheckpoints(dbPath, config, nowMs = Date.now()) {
|
|
|
154
163
|
WHERE meta.created_at_ms <= ?
|
|
155
164
|
ORDER BY meta.created_at_ms ASC, meta.checkpoint_id ASC
|
|
156
165
|
LIMIT ?`)
|
|
157
|
-
.all(cutoffMs, config.sqlite.sweepBatchSize);
|
|
158
|
-
deletedCount += deleteCheckpointRows(db, expired);
|
|
166
|
+
.all(cutoffMs, config.sqlite.sweepBatchSize * 4);
|
|
167
|
+
deletedCount += deleteCheckpointRows(db, expired.filter((row) => !protectedThreadIds.has(row.thread_id)).slice(0, config.sqlite.sweepBatchSize));
|
|
159
168
|
}
|
|
160
169
|
if (config.policies.maxBytes !== undefined) {
|
|
161
170
|
let currentBytes = totalCheckpointBytes(db);
|
|
162
171
|
while (currentBytes > config.policies.maxBytes) {
|
|
163
|
-
const oldest = selectOldestRows(db, config.sqlite.sweepBatchSize);
|
|
172
|
+
const oldest = selectOldestRows(db, config.sqlite.sweepBatchSize * 4).filter((row) => !protectedThreadIds.has(row.thread_id));
|
|
164
173
|
if (oldest.length === 0) {
|
|
165
174
|
break;
|
|
166
175
|
}
|
|
@@ -191,15 +200,42 @@ export class CheckpointMaintenanceLoop {
|
|
|
191
200
|
config;
|
|
192
201
|
timer = null;
|
|
193
202
|
running = false;
|
|
203
|
+
status = {
|
|
204
|
+
consecutiveFailures: 0,
|
|
205
|
+
};
|
|
194
206
|
constructor(targets, config) {
|
|
195
207
|
this.targets = targets;
|
|
196
208
|
this.config = config;
|
|
197
209
|
}
|
|
198
210
|
async runOnce(nowMs = Date.now()) {
|
|
199
|
-
|
|
200
|
-
|
|
211
|
+
this.status = {
|
|
212
|
+
...this.status,
|
|
213
|
+
lastStartedAt: new Date(nowMs).toISOString(),
|
|
214
|
+
};
|
|
215
|
+
try {
|
|
216
|
+
for (const target of this.targets) {
|
|
217
|
+
await maintainSqliteCheckpoints(target.dbPath, this.config, nowMs);
|
|
218
|
+
}
|
|
219
|
+
this.status = {
|
|
220
|
+
...this.status,
|
|
221
|
+
lastCompletedAt: new Date(nowMs).toISOString(),
|
|
222
|
+
consecutiveFailures: 0,
|
|
223
|
+
lastError: undefined,
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
catch (error) {
|
|
227
|
+
this.status = {
|
|
228
|
+
...this.status,
|
|
229
|
+
lastFailedAt: new Date(nowMs).toISOString(),
|
|
230
|
+
consecutiveFailures: this.status.consecutiveFailures + 1,
|
|
231
|
+
lastError: error instanceof Error ? error.message : String(error),
|
|
232
|
+
};
|
|
233
|
+
throw error;
|
|
201
234
|
}
|
|
202
235
|
}
|
|
236
|
+
getStatus() {
|
|
237
|
+
return { ...this.status };
|
|
238
|
+
}
|
|
203
239
|
async start() {
|
|
204
240
|
if (this.running) {
|
|
205
241
|
return;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { ApprovalRecord, HarnessEvent, HarnessStreamItem, MessageContent, RunRecord, RunStartOptions, RestartConversationOptions, RuntimeAdapterOptions, ResumeOptions, RunOptions, RunResult, RunSummary, ThreadSummary, ThreadRecord, WorkspaceBundle } from "../contracts/types.js";
|
|
1
|
+
import type { ApprovalRecord, HarnessEvent, HarnessStreamItem, RuntimeHealthSnapshot, MessageContent, RunRecord, RunStartOptions, RestartConversationOptions, RuntimeAdapterOptions, ResumeOptions, RunOptions, RunResult, RunSummary, ThreadSummary, ThreadRecord, WorkspaceBundle } from "../contracts/types.js";
|
|
2
2
|
import { type ToolMcpServerOptions } from "../mcp.js";
|
|
3
3
|
import { type InventoryAgentRecord, type InventorySkillRecord } from "./inventory.js";
|
|
4
4
|
import type { RequirementAssessmentOptions } from "./skill-requirements.js";
|
|
@@ -23,10 +23,13 @@ export declare class AgentHarnessRuntime {
|
|
|
23
23
|
private readonly unregisterThreadMemorySync;
|
|
24
24
|
private readonly resolvedRuntimeAdapterOptions;
|
|
25
25
|
private readonly checkpointMaintenance;
|
|
26
|
+
private readonly runtimeRecordMaintenance;
|
|
27
|
+
private readonly healthMonitor;
|
|
26
28
|
private readonly recoveryConfig;
|
|
27
29
|
private readonly concurrencyConfig;
|
|
28
30
|
private activeRunSlots;
|
|
29
31
|
private readonly pendingRunSlots;
|
|
32
|
+
private runtimeEventSequence;
|
|
30
33
|
private toPublicApprovalRecord;
|
|
31
34
|
private normalizeInvocationEnvelope;
|
|
32
35
|
private isTerminalRunState;
|
|
@@ -42,6 +45,7 @@ export declare class AgentHarnessRuntime {
|
|
|
42
45
|
constructor(workspace: WorkspaceBundle, runtimeAdapterOptions?: RuntimeAdapterOptions);
|
|
43
46
|
initialize(): Promise<void>;
|
|
44
47
|
subscribe(listener: (event: HarnessEvent) => void): () => void;
|
|
48
|
+
getHealth(): Promise<RuntimeHealthSnapshot>;
|
|
45
49
|
private getBinding;
|
|
46
50
|
private listAgentTools;
|
|
47
51
|
private resolveAgentTools;
|
package/dist/runtime/harness.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { AUTO_AGENT_ID } from "../contracts/types.js";
|
|
2
|
-
import {
|
|
2
|
+
import { SqlitePersistence } from "../persistence/sqlite-store.js";
|
|
3
3
|
import { createPersistentId } from "../utils/id.js";
|
|
4
4
|
import { AGENT_INTERRUPT_SENTINEL_PREFIX, AgentRuntimeAdapter, RuntimeOperationTimeoutError } from "./agent-runtime-adapter.js";
|
|
5
5
|
import { createResourceBackendResolver, createResourceToolResolver } from "../resource/resource.js";
|
|
@@ -13,6 +13,8 @@ import { resolveCompiledVectorStore, resolveCompiledVectorStoreRef } from "./sup
|
|
|
13
13
|
import { ThreadMemorySync } from "./thread-memory-sync.js";
|
|
14
14
|
import { FileBackedStore } from "./store.js";
|
|
15
15
|
import { CheckpointMaintenanceLoop, discoverCheckpointMaintenanceTargets, readCheckpointMaintenanceConfig, } from "./checkpoint-maintenance.js";
|
|
16
|
+
import { RuntimeRecordMaintenanceLoop, discoverRuntimeRecordMaintenanceTargets, readRuntimeRecordMaintenanceConfig, } from "./runtime-record-maintenance.js";
|
|
17
|
+
import { HealthMonitor } from "./health-monitor.js";
|
|
16
18
|
import { extractMessageText, normalizeMessageContent } from "../utils/message-content.js";
|
|
17
19
|
import { createToolMcpServerFromTools, serveToolsOverStdioFromHarness } from "../mcp.js";
|
|
18
20
|
import { getBindingAdapterKind, getBindingPrimaryTools, getBindingStoreConfig, isDeepAgentBinding } from "./support/compiled-binding.js";
|
|
@@ -38,10 +40,13 @@ export class AgentHarnessRuntime {
|
|
|
38
40
|
unregisterThreadMemorySync;
|
|
39
41
|
resolvedRuntimeAdapterOptions;
|
|
40
42
|
checkpointMaintenance;
|
|
43
|
+
runtimeRecordMaintenance;
|
|
44
|
+
healthMonitor;
|
|
41
45
|
recoveryConfig;
|
|
42
46
|
concurrencyConfig;
|
|
43
47
|
activeRunSlots = 0;
|
|
44
48
|
pendingRunSlots = [];
|
|
49
|
+
runtimeEventSequence = 0;
|
|
45
50
|
toPublicApprovalRecord(approval) {
|
|
46
51
|
const { toolCallId: _toolCallId, checkpointRef: _checkpointRef, eventRefs: _eventRefs, ...publicApproval } = approval;
|
|
47
52
|
return publicApproval;
|
|
@@ -141,7 +146,7 @@ export class AgentHarnessRuntime {
|
|
|
141
146
|
this.workspace = workspace;
|
|
142
147
|
this.runtimeAdapterOptions = runtimeAdapterOptions;
|
|
143
148
|
const runRoot = this.defaultRunRoot();
|
|
144
|
-
this.persistence = new
|
|
149
|
+
this.persistence = new SqlitePersistence(runRoot);
|
|
145
150
|
const defaultStoreConfig = this.listHostBindings()[0]?.harnessRuntime.store;
|
|
146
151
|
this.defaultStore = this.resolveStoreFromConfig(defaultStoreConfig, runRoot) ?? new FileBackedStore(`${runRoot}/store.json`);
|
|
147
152
|
const runtimeMemoryStoreConfig = typeof this.listHostBindings()[0]?.harnessRuntime.runtimeMemory?.store === "object" &&
|
|
@@ -189,17 +194,37 @@ export class AgentHarnessRuntime {
|
|
|
189
194
|
this.checkpointMaintenance = checkpointMaintenanceConfig
|
|
190
195
|
? new CheckpointMaintenanceLoop(discoverCheckpointMaintenanceTargets(workspace), checkpointMaintenanceConfig)
|
|
191
196
|
: null;
|
|
197
|
+
const runtimeRecordMaintenanceConfig = readRuntimeRecordMaintenanceConfig(workspace);
|
|
198
|
+
this.runtimeRecordMaintenance = runtimeRecordMaintenanceConfig
|
|
199
|
+
? new RuntimeRecordMaintenanceLoop(discoverRuntimeRecordMaintenanceTargets(workspace), runtimeRecordMaintenanceConfig)
|
|
200
|
+
: null;
|
|
192
201
|
this.recoveryConfig = getRecoveryConfig(workspace.refs);
|
|
193
202
|
this.concurrencyConfig = getConcurrencyConfig(workspace.refs);
|
|
203
|
+
this.healthMonitor = new HealthMonitor({
|
|
204
|
+
workspace,
|
|
205
|
+
persistence: this.persistence,
|
|
206
|
+
getActiveRunSlots: () => this.activeRunSlots,
|
|
207
|
+
getPendingRunSlots: () => this.pendingRunSlots.length,
|
|
208
|
+
getCheckpointMaintenanceStatus: () => this.checkpointMaintenance?.getStatus() ?? null,
|
|
209
|
+
getRuntimeRecordMaintenanceStatus: () => this.runtimeRecordMaintenance?.getStatus() ?? null,
|
|
210
|
+
publishEvent: async (payload) => {
|
|
211
|
+
this.eventBus.publish(createHarnessEvent("__runtime__", "__runtime__", ++this.runtimeEventSequence, "runtime.health.changed", payload));
|
|
212
|
+
},
|
|
213
|
+
});
|
|
194
214
|
}
|
|
195
215
|
async initialize() {
|
|
196
216
|
await this.persistence.initialize();
|
|
197
217
|
await this.checkpointMaintenance?.start();
|
|
218
|
+
await this.runtimeRecordMaintenance?.start();
|
|
219
|
+
await this.healthMonitor.start();
|
|
198
220
|
await this.recoverStartupRuns();
|
|
199
221
|
}
|
|
200
222
|
subscribe(listener) {
|
|
201
223
|
return this.eventBus.subscribe(listener);
|
|
202
224
|
}
|
|
225
|
+
async getHealth() {
|
|
226
|
+
return this.healthMonitor.getSnapshot();
|
|
227
|
+
}
|
|
203
228
|
getBinding(agentId) {
|
|
204
229
|
return this.workspace.bindings.get(agentId);
|
|
205
230
|
}
|
|
@@ -457,7 +482,16 @@ export class AgentHarnessRuntime {
|
|
|
457
482
|
}
|
|
458
483
|
async invokeWithHistory(binding, input, threadId, runId, resumePayload, options = {}) {
|
|
459
484
|
const priorHistory = await this.loadPriorHistory(threadId, runId);
|
|
460
|
-
|
|
485
|
+
const startedAt = Date.now();
|
|
486
|
+
try {
|
|
487
|
+
const result = await this.runtimeAdapter.invoke(binding, input, threadId, runId, resumePayload, priorHistory, options);
|
|
488
|
+
this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
|
|
489
|
+
return result;
|
|
490
|
+
}
|
|
491
|
+
catch (error) {
|
|
492
|
+
this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
|
|
493
|
+
throw error;
|
|
494
|
+
}
|
|
461
495
|
}
|
|
462
496
|
buildPersistedRunRequest(input, invocation) {
|
|
463
497
|
const envelope = invocation.invocation ?? {
|
|
@@ -830,6 +864,7 @@ export class AgentHarnessRuntime {
|
|
|
830
864
|
return;
|
|
831
865
|
}
|
|
832
866
|
let emitted = false;
|
|
867
|
+
let streamActivityObserved = false;
|
|
833
868
|
const { threadId, runId } = await this.ensureThreadStarted(selectedAgentId, binding, options.input, options.threadId);
|
|
834
869
|
await this.persistence.saveRunRequest(threadId, runId, this.buildPersistedRunRequest(options.input, invocation));
|
|
835
870
|
yield { type: "event", event: await this.emitRunCreated(threadId, runId, {
|
|
@@ -851,6 +886,7 @@ export class AgentHarnessRuntime {
|
|
|
851
886
|
files: invocation.files,
|
|
852
887
|
})) {
|
|
853
888
|
if (chunk) {
|
|
889
|
+
streamActivityObserved = true;
|
|
854
890
|
const normalizedChunk = typeof chunk === "string"
|
|
855
891
|
? chunk.startsWith(AGENT_INTERRUPT_SENTINEL_PREFIX)
|
|
856
892
|
? { kind: "interrupt", content: chunk.slice(AGENT_INTERRUPT_SENTINEL_PREFIX.length) }
|
|
@@ -964,11 +1000,30 @@ export class AgentHarnessRuntime {
|
|
|
964
1000
|
return;
|
|
965
1001
|
}
|
|
966
1002
|
catch (error) {
|
|
967
|
-
if (emitted) {
|
|
1003
|
+
if (emitted || streamActivityObserved) {
|
|
1004
|
+
const runtimeFailure = renderRuntimeFailure(error);
|
|
968
1005
|
yield { type: "event", event: await this.setRunStateAndEmit(threadId, runId, 6, "failed", {
|
|
969
1006
|
previousState: "running",
|
|
970
1007
|
error: error instanceof Error ? error.message : String(error),
|
|
971
1008
|
}) };
|
|
1009
|
+
yield {
|
|
1010
|
+
type: "content",
|
|
1011
|
+
threadId,
|
|
1012
|
+
runId,
|
|
1013
|
+
agentId: selectedAgentId,
|
|
1014
|
+
content: runtimeFailure,
|
|
1015
|
+
};
|
|
1016
|
+
yield {
|
|
1017
|
+
type: "result",
|
|
1018
|
+
result: {
|
|
1019
|
+
threadId,
|
|
1020
|
+
runId,
|
|
1021
|
+
agentId: selectedAgentId,
|
|
1022
|
+
state: "failed",
|
|
1023
|
+
output: runtimeFailure,
|
|
1024
|
+
finalMessageText: runtimeFailure,
|
|
1025
|
+
},
|
|
1026
|
+
};
|
|
972
1027
|
return;
|
|
973
1028
|
}
|
|
974
1029
|
if (error instanceof RuntimeOperationTimeoutError && error.stage === "invoke") {
|
|
@@ -1097,8 +1152,10 @@ export class AgentHarnessRuntime {
|
|
|
1097
1152
|
const history = await this.persistence.listThreadMessages(threadId);
|
|
1098
1153
|
const priorHistory = history.filter((message) => message.runId !== runId);
|
|
1099
1154
|
const runInput = await this.loadRunInput(threadId, runId);
|
|
1155
|
+
const startedAt = Date.now();
|
|
1100
1156
|
try {
|
|
1101
1157
|
const actual = await this.runtimeAdapter.invoke(binding, "", threadId, runId, resumePayload, priorHistory);
|
|
1158
|
+
this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
|
|
1102
1159
|
await this.persistence.clearRecoveryIntent(threadId, runId);
|
|
1103
1160
|
const finalized = await this.finalizeContinuedRun(threadId, runId, runInput, actual, {
|
|
1104
1161
|
previousState: "resuming",
|
|
@@ -1112,6 +1169,7 @@ export class AgentHarnessRuntime {
|
|
|
1112
1169
|
};
|
|
1113
1170
|
}
|
|
1114
1171
|
catch (error) {
|
|
1172
|
+
this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
|
|
1115
1173
|
throw error;
|
|
1116
1174
|
}
|
|
1117
1175
|
}
|
|
@@ -1182,7 +1240,9 @@ export class AgentHarnessRuntime {
|
|
|
1182
1240
|
};
|
|
1183
1241
|
}
|
|
1184
1242
|
async close() {
|
|
1243
|
+
await this.healthMonitor.stop();
|
|
1185
1244
|
await this.checkpointMaintenance?.stop();
|
|
1245
|
+
await this.runtimeRecordMaintenance?.stop();
|
|
1186
1246
|
this.unregisterThreadMemorySync();
|
|
1187
1247
|
await this.threadMemorySync.close();
|
|
1188
1248
|
}
|
|
@@ -1283,8 +1343,10 @@ export class AgentHarnessRuntime {
|
|
|
1283
1343
|
const history = await this.persistence.listThreadMessages(thread.threadId);
|
|
1284
1344
|
const priorHistory = history.filter((message) => message.runId !== thread.latestRunId);
|
|
1285
1345
|
const runInput = await this.loadRunInput(thread.threadId, thread.latestRunId);
|
|
1346
|
+
const startedAt = Date.now();
|
|
1286
1347
|
try {
|
|
1287
1348
|
const actual = await this.runtimeAdapter.invoke(binding, "", thread.threadId, thread.latestRunId, recoveryIntent.resumePayload, priorHistory);
|
|
1349
|
+
this.healthMonitor.recordLlmSuccess(Date.now() - startedAt);
|
|
1288
1350
|
await this.persistence.clearRecoveryIntent(thread.threadId, thread.latestRunId);
|
|
1289
1351
|
await this.finalizeContinuedRun(thread.threadId, thread.latestRunId, runInput, actual, {
|
|
1290
1352
|
previousState: "resuming",
|
|
@@ -1293,6 +1355,7 @@ export class AgentHarnessRuntime {
|
|
|
1293
1355
|
});
|
|
1294
1356
|
}
|
|
1295
1357
|
catch (error) {
|
|
1358
|
+
this.healthMonitor.recordLlmFailure(Date.now() - startedAt);
|
|
1296
1359
|
if (recoveryIntent.attempts + 1 >= this.recoveryConfig.maxRecoveryAttempts) {
|
|
1297
1360
|
await this.persistence.setRunState(thread.threadId, thread.latestRunId, "failed", recoveryIntent.checkpointRef);
|
|
1298
1361
|
await this.persistence.clearRecoveryIntent(thread.threadId, thread.latestRunId);
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import type { RuntimeHealthSnapshot, WorkspaceBundle } from "../contracts/types.js";
|
|
2
|
+
import type { RuntimePersistence } from "../persistence/types.js";
|
|
3
|
+
import type { MaintenanceLoopStatus } from "./checkpoint-maintenance.js";
|
|
4
|
+
type HealthMonitorConfig = {
|
|
5
|
+
enabled: boolean;
|
|
6
|
+
evaluateIntervalSeconds: number;
|
|
7
|
+
emitEvents: boolean;
|
|
8
|
+
thresholds: {
|
|
9
|
+
llmErrorRate: {
|
|
10
|
+
windowSeconds: number;
|
|
11
|
+
degradedAbove: number;
|
|
12
|
+
unhealthyAbove: number;
|
|
13
|
+
};
|
|
14
|
+
llmP95LatencyMs: {
|
|
15
|
+
windowSeconds: number;
|
|
16
|
+
degradedAbove: number;
|
|
17
|
+
unhealthyAbove: number;
|
|
18
|
+
};
|
|
19
|
+
pendingApprovals: {
|
|
20
|
+
degradedAbove: number;
|
|
21
|
+
unhealthyAbove: number;
|
|
22
|
+
};
|
|
23
|
+
stuckRunSeconds: {
|
|
24
|
+
degradedAbove: number;
|
|
25
|
+
unhealthyAbove: number;
|
|
26
|
+
};
|
|
27
|
+
checkpointBytes: {
|
|
28
|
+
degradedAbove: number;
|
|
29
|
+
unhealthyAbove: number;
|
|
30
|
+
};
|
|
31
|
+
runtimeDbBytes: {
|
|
32
|
+
degradedAbove: number;
|
|
33
|
+
unhealthyAbove: number;
|
|
34
|
+
};
|
|
35
|
+
artifactBytes: {
|
|
36
|
+
degradedAbove: number;
|
|
37
|
+
unhealthyAbove: number;
|
|
38
|
+
};
|
|
39
|
+
};
|
|
40
|
+
};
|
|
41
|
+
type HealthMonitorOptions = {
|
|
42
|
+
workspace: WorkspaceBundle;
|
|
43
|
+
persistence: RuntimePersistence;
|
|
44
|
+
getActiveRunSlots: () => number;
|
|
45
|
+
getPendingRunSlots: () => number;
|
|
46
|
+
getCheckpointMaintenanceStatus: () => MaintenanceLoopStatus | null;
|
|
47
|
+
getRuntimeRecordMaintenanceStatus: () => MaintenanceLoopStatus | null;
|
|
48
|
+
publishEvent?: (payload: Record<string, unknown>) => void | Promise<void>;
|
|
49
|
+
};
|
|
50
|
+
export declare function readHealthMonitorConfig(workspace: WorkspaceBundle): HealthMonitorConfig;
|
|
51
|
+
export declare class HealthMonitor {
|
|
52
|
+
private readonly options;
|
|
53
|
+
private readonly config;
|
|
54
|
+
private readonly runRoots;
|
|
55
|
+
private readonly checkpointDbPaths;
|
|
56
|
+
private readonly llmSamples;
|
|
57
|
+
private timer;
|
|
58
|
+
private latestSnapshot;
|
|
59
|
+
private runtimeEventSequence;
|
|
60
|
+
constructor(options: HealthMonitorOptions);
|
|
61
|
+
recordLlmSuccess(latencyMs: number, nowMs?: number): void;
|
|
62
|
+
recordLlmFailure(latencyMs: number, nowMs?: number): void;
|
|
63
|
+
private recordLlmSample;
|
|
64
|
+
start(): Promise<void>;
|
|
65
|
+
stop(): Promise<void>;
|
|
66
|
+
getSnapshot(): Promise<RuntimeHealthSnapshot>;
|
|
67
|
+
evaluate(nowMs?: number): Promise<RuntimeHealthSnapshot>;
|
|
68
|
+
private llmStats;
|
|
69
|
+
private evaluateLlmCheck;
|
|
70
|
+
private llmSymptoms;
|
|
71
|
+
private evaluatePersistenceCheck;
|
|
72
|
+
private evaluateCapacityCheck;
|
|
73
|
+
private capacitySymptoms;
|
|
74
|
+
private evaluateWorkloadCheck;
|
|
75
|
+
private workloadSymptoms;
|
|
76
|
+
private countStuckRuns;
|
|
77
|
+
private sumRuntimeDbBytes;
|
|
78
|
+
private sumCheckpointDbBytes;
|
|
79
|
+
private sumArtifactBytes;
|
|
80
|
+
}
|
|
81
|
+
export {};
|