@botbotgo/agent-harness 0.0.104 → 0.0.106
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package-version.d.ts +1 -1
- package/dist/package-version.js +1 -1
- package/dist/runtime/harness/run/routing.d.ts +2 -2
- package/dist/runtime/harness/run/routing.js +3 -4
- package/dist/runtime/harness/system/health-monitor.d.ts +0 -23
- package/dist/runtime/harness/system/health-monitor.js +56 -254
- package/dist/runtime/harness/system/inventory.js +1 -2
- package/dist/runtime/harness.d.ts +0 -1
- package/dist/runtime/harness.js +4 -6
- package/dist/runtime/support/harness-support.d.ts +0 -2
- package/dist/runtime/support/harness-support.js +5 -35
- package/dist/workspace/agent-binding-compiler.js +0 -3
- package/package.json +1 -1
- package/dist/runtime/support/runtime-entry.d.ts +0 -2
- package/dist/runtime/support/runtime-entry.js +0 -3
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const AGENT_HARNESS_VERSION = "0.0.
|
|
1
|
+
export declare const AGENT_HARNESS_VERSION = "0.0.105";
|
package/dist/package-version.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export const AGENT_HARNESS_VERSION = "0.0.
|
|
1
|
+
export const AGENT_HARNESS_VERSION = "0.0.105";
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { type MessageContent, type ThreadSummary, type WorkspaceBundle } from "../../../contracts/types.js";
|
|
2
|
-
export declare function getDefaultHostAgentId(workspace: WorkspaceBundle, preferredHostAgentId
|
|
2
|
+
export declare function getDefaultHostAgentId(workspace: WorkspaceBundle, preferredHostAgentId?: string): string;
|
|
3
3
|
export declare function resolveSelectedAgentId(options: {
|
|
4
4
|
workspace: WorkspaceBundle;
|
|
5
5
|
input: MessageContent;
|
|
6
6
|
requestedAgentId?: string;
|
|
7
7
|
threadId?: string;
|
|
8
|
-
preferredHostAgentId
|
|
8
|
+
preferredHostAgentId?: string;
|
|
9
9
|
getThreadSummary: (threadId: string) => Promise<ThreadSummary | null>;
|
|
10
10
|
}): Promise<string>;
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import { AUTO_AGENT_ID } from "../../../contracts/types.js";
|
|
2
2
|
import { inferRoutingBindings } from "../../support/harness-support.js";
|
|
3
|
-
import { isRuntimeEntryBinding } from "../../support/runtime-entry.js";
|
|
4
3
|
export function getDefaultHostAgentId(workspace, preferredHostAgentId) {
|
|
5
|
-
const preferredBinding = workspace.bindings.get(preferredHostAgentId);
|
|
6
|
-
if (preferredBinding
|
|
4
|
+
const preferredBinding = preferredHostAgentId ? workspace.bindings.get(preferredHostAgentId) : undefined;
|
|
5
|
+
if (preferredBinding) {
|
|
7
6
|
return preferredBinding.agent.id;
|
|
8
7
|
}
|
|
9
8
|
return inferRoutingBindings(workspace).primaryBinding?.agent.id ?? "agent";
|
|
@@ -14,7 +13,7 @@ export async function resolveSelectedAgentId(options) {
|
|
|
14
13
|
if (threadId) {
|
|
15
14
|
const thread = await getThreadSummary(threadId);
|
|
16
15
|
const threadBinding = thread ? workspace.bindings.get(thread.agentId) : undefined;
|
|
17
|
-
if (thread?.agentId && threadBinding
|
|
16
|
+
if (thread?.agentId && threadBinding) {
|
|
18
17
|
return thread.agentId;
|
|
19
18
|
}
|
|
20
19
|
}
|
|
@@ -24,18 +24,6 @@ type HealthMonitorConfig = {
|
|
|
24
24
|
degradedAbove: number;
|
|
25
25
|
unhealthyAbove: number;
|
|
26
26
|
};
|
|
27
|
-
checkpointBytes: {
|
|
28
|
-
degradedAbove: number;
|
|
29
|
-
unhealthyAbove: number;
|
|
30
|
-
};
|
|
31
|
-
runtimeDbBytes: {
|
|
32
|
-
degradedAbove: number;
|
|
33
|
-
unhealthyAbove: number;
|
|
34
|
-
};
|
|
35
|
-
artifactBytes: {
|
|
36
|
-
degradedAbove: number;
|
|
37
|
-
unhealthyAbove: number;
|
|
38
|
-
};
|
|
39
27
|
};
|
|
40
28
|
};
|
|
41
29
|
type HealthMonitorOptions = {
|
|
@@ -51,12 +39,9 @@ export declare function readHealthMonitorConfig(workspace: WorkspaceBundle): Hea
|
|
|
51
39
|
export declare class HealthMonitor {
|
|
52
40
|
private readonly options;
|
|
53
41
|
private readonly config;
|
|
54
|
-
private readonly runRoots;
|
|
55
|
-
private readonly checkpointDbPaths;
|
|
56
42
|
private readonly llmSamples;
|
|
57
43
|
private timer;
|
|
58
44
|
private latestSnapshot;
|
|
59
|
-
private runtimeEventSequence;
|
|
60
45
|
constructor(options: HealthMonitorOptions);
|
|
61
46
|
recordLlmSuccess(latencyMs: number, nowMs?: number): void;
|
|
62
47
|
recordLlmFailure(latencyMs: number, nowMs?: number): void;
|
|
@@ -67,15 +52,7 @@ export declare class HealthMonitor {
|
|
|
67
52
|
evaluate(nowMs?: number): Promise<RuntimeHealthSnapshot>;
|
|
68
53
|
private llmStats;
|
|
69
54
|
private evaluateLlmCheck;
|
|
70
|
-
private llmSymptoms;
|
|
71
|
-
private evaluatePersistenceCheck;
|
|
72
|
-
private evaluateCapacityCheck;
|
|
73
|
-
private capacitySymptoms;
|
|
74
55
|
private evaluateWorkloadCheck;
|
|
75
|
-
private workloadSymptoms;
|
|
76
56
|
private countStuckRuns;
|
|
77
|
-
private sumRuntimeDbBytes;
|
|
78
|
-
private sumCheckpointDbBytes;
|
|
79
|
-
private sumArtifactBytes;
|
|
80
57
|
}
|
|
81
58
|
export {};
|
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
import path from "node:path";
|
|
2
|
-
import { readdir, stat } from "node:fs/promises";
|
|
3
1
|
import { getRuntimeDefaults } from "../../../workspace/support/workspace-ref-utils.js";
|
|
4
|
-
import { discoverCheckpointMaintenanceTargets } from "../../maintenance/checkpoint-maintenance.js";
|
|
5
2
|
const DEFAULT_HEALTH_CONFIG = {
|
|
6
3
|
enabled: false,
|
|
7
4
|
evaluateIntervalSeconds: 30,
|
|
@@ -25,18 +22,6 @@ const DEFAULT_HEALTH_CONFIG = {
|
|
|
25
22
|
degradedAbove: 300,
|
|
26
23
|
unhealthyAbove: 900,
|
|
27
24
|
},
|
|
28
|
-
checkpointBytes: {
|
|
29
|
-
degradedAbove: 512 * 1024 * 1024,
|
|
30
|
-
unhealthyAbove: 2 * 1024 * 1024 * 1024,
|
|
31
|
-
},
|
|
32
|
-
runtimeDbBytes: {
|
|
33
|
-
degradedAbove: 256 * 1024 * 1024,
|
|
34
|
-
unhealthyAbove: 1024 * 1024 * 1024,
|
|
35
|
-
},
|
|
36
|
-
artifactBytes: {
|
|
37
|
-
degradedAbove: 512 * 1024 * 1024,
|
|
38
|
-
unhealthyAbove: 2 * 1024 * 1024 * 1024,
|
|
39
|
-
},
|
|
40
25
|
},
|
|
41
26
|
};
|
|
42
27
|
function asObject(value) {
|
|
@@ -52,23 +37,18 @@ function maxStatus(left, right) {
|
|
|
52
37
|
const rank = { healthy: 0, degraded: 1, unhealthy: 2 };
|
|
53
38
|
return rank[left] >= rank[right] ? left : right;
|
|
54
39
|
}
|
|
55
|
-
function compareStatus(left, right) {
|
|
56
|
-
return left === right;
|
|
57
|
-
}
|
|
58
|
-
function computeP95(values) {
|
|
59
|
-
if (values.length === 0) {
|
|
60
|
-
return undefined;
|
|
61
|
-
}
|
|
62
|
-
const sorted = [...values].sort((a, b) => a - b);
|
|
63
|
-
const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
|
|
64
|
-
return sorted[index];
|
|
65
|
-
}
|
|
66
|
-
function isoFromMs(value) {
|
|
67
|
-
return new Date(value).toISOString();
|
|
68
|
-
}
|
|
69
40
|
function buildCheck(status, updatedAt, reason) {
|
|
70
41
|
return reason ? { status, updatedAt, reason } : { status, updatedAt };
|
|
71
42
|
}
|
|
43
|
+
function createSymptom(code, status, message, timestamp) {
|
|
44
|
+
return {
|
|
45
|
+
code,
|
|
46
|
+
severity: status === "unhealthy" ? "error" : status === "degraded" ? "warn" : "info",
|
|
47
|
+
message,
|
|
48
|
+
firstSeenAt: timestamp,
|
|
49
|
+
lastSeenAt: timestamp,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
72
52
|
function describeThresholdStatus(value, thresholds) {
|
|
73
53
|
if (value >= thresholds.unhealthyAbove) {
|
|
74
54
|
return "unhealthy";
|
|
@@ -78,6 +58,14 @@ function describeThresholdStatus(value, thresholds) {
|
|
|
78
58
|
}
|
|
79
59
|
return "healthy";
|
|
80
60
|
}
|
|
61
|
+
function computeP95(values) {
|
|
62
|
+
if (values.length === 0) {
|
|
63
|
+
return undefined;
|
|
64
|
+
}
|
|
65
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
66
|
+
const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil(sorted.length * 0.95) - 1));
|
|
67
|
+
return sorted[index];
|
|
68
|
+
}
|
|
81
69
|
export function readHealthMonitorConfig(workspace) {
|
|
82
70
|
const runtimeDefaults = getRuntimeDefaults(workspace.refs);
|
|
83
71
|
const observability = asObject(runtimeDefaults?.observability);
|
|
@@ -87,9 +75,6 @@ export function readHealthMonitorConfig(workspace) {
|
|
|
87
75
|
const llmP95LatencyMs = asObject(thresholds?.llmP95LatencyMs);
|
|
88
76
|
const pendingApprovals = asObject(thresholds?.pendingApprovals);
|
|
89
77
|
const stuckRunSeconds = asObject(thresholds?.stuckRunSeconds);
|
|
90
|
-
const checkpointBytes = asObject(thresholds?.checkpointBytes);
|
|
91
|
-
const runtimeDbBytes = asObject(thresholds?.runtimeDbBytes);
|
|
92
|
-
const artifactBytes = asObject(thresholds?.artifactBytes);
|
|
93
78
|
return {
|
|
94
79
|
enabled: health?.enabled === true,
|
|
95
80
|
evaluateIntervalSeconds: readPositiveNumber(health?.evaluateIntervalSeconds, DEFAULT_HEALTH_CONFIG.evaluateIntervalSeconds),
|
|
@@ -113,112 +98,18 @@ export function readHealthMonitorConfig(workspace) {
|
|
|
113
98
|
degradedAbove: readNonNegativeNumber(stuckRunSeconds?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.stuckRunSeconds.degradedAbove),
|
|
114
99
|
unhealthyAbove: readNonNegativeNumber(stuckRunSeconds?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.stuckRunSeconds.unhealthyAbove),
|
|
115
100
|
},
|
|
116
|
-
checkpointBytes: {
|
|
117
|
-
degradedAbove: readNonNegativeNumber(checkpointBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.checkpointBytes.degradedAbove),
|
|
118
|
-
unhealthyAbove: readNonNegativeNumber(checkpointBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.checkpointBytes.unhealthyAbove),
|
|
119
|
-
},
|
|
120
|
-
runtimeDbBytes: {
|
|
121
|
-
degradedAbove: readNonNegativeNumber(runtimeDbBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.runtimeDbBytes.degradedAbove),
|
|
122
|
-
unhealthyAbove: readNonNegativeNumber(runtimeDbBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.runtimeDbBytes.unhealthyAbove),
|
|
123
|
-
},
|
|
124
|
-
artifactBytes: {
|
|
125
|
-
degradedAbove: readNonNegativeNumber(artifactBytes?.degradedAbove, DEFAULT_HEALTH_CONFIG.thresholds.artifactBytes.degradedAbove),
|
|
126
|
-
unhealthyAbove: readNonNegativeNumber(artifactBytes?.unhealthyAbove, DEFAULT_HEALTH_CONFIG.thresholds.artifactBytes.unhealthyAbove),
|
|
127
|
-
},
|
|
128
101
|
},
|
|
129
102
|
};
|
|
130
103
|
}
|
|
131
|
-
async function safeFileSize(filePath) {
|
|
132
|
-
try {
|
|
133
|
-
const stats = await stat(filePath);
|
|
134
|
-
return stats.size;
|
|
135
|
-
}
|
|
136
|
-
catch {
|
|
137
|
-
return 0;
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
async function directorySize(root) {
|
|
141
|
-
try {
|
|
142
|
-
const entries = await readdir(root, { withFileTypes: true });
|
|
143
|
-
let total = 0;
|
|
144
|
-
for (const entry of entries) {
|
|
145
|
-
const fullPath = path.join(root, entry.name);
|
|
146
|
-
if (entry.isDirectory()) {
|
|
147
|
-
total += await directorySize(fullPath);
|
|
148
|
-
continue;
|
|
149
|
-
}
|
|
150
|
-
if (entry.isFile()) {
|
|
151
|
-
total += await safeFileSize(fullPath);
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
return total;
|
|
155
|
-
}
|
|
156
|
-
catch {
|
|
157
|
-
return 0;
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
function evaluateMaintenanceStatus(label, status, updatedAt) {
|
|
161
|
-
if (!status) {
|
|
162
|
-
return {
|
|
163
|
-
check: buildCheck("healthy", updatedAt, `${label} disabled`),
|
|
164
|
-
symptoms: [],
|
|
165
|
-
};
|
|
166
|
-
}
|
|
167
|
-
if (status.consecutiveFailures > 0) {
|
|
168
|
-
return {
|
|
169
|
-
check: buildCheck("degraded", updatedAt, `${label} has ${status.consecutiveFailures} consecutive failure(s)`),
|
|
170
|
-
symptoms: [{
|
|
171
|
-
code: `${label}.loop.failure`,
|
|
172
|
-
severity: "warn",
|
|
173
|
-
message: status.lastError ?? `${label} maintenance loop failed`,
|
|
174
|
-
firstSeenAt: status.lastFailedAt ?? updatedAt,
|
|
175
|
-
lastSeenAt: status.lastFailedAt ?? updatedAt,
|
|
176
|
-
}],
|
|
177
|
-
};
|
|
178
|
-
}
|
|
179
|
-
if (status.lastCompletedAt) {
|
|
180
|
-
return {
|
|
181
|
-
check: buildCheck("healthy", updatedAt, `${label} completed successfully`),
|
|
182
|
-
symptoms: [],
|
|
183
|
-
};
|
|
184
|
-
}
|
|
185
|
-
return {
|
|
186
|
-
check: buildCheck("healthy", updatedAt, `${label} idle`),
|
|
187
|
-
symptoms: [],
|
|
188
|
-
};
|
|
189
|
-
}
|
|
190
|
-
function normalizeSeverity(status) {
|
|
191
|
-
if (status === "unhealthy") {
|
|
192
|
-
return "error";
|
|
193
|
-
}
|
|
194
|
-
if (status === "degraded") {
|
|
195
|
-
return "warn";
|
|
196
|
-
}
|
|
197
|
-
return "info";
|
|
198
|
-
}
|
|
199
|
-
function createSymptom(code, status, message, timestamp) {
|
|
200
|
-
return {
|
|
201
|
-
code,
|
|
202
|
-
severity: normalizeSeverity(status),
|
|
203
|
-
message,
|
|
204
|
-
firstSeenAt: timestamp,
|
|
205
|
-
lastSeenAt: timestamp,
|
|
206
|
-
};
|
|
207
|
-
}
|
|
208
104
|
export class HealthMonitor {
|
|
209
105
|
options;
|
|
210
106
|
config;
|
|
211
|
-
runRoots;
|
|
212
|
-
checkpointDbPaths;
|
|
213
107
|
llmSamples = [];
|
|
214
108
|
timer = null;
|
|
215
109
|
latestSnapshot = null;
|
|
216
|
-
runtimeEventSequence = 0;
|
|
217
110
|
constructor(options) {
|
|
218
111
|
this.options = options;
|
|
219
112
|
this.config = readHealthMonitorConfig(options.workspace);
|
|
220
|
-
this.runRoots = Array.from(new Set(Array.from(options.workspace.bindings.values()).map((binding) => binding.harnessRuntime.runRoot)));
|
|
221
|
-
this.checkpointDbPaths = Array.from(new Set(discoverCheckpointMaintenanceTargets(options.workspace).map((target) => target.dbPath)));
|
|
222
113
|
}
|
|
223
114
|
recordLlmSuccess(latencyMs, nowMs = Date.now()) {
|
|
224
115
|
this.recordLlmSample({ timestampMs: nowMs, latencyMs, success: true });
|
|
@@ -253,172 +144,96 @@ export class HealthMonitor {
|
|
|
253
144
|
return this.evaluate();
|
|
254
145
|
}
|
|
255
146
|
async evaluate(nowMs = Date.now()) {
|
|
256
|
-
const updatedAt =
|
|
257
|
-
const [runs, approvals
|
|
147
|
+
const updatedAt = new Date(nowMs).toISOString();
|
|
148
|
+
const [runs, approvals] = await Promise.all([
|
|
258
149
|
this.options.persistence.listRuns(),
|
|
259
150
|
this.options.persistence.listApprovals(),
|
|
260
|
-
this.sumRuntimeDbBytes(),
|
|
261
|
-
this.sumCheckpointDbBytes(),
|
|
262
|
-
this.sumArtifactBytes(),
|
|
263
151
|
]);
|
|
264
152
|
const pendingApprovals = approvals.filter((approval) => approval.status === "pending").length;
|
|
265
153
|
const stuckRuns = this.countStuckRuns(runs, nowMs);
|
|
266
|
-
const runtimeMaintenance = evaluateMaintenanceStatus("checkpoint", this.options.getCheckpointMaintenanceStatus(), updatedAt);
|
|
267
|
-
const recordMaintenance = evaluateMaintenanceStatus("records", this.options.getRuntimeRecordMaintenanceStatus(), updatedAt);
|
|
268
|
-
let runtimeCheck = buildCheck("healthy", updatedAt, "runtime loops healthy");
|
|
269
|
-
const runtimeSymptoms = [...runtimeMaintenance.symptoms, ...recordMaintenance.symptoms];
|
|
270
|
-
runtimeCheck = buildCheck(maxStatus(runtimeMaintenance.check.status, recordMaintenance.check.status), updatedAt, runtimeSymptoms.length > 0 ? runtimeSymptoms.map((symptom) => symptom.message).join("; ") : "runtime loops healthy");
|
|
271
154
|
const llmCheck = this.evaluateLlmCheck(updatedAt, nowMs);
|
|
272
|
-
const persistenceCheck = this.evaluatePersistenceCheck(updatedAt, runtimeDbBytes, runtimeSymptoms);
|
|
273
|
-
const capacityCheck = this.evaluateCapacityCheck(updatedAt, checkpointBytes, runtimeDbBytes, artifactBytes);
|
|
274
155
|
const workloadCheck = this.evaluateWorkloadCheck(updatedAt, pendingApprovals, stuckRuns);
|
|
275
|
-
const
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
llm
|
|
287
|
-
|
|
288
|
-
capacity: capacityCheck,
|
|
289
|
-
workload: workloadCheck,
|
|
290
|
-
};
|
|
156
|
+
const runtimeCheck = buildCheck("healthy", updatedAt, "runtime health monitoring enabled");
|
|
157
|
+
const persistenceCheck = buildCheck("healthy", updatedAt, "runtime persistence accessible");
|
|
158
|
+
const capacityCheck = buildCheck("healthy", updatedAt, "capacity checks disabled");
|
|
159
|
+
const symptoms = [];
|
|
160
|
+
if (describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals) !== "healthy") {
|
|
161
|
+
symptoms.push(createSymptom("workload.pending-approvals", describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals), `pendingApprovals=${pendingApprovals}`, updatedAt));
|
|
162
|
+
}
|
|
163
|
+
if (stuckRuns > 0) {
|
|
164
|
+
symptoms.push(createSymptom("workload.stuck-runs", workloadCheck.status, `stuckRuns=${stuckRuns}`, updatedAt));
|
|
165
|
+
}
|
|
166
|
+
if (llmCheck.status !== "healthy") {
|
|
167
|
+
symptoms.push(createSymptom("llm.invocations", llmCheck.status, llmCheck.reason ?? "llm health degraded", updatedAt));
|
|
168
|
+
}
|
|
291
169
|
let overallStatus = "healthy";
|
|
292
|
-
for (const check of
|
|
170
|
+
for (const check of [runtimeCheck, llmCheck, persistenceCheck, capacityCheck, workloadCheck]) {
|
|
293
171
|
overallStatus = maxStatus(overallStatus, check.status);
|
|
294
172
|
}
|
|
295
173
|
const snapshot = {
|
|
296
174
|
status: overallStatus,
|
|
297
175
|
updatedAt,
|
|
298
|
-
checks
|
|
176
|
+
checks: {
|
|
177
|
+
runtime: runtimeCheck,
|
|
178
|
+
llm: llmCheck,
|
|
179
|
+
persistence: persistenceCheck,
|
|
180
|
+
capacity: capacityCheck,
|
|
181
|
+
workload: workloadCheck,
|
|
182
|
+
},
|
|
299
183
|
symptoms,
|
|
300
184
|
stats: {
|
|
301
185
|
activeRunSlots: this.options.getActiveRunSlots(),
|
|
302
186
|
pendingRunSlots: this.options.getPendingRunSlots(),
|
|
303
187
|
pendingApprovals,
|
|
304
188
|
stuckRuns,
|
|
305
|
-
checkpointBytes,
|
|
306
|
-
runtimeDbBytes,
|
|
307
|
-
artifactBytes,
|
|
308
189
|
...this.llmStats(nowMs),
|
|
309
190
|
},
|
|
310
191
|
};
|
|
311
|
-
const previous = this.latestSnapshot;
|
|
312
|
-
this.latestSnapshot = snapshot;
|
|
313
192
|
if (this.config.enabled &&
|
|
314
193
|
this.config.emitEvents &&
|
|
315
194
|
this.options.publishEvent &&
|
|
316
|
-
|
|
317
|
-
|
|
195
|
+
this.latestSnapshot &&
|
|
196
|
+
this.latestSnapshot.status !== snapshot.status) {
|
|
318
197
|
await this.options.publishEvent({
|
|
319
|
-
previousStatus:
|
|
198
|
+
previousStatus: this.latestSnapshot.status,
|
|
320
199
|
status: snapshot.status,
|
|
321
200
|
checks: snapshot.checks,
|
|
322
201
|
stats: snapshot.stats,
|
|
323
202
|
});
|
|
324
203
|
}
|
|
204
|
+
this.latestSnapshot = snapshot;
|
|
325
205
|
return snapshot;
|
|
326
206
|
}
|
|
327
207
|
llmStats(nowMs) {
|
|
328
|
-
const
|
|
329
|
-
const recent = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= oneMinuteWindowMs);
|
|
208
|
+
const recent = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= 60 * 1000);
|
|
330
209
|
const successes = recent.filter((sample) => sample.success).length;
|
|
331
|
-
const latencies = recent.map((sample) => sample.latencyMs);
|
|
332
210
|
return {
|
|
333
211
|
llmSuccessRate1m: recent.length > 0 ? successes / recent.length : undefined,
|
|
334
|
-
llmP95LatencyMs1m: computeP95(
|
|
212
|
+
llmP95LatencyMs1m: computeP95(recent.map((sample) => sample.latencyMs)),
|
|
335
213
|
};
|
|
336
214
|
}
|
|
337
215
|
evaluateLlmCheck(updatedAt, nowMs) {
|
|
338
|
-
const
|
|
339
|
-
const
|
|
340
|
-
const errorSamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= errorWindow);
|
|
341
|
-
const latencySamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= latencyWindow);
|
|
216
|
+
const errorSamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= this.config.thresholds.llmErrorRate.windowSeconds * 1000);
|
|
217
|
+
const latencySamples = this.llmSamples.filter((sample) => nowMs - sample.timestampMs <= this.config.thresholds.llmP95LatencyMs.windowSeconds * 1000);
|
|
342
218
|
if (errorSamples.length === 0 && latencySamples.length === 0) {
|
|
343
219
|
return buildCheck("healthy", updatedAt, "no recent llm invocations");
|
|
344
220
|
}
|
|
345
|
-
const
|
|
346
|
-
|
|
221
|
+
const errorRate = errorSamples.length > 0
|
|
222
|
+
? errorSamples.filter((sample) => !sample.success).length / errorSamples.length
|
|
223
|
+
: 0;
|
|
347
224
|
const p95Latency = computeP95(latencySamples.map((sample) => sample.latencyMs));
|
|
348
|
-
let status =
|
|
349
|
-
if (errorRate >= this.config.thresholds.llmErrorRate.unhealthyAbove) {
|
|
350
|
-
status = "unhealthy";
|
|
351
|
-
}
|
|
352
|
-
else if (errorRate >= this.config.thresholds.llmErrorRate.degradedAbove) {
|
|
353
|
-
status = "degraded";
|
|
354
|
-
}
|
|
225
|
+
let status = describeThresholdStatus(errorRate, this.config.thresholds.llmErrorRate);
|
|
355
226
|
if (p95Latency !== undefined) {
|
|
356
|
-
|
|
357
|
-
status = maxStatus(status, latencyStatus);
|
|
227
|
+
status = maxStatus(status, describeThresholdStatus(p95Latency, this.config.thresholds.llmP95LatencyMs));
|
|
358
228
|
}
|
|
359
229
|
return buildCheck(status, updatedAt, `errorRate=${errorRate.toFixed(2)}${p95Latency !== undefined ? ` p95LatencyMs=${Math.round(p95Latency)}` : ""}`);
|
|
360
230
|
}
|
|
361
|
-
llmSymptoms(check, updatedAt) {
|
|
362
|
-
if (check.status === "healthy") {
|
|
363
|
-
return [];
|
|
364
|
-
}
|
|
365
|
-
return [createSymptom("llm.invocations", check.status, check.reason ?? "llm health degraded", updatedAt)];
|
|
366
|
-
}
|
|
367
|
-
evaluatePersistenceCheck(updatedAt, runtimeDbBytes, runtimeSymptoms) {
|
|
368
|
-
if (runtimeSymptoms.length > 0) {
|
|
369
|
-
return buildCheck("degraded", updatedAt, runtimeSymptoms.map((symptom) => symptom.message).join("; "));
|
|
370
|
-
}
|
|
371
|
-
if (runtimeDbBytes === 0) {
|
|
372
|
-
return buildCheck("healthy", updatedAt, "runtime sqlite not materialized yet");
|
|
373
|
-
}
|
|
374
|
-
return buildCheck("healthy", updatedAt, "runtime sqlite accessible");
|
|
375
|
-
}
|
|
376
|
-
evaluateCapacityCheck(updatedAt, checkpointBytes, runtimeDbBytes, artifactBytes) {
|
|
377
|
-
let status = "healthy";
|
|
378
|
-
status = maxStatus(status, describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes));
|
|
379
|
-
status = maxStatus(status, describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes));
|
|
380
|
-
status = maxStatus(status, describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes));
|
|
381
|
-
return buildCheck(status, updatedAt, `checkpointBytes=${checkpointBytes} runtimeDbBytes=${runtimeDbBytes} artifactBytes=${artifactBytes}`);
|
|
382
|
-
}
|
|
383
|
-
capacitySymptoms(check, checkpointBytes, runtimeDbBytes, artifactBytes, updatedAt) {
|
|
384
|
-
if (check.status === "healthy") {
|
|
385
|
-
return [];
|
|
386
|
-
}
|
|
387
|
-
const symptoms = [];
|
|
388
|
-
if (describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes) !== "healthy") {
|
|
389
|
-
symptoms.push(createSymptom("capacity.checkpoints", describeThresholdStatus(checkpointBytes, this.config.thresholds.checkpointBytes), `checkpointBytes=${checkpointBytes}`, updatedAt));
|
|
390
|
-
}
|
|
391
|
-
if (describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes) !== "healthy") {
|
|
392
|
-
symptoms.push(createSymptom("capacity.runtime-db", describeThresholdStatus(runtimeDbBytes, this.config.thresholds.runtimeDbBytes), `runtimeDbBytes=${runtimeDbBytes}`, updatedAt));
|
|
393
|
-
}
|
|
394
|
-
if (describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes) !== "healthy") {
|
|
395
|
-
symptoms.push(createSymptom("capacity.artifacts", describeThresholdStatus(artifactBytes, this.config.thresholds.artifactBytes), `artifactBytes=${artifactBytes}`, updatedAt));
|
|
396
|
-
}
|
|
397
|
-
return symptoms;
|
|
398
|
-
}
|
|
399
231
|
evaluateWorkloadCheck(updatedAt, pendingApprovals, stuckRuns) {
|
|
400
232
|
let status = describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals);
|
|
401
|
-
status = maxStatus(status, describeThresholdStatus(stuckRuns, {
|
|
402
|
-
degradedAbove: this.config.thresholds.stuckRunSeconds.degradedAbove > 0 ? 1 : Number.MAX_SAFE_INTEGER,
|
|
403
|
-
unhealthyAbove: this.config.thresholds.stuckRunSeconds.unhealthyAbove > 0 ? 1 : Number.MAX_SAFE_INTEGER,
|
|
404
|
-
}));
|
|
405
|
-
return buildCheck(status, updatedAt, `pendingApprovals=${pendingApprovals} stuckRuns=${stuckRuns}`);
|
|
406
|
-
}
|
|
407
|
-
workloadSymptoms(check, pendingApprovals, stuckRuns, updatedAt) {
|
|
408
|
-
if (check.status === "healthy") {
|
|
409
|
-
return [];
|
|
410
|
-
}
|
|
411
|
-
const symptoms = [];
|
|
412
|
-
if (describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals) !== "healthy") {
|
|
413
|
-
symptoms.push(createSymptom("workload.pending-approvals", describeThresholdStatus(pendingApprovals, this.config.thresholds.pendingApprovals), `pendingApprovals=${pendingApprovals}`, updatedAt));
|
|
414
|
-
}
|
|
415
233
|
if (stuckRuns > 0) {
|
|
416
|
-
|
|
417
|
-
? "unhealthy"
|
|
418
|
-
: "degraded";
|
|
419
|
-
symptoms.push(createSymptom("workload.stuck-runs", stuckStatus, `stuckRuns=${stuckRuns}`, updatedAt));
|
|
234
|
+
status = maxStatus(status, stuckRuns >= this.config.thresholds.stuckRunSeconds.unhealthyAbove ? "unhealthy" : "degraded");
|
|
420
235
|
}
|
|
421
|
-
return
|
|
236
|
+
return buildCheck(status, updatedAt, `pendingApprovals=${pendingApprovals} stuckRuns=${stuckRuns}`);
|
|
422
237
|
}
|
|
423
238
|
countStuckRuns(runs, nowMs) {
|
|
424
239
|
return runs.filter((run) => {
|
|
@@ -429,20 +244,7 @@ export class HealthMonitor {
|
|
|
429
244
|
if (!Number.isFinite(updatedAtMs)) {
|
|
430
245
|
return false;
|
|
431
246
|
}
|
|
432
|
-
|
|
433
|
-
return ageSeconds >= this.config.thresholds.stuckRunSeconds.degradedAbove;
|
|
247
|
+
return (nowMs - updatedAtMs) / 1000 >= this.config.thresholds.stuckRunSeconds.degradedAbove;
|
|
434
248
|
}).length;
|
|
435
249
|
}
|
|
436
|
-
async sumRuntimeDbBytes() {
|
|
437
|
-
const sizes = await Promise.all(this.runRoots.map((runRoot) => safeFileSize(path.join(runRoot, "runtime.sqlite"))));
|
|
438
|
-
return sizes.reduce((sum, value) => sum + value, 0);
|
|
439
|
-
}
|
|
440
|
-
async sumCheckpointDbBytes() {
|
|
441
|
-
const sizes = await Promise.all(this.checkpointDbPaths.map((dbPath) => safeFileSize(dbPath)));
|
|
442
|
-
return sizes.reduce((sum, value) => sum + value, 0);
|
|
443
|
-
}
|
|
444
|
-
async sumArtifactBytes() {
|
|
445
|
-
const sizes = await Promise.all(this.runRoots.map((runRoot) => directorySize(path.join(runRoot, "threads"))));
|
|
446
|
-
return sizes.reduce((sum, value) => sum + value, 0);
|
|
447
|
-
}
|
|
448
250
|
}
|
|
@@ -2,9 +2,8 @@ import { readSkillMetadata } from "../../support/skill-metadata.js";
|
|
|
2
2
|
import { getBindingPrimaryTools } from "../../support/compiled-binding.js";
|
|
3
3
|
import { assessSkillRequirements, } from "./skill-requirements.js";
|
|
4
4
|
import { createRuntimeEnv } from "../../support/runtime-env.js";
|
|
5
|
-
import { isRuntimeEntryBinding } from "../../support/runtime-entry.js";
|
|
6
5
|
function listHostBindings(workspace) {
|
|
7
|
-
return Array.from(workspace.bindings.values())
|
|
6
|
+
return Array.from(workspace.bindings.values());
|
|
8
7
|
}
|
|
9
8
|
export function findAgentBinding(workspace, agentId) {
|
|
10
9
|
return workspace.bindings.get(agentId);
|
|
@@ -5,7 +5,6 @@ import type { RequirementAssessmentOptions } from "./harness/system/skill-requir
|
|
|
5
5
|
export declare class AgentHarnessRuntime {
|
|
6
6
|
private readonly workspace;
|
|
7
7
|
private readonly runtimeAdapterOptions;
|
|
8
|
-
private static readonly DEFAULT_HOST_AGENT_ID;
|
|
9
8
|
private static readonly BACKGROUND_EVENT_TYPES;
|
|
10
9
|
private readonly eventBus;
|
|
11
10
|
private readonly persistence;
|
package/dist/runtime/harness.js
CHANGED
|
@@ -23,7 +23,6 @@ import { getDefaultHostAgentId, resolveSelectedAgentId } from "./harness/run/rou
|
|
|
23
23
|
import { resolveCheckpointer, resolveEmbeddingModel, resolveStore, resolveStoreFromConfig, resolveVectorStore, } from "./harness/run/resources.js";
|
|
24
24
|
import { createToolMcpServerFromTools, serveToolsOverStdioFromHarness } from "../mcp.js";
|
|
25
25
|
import { getBindingAdapterKind, getBindingPrimaryTools, getBindingStoreConfig } from "./support/compiled-binding.js";
|
|
26
|
-
import { isRuntimeEntryBinding } from "./support/runtime-entry.js";
|
|
27
26
|
import { describeWorkspaceInventory, listAgentSkills as listWorkspaceAgentSkills, } from "./harness/system/inventory.js";
|
|
28
27
|
import { createDefaultHealthSnapshot, isInventoryEnabled, isThreadMemorySyncEnabled, } from "./harness/runtime-defaults.js";
|
|
29
28
|
import { initializeHarnessRuntime, isStaleRunningRun as isHarnessStaleRunningRun, } from "./harness/run/startup-runtime.js";
|
|
@@ -32,7 +31,6 @@ import { deleteThreadRecord, getPublicApproval, getThreadRecord, listPublicAppro
|
|
|
32
31
|
export class AgentHarnessRuntime {
|
|
33
32
|
workspace;
|
|
34
33
|
runtimeAdapterOptions;
|
|
35
|
-
static DEFAULT_HOST_AGENT_ID = "orchestra";
|
|
36
34
|
static BACKGROUND_EVENT_TYPES = new Set([
|
|
37
35
|
"run.created",
|
|
38
36
|
"run.queued",
|
|
@@ -72,7 +70,7 @@ export class AgentHarnessRuntime {
|
|
|
72
70
|
`${this.workspace.workspaceRoot}/run-data`);
|
|
73
71
|
}
|
|
74
72
|
getDefaultHostAgentId() {
|
|
75
|
-
return getDefaultHostAgentId(this.workspace, this.routingDefaultAgentId
|
|
73
|
+
return getDefaultHostAgentId(this.workspace, this.routingDefaultAgentId);
|
|
76
74
|
}
|
|
77
75
|
async resolveSelectedAgentId(input, requestedAgentId, threadId) {
|
|
78
76
|
return resolveSelectedAgentId({
|
|
@@ -80,7 +78,7 @@ export class AgentHarnessRuntime {
|
|
|
80
78
|
input,
|
|
81
79
|
requestedAgentId,
|
|
82
80
|
threadId,
|
|
83
|
-
preferredHostAgentId: this.routingDefaultAgentId
|
|
81
|
+
preferredHostAgentId: this.routingDefaultAgentId,
|
|
84
82
|
getThreadSummary: (currentThreadId) => this.getSession(currentThreadId),
|
|
85
83
|
});
|
|
86
84
|
}
|
|
@@ -264,14 +262,14 @@ export class AgentHarnessRuntime {
|
|
|
264
262
|
const configuredRule = this.routingRules.find((rule) => matchRoutingRule(rawInput, rule, options));
|
|
265
263
|
if (configuredRule) {
|
|
266
264
|
const configuredBinding = this.workspace.bindings.get(configuredRule.agentId);
|
|
267
|
-
if (configuredBinding
|
|
265
|
+
if (configuredBinding) {
|
|
268
266
|
return configuredBinding.agent.id;
|
|
269
267
|
}
|
|
270
268
|
}
|
|
271
269
|
const defaultBinding = this.routingDefaultAgentId
|
|
272
270
|
? this.workspace.bindings.get(this.routingDefaultAgentId)
|
|
273
271
|
: undefined;
|
|
274
|
-
if (defaultBinding
|
|
272
|
+
if (defaultBinding) {
|
|
275
273
|
return defaultBinding.agent.id;
|
|
276
274
|
}
|
|
277
275
|
return this.getDefaultHostAgentId();
|
|
@@ -11,7 +11,5 @@ export declare function createHarnessEvent(threadId: string, runId: string, sequ
|
|
|
11
11
|
export declare function createPendingApproval(threadId: string, runId: string, checkpointRef: string, input: string, interruptContent?: string): InternalApprovalRecord;
|
|
12
12
|
export declare function inferRoutingBindings(workspace: WorkspaceBundle): {
|
|
13
13
|
primaryBinding: import("../../contracts/workspace.js").CompiledAgentBinding;
|
|
14
|
-
secondaryBinding: import("../../contracts/workspace.js").CompiledAgentBinding | undefined;
|
|
15
|
-
researchBinding: import("../../contracts/workspace.js").CompiledAgentBinding | undefined;
|
|
16
14
|
hostBindings: import("../../contracts/workspace.js").CompiledAgentBinding[];
|
|
17
15
|
};
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { createPersistentId } from "../../utils/id.js";
|
|
2
2
|
import { isDelegationCapableBinding } from "../../workspace/support/agent-capabilities.js";
|
|
3
|
-
import { isRuntimeEntryBinding } from "./runtime-entry.js";
|
|
4
3
|
export function renderRuntimeFailure(error) {
|
|
5
4
|
const message = error instanceof Error ? error.message : String(error);
|
|
6
5
|
return `runtime_error=${message}`.trim();
|
|
@@ -87,38 +86,9 @@ export function createPendingApproval(threadId, runId, checkpointRef, input, int
|
|
|
87
86
|
};
|
|
88
87
|
}
|
|
89
88
|
export function inferRoutingBindings(workspace) {
|
|
90
|
-
const
|
|
91
|
-
const orchestrationHosts =
|
|
92
|
-
const routingHosts = orchestrationHosts.length > 0 ? orchestrationHosts :
|
|
93
|
-
const
|
|
94
|
-
|
|
95
|
-
const researchBinding = routingHosts.find((binding) => binding.agent.id === "research-lite" || binding.agent.id === "research");
|
|
96
|
-
const directBinding = routingHosts.find((binding) => binding.agent.id === "direct");
|
|
97
|
-
const delegationHosts = routingHosts.filter((binding) => isDelegationCapableBinding(binding));
|
|
98
|
-
const deepAgentDelegationHosts = deepAgentHosts.filter((binding) => isDelegationCapableBinding(binding));
|
|
99
|
-
const nonDeepAgentDelegationHosts = nonDeepAgentHosts.filter((binding) => isDelegationCapableBinding(binding));
|
|
100
|
-
const lightweightHosts = routingHosts.filter((binding) => !isDelegationCapableBinding(binding));
|
|
101
|
-
const defaultOrchestratingHost = routingHosts.find((binding) => binding.agent.id === "orchestra") ??
|
|
102
|
-
deepAgentDelegationHosts.find((binding) => (binding.deepAgentParams?.subagents.length ?? 0) > 0) ??
|
|
103
|
-
deepAgentDelegationHosts[0] ??
|
|
104
|
-
deepAgentHosts[0] ??
|
|
105
|
-
nonDeepAgentDelegationHosts.find((binding) => (binding.deepAgentParams?.subagents.length ?? 0) > 0) ??
|
|
106
|
-
nonDeepAgentDelegationHosts[0] ??
|
|
107
|
-
nonDeepAgentHosts[0];
|
|
108
|
-
const delegationPreferredSecondary = deepAgentDelegationHosts.find((binding) => (binding.deepAgentParams?.subagents.length ?? 0) > 0) ??
|
|
109
|
-
deepAgentDelegationHosts[0] ??
|
|
110
|
-
nonDeepAgentDelegationHosts.find((binding) => (binding.deepAgentParams?.subagents.length ?? 0) > 0) ??
|
|
111
|
-
nonDeepAgentDelegationHosts[0] ??
|
|
112
|
-
delegationHosts[0];
|
|
113
|
-
const genericLightweightHost = lightweightHosts.find((binding) => binding.agent.id !== researchBinding?.agent.id);
|
|
114
|
-
const primaryBinding = defaultOrchestratingHost ?? directBinding ?? genericLightweightHost ?? routingHosts[0] ?? runtimeEntryBindings[0];
|
|
115
|
-
const secondaryBinding = genericLightweightHost && genericLightweightHost.agent.id !== primaryBinding?.agent.id
|
|
116
|
-
? genericLightweightHost
|
|
117
|
-
: directBinding && directBinding.agent.id !== primaryBinding?.agent.id
|
|
118
|
-
? directBinding
|
|
119
|
-
: delegationPreferredSecondary && delegationPreferredSecondary.agent.id !== primaryBinding?.agent.id
|
|
120
|
-
? delegationPreferredSecondary
|
|
121
|
-
: routingHosts.find((binding) => binding.agent.id !== primaryBinding?.agent.id) ??
|
|
122
|
-
(orchestrationHosts.length > 0 ? undefined : runtimeEntryBindings.find((binding) => binding.agent.id !== primaryBinding?.agent.id));
|
|
123
|
-
return { primaryBinding, secondaryBinding, researchBinding, hostBindings: runtimeEntryBindings };
|
|
89
|
+
const hostBindings = Array.from(workspace.bindings.values());
|
|
90
|
+
const orchestrationHosts = hostBindings.filter((binding) => binding.agent.executionMode === "deepagent" || Boolean(binding.deepAgentParams));
|
|
91
|
+
const routingHosts = orchestrationHosts.length > 0 ? orchestrationHosts : hostBindings;
|
|
92
|
+
const primaryBinding = routingHosts.find((binding) => isDelegationCapableBinding(binding)) ?? routingHosts[0] ?? hostBindings[0];
|
|
93
|
+
return { primaryBinding, hostBindings };
|
|
124
94
|
}
|
|
@@ -72,9 +72,6 @@ function resolveAgentRuntimeName(agent) {
|
|
|
72
72
|
if (normalizedSourcePath.includes("/packages/framework/")) {
|
|
73
73
|
return `core.${baseName}`;
|
|
74
74
|
}
|
|
75
|
-
if (["direct", "orchestra", "research-lite"].includes(baseName)) {
|
|
76
|
-
return `core.${baseName}`;
|
|
77
|
-
}
|
|
78
75
|
return baseName;
|
|
79
76
|
}
|
|
80
77
|
export function requireTools(tools, refs, ownerId) {
|
package/package.json
CHANGED