@botbotgo/agent-harness 0.0.123 → 0.0.125
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -6
- package/README.zh.md +45 -7
- package/dist/benchmark/upstream-runtime-ab-benchmark.d.ts +32 -2
- package/dist/benchmark/upstream-runtime-ab-benchmark.js +58 -2
- package/dist/contracts/runtime.d.ts +32 -0
- package/dist/package-version.d.ts +1 -1
- package/dist/package-version.js +1 -1
- package/dist/runtime/agent-runtime-adapter.d.ts +3 -3
- package/dist/runtime/agent-runtime-adapter.js +31 -7
- package/dist/runtime/harness/run/helpers.d.ts +1 -1
- package/dist/runtime/harness/run/helpers.js +1 -1
- package/dist/runtime/harness/run/stream-run.js +1 -8
- package/dist/runtime/harness/run/thread-records.d.ts +1 -1
- package/dist/runtime/harness/run/thread-records.js +10 -7
- package/dist/runtime/harness.js +2 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
# @botbotgo/agent-harness
|
|
15
15
|
|
|
16
16
|
<p align="center">
|
|
17
|
-
<strong>The
|
|
17
|
+
<strong>The application runtime for multi-agent products with approvals, recovery, and operator control built in.</strong>
|
|
18
18
|
</p>
|
|
19
19
|
|
|
20
20
|
<p align="center">
|
|
@@ -29,9 +29,38 @@
|
|
|
29
29
|
>
|
|
30
30
|
</p>
|
|
31
31
|
|
|
32
|
+
## What Problem We Solve
|
|
33
|
+
|
|
34
|
+
AI makes it much easier to generate agent logic, tool calls, and workflow code. The hard part moves to operations.
|
|
35
|
+
|
|
36
|
+
Once the demo works, the real software problem changes shape:
|
|
37
|
+
|
|
38
|
+
- more generated logic creates more execution paths to inspect, interrupt, retry, and recover
|
|
39
|
+
- natural-language entrypoints turn approvals and policy boundaries into runtime requirements
|
|
40
|
+
- backend, prompt, and tool changes happen faster, but product-facing behavior still needs one stable control surface
|
|
41
|
+
- MCP and provider-native tooling expand what agents can reach, which raises the bar for governance
|
|
42
|
+
|
|
43
|
+
Teams still need answers to the runtime questions that appear after that shift:
|
|
44
|
+
|
|
45
|
+
- how approvals are resolved and audited
|
|
46
|
+
- how runs, threads, and events stay inspectable
|
|
47
|
+
- how execution recovers after interruption, failure, or restart
|
|
48
|
+
- how routing, concurrency, and maintenance policy stay consistent
|
|
49
|
+
- how backend churn does not leak into the product model
|
|
50
|
+
|
|
51
|
+
`agent-harness` solves that layer. It keeps agent execution upstream while making the application runtime operable, recoverable, and governable.
|
|
52
|
+
|
|
53
|
+
Concretely, that means:
|
|
54
|
+
|
|
55
|
+
- a product-facing approval and operator surface instead of backend-specific middleware state
|
|
56
|
+
- persisted `runs`, `threads`, `approvals`, and `events` as stable runtime records
|
|
57
|
+
- restart-safe recovery and continuation as system-managed behavior
|
|
58
|
+
- YAML-owned routing, concurrency, maintenance, and recovery policy
|
|
59
|
+
- adapter isolation so backend replacement does not redefine the public runtime model
|
|
60
|
+
|
|
32
61
|
## Product Overview
|
|
33
62
|
|
|
34
|
-
`@botbotgo/agent-harness` is a workspace-shaped application runtime for real agent products.
|
|
63
|
+
`@botbotgo/agent-harness` is a workspace-shaped application runtime for real multi-agent products.
|
|
35
64
|
|
|
36
65
|
It is not a new agent framework. It is the runtime layer around LangChain v1 and DeepAgents that turns one workspace into one operable application runtime.
|
|
37
66
|
|
|
@@ -39,7 +68,7 @@ The point is simple:
|
|
|
39
68
|
|
|
40
69
|
- Codex, Claude Code, and Cursor are products for people using agents
|
|
41
70
|
- LangChain v1 and DeepAgents are frameworks for defining agent execution semantics
|
|
42
|
-
- `agent-harness` is the runtime product layer for
|
|
71
|
+
- `agent-harness` is the runtime product layer for operating, recovering, approving, and governing multi-agent applications
|
|
43
72
|
|
|
44
73
|
The product boundary is strict:
|
|
45
74
|
|
|
@@ -61,6 +90,14 @@ The runtime provides:
|
|
|
61
90
|
- local `resources/tools/` `tool({...})` modules and `resources/skills/` discovery
|
|
62
91
|
- persisted threads, runs, approvals, events, queue state, and recovery metadata
|
|
63
92
|
|
|
93
|
+
In practice, the harness exists for the parts that are painful to rebuild inside every agent app:
|
|
94
|
+
|
|
95
|
+
- approval inboxes and human decision flow
|
|
96
|
+
- persisted runs, threads, and inspectable event history
|
|
97
|
+
- runtime-managed recovery after interrupts, failures, or process restart
|
|
98
|
+
- queueing, concurrency, maintenance, and operational policy
|
|
99
|
+
- stable runtime records that stay usable even if the backend changes
|
|
100
|
+
|
|
64
101
|
The repository-owned default config layer is intentionally full-shaped. The shipped YAML keeps explicit defaults for the important runtime and agent knobs so teams can start from concrete config instead of reverse-engineering adapter behavior from source.
|
|
65
102
|
|
|
66
103
|
The default rule is:
|
|
@@ -89,7 +126,7 @@ Recommended orchestration shape for long-running flows:
|
|
|
89
126
|
|
|
90
127
|
## Why This Exists
|
|
91
128
|
|
|
92
|
-
Most agent
|
|
129
|
+
Most agent tooling stops at execution.
|
|
93
130
|
|
|
94
131
|
Real products need a runtime that can answer harder questions:
|
|
95
132
|
|
|
@@ -103,9 +140,10 @@ Real products need a runtime that can answer harder questions:
|
|
|
103
140
|
|
|
104
141
|
## What Makes It Different
|
|
105
142
|
|
|
106
|
-
- It treats `runs`, `threads`, `approvals`, `events`, and recovery as product
|
|
143
|
+
- It treats `runs`, `threads`, `approvals`, `events`, and recovery as first-class product records
|
|
144
|
+
- It gives operators a runtime control surface instead of exposing raw backend internals
|
|
107
145
|
- It keeps checkpoint resume system-managed instead of promoting checkpoint internals into the primary API
|
|
108
|
-
- It lets YAML own assembly
|
|
146
|
+
- It lets YAML own assembly and operating policy while code keeps a tiny surface
|
|
109
147
|
- It goes deep on runtime concerns that upstream libraries do not fully productize
|
|
110
148
|
|
|
111
149
|
## Quick Start
|
package/README.zh.md
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
# @botbotgo/agent-harness
|
|
15
15
|
|
|
16
16
|
<p align="center">
|
|
17
|
-
<strong>面向多 agent
|
|
17
|
+
<strong>面向多 agent 产品的应用运行时:内建审批、恢复与运维控制,而不只是执行。</strong>
|
|
18
18
|
</p>
|
|
19
19
|
|
|
20
20
|
<p align="center">
|
|
@@ -29,9 +29,38 @@
|
|
|
29
29
|
>
|
|
30
30
|
</p>
|
|
31
31
|
|
|
32
|
+
## 我们解决什么问题
|
|
33
|
+
|
|
34
|
+
AI 让 agent 逻辑、工具调用和工作流代码更容易生成,真正变难的是运行时运维。
|
|
35
|
+
|
|
36
|
+
当 demo 跑起来之后,真正的软件问题会换一种形状出现:
|
|
37
|
+
|
|
38
|
+
- 更多生成式逻辑,意味着更多要观测、打断、重试和恢复的执行路径
|
|
39
|
+
- 自然语言入口让审批与策略边界变成运行时要求,而不是事后补丁
|
|
40
|
+
- backend、prompt 和工具变化更快了,但面向产品的控制面仍然必须稳定
|
|
41
|
+
- MCP 与 provider 原生工具扩展了 agent 的可触达范围,也同步抬高了治理要求
|
|
42
|
+
|
|
43
|
+
团队仍然要回答这些运行时问题:
|
|
44
|
+
|
|
45
|
+
- 审批怎么决策、怎么审计
|
|
46
|
+
- runs、threads、events 怎么稳定可查
|
|
47
|
+
- 执行被打断、失败或进程重启后怎么恢复
|
|
48
|
+
- 路由、并发和维护策略怎么保持一致
|
|
49
|
+
- 后端频繁变化时,怎么不让产品模型跟着漂移
|
|
50
|
+
|
|
51
|
+
`agent-harness` 解决的就是这一层。它把 agent 执行留在上游,同时把应用运行时做成可运维、可恢复、可治理的系统。
|
|
52
|
+
|
|
53
|
+
具体来说,就是把这些能力沉到运行时里:
|
|
54
|
+
|
|
55
|
+
- 面向产品的审批与运维控制面,而不是 backend 专属的中间件状态
|
|
56
|
+
- 稳定持久化的 `runs`、`threads`、`approvals` 与 `events` 记录
|
|
57
|
+
- 由系统托管的重启恢复与中断续跑
|
|
58
|
+
- 由 YAML 持有的路由、并发、维护与恢复策略
|
|
59
|
+
- 通过适配器隔离 backend 变化,不让公共运行时模型漂移
|
|
60
|
+
|
|
32
61
|
## 产品概览
|
|
33
62
|
|
|
34
|
-
`@botbotgo/agent-harness`
|
|
63
|
+
`@botbotgo/agent-harness` 是面向真实多 agent 产品的、工作区形态的应用运行时。
|
|
35
64
|
|
|
36
65
|
它不是又一个 agent 框架,而是围绕 LangChain v1 与 DeepAgents 的运行时层:把一个工作区变成一套可运维的应用运行时。
|
|
37
66
|
|
|
@@ -39,7 +68,7 @@
|
|
|
39
68
|
|
|
40
69
|
- Codex、Claude Code、Cursor 是「人用 agent」的产品
|
|
41
70
|
- LangChain v1 与 DeepAgents 是定义 agent 执行语义的框架
|
|
42
|
-
- `agent-harness`
|
|
71
|
+
- `agent-harness` 是负责运维、恢复、审批与治理多 agent 应用的运行时产品层
|
|
43
72
|
|
|
44
73
|
产品边界是清晰的:
|
|
45
74
|
|
|
@@ -61,6 +90,14 @@
|
|
|
61
90
|
- 本地 `resources/tools/` 中 `tool({...})` 工具模块与 `resources/skills/` 的发现
|
|
62
91
|
- 持久化的线程、运行、审批、事件、队列状态与恢复元数据
|
|
63
92
|
|
|
93
|
+
落到实际系统里,harness 主要解决那些每个 agent 应用都不想重复造一遍的运行时难题:
|
|
94
|
+
|
|
95
|
+
- 审批收件箱与人工决策流
|
|
96
|
+
- 持久化的 runs、threads 与可查询事件历史
|
|
97
|
+
- 中断、失败或进程重启后的运行时托管恢复
|
|
98
|
+
- 队列、并发、维护与运维策略
|
|
99
|
+
- 即使后端变更也保持稳定的运行时记录模型
|
|
100
|
+
|
|
64
101
|
仓库自带的默认配置刻意做成「形状完整」。随仓库提供的 YAML 对重要的运行时与 agent 开关给出显式默认值,便于从具体配置起步,而不必从源码反推适配器行为。
|
|
65
102
|
|
|
66
103
|
默认原则是:
|
|
@@ -89,7 +126,7 @@
|
|
|
89
126
|
|
|
90
127
|
## 为何需要它
|
|
91
128
|
|
|
92
|
-
多数 agent
|
|
129
|
+
多数 agent 工具停在「能跑」。
|
|
93
130
|
|
|
94
131
|
真实产品需要能回答更难问题的运行时:
|
|
95
132
|
|
|
@@ -103,10 +140,11 @@
|
|
|
103
140
|
|
|
104
141
|
## 有何不同
|
|
105
142
|
|
|
106
|
-
- 将 `runs`、`threads`、`approvals`、`events`
|
|
143
|
+
- 将 `runs`、`threads`、`approvals`、`events` 与恢复视为一等产品记录
|
|
144
|
+
- 给运维侧提供运行时控制面,而不是暴露原始后端内部结构
|
|
107
145
|
- 将 checkpoint 恢复作为系统管理的行为,而不是把 checkpoint 细节抬成主 API
|
|
108
|
-
-
|
|
109
|
-
-
|
|
146
|
+
- 复杂装配与运行策略交给 YAML,代码面保持极小
|
|
147
|
+
- 在上游库未充分产品化的运行时问题上做深做透
|
|
110
148
|
|
|
111
149
|
## 快速开始
|
|
112
150
|
|
|
@@ -1,15 +1,23 @@
|
|
|
1
|
-
export declare const DEFAULT_UPSTREAM_BENCHMARK_PATHS: readonly ["harness", "raw-langchain-v1", "raw-deepagent"];
|
|
1
|
+
export declare const DEFAULT_UPSTREAM_BENCHMARK_PATHS: readonly ["harness", "harness-minimal-upstream", "raw-langchain-v1", "raw-deepagent"];
|
|
2
2
|
export declare const DEFAULT_UPSTREAM_BENCHMARK_WORKLOAD: "tool";
|
|
3
|
+
export declare const DEFAULT_UPSTREAM_BENCHMARK_SCENARIOS: readonly ["normal", "complex", "extreme"];
|
|
3
4
|
export type UpstreamBenchmarkPath = (typeof DEFAULT_UPSTREAM_BENCHMARK_PATHS)[number];
|
|
4
5
|
export type UpstreamBenchmarkWorkload = "tool" | "no-tool";
|
|
6
|
+
export type UpstreamBenchmarkScenario = (typeof DEFAULT_UPSTREAM_BENCHMARK_SCENARIOS)[number];
|
|
5
7
|
export type UpstreamBenchmarkRunSummary = {
|
|
6
8
|
providerLabel: string;
|
|
7
9
|
model: string;
|
|
10
|
+
scenario: UpstreamBenchmarkScenario;
|
|
8
11
|
path: UpstreamBenchmarkPath;
|
|
9
12
|
runNumber: number;
|
|
10
13
|
status: "completed" | "failed";
|
|
11
14
|
totalMs: number;
|
|
12
15
|
firstTokenMs: number | null;
|
|
16
|
+
setupMs: number | null;
|
|
17
|
+
firstToolMs: number | null;
|
|
18
|
+
lastToolMs: number | null;
|
|
19
|
+
finalOutputMs: number | null;
|
|
20
|
+
cleanupMs: number | null;
|
|
13
21
|
outputLength: number;
|
|
14
22
|
normalizedOutputLength: number;
|
|
15
23
|
toolCallCount: number;
|
|
@@ -19,6 +27,7 @@ export type UpstreamBenchmarkRunSummary = {
|
|
|
19
27
|
export type UpstreamBenchmarkAggregateSummary = {
|
|
20
28
|
providerLabel: string;
|
|
21
29
|
model: string;
|
|
30
|
+
scenario: UpstreamBenchmarkScenario;
|
|
22
31
|
path: UpstreamBenchmarkPath;
|
|
23
32
|
repetitions: number;
|
|
24
33
|
successCount: number;
|
|
@@ -31,6 +40,11 @@ export type UpstreamBenchmarkAggregateSummary = {
|
|
|
31
40
|
trimmedAvgFirstTokenMs: number | null;
|
|
32
41
|
medianFirstTokenMs: number | null;
|
|
33
42
|
p95FirstTokenMs: number | null;
|
|
43
|
+
avgSetupMs: number | null;
|
|
44
|
+
avgFirstToolMs: number | null;
|
|
45
|
+
avgLastToolMs: number | null;
|
|
46
|
+
avgFinalOutputMs: number | null;
|
|
47
|
+
avgCleanupMs: number | null;
|
|
34
48
|
avgOutputLength: number | null;
|
|
35
49
|
avgNormalizedOutputLength: number | null;
|
|
36
50
|
avgToolCallCount: number | null;
|
|
@@ -44,8 +58,24 @@ export type UpstreamBenchmarkComparison = {
|
|
|
44
58
|
avgFirstTokenMsDelta: number | null;
|
|
45
59
|
avgFirstTokenMsOverheadPct: number | null;
|
|
46
60
|
};
|
|
61
|
+
export type UpstreamBenchmarkPhaseCheckpoint = {
|
|
62
|
+
label: string;
|
|
63
|
+
atMs: number | null;
|
|
64
|
+
};
|
|
65
|
+
export type UpstreamBenchmarkTemperature = "cold" | "warm";
|
|
66
|
+
export type UpstreamBenchmarkDurationSummary = {
|
|
67
|
+
count: number;
|
|
68
|
+
totalMs: number;
|
|
69
|
+
avgMs: number | null;
|
|
70
|
+
maxMs: number | null;
|
|
71
|
+
};
|
|
47
72
|
export declare function resolveUpstreamBenchmarkPaths(rawValue?: string): readonly UpstreamBenchmarkPath[];
|
|
48
73
|
export declare function resolveUpstreamBenchmarkWorkload(rawValue?: string): UpstreamBenchmarkWorkload;
|
|
74
|
+
export declare function resolveUpstreamBenchmarkScenarios(rawValue?: string): readonly UpstreamBenchmarkScenario[];
|
|
49
75
|
export declare function extractLastMatchingToken(output: string, prefixes: readonly string[]): string;
|
|
50
|
-
export declare function aggregateUpstreamBenchmarkRuns(providerLabel: string, model: string, path: UpstreamBenchmarkPath, runs: UpstreamBenchmarkRunSummary[]): UpstreamBenchmarkAggregateSummary;
|
|
76
|
+
export declare function aggregateUpstreamBenchmarkRuns(providerLabel: string, model: string, scenario: UpstreamBenchmarkScenario, path: UpstreamBenchmarkPath, runs: UpstreamBenchmarkRunSummary[]): UpstreamBenchmarkAggregateSummary;
|
|
77
|
+
export declare function withUpstreamBenchmarkCleanup(summary: UpstreamBenchmarkRunSummary, cleanupMs: number | null): UpstreamBenchmarkRunSummary;
|
|
78
|
+
export declare function summarizeUpstreamBenchmarkPhases(checkpoints: readonly UpstreamBenchmarkPhaseCheckpoint[]): Record<string, number | null>;
|
|
79
|
+
export declare function selectUpstreamBenchmarkRunsByTemperature(runs: readonly UpstreamBenchmarkRunSummary[], temperature: UpstreamBenchmarkTemperature): UpstreamBenchmarkRunSummary[];
|
|
80
|
+
export declare function summarizeUpstreamBenchmarkDurations(values: readonly number[]): UpstreamBenchmarkDurationSummary;
|
|
51
81
|
export declare function compareUpstreamBenchmarkPaths(baseline: UpstreamBenchmarkAggregateSummary, candidate: UpstreamBenchmarkAggregateSummary): UpstreamBenchmarkComparison;
|
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
export const DEFAULT_UPSTREAM_BENCHMARK_PATHS = Object.freeze([
|
|
2
2
|
"harness",
|
|
3
|
+
"harness-minimal-upstream",
|
|
3
4
|
"raw-langchain-v1",
|
|
4
5
|
"raw-deepagent",
|
|
5
6
|
]);
|
|
6
7
|
export const DEFAULT_UPSTREAM_BENCHMARK_WORKLOAD = "tool";
|
|
8
|
+
export const DEFAULT_UPSTREAM_BENCHMARK_SCENARIOS = Object.freeze([
|
|
9
|
+
"normal",
|
|
10
|
+
"complex",
|
|
11
|
+
"extreme",
|
|
12
|
+
]);
|
|
7
13
|
function average(values) {
|
|
8
14
|
return Number((values.reduce((sum, value) => sum + value, 0) / values.length).toFixed(2));
|
|
9
15
|
}
|
|
@@ -47,12 +53,25 @@ export function resolveUpstreamBenchmarkPaths(rawValue) {
|
|
|
47
53
|
const parsed = rawValue
|
|
48
54
|
.split(",")
|
|
49
55
|
.map((value) => value.trim().toLowerCase())
|
|
50
|
-
.filter((value) => value === "harness" ||
|
|
56
|
+
.filter((value) => value === "harness" ||
|
|
57
|
+
value === "harness-minimal-upstream" ||
|
|
58
|
+
value === "raw-langchain-v1" ||
|
|
59
|
+
value === "raw-deepagent");
|
|
51
60
|
return parsed.length > 0 ? parsed : [...DEFAULT_UPSTREAM_BENCHMARK_PATHS];
|
|
52
61
|
}
|
|
53
62
|
export function resolveUpstreamBenchmarkWorkload(rawValue) {
|
|
54
63
|
return rawValue?.trim().toLowerCase() === "no-tool" ? "no-tool" : DEFAULT_UPSTREAM_BENCHMARK_WORKLOAD;
|
|
55
64
|
}
|
|
65
|
+
export function resolveUpstreamBenchmarkScenarios(rawValue) {
|
|
66
|
+
if (!rawValue) {
|
|
67
|
+
return [...DEFAULT_UPSTREAM_BENCHMARK_SCENARIOS];
|
|
68
|
+
}
|
|
69
|
+
const parsed = rawValue
|
|
70
|
+
.split(",")
|
|
71
|
+
.map((value) => value.trim().toLowerCase())
|
|
72
|
+
.filter((value) => value === "normal" || value === "complex" || value === "extreme");
|
|
73
|
+
return parsed.length > 0 ? parsed : [...DEFAULT_UPSTREAM_BENCHMARK_SCENARIOS];
|
|
74
|
+
}
|
|
56
75
|
export function extractLastMatchingToken(output, prefixes) {
|
|
57
76
|
const normalized = output.replace(/\s+/g, " ").trim();
|
|
58
77
|
let matched = "";
|
|
@@ -69,7 +88,7 @@ export function extractLastMatchingToken(output, prefixes) {
|
|
|
69
88
|
}
|
|
70
89
|
return matched || normalized;
|
|
71
90
|
}
|
|
72
|
-
export function aggregateUpstreamBenchmarkRuns(providerLabel, model, path, runs) {
|
|
91
|
+
export function aggregateUpstreamBenchmarkRuns(providerLabel, model, scenario, path, runs) {
|
|
73
92
|
const successfulRuns = runs.filter((run) => run.status === "completed");
|
|
74
93
|
const totalValues = successfulRuns.map((run) => run.totalMs);
|
|
75
94
|
const firstTokenValues = successfulRuns
|
|
@@ -78,6 +97,7 @@ export function aggregateUpstreamBenchmarkRuns(providerLabel, model, path, runs)
|
|
|
78
97
|
return {
|
|
79
98
|
providerLabel,
|
|
80
99
|
model,
|
|
100
|
+
scenario,
|
|
81
101
|
path,
|
|
82
102
|
repetitions: runs.length,
|
|
83
103
|
successCount: successfulRuns.length,
|
|
@@ -90,12 +110,48 @@ export function aggregateUpstreamBenchmarkRuns(providerLabel, model, path, runs)
|
|
|
90
110
|
trimmedAvgFirstTokenMs: trimmedAverageOrNull(firstTokenValues, 0.1),
|
|
91
111
|
medianFirstTokenMs: medianOrNull(firstTokenValues),
|
|
92
112
|
p95FirstTokenMs: percentileOrNull(firstTokenValues, 0.95),
|
|
113
|
+
avgSetupMs: averageOrNull(successfulRuns.map((run) => run.setupMs).filter((value) => value !== null)),
|
|
114
|
+
avgFirstToolMs: averageOrNull(successfulRuns.map((run) => run.firstToolMs).filter((value) => value !== null)),
|
|
115
|
+
avgLastToolMs: averageOrNull(successfulRuns.map((run) => run.lastToolMs).filter((value) => value !== null)),
|
|
116
|
+
avgFinalOutputMs: averageOrNull(successfulRuns.map((run) => run.finalOutputMs).filter((value) => value !== null)),
|
|
117
|
+
avgCleanupMs: averageOrNull(successfulRuns.map((run) => run.cleanupMs).filter((value) => value !== null)),
|
|
93
118
|
avgOutputLength: averageOrNull(successfulRuns.map((run) => run.outputLength)),
|
|
94
119
|
avgNormalizedOutputLength: averageOrNull(successfulRuns.map((run) => run.normalizedOutputLength)),
|
|
95
120
|
avgToolCallCount: averageOrNull(successfulRuns.map((run) => run.toolCallCount)),
|
|
96
121
|
exactOutputMatchCount: successfulRuns.filter((run) => run.exactOutputMatch).length,
|
|
97
122
|
};
|
|
98
123
|
}
|
|
124
|
+
export function withUpstreamBenchmarkCleanup(summary, cleanupMs) {
|
|
125
|
+
return {
|
|
126
|
+
...summary,
|
|
127
|
+
cleanupMs,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
export function summarizeUpstreamBenchmarkPhases(checkpoints) {
|
|
131
|
+
let previousAtMs = 0;
|
|
132
|
+
const durations = {};
|
|
133
|
+
for (const checkpoint of checkpoints) {
|
|
134
|
+
const key = `${checkpoint.label}Ms`;
|
|
135
|
+
if (checkpoint.atMs === null) {
|
|
136
|
+
durations[key] = null;
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
durations[key] = Number((checkpoint.atMs - previousAtMs).toFixed(2));
|
|
140
|
+
previousAtMs = checkpoint.atMs;
|
|
141
|
+
}
|
|
142
|
+
return durations;
|
|
143
|
+
}
|
|
144
|
+
export function selectUpstreamBenchmarkRunsByTemperature(runs, temperature) {
|
|
145
|
+
return runs.filter((run) => (temperature === "cold" ? run.runNumber === 1 : run.runNumber > 1));
|
|
146
|
+
}
|
|
147
|
+
export function summarizeUpstreamBenchmarkDurations(values) {
|
|
148
|
+
return {
|
|
149
|
+
count: values.length,
|
|
150
|
+
totalMs: Number(values.reduce((sum, value) => sum + value, 0).toFixed(2)),
|
|
151
|
+
avgMs: values.length > 0 ? average(values) : null,
|
|
152
|
+
maxMs: values.length > 0 ? Number(Math.max(...values).toFixed(2)) : null,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
99
155
|
function computeOverhead(candidate, baseline) {
|
|
100
156
|
if (candidate === null || baseline === null) {
|
|
101
157
|
return { delta: null, pct: null };
|
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
import type { RunState } from "./core.js";
|
|
2
2
|
import type { CompiledAgentBinding, CompiledModel, CompiledTool, ParsedAgentObject, ParsedToolObject, WorkspaceBundle } from "./workspace.js";
|
|
3
|
+
/**
|
|
4
|
+
* Persisted runtime summary for an inspectable conversation thread.
|
|
5
|
+
* This projects upstream session/thread execution state into a stable runtime
|
|
6
|
+
* inspection surface.
|
|
7
|
+
*/
|
|
3
8
|
export type ThreadSummary = {
|
|
4
9
|
agentId: string;
|
|
5
10
|
threadId: string;
|
|
@@ -11,6 +16,11 @@ export type ThreadSummary = {
|
|
|
11
16
|
export type SessionRecord = ThreadSummary;
|
|
12
17
|
export type KnownHarnessEventType = "run.created" | "run.queued" | "run.dequeued" | "run.state.changed" | "run.resumed" | "approval.requested" | "approval.resolved" | "artifact.created" | "output.delta" | "runtime.health.changed" | "runtime.synthetic_fallback";
|
|
13
18
|
export type HarnessEventType = KnownHarnessEventType | (string & {});
|
|
19
|
+
/**
|
|
20
|
+
* Persisted runtime event recorded by the harness runtime.
|
|
21
|
+
* Event payload semantics should stay aligned with upstream/runtime behavior
|
|
22
|
+
* rather than introducing a second execution protocol.
|
|
23
|
+
*/
|
|
14
24
|
export type HarnessEvent = {
|
|
15
25
|
eventId: string;
|
|
16
26
|
eventType: HarnessEventType;
|
|
@@ -45,6 +55,10 @@ export type RuntimeHealthSymptom = {
|
|
|
45
55
|
firstSeenAt: string;
|
|
46
56
|
lastSeenAt: string;
|
|
47
57
|
};
|
|
58
|
+
/**
|
|
59
|
+
* Harness-operated operational state built from persisted records and runtime telemetry.
|
|
60
|
+
* This is runtime ops state, not an upstream execution semantic.
|
|
61
|
+
*/
|
|
48
62
|
export type RuntimeHealthSnapshot = {
|
|
49
63
|
status: HealthStatus;
|
|
50
64
|
updatedAt: string;
|
|
@@ -153,8 +167,15 @@ export type ThreadRunRecord = {
|
|
|
153
167
|
checkpointRef: string | null;
|
|
154
168
|
resumable: boolean;
|
|
155
169
|
};
|
|
170
|
+
/**
|
|
171
|
+
* Persisted run summary projected from upstream execution state plus runtime lifecycle metadata.
|
|
172
|
+
*/
|
|
156
173
|
export type RunSummary = ThreadRunRecord;
|
|
157
174
|
export type RunRecord = RunSummary;
|
|
175
|
+
/**
|
|
176
|
+
* Persisted thread inspection record assembled from runtime records.
|
|
177
|
+
* This is an inspectable projection, not a second thread semantic model.
|
|
178
|
+
*/
|
|
158
179
|
export type ThreadRecord = {
|
|
159
180
|
threadId: string;
|
|
160
181
|
entryAgentId: string;
|
|
@@ -188,6 +209,11 @@ export type RestartConversationOptions = {
|
|
|
188
209
|
mode: "restart-in-thread" | "restart-new-thread";
|
|
189
210
|
input: string;
|
|
190
211
|
};
|
|
212
|
+
/**
|
|
213
|
+
* Persisted approval inspection record.
|
|
214
|
+
* Approval decision semantics should stay aligned with upstream interrupt/approval
|
|
215
|
+
* behavior even though the record is stored and resolved through harness persistence.
|
|
216
|
+
*/
|
|
191
217
|
export type ApprovalRecord = {
|
|
192
218
|
approvalId: string;
|
|
193
219
|
pendingActionId: string;
|
|
@@ -240,6 +266,12 @@ export type RuntimeAdapterOptions = {
|
|
|
240
266
|
checkpointerResolver?: RuntimeCheckpointerResolver;
|
|
241
267
|
storeResolver?: RuntimeStoreResolver;
|
|
242
268
|
backendResolver?: RuntimeBackendResolver;
|
|
269
|
+
/**
|
|
270
|
+
* DeepAgent execution semantics stay upstream-owned.
|
|
271
|
+
* `minimal` keeps harness runtime persistence/ops active while only attaching
|
|
272
|
+
* upstream substrate objects when the binding explicitly needs them.
|
|
273
|
+
*/
|
|
274
|
+
deepAgentUpstreamSubstrateMode?: "full" | "minimal";
|
|
243
275
|
};
|
|
244
276
|
export type ToolKindAdapter = {
|
|
245
277
|
type: string;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const AGENT_HARNESS_VERSION = "0.0.
|
|
1
|
+
export declare const AGENT_HARNESS_VERSION = "0.0.124";
|
package/dist/package-version.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export const AGENT_HARNESS_VERSION = "0.0.
|
|
1
|
+
export const AGENT_HARNESS_VERSION = "0.0.124";
|
|
@@ -32,9 +32,9 @@ export declare function buildDeepAgentCreateParams(input: {
|
|
|
32
32
|
resolvedTools: unknown[];
|
|
33
33
|
resolvedMiddleware: unknown[];
|
|
34
34
|
resolvedSubagents: UpstreamSubagentConfig[];
|
|
35
|
-
resolvedCheckpointer
|
|
36
|
-
resolvedStore
|
|
37
|
-
resolvedBackend
|
|
35
|
+
resolvedCheckpointer?: unknown;
|
|
36
|
+
resolvedStore?: unknown;
|
|
37
|
+
resolvedBackend?: unknown;
|
|
38
38
|
resolvedInterruptOn?: Record<string, {
|
|
39
39
|
allowedDecisions: Array<"approve" | "edit" | "reject">;
|
|
40
40
|
}>;
|
|
@@ -18,7 +18,7 @@ export { applyDeepAgentDelegationPromptCompatibility, materializeDeepAgentSkillS
|
|
|
18
18
|
export { buildAuthOmittingFetch, normalizeOpenAICompatibleInit } from "./adapter/compat/openai-compatible.js";
|
|
19
19
|
export { buildToolNameMapping, createModelFacingToolNameCandidates, createModelFacingToolNameLookupCandidates, resolveModelFacingToolName, sanitizeToolNameForModel, } from "./adapter/tool/tool-name-mapping.js";
|
|
20
20
|
export { computeRemainingTimeoutMs, isRetryableProviderError, resolveBindingTimeout, resolveProviderRetryPolicy, resolveStreamIdleTimeout, resolveTimeoutMs, } from "./adapter/resilience.js";
|
|
21
|
-
import { getBindingAdapterKind, getBindingExecutionKind, getBindingExecutionParams, getBindingFilesystemConfig, getBindingInterruptCompatibilityRules, getBindingPrimaryModel, getBindingSkills, getBindingSubagents, getBindingToolCount, getBindingPrimaryTools, getBindingSystemPrompt, isDeepAgentBinding, isLangChainBinding, } from "./support/compiled-binding.js";
|
|
21
|
+
import { getBindingBackendConfig, getBindingAdapterKind, getBindingExecutionKind, getBindingExecutionParams, getBindingFilesystemConfig, getBindingInterruptCompatibilityRules, getBindingMemorySources, getBindingMiddlewareConfigs, getBindingPrimaryModel, getBindingSkills, getBindingStoreConfig, getBindingSubagents, getBindingToolCount, getBindingPrimaryTools, getBindingSystemPrompt, isDeepAgentBinding, isLangChainBinding, } from "./support/compiled-binding.js";
|
|
22
22
|
const AGENT_INTERRUPT_SENTINEL_PREFIX = "__agent_harness_interrupt__:";
|
|
23
23
|
const UPSTREAM_BUILTIN_MIDDLEWARE_TOOL_NAMES = Object.freeze([
|
|
24
24
|
"write_todos",
|
|
@@ -116,13 +116,28 @@ export function buildDeepAgentCreateParams(input) {
|
|
|
116
116
|
model: input.resolvedModel,
|
|
117
117
|
tools: input.resolvedTools,
|
|
118
118
|
middleware: input.resolvedMiddleware,
|
|
119
|
-
checkpointer: input.resolvedCheckpointer,
|
|
120
|
-
store: input.resolvedStore,
|
|
121
119
|
subagents: input.resolvedSubagents,
|
|
122
|
-
backend: input.resolvedBackend,
|
|
123
120
|
interruptOn: input.resolvedInterruptOn,
|
|
121
|
+
...(input.resolvedCheckpointer !== undefined ? { checkpointer: input.resolvedCheckpointer } : {}),
|
|
122
|
+
...(input.resolvedStore !== undefined ? { store: input.resolvedStore } : {}),
|
|
123
|
+
...(input.resolvedBackend !== undefined ? { backend: input.resolvedBackend } : {}),
|
|
124
124
|
};
|
|
125
125
|
}
|
|
126
|
+
function shouldAttachMinimalDeepAgentCheckpointer(binding, resolvedInterruptOn) {
|
|
127
|
+
if (binding.harnessRuntime.checkpointer !== undefined) {
|
|
128
|
+
return true;
|
|
129
|
+
}
|
|
130
|
+
return resolvedInterruptOn !== undefined && Object.keys(resolvedInterruptOn).length > 0;
|
|
131
|
+
}
|
|
132
|
+
function shouldAttachMinimalDeepAgentStore(binding) {
|
|
133
|
+
return getBindingStoreConfig(binding) !== undefined || getBindingMemorySources(binding).length > 0;
|
|
134
|
+
}
|
|
135
|
+
function shouldAttachMinimalDeepAgentBackend(binding) {
|
|
136
|
+
return (getBindingBackendConfig(binding) !== undefined ||
|
|
137
|
+
getBindingMemorySources(binding).length > 0 ||
|
|
138
|
+
getBindingSkills(binding).length > 0 ||
|
|
139
|
+
(getBindingMiddlewareConfigs(binding)?.length ?? 0) > 0);
|
|
140
|
+
}
|
|
126
141
|
export class AgentRuntimeAdapter {
|
|
127
142
|
options;
|
|
128
143
|
modelCache = new Map();
|
|
@@ -351,10 +366,19 @@ export class AgentRuntimeAdapter {
|
|
|
351
366
|
const resolvedTools = this.resolveTools(primaryTools, binding);
|
|
352
367
|
const resolvedMiddleware = await this.resolveMiddleware(binding);
|
|
353
368
|
const resolvedSubagents = await this.resolveSubagents(getBindingSubagents(binding), binding);
|
|
354
|
-
const resolvedCheckpointer = resolveRunnableCheckpointer(this.options, binding);
|
|
355
|
-
const resolvedStore = this.options.storeResolver?.(binding);
|
|
356
|
-
const resolvedBackend = this.options.backendResolver?.(binding);
|
|
357
369
|
const resolvedInterruptOn = resolveRunnableInterruptOn(binding);
|
|
370
|
+
const substrateMode = this.options.deepAgentUpstreamSubstrateMode ?? "minimal";
|
|
371
|
+
const resolvedCheckpointer = substrateMode === "minimal"
|
|
372
|
+
? (shouldAttachMinimalDeepAgentCheckpointer(binding, resolvedInterruptOn)
|
|
373
|
+
? resolveRunnableCheckpointer(this.options, binding)
|
|
374
|
+
: undefined)
|
|
375
|
+
: resolveRunnableCheckpointer(this.options, binding);
|
|
376
|
+
const resolvedStore = substrateMode === "minimal"
|
|
377
|
+
? (shouldAttachMinimalDeepAgentStore(binding) ? this.options.storeResolver?.(binding) : undefined)
|
|
378
|
+
: this.options.storeResolver?.(binding);
|
|
379
|
+
const resolvedBackend = substrateMode === "minimal"
|
|
380
|
+
? (shouldAttachMinimalDeepAgentBackend(binding) ? this.options.backendResolver?.(binding) : undefined)
|
|
381
|
+
: this.options.backendResolver?.(binding);
|
|
358
382
|
const resolvedSkills = resolveDeepAgentSkillSourcePaths({
|
|
359
383
|
workspaceRoot: binding.harnessRuntime.workspaceRoot,
|
|
360
384
|
runRoot: binding.harnessRuntime.runRoot,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { ApprovalRecord, HarnessEvent, InternalApprovalRecord, InvocationEnvelope, RunListeners, RunOptions, RunResult, RunStartOptions, MessageContent } from "../../../contracts/types.js";
|
|
2
|
-
export declare function
|
|
2
|
+
export declare function toInspectableApprovalRecord(approval: InternalApprovalRecord): ApprovalRecord;
|
|
3
3
|
export declare function normalizeInvocationEnvelope(options: RunStartOptions): {
|
|
4
4
|
context?: Record<string, unknown>;
|
|
5
5
|
state?: Record<string, unknown>;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { normalizeMessageContent } from "../../../utils/message-content.js";
|
|
2
|
-
export function
|
|
2
|
+
export function toInspectableApprovalRecord(approval) {
|
|
3
3
|
const { toolCallId: _toolCallId, checkpointRef: _checkpointRef, eventRefs: _eventRefs, ...publicApproval } = approval;
|
|
4
4
|
return publicApproval;
|
|
5
5
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { AGENT_INTERRUPT_SENTINEL_PREFIX, RuntimeOperationTimeoutError } from "../../agent-runtime-adapter.js";
|
|
2
2
|
import { renderRuntimeFailure, renderToolFailure } from "../../support/harness-support.js";
|
|
3
|
-
import { createContentBlocksItem, createToolResultKey,
|
|
3
|
+
import { createContentBlocksItem, createToolResultKey, } from "../events/streaming.js";
|
|
4
4
|
function normalizeStreamChunk(chunk) {
|
|
5
5
|
if (typeof chunk === "string") {
|
|
6
6
|
if (chunk.startsWith(AGENT_INTERRUPT_SENTINEL_PREFIX)) {
|
|
@@ -33,7 +33,6 @@ export async function* streamHarnessRun(options) {
|
|
|
33
33
|
let releaseRunSlot = async () => undefined;
|
|
34
34
|
let emitted = false;
|
|
35
35
|
let streamActivityObserved = false;
|
|
36
|
-
const emitOutputDelta = (content) => emitOutputDeltaAndCreateItem(options.emit, options.threadId, options.runId, options.selectedAgentId, content);
|
|
37
36
|
try {
|
|
38
37
|
const [priorHistory, acquiredReleaseRunSlot] = await Promise.all([
|
|
39
38
|
priorHistoryPromise,
|
|
@@ -114,12 +113,10 @@ export async function* streamHarnessRun(options) {
|
|
|
114
113
|
}
|
|
115
114
|
emitted = true;
|
|
116
115
|
assistantOutput += normalizedChunk.content;
|
|
117
|
-
yield await emitOutputDelta(normalizedChunk.content);
|
|
118
116
|
}
|
|
119
117
|
if (!assistantOutput && toolErrors.length > 0) {
|
|
120
118
|
assistantOutput = toolErrors.join("\n\n");
|
|
121
119
|
emitted = true;
|
|
122
|
-
yield await emitOutputDelta(assistantOutput);
|
|
123
120
|
}
|
|
124
121
|
if (!assistantOutput) {
|
|
125
122
|
const actual = await options.invokeWithHistory(options.binding, options.input, options.threadId, options.runId);
|
|
@@ -129,7 +126,6 @@ export async function* streamHarnessRun(options) {
|
|
|
129
126
|
if (actual.output) {
|
|
130
127
|
assistantOutput = actual.output;
|
|
131
128
|
emitted = true;
|
|
132
|
-
yield await emitOutputDelta(actual.output);
|
|
133
129
|
}
|
|
134
130
|
}
|
|
135
131
|
await options.appendAssistantMessage(options.threadId, options.runId, assistantOutput);
|
|
@@ -216,9 +212,6 @@ export async function* streamHarnessRun(options) {
|
|
|
216
212
|
if (Array.isArray(actual.contentBlocks) && actual.contentBlocks.length > 0) {
|
|
217
213
|
yield createContentBlocksItem(options.threadId, options.runId, options.selectedAgentId, actual.contentBlocks);
|
|
218
214
|
}
|
|
219
|
-
if (actual.output) {
|
|
220
|
-
yield await emitOutputDelta(actual.output);
|
|
221
|
-
}
|
|
222
215
|
yield {
|
|
223
216
|
type: "result",
|
|
224
217
|
result: {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import type { ApprovalRecord, ThreadRecord, ThreadSummary } from "../../../contracts/types.js";
|
|
2
2
|
import type { RuntimePersistence } from "../../../persistence/types.js";
|
|
3
|
-
export declare function
|
|
3
|
+
export declare function buildThreadInspectionRecord(input: {
|
|
4
4
|
persistence: RuntimePersistence;
|
|
5
5
|
getSession: (threadId: string) => Promise<ThreadSummary | null>;
|
|
6
6
|
}, threadId: string): Promise<ThreadRecord | null>;
|
|
@@ -1,5 +1,10 @@
|
|
|
1
|
-
import { isTerminalRunState,
|
|
2
|
-
|
|
1
|
+
import { isTerminalRunState, toInspectableApprovalRecord } from "./helpers.js";
|
|
2
|
+
function selectLatestPendingApproval(approvals) {
|
|
3
|
+
return approvals
|
|
4
|
+
.filter((approval) => approval.status === "pending")
|
|
5
|
+
.sort((left, right) => right.requestedAt.localeCompare(left.requestedAt))[0];
|
|
6
|
+
}
|
|
7
|
+
export async function buildThreadInspectionRecord(input, threadId) {
|
|
3
8
|
const [threadSummary, meta, messages, runs] = await Promise.all([
|
|
4
9
|
input.getSession(threadId),
|
|
5
10
|
input.persistence.getThreadMeta(threadId),
|
|
@@ -11,9 +16,7 @@ export async function getThreadRecord(input, threadId) {
|
|
|
11
16
|
}
|
|
12
17
|
const latestRunId = threadSummary.latestRunId;
|
|
13
18
|
const latestApprovals = await input.persistence.getRunApprovals(threadId, latestRunId);
|
|
14
|
-
const pendingApproval = latestApprovals
|
|
15
|
-
.filter((approval) => approval.status === "pending")
|
|
16
|
-
.sort((left, right) => right.requestedAt.localeCompare(left.requestedAt))[0];
|
|
19
|
+
const pendingApproval = selectLatestPendingApproval(latestApprovals);
|
|
17
20
|
return {
|
|
18
21
|
threadId,
|
|
19
22
|
entryAgentId: meta.entryAgentId,
|
|
@@ -36,11 +39,11 @@ export async function getThreadRecord(input, threadId) {
|
|
|
36
39
|
}
|
|
37
40
|
export async function listPublicApprovals(input, filter) {
|
|
38
41
|
const approvals = await input.persistence.listApprovals(filter);
|
|
39
|
-
return approvals.map((approval) =>
|
|
42
|
+
return approvals.map((approval) => toInspectableApprovalRecord(approval));
|
|
40
43
|
}
|
|
41
44
|
export async function getPublicApproval(input, approvalId) {
|
|
42
45
|
const approval = await input.persistence.getApproval(approvalId);
|
|
43
|
-
return approval ?
|
|
46
|
+
return approval ? toInspectableApprovalRecord(approval) : null;
|
|
44
47
|
}
|
|
45
48
|
export async function deleteThreadRecord(input, threadId) {
|
|
46
49
|
const thread = await input.getThread(threadId);
|
package/dist/runtime/harness.js
CHANGED
|
@@ -29,7 +29,7 @@ import { resolveRuntimeAdapterOptions } from "./support/runtime-adapter-options.
|
|
|
29
29
|
import { initializeHarnessRuntime, reclaimExpiredClaimedRuns as reclaimHarnessExpiredClaimedRuns, recoverStartupRuns as recoverHarnessStartupRuns, isStaleRunningRun as isHarnessStaleRunningRun, } from "./harness/run/startup-runtime.js";
|
|
30
30
|
import { streamHarnessRun } from "./harness/run/stream-run.js";
|
|
31
31
|
import { defaultRequestedAgentId, prepareRunStart } from "./harness/run/start-run.js";
|
|
32
|
-
import { deleteThreadRecord, getPublicApproval,
|
|
32
|
+
import { buildThreadInspectionRecord, deleteThreadRecord, getPublicApproval, listPublicApprovals, } from "./harness/run/thread-records.js";
|
|
33
33
|
export class AgentHarnessRuntime {
|
|
34
34
|
workspace;
|
|
35
35
|
runtimeAdapterOptions;
|
|
@@ -199,7 +199,7 @@ export class AgentHarnessRuntime {
|
|
|
199
199
|
return this.persistence.getSession(threadId);
|
|
200
200
|
}
|
|
201
201
|
async getThread(threadId) {
|
|
202
|
-
return
|
|
202
|
+
return buildThreadInspectionRecord({
|
|
203
203
|
persistence: this.persistence,
|
|
204
204
|
getSession: (currentThreadId) => this.getSession(currentThreadId),
|
|
205
205
|
}, threadId);
|