@parkgogogo/openclaw-reflection 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -11
- package/README.zh-CN.md +47 -11
- package/assets/memory-flowchart.png +0 -0
- package/assets/openclaw-reflection-logo.png +0 -0
- package/package.json +4 -2
- package/src/evals/cli.ts +15 -0
- package/src/evals/comparison.ts +248 -0
- package/src/evals/models.ts +125 -0
- package/src/evals/reporting.ts +123 -0
- package/src/evals/runner.ts +62 -0
- package/src/index.ts +66 -1
- package/src/write-guardian/audit-log.ts +71 -0
- package/src/write-guardian/index.ts +49 -7
package/README.md
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# OpenClaw Reflection
|
|
2
2
|
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="./assets/openclaw-reflection-logo.png" alt="OpenClaw Reflection logo" width="180" />
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center"><strong>Make OpenClaw's native memory system sharper without replacing it.</strong></p>
|
|
8
|
+
|
|
3
9
|

|
|
4
10
|

|
|
5
11
|

|
|
@@ -7,8 +13,6 @@
|
|
|
7
13
|
|
|
8
14
|
Chinese version: [README.zh-CN.md](./README.zh-CN.md)
|
|
9
15
|
|
|
10
|
-
**Make OpenClaw's native memory system sharper without replacing it.**
|
|
11
|
-
|
|
12
16
|
OpenClaw Reflection is an additive layer on top of OpenClaw's built-in Markdown memory system. It captures message flow, keeps thread noise out of long-term memory, writes durable knowledge into the same human-readable memory files OpenClaw already uses, and periodically consolidates them so your agent gets sharper over time instead of messier.
|
|
13
17
|
|
|
14
18
|
## Current Scope
|
|
@@ -103,6 +107,13 @@ Put the following under `plugins.entries.openclaw-reflection` in your OpenClaw c
|
|
|
103
107
|
|
|
104
108
|
Once the gateway restarts, Reflection will begin listening to `message_received` and `before_message_write`, then writing curated memory files into your configured `workspaceDir`.
|
|
105
109
|
|
|
110
|
+
### Observability command
|
|
111
|
+
|
|
112
|
+
- Reflection now writes an independent write_guardian audit log to:
|
|
113
|
+
- `<workspaceDir>/.openclaw-reflection/write-guardian.log.jsonl`
|
|
114
|
+
- Register command: `/openclaw-reflection`
|
|
115
|
+
- Returns the most recent 10 write_guardian behaviors (written/refused/failed/skipped), including decision, target file, and reason.
|
|
116
|
+
|
|
106
117
|
## What You Get
|
|
107
118
|
|
|
108
119
|
| You want | Reflection gives you |
|
|
@@ -114,15 +125,7 @@ Once the gateway restarts, Reflection will begin listening to `message_received`
|
|
|
114
125
|
|
|
115
126
|
## How It Works
|
|
116
127
|
|
|
117
|
-
|
|
118
|
-
flowchart LR
|
|
119
|
-
A["Incoming conversation"] --> B["Session buffer"]
|
|
120
|
-
B --> C["memory_gate"]
|
|
121
|
-
C -->|durable fact| D["write_guardian"]
|
|
122
|
-
C -->|thread noise| E["No write"]
|
|
123
|
-
D --> F["MEMORY.md / USER.md / SOUL.md / IDENTITY.md / TOOLS.md"]
|
|
124
|
-
F --> G["Scheduled consolidation"]
|
|
125
|
-
```
|
|
128
|
+

|
|
126
129
|
|
|
127
130
|
In practice, the pipeline is simple:
|
|
128
131
|
|
|
@@ -200,10 +203,43 @@ pnpm run typecheck
|
|
|
200
203
|
pnpm run eval:memory-gate
|
|
201
204
|
pnpm run eval:write-guardian
|
|
202
205
|
pnpm run eval:all
|
|
206
|
+
|
|
207
|
+
node evals/run.mjs \
|
|
208
|
+
--suite memory-gate \
|
|
209
|
+
--models-config evals/models.json \
|
|
210
|
+
--baseline grok-fast \
|
|
211
|
+
--output evals/results/$(date +%F)-memory-gate-matrix.json \
|
|
212
|
+
--markdown-output evals/results/$(date +%F)-memory-gate-matrix.md
|
|
203
213
|
```
|
|
204
214
|
|
|
215
|
+
`evals/models.json` defines only the comparison matrix. The shared provider endpoint and key still come from `EVAL_BASE_URL` and `EVAL_API_KEY`. JSON output is the source of truth for automation and history, while the Markdown artifact is the readable leaderboard summary.
|
|
216
|
+
|
|
205
217
|
More eval details: [evals/README.md](./evals/README.md)
|
|
206
218
|
|
|
219
|
+
## Model Selection
|
|
220
|
+
|
|
221
|
+
Benchmark date: `2026-03-09`
|
|
222
|
+
Scope: `memory_gate` only, `18` cases, shared OpenRouter-compatible `EVAL_*` route
|
|
223
|
+
|
|
224
|
+
| Model | Pass/Total | Accuracy | Errors (P/S/E) | Recommendation | Best For |
|
|
225
|
+
| --- | --- | --- | --- | --- | --- |
|
|
226
|
+
| `x-ai/grok-4.1-fast` | `17/18` | `94.4%` | `0/0/0` | Default baseline | Daily eval baseline |
|
|
227
|
+
| `qwen/qwen3.5-flash-02-23` | `17/18` | `94.4%` | `0/1/0` | Good backup option | Cost-sensitive cross-checks |
|
|
228
|
+
| `google/gemini-2.5-flash-lite` | `16/18` | `88.9%` | `0/0/0` | Fast iteration candidate | Cheap prompt iteration |
|
|
229
|
+
| `inception/mercury-2` | `11/18` | `61.1%` | `0/0/0` | Not recommended as default | Exploratory comparisons only |
|
|
230
|
+
| `minimax/minimax-m2.5` | `9/18` | `50.0%` | `0/0/0` | Not recommended as default | Occasional sanity checks only |
|
|
231
|
+
| `openai/gpt-4o-mini` | `4/18` | `22.2%` | `18/0/0` | Not recommended on current route | Avoid on current OpenRouter path |
|
|
232
|
+
|
|
233
|
+
How to choose:
|
|
234
|
+
|
|
235
|
+
- Default to `x-ai/grok-4.1-fast` because it had the best overall stability in this round with no internal errors.
|
|
236
|
+
- Use `qwen/qwen3.5-flash-02-23` as the strongest backup when you want similar accuracy but can tolerate one schema failure in this benchmark.
|
|
237
|
+
- Use `google/gemini-2.5-flash-lite` for cheaper, faster prompt iteration when slightly lower boundary accuracy is acceptable.
|
|
238
|
+
- Avoid `inception/mercury-2` and `minimax/minimax-m2.5` as defaults because they frequently collapse `SOUL`, `IDENTITY`, or `NO_WRITE` boundaries into the wrong bucket.
|
|
239
|
+
- Avoid `openai/gpt-4o-mini` on the current OpenRouter/Azure-backed route because all `18` cases surfaced provider-side structured-output errors.
|
|
240
|
+
|
|
241
|
+
Source artifact: [2026-03-09-memory-gate-openrouter-model-benchmark.md](./evals/results/2026-03-09-memory-gate-openrouter-model-benchmark.md)
|
|
242
|
+
|
|
207
243
|
## Links
|
|
208
244
|
|
|
209
245
|
- OpenClaw plugin docs: [docs.openclaw.ai/tools/plugin](https://docs.openclaw.ai/tools/plugin)
|
package/README.zh-CN.md
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# OpenClaw Reflection
|
|
2
2
|
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="./assets/openclaw-reflection-logo.png" alt="OpenClaw Reflection logo" width="180" />
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center"><strong>在不替换 OpenClaw 原生记忆体系的前提下,让 Markdown 记忆更干净、更稳定、更可持续。</strong></p>
|
|
8
|
+
|
|
3
9
|
英文版: [README.md](./README.md)
|
|
4
10
|
|
|
5
11
|

|
|
@@ -7,8 +13,6 @@
|
|
|
7
13
|

|
|
8
14
|

|
|
9
15
|
|
|
10
|
-
**在不替换 OpenClaw 原生记忆体系的前提下,让 Markdown 记忆更干净、更稳定、更可持续。**
|
|
11
|
-
|
|
12
16
|
OpenClaw Reflection 是叠加在 OpenClaw 原生 Markdown memory 之上的一层增强插件。它负责监听消息流,过滤线程噪音,把真正长期有效的信息写回 OpenClaw 的核心记忆文件,并定期整理这些文件,避免长期使用后越记越乱。
|
|
13
17
|
|
|
14
18
|
## 当前支持范围
|
|
@@ -98,6 +102,13 @@ openclaw plugins install @parkgogogo/openclaw-reflection
|
|
|
98
102
|
|
|
99
103
|
Gateway 重启后,Reflection 就会开始监听 `message_received` 和 `before_message_write`,并把整理后的长期信息写入你配置的 `workspaceDir`。
|
|
100
104
|
|
|
105
|
+
### 可观测性命令
|
|
106
|
+
|
|
107
|
+
- Reflection 现在会给 write_guardian 单独写一份审计日志:
|
|
108
|
+
- `<workspaceDir>/.openclaw-reflection/write-guardian.log.jsonl`
|
|
109
|
+
- 注册命令:`/openclaw-reflection`
|
|
110
|
+
- 返回最近 10 条 write_guardian 行为(written/refused/failed/skipped),包含 decision、目标文件和原因。
|
|
111
|
+
|
|
101
112
|
## 你会得到什么
|
|
102
113
|
|
|
103
114
|
| 你想要的能力 | Reflection 提供的结果 |
|
|
@@ -109,15 +120,7 @@ Gateway 重启后,Reflection 就会开始监听 `message_received` 和 `before
|
|
|
109
120
|
|
|
110
121
|
## 它如何工作
|
|
111
122
|
|
|
112
|
-
|
|
113
|
-
flowchart LR
|
|
114
|
-
A["Incoming conversation"] --> B["Session buffer"]
|
|
115
|
-
B --> C["memory_gate"]
|
|
116
|
-
C -->|durable fact| D["write_guardian"]
|
|
117
|
-
C -->|thread noise| E["No write"]
|
|
118
|
-
D --> F["MEMORY.md / USER.md / SOUL.md / IDENTITY.md / TOOLS.md"]
|
|
119
|
-
F --> G["Scheduled consolidation"]
|
|
120
|
-
```
|
|
123
|
+

|
|
121
124
|
|
|
122
125
|
流程很直接:
|
|
123
126
|
|
|
@@ -174,10 +177,43 @@ pnpm run typecheck
|
|
|
174
177
|
pnpm run eval:memory-gate
|
|
175
178
|
pnpm run eval:write-guardian
|
|
176
179
|
pnpm run eval:all
|
|
180
|
+
|
|
181
|
+
node evals/run.mjs \
|
|
182
|
+
--suite memory-gate \
|
|
183
|
+
--models-config evals/models.json \
|
|
184
|
+
--baseline grok-fast \
|
|
185
|
+
--output evals/results/$(date +%F)-memory-gate-matrix.json \
|
|
186
|
+
--markdown-output evals/results/$(date +%F)-memory-gate-matrix.md
|
|
177
187
|
```
|
|
178
188
|
|
|
189
|
+
`evals/models.json` 只用来定义多模型对比矩阵;共享的 provider endpoint 和 key 仍然来自 `EVAL_BASE_URL` 与 `EVAL_API_KEY`。JSON 输出是后续自动化和历史追踪的基准,Markdown 输出则是给人看的 leaderboard 摘要。
|
|
190
|
+
|
|
179
191
|
更多评测说明见 [evals/README.md](./evals/README.md)。
|
|
180
192
|
|
|
193
|
+
## 模型选择
|
|
194
|
+
|
|
195
|
+
评测日期:`2026-03-09`
|
|
196
|
+
范围:仅 `memory_gate`,共 `18` 个 case,共享 OpenRouter 兼容的 `EVAL_*` 路由
|
|
197
|
+
|
|
198
|
+
| 模型 | Pass/Total | 准确率 | 错误数 (P/S/E) | 建议 | 适用场景 |
|
|
199
|
+
| --- | --- | --- | --- | --- | --- |
|
|
200
|
+
| `x-ai/grok-4.1-fast` | `17/18` | `94.4%` | `0/0/0` | 默认基线 | 日常 eval 基线 |
|
|
201
|
+
| `qwen/qwen3.5-flash-02-23` | `17/18` | `94.4%` | `0/1/0` | 优秀备选 | 对成本敏感的交叉验证 |
|
|
202
|
+
| `google/gemini-2.5-flash-lite` | `16/18` | `88.9%` | `0/0/0` | 便宜快速候选 | 低成本 prompt 迭代 |
|
|
203
|
+
| `inception/mercury-2` | `11/18` | `61.1%` | `0/0/0` | 不建议默认使用 | 仅做探索性对比 |
|
|
204
|
+
| `minimax/minimax-m2.5` | `9/18` | `50.0%` | `0/0/0` | 不建议默认使用 | 偶尔做 sanity check |
|
|
205
|
+
| `openai/gpt-4o-mini` | `4/18` | `22.2%` | `18/0/0` | 当前路由下不建议使用 | 避免在当前 OpenRouter 路径使用 |
|
|
206
|
+
|
|
207
|
+
如何选择:
|
|
208
|
+
|
|
209
|
+
- 默认优先用 `x-ai/grok-4.1-fast`,因为这一轮里它的整体稳定性最好,而且没有内部错误。
|
|
210
|
+
- 如果想要接近的准确率,同时能接受一次 schema 失败,可以把 `qwen/qwen3.5-flash-02-23` 作为最强备选。
|
|
211
|
+
- 如果更看重低成本和快速迭代,可以用 `google/gemini-2.5-flash-lite`,但要接受它在部分 `TOOLS` 边界上略弱。
|
|
212
|
+
- 不要把 `inception/mercury-2` 和 `minimax/minimax-m2.5` 当默认基线,因为它们经常把 `SOUL`、`IDENTITY` 或 `NO_WRITE` 判到错误类别。
|
|
213
|
+
- 当前 OpenRouter/Azure 路由下不要选 `openai/gpt-4o-mini`,因为 `18` 个 case 全都触发了 provider 侧 structured-output 错误。
|
|
214
|
+
|
|
215
|
+
源结果见:[2026-03-09-memory-gate-openrouter-model-benchmark.md](./evals/results/2026-03-09-memory-gate-openrouter-model-benchmark.md)
|
|
216
|
+
|
|
181
217
|
## 链接
|
|
182
218
|
|
|
183
219
|
- OpenClaw plugin docs: [docs.openclaw.ai/tools/plugin](https://docs.openclaw.ai/tools/plugin)
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@parkgogogo/openclaw-reflection",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"description": "OpenClaw plugin that enhances native Markdown memory with filtering, curation, and consolidation",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.ts",
|
|
7
7
|
"files": [
|
|
8
|
+
"assets/",
|
|
8
9
|
"src/",
|
|
9
10
|
"openclaw.plugin.json",
|
|
10
11
|
"README.md",
|
|
@@ -20,8 +21,9 @@
|
|
|
20
21
|
"url": "https://github.com/parkgogogo/openclaw-reflection/issues"
|
|
21
22
|
},
|
|
22
23
|
"scripts": {
|
|
23
|
-
"build": "tsc
|
|
24
|
+
"build": "tsc -p tsconfig.json",
|
|
24
25
|
"clean": "rm -rf logs",
|
|
26
|
+
"test": "pnpm run build && node --test tests/*.test.mjs",
|
|
25
27
|
"typecheck": "tsc --noEmit",
|
|
26
28
|
"e2e:openclaw-plugin": "bash scripts/e2e-openclaw-plugin.sh",
|
|
27
29
|
"eval:memory-gate": "pnpm exec tsc && node evals/run.mjs --suite memory-gate",
|
package/src/evals/cli.ts
CHANGED
|
@@ -7,6 +7,11 @@ export interface EvalCliOptions {
|
|
|
7
7
|
sharedDatasetPath?: string;
|
|
8
8
|
memoryGateDatasetPath?: string;
|
|
9
9
|
writeGuardianDatasetPath?: string;
|
|
10
|
+
modelsConfigPath?: string;
|
|
11
|
+
models?: string[];
|
|
12
|
+
baselineModelId?: string;
|
|
13
|
+
outputPath?: string;
|
|
14
|
+
markdownOutputPath?: string;
|
|
10
15
|
}
|
|
11
16
|
|
|
12
17
|
function getArgValue(argv: string[], flag: string): string | undefined {
|
|
@@ -34,6 +39,11 @@ function parseSuite(value: string | undefined): EvalSuite {
|
|
|
34
39
|
}
|
|
35
40
|
|
|
36
41
|
export function parseEvalCliOptions(argv: string[]): EvalCliOptions {
|
|
42
|
+
const models = getArgValue(argv, "--models")
|
|
43
|
+
?.split(",")
|
|
44
|
+
.map((modelId) => modelId.trim())
|
|
45
|
+
.filter((modelId) => modelId !== "");
|
|
46
|
+
|
|
37
47
|
return {
|
|
38
48
|
suite: parseSuite(getArgValue(argv, "--suite")),
|
|
39
49
|
useJudge: !argv.includes("--no-judge"),
|
|
@@ -41,5 +51,10 @@ export function parseEvalCliOptions(argv: string[]): EvalCliOptions {
|
|
|
41
51
|
sharedDatasetPath: getArgValue(argv, "--shared-dataset"),
|
|
42
52
|
memoryGateDatasetPath: getArgValue(argv, "--memory-gate-dataset"),
|
|
43
53
|
writeGuardianDatasetPath: getArgValue(argv, "--write-guardian-dataset"),
|
|
54
|
+
modelsConfigPath: getArgValue(argv, "--models-config"),
|
|
55
|
+
models,
|
|
56
|
+
baselineModelId: getArgValue(argv, "--baseline"),
|
|
57
|
+
outputPath: getArgValue(argv, "--output"),
|
|
58
|
+
markdownOutputPath: getArgValue(argv, "--markdown-output"),
|
|
44
59
|
};
|
|
45
60
|
}
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
MemoryGateCaseResult,
|
|
3
|
+
SingleModelRunReport,
|
|
4
|
+
WriteGuardianCaseResult,
|
|
5
|
+
} from "./runner.js";
|
|
6
|
+
import type { EvalSuite } from "./cli.js";
|
|
7
|
+
|
|
8
|
+
export interface RankedModelReport {
|
|
9
|
+
modelId: string;
|
|
10
|
+
passed: number;
|
|
11
|
+
total: number;
|
|
12
|
+
errorCounts?: SingleModelRunReport["summary"]["errorCounts"];
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface BaselineDiff {
|
|
16
|
+
modelId: string;
|
|
17
|
+
regressedCases: string[];
|
|
18
|
+
improvedCases: string[];
|
|
19
|
+
disagreementCases: string[];
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface HardestCase {
|
|
23
|
+
scenarioId: string;
|
|
24
|
+
failedBy: string[];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface DisagreementCase {
|
|
28
|
+
scenarioId: string;
|
|
29
|
+
modelIds: string[];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface MultiModelComparisonReport {
|
|
33
|
+
runId: string;
|
|
34
|
+
timestamp: string;
|
|
35
|
+
suite: EvalSuite;
|
|
36
|
+
baselineModelId?: string;
|
|
37
|
+
models: SingleModelRunReport[];
|
|
38
|
+
comparison: {
|
|
39
|
+
ranking: RankedModelReport[];
|
|
40
|
+
baselineDiffs: BaselineDiff[];
|
|
41
|
+
hardestCases: HardestCase[];
|
|
42
|
+
disagreementCases: DisagreementCase[];
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
type EvalCaseResult = MemoryGateCaseResult | WriteGuardianCaseResult;
|
|
47
|
+
|
|
48
|
+
function getScenarioId(result: EvalCaseResult): string {
|
|
49
|
+
return result.scenarioId;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function getTotalErrors(report: SingleModelRunReport): number {
|
|
53
|
+
const errorCounts = report.summary.errorCounts;
|
|
54
|
+
if (!errorCounts) {
|
|
55
|
+
return 0;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return (
|
|
59
|
+
errorCounts.provider_error +
|
|
60
|
+
errorCounts.schema_error +
|
|
61
|
+
errorCounts.execution_error
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function getCaseSignature(result: EvalCaseResult): string {
|
|
66
|
+
if ("actualDecision" in result) {
|
|
67
|
+
return JSON.stringify({
|
|
68
|
+
pass: result.pass,
|
|
69
|
+
actualDecision: result.actualDecision,
|
|
70
|
+
decisionPass: result.decisionPass,
|
|
71
|
+
candidatePass: result.candidatePass,
|
|
72
|
+
errorType: result.errorType,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return JSON.stringify({
|
|
77
|
+
pass: result.pass,
|
|
78
|
+
actualShouldWrite: result.actualShouldWrite,
|
|
79
|
+
toolTrace: result.actualToolTrace,
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function buildResultMap(report: SingleModelRunReport): Map<string, EvalCaseResult> {
|
|
84
|
+
return new Map(
|
|
85
|
+
report.results.map((result) => [getScenarioId(result as EvalCaseResult), result as EvalCaseResult])
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
export function rankModelReports(
|
|
90
|
+
reports: SingleModelRunReport[]
|
|
91
|
+
): RankedModelReport[] {
|
|
92
|
+
return reports
|
|
93
|
+
.map((report) => ({
|
|
94
|
+
modelId: report.modelId,
|
|
95
|
+
passed: report.summary.passed,
|
|
96
|
+
total: report.summary.total,
|
|
97
|
+
errorCounts: report.summary.errorCounts,
|
|
98
|
+
totalErrors: getTotalErrors(report),
|
|
99
|
+
}))
|
|
100
|
+
.sort((left, right) => {
|
|
101
|
+
if (right.passed !== left.passed) {
|
|
102
|
+
return right.passed - left.passed;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (left.totalErrors !== right.totalErrors) {
|
|
106
|
+
return left.totalErrors - right.totalErrors;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return left.modelId.localeCompare(right.modelId);
|
|
110
|
+
})
|
|
111
|
+
.map(({ totalErrors: _totalErrors, ...report }) => report);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
export function buildBaselineDiffs(
|
|
115
|
+
reports: SingleModelRunReport[],
|
|
116
|
+
baselineModelId: string
|
|
117
|
+
): BaselineDiff[] {
|
|
118
|
+
const baselineReport = reports.find((report) => report.modelId === baselineModelId);
|
|
119
|
+
if (!baselineReport) {
|
|
120
|
+
throw new Error(`Missing baseline model: ${baselineModelId}`);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const baselineResults = buildResultMap(baselineReport);
|
|
124
|
+
|
|
125
|
+
return reports
|
|
126
|
+
.filter((report) => report.modelId !== baselineModelId)
|
|
127
|
+
.map((report) => {
|
|
128
|
+
const reportResults = buildResultMap(report);
|
|
129
|
+
const regressedCases: string[] = [];
|
|
130
|
+
const improvedCases: string[] = [];
|
|
131
|
+
const disagreementCases: string[] = [];
|
|
132
|
+
|
|
133
|
+
for (const [scenarioId, baselineResult] of baselineResults.entries()) {
|
|
134
|
+
const candidateResult = reportResults.get(scenarioId);
|
|
135
|
+
if (!candidateResult) {
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (baselineResult.pass && !candidateResult.pass) {
|
|
140
|
+
regressedCases.push(scenarioId);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (!baselineResult.pass && candidateResult.pass) {
|
|
144
|
+
improvedCases.push(scenarioId);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (getCaseSignature(baselineResult) !== getCaseSignature(candidateResult)) {
|
|
148
|
+
disagreementCases.push(scenarioId);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return {
|
|
153
|
+
modelId: report.modelId,
|
|
154
|
+
regressedCases,
|
|
155
|
+
improvedCases,
|
|
156
|
+
disagreementCases,
|
|
157
|
+
};
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
export function findHardestCases(
|
|
162
|
+
reports: SingleModelRunReport[]
|
|
163
|
+
): HardestCase[] {
|
|
164
|
+
const failedByScenario = new Map<string, string[]>();
|
|
165
|
+
|
|
166
|
+
for (const report of reports) {
|
|
167
|
+
for (const result of report.results) {
|
|
168
|
+
const caseResult = result as EvalCaseResult;
|
|
169
|
+
if (caseResult.pass) {
|
|
170
|
+
continue;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const scenarioId = getScenarioId(caseResult);
|
|
174
|
+
const failedBy = failedByScenario.get(scenarioId) ?? [];
|
|
175
|
+
failedBy.push(report.modelId);
|
|
176
|
+
failedByScenario.set(scenarioId, failedBy);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return [...failedByScenario.entries()]
|
|
181
|
+
.map(([scenarioId, failedBy]) => ({
|
|
182
|
+
scenarioId,
|
|
183
|
+
failedBy,
|
|
184
|
+
}))
|
|
185
|
+
.sort((left, right) => {
|
|
186
|
+
if (right.failedBy.length !== left.failedBy.length) {
|
|
187
|
+
return right.failedBy.length - left.failedBy.length;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return left.scenarioId.localeCompare(right.scenarioId);
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
export function findDisagreementCases(
|
|
195
|
+
reports: SingleModelRunReport[]
|
|
196
|
+
): DisagreementCase[] {
|
|
197
|
+
const cases = new Map<string, Array<{ modelId: string; signature: string }>>();
|
|
198
|
+
|
|
199
|
+
for (const report of reports) {
|
|
200
|
+
for (const result of report.results) {
|
|
201
|
+
const caseResult = result as EvalCaseResult;
|
|
202
|
+
const scenarioId = getScenarioId(caseResult);
|
|
203
|
+
const entries = cases.get(scenarioId) ?? [];
|
|
204
|
+
entries.push({
|
|
205
|
+
modelId: report.modelId,
|
|
206
|
+
signature: getCaseSignature(caseResult),
|
|
207
|
+
});
|
|
208
|
+
cases.set(scenarioId, entries);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
return [...cases.entries()]
|
|
213
|
+
.filter(([, entries]) => new Set(entries.map((entry) => entry.signature)).size > 1)
|
|
214
|
+
.map(([scenarioId, entries]) => ({
|
|
215
|
+
scenarioId,
|
|
216
|
+
modelIds: entries.map((entry) => entry.modelId),
|
|
217
|
+
}))
|
|
218
|
+
.sort((left, right) => left.scenarioId.localeCompare(right.scenarioId));
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
export function buildMultiModelComparisonReport(input: {
|
|
222
|
+
suite: EvalSuite;
|
|
223
|
+
modelReports: SingleModelRunReport[];
|
|
224
|
+
baselineModelId?: string;
|
|
225
|
+
timestamp?: string;
|
|
226
|
+
runId?: string;
|
|
227
|
+
}): MultiModelComparisonReport {
|
|
228
|
+
const timestamp = input.timestamp ?? new Date().toISOString();
|
|
229
|
+
const baselineModelId =
|
|
230
|
+
input.baselineModelId ??
|
|
231
|
+
(input.modelReports.length > 0 ? input.modelReports[0].modelId : undefined);
|
|
232
|
+
|
|
233
|
+
return {
|
|
234
|
+
runId: input.runId ?? `${input.suite}-${timestamp}`,
|
|
235
|
+
timestamp,
|
|
236
|
+
suite: input.suite,
|
|
237
|
+
baselineModelId,
|
|
238
|
+
models: input.modelReports,
|
|
239
|
+
comparison: {
|
|
240
|
+
ranking: rankModelReports(input.modelReports),
|
|
241
|
+
baselineDiffs: baselineModelId
|
|
242
|
+
? buildBaselineDiffs(input.modelReports, baselineModelId)
|
|
243
|
+
: [],
|
|
244
|
+
hardestCases: findHardestCases(input.modelReports),
|
|
245
|
+
disagreementCases: findDisagreementCases(input.modelReports),
|
|
246
|
+
},
|
|
247
|
+
};
|
|
248
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
|
|
3
|
+
export interface EvalModelProfile {
|
|
4
|
+
id: string;
|
|
5
|
+
label: string;
|
|
6
|
+
model: string;
|
|
7
|
+
enabled: boolean;
|
|
8
|
+
tags?: string[];
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface ResolvedEvalModelProfile extends EvalModelProfile {
|
|
12
|
+
baseURL: string;
|
|
13
|
+
apiKey: string;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
interface LoadEvalModelProfilesInput {
|
|
17
|
+
configPath: string;
|
|
18
|
+
selectedModelIds?: string[];
|
|
19
|
+
env?: NodeJS.ProcessEnv;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
23
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function parseEvalModelProfile(value: unknown): EvalModelProfile {
|
|
27
|
+
if (!isRecord(value)) {
|
|
28
|
+
throw new Error("Eval model profile must be an object");
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const {
|
|
32
|
+
id,
|
|
33
|
+
label,
|
|
34
|
+
model,
|
|
35
|
+
enabled,
|
|
36
|
+
tags,
|
|
37
|
+
} = value;
|
|
38
|
+
|
|
39
|
+
if (typeof id !== "string" || id.trim() === "") {
|
|
40
|
+
throw new Error("Eval model profile id must be a non-empty string");
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (typeof label !== "string" || label.trim() === "") {
|
|
44
|
+
throw new Error(`Eval model profile ${id} label must be a non-empty string`);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if (typeof model !== "string" || model.trim() === "") {
|
|
48
|
+
throw new Error(`Eval model profile ${id} model must be a non-empty string`);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (typeof enabled !== "boolean") {
|
|
52
|
+
throw new Error(`Eval model profile ${id} enabled must be a boolean`);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (
|
|
56
|
+
tags !== undefined &&
|
|
57
|
+
(!Array.isArray(tags) || tags.some((tag) => typeof tag !== "string"))
|
|
58
|
+
) {
|
|
59
|
+
throw new Error(`Eval model profile ${id} tags must be a string array`);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
id,
|
|
64
|
+
label,
|
|
65
|
+
model,
|
|
66
|
+
enabled,
|
|
67
|
+
tags,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function parseEvalModelConfig(content: string): EvalModelProfile[] {
|
|
72
|
+
const parsed: unknown = JSON.parse(content);
|
|
73
|
+
|
|
74
|
+
if (!isRecord(parsed) || !Array.isArray(parsed.profiles)) {
|
|
75
|
+
throw new Error("Eval model config must contain a profiles array");
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return parsed.profiles.map((profile) => parseEvalModelProfile(profile));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export async function loadEvalModelProfiles(
|
|
82
|
+
input: LoadEvalModelProfilesInput
|
|
83
|
+
): Promise<ResolvedEvalModelProfile[]> {
|
|
84
|
+
const env = input.env ?? process.env;
|
|
85
|
+
const baseURL = env.EVAL_BASE_URL;
|
|
86
|
+
const apiKey = env.EVAL_API_KEY;
|
|
87
|
+
if (
|
|
88
|
+
typeof baseURL !== "string" ||
|
|
89
|
+
baseURL.trim() === "" ||
|
|
90
|
+
typeof apiKey !== "string" ||
|
|
91
|
+
apiKey.trim() === ""
|
|
92
|
+
) {
|
|
93
|
+
throw new Error(
|
|
94
|
+
"Missing required env vars for model comparison: EVAL_BASE_URL, EVAL_API_KEY"
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const profiles = parseEvalModelConfig(await readFile(input.configPath, "utf8"));
|
|
99
|
+
const enabledProfiles = profiles.filter((profile) => profile.enabled);
|
|
100
|
+
|
|
101
|
+
if (enabledProfiles.length === 0) {
|
|
102
|
+
throw new Error("Eval model config has no enabled profiles");
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const selectedModelIds =
|
|
106
|
+
input.selectedModelIds?.filter((modelId) => modelId.trim() !== "") ?? [];
|
|
107
|
+
|
|
108
|
+
const filteredProfiles =
|
|
109
|
+
selectedModelIds.length === 0
|
|
110
|
+
? enabledProfiles
|
|
111
|
+
: selectedModelIds.map((modelId) => {
|
|
112
|
+
const profile = enabledProfiles.find((candidate) => candidate.id === modelId);
|
|
113
|
+
if (!profile) {
|
|
114
|
+
throw new Error(`Unknown model ids: ${modelId}`);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return profile;
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
return filteredProfiles.map((profile) => ({
|
|
121
|
+
...profile,
|
|
122
|
+
baseURL,
|
|
123
|
+
apiKey,
|
|
124
|
+
}));
|
|
125
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
|
|
4
|
+
import type { MultiModelComparisonReport } from "./comparison.js";
|
|
5
|
+
|
|
6
|
+
function formatErrorCounts(
|
|
7
|
+
errorCounts: MultiModelComparisonReport["models"][number]["summary"]["errorCounts"]
|
|
8
|
+
): string {
|
|
9
|
+
if (!errorCounts) {
|
|
10
|
+
return "0/0/0";
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
return `${errorCounts.provider_error}/${errorCounts.schema_error}/${errorCounts.execution_error}`;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export function renderComparisonMarkdown(
|
|
17
|
+
report: MultiModelComparisonReport
|
|
18
|
+
): string {
|
|
19
|
+
const lines = [
|
|
20
|
+
"# Eval Comparison Report",
|
|
21
|
+
"",
|
|
22
|
+
`- Run ID: ${report.runId}`,
|
|
23
|
+
`- Timestamp: ${report.timestamp}`,
|
|
24
|
+
`- Suite: ${report.suite}`,
|
|
25
|
+
];
|
|
26
|
+
|
|
27
|
+
if (report.baselineModelId) {
|
|
28
|
+
lines.push(`- Baseline: ${report.baselineModelId}`);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
lines.push(
|
|
32
|
+
"",
|
|
33
|
+
"## Leaderboard",
|
|
34
|
+
"",
|
|
35
|
+
"| Model | Passed | Total | Errors (provider/schema/execution) |",
|
|
36
|
+
"| --- | --- | --- | --- |"
|
|
37
|
+
);
|
|
38
|
+
|
|
39
|
+
for (const entry of report.comparison.ranking) {
|
|
40
|
+
lines.push(
|
|
41
|
+
`| ${entry.modelId} | ${entry.passed} | ${entry.total} | ${formatErrorCounts(
|
|
42
|
+
entry.errorCounts
|
|
43
|
+
)} |`
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
lines.push("", "## Baseline Diffs", "");
|
|
48
|
+
if (report.comparison.baselineDiffs.length === 0) {
|
|
49
|
+
lines.push("No baseline diffs.");
|
|
50
|
+
} else {
|
|
51
|
+
for (const diff of report.comparison.baselineDiffs) {
|
|
52
|
+
lines.push(`### ${diff.modelId}`);
|
|
53
|
+
lines.push(`- Regressed: ${diff.regressedCases.join(", ") || "(none)"}`);
|
|
54
|
+
lines.push(`- Improved: ${diff.improvedCases.join(", ") || "(none)"}`);
|
|
55
|
+
lines.push(
|
|
56
|
+
`- Disagreements: ${diff.disagreementCases.join(", ") || "(none)"}`
|
|
57
|
+
);
|
|
58
|
+
lines.push("");
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
lines.push("## Hardest Cases", "");
|
|
63
|
+
if (report.comparison.hardestCases.length === 0) {
|
|
64
|
+
lines.push("No failed cases.");
|
|
65
|
+
} else {
|
|
66
|
+
for (const hardestCase of report.comparison.hardestCases) {
|
|
67
|
+
lines.push(
|
|
68
|
+
`- ${hardestCase.scenarioId}: ${hardestCase.failedBy.join(", ")}`
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
lines.push("", "## Disagreement Cases", "");
|
|
74
|
+
if (report.comparison.disagreementCases.length === 0) {
|
|
75
|
+
lines.push("No disagreement cases.");
|
|
76
|
+
} else {
|
|
77
|
+
for (const disagreement of report.comparison.disagreementCases) {
|
|
78
|
+
lines.push(
|
|
79
|
+
`- ${disagreement.scenarioId}: ${disagreement.modelIds.join(", ")}`
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return `${lines.join("\n")}\n`;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export async function writeComparisonReports(input: {
|
|
88
|
+
report: MultiModelComparisonReport;
|
|
89
|
+
outputPath?: string;
|
|
90
|
+
markdownOutputPath?: string;
|
|
91
|
+
}): Promise<{
|
|
92
|
+
jsonWritten: boolean;
|
|
93
|
+
markdownWritten: boolean;
|
|
94
|
+
writtenPaths: string[];
|
|
95
|
+
}> {
|
|
96
|
+
const writtenPaths: string[] = [];
|
|
97
|
+
|
|
98
|
+
if (input.outputPath) {
|
|
99
|
+
await mkdir(path.dirname(input.outputPath), { recursive: true });
|
|
100
|
+
await writeFile(
|
|
101
|
+
input.outputPath,
|
|
102
|
+
`${JSON.stringify(input.report, null, 2)}\n`,
|
|
103
|
+
"utf8"
|
|
104
|
+
);
|
|
105
|
+
writtenPaths.push(input.outputPath);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (input.markdownOutputPath) {
|
|
109
|
+
await mkdir(path.dirname(input.markdownOutputPath), { recursive: true });
|
|
110
|
+
await writeFile(
|
|
111
|
+
input.markdownOutputPath,
|
|
112
|
+
renderComparisonMarkdown(input.report),
|
|
113
|
+
"utf8"
|
|
114
|
+
);
|
|
115
|
+
writtenPaths.push(input.markdownOutputPath);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
jsonWritten: Boolean(input.outputPath),
|
|
120
|
+
markdownWritten: Boolean(input.markdownOutputPath),
|
|
121
|
+
writtenPaths,
|
|
122
|
+
};
|
|
123
|
+
}
|
package/src/evals/runner.ts
CHANGED
|
@@ -5,6 +5,7 @@ import { mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
|
|
5
5
|
import { LLMService } from "../llm/service.js";
|
|
6
6
|
import { MemoryGateAnalyzer } from "../memory-gate/analyzer.js";
|
|
7
7
|
import { WriteGuardian } from "../write-guardian/index.js";
|
|
8
|
+
import type { EvalSuite } from "./cli.js";
|
|
8
9
|
import type {
|
|
9
10
|
AgentStep,
|
|
10
11
|
LLMService as LLMServiceContract,
|
|
@@ -59,6 +60,7 @@ export interface MemoryGateCaseResult {
|
|
|
59
60
|
expectedDecision: MemoryGateOutput["decision"];
|
|
60
61
|
actualCandidateFact?: string;
|
|
61
62
|
expectedCandidateFact?: string;
|
|
63
|
+
errorType?: "provider_error" | "schema_error" | "execution_error";
|
|
62
64
|
error?: string;
|
|
63
65
|
}
|
|
64
66
|
|
|
@@ -77,6 +79,21 @@ export interface WriteGuardianCaseResult {
|
|
|
77
79
|
export interface BenchmarkSummary {
|
|
78
80
|
total: number;
|
|
79
81
|
passed: number;
|
|
82
|
+
errorCounts?: {
|
|
83
|
+
provider_error: number;
|
|
84
|
+
schema_error: number;
|
|
85
|
+
execution_error: number;
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
export interface SingleModelRunReport {
|
|
90
|
+
modelId: string;
|
|
91
|
+
modelLabel: string;
|
|
92
|
+
suite: EvalSuite;
|
|
93
|
+
startedAt: string;
|
|
94
|
+
finishedAt: string;
|
|
95
|
+
summary: BenchmarkSummary;
|
|
96
|
+
results: MemoryGateCaseResult[] | WriteGuardianCaseResult[];
|
|
80
97
|
}
|
|
81
98
|
|
|
82
99
|
export interface Judge {
|
|
@@ -138,6 +155,38 @@ function normalizeFileContent(content: string): string {
|
|
|
138
155
|
return normalized.endsWith("\n") ? normalized : `${normalized}\n`;
|
|
139
156
|
}
|
|
140
157
|
|
|
158
|
+
function createEmptyErrorCounts(): NonNullable<BenchmarkSummary["errorCounts"]> {
|
|
159
|
+
return {
|
|
160
|
+
provider_error: 0,
|
|
161
|
+
schema_error: 0,
|
|
162
|
+
execution_error: 0,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function classifyMemoryGateError(
|
|
167
|
+
message: string | undefined
|
|
168
|
+
): MemoryGateCaseResult["errorType"] | undefined {
|
|
169
|
+
if (!message) {
|
|
170
|
+
return undefined;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if (message.includes("Provider request failed")) {
|
|
174
|
+
return "provider_error";
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
if (message.includes("Schema validation failed")) {
|
|
178
|
+
return "schema_error";
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return undefined;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
export function buildSingleModelRunReport(
|
|
185
|
+
input: SingleModelRunReport
|
|
186
|
+
): SingleModelRunReport {
|
|
187
|
+
return { ...input };
|
|
188
|
+
}
|
|
189
|
+
|
|
141
190
|
export async function evaluateMemoryGateBenchmark(input: {
|
|
142
191
|
scenarios: SharedScenario[];
|
|
143
192
|
benchmarkCases: MemoryGateBenchmarkCase[];
|
|
@@ -148,6 +197,7 @@ export async function evaluateMemoryGateBenchmark(input: {
|
|
|
148
197
|
const scenarioMap = buildScenarioMap(input.scenarios);
|
|
149
198
|
const results: MemoryGateCaseResult[] = [];
|
|
150
199
|
const logger = input.logger ?? createNoopLogger();
|
|
200
|
+
const errorCounts = createEmptyErrorCounts();
|
|
151
201
|
|
|
152
202
|
for (const benchmarkCase of input.benchmarkCases) {
|
|
153
203
|
const scenario = scenarioMap.get(benchmarkCase.scenario_id);
|
|
@@ -188,6 +238,10 @@ export async function evaluateMemoryGateBenchmark(input: {
|
|
|
188
238
|
}
|
|
189
239
|
|
|
190
240
|
const pass = decisionPass && candidatePass;
|
|
241
|
+
const errorType = classifyMemoryGateError(actual.reason);
|
|
242
|
+
if (errorType) {
|
|
243
|
+
errorCounts[errorType] += 1;
|
|
244
|
+
}
|
|
191
245
|
results.push({
|
|
192
246
|
scenarioId: benchmarkCase.scenario_id,
|
|
193
247
|
pass,
|
|
@@ -198,6 +252,8 @@ export async function evaluateMemoryGateBenchmark(input: {
|
|
|
198
252
|
expectedDecision: benchmarkCase.expected_decision,
|
|
199
253
|
actualCandidateFact: actual.candidateFact,
|
|
200
254
|
expectedCandidateFact: benchmarkCase.expected_candidate_fact,
|
|
255
|
+
errorType,
|
|
256
|
+
error: errorType ? actual.reason : undefined,
|
|
201
257
|
});
|
|
202
258
|
logger.info("EvalRunner", "Completed memory_gate case", {
|
|
203
259
|
scenarioId: benchmarkCase.scenario_id,
|
|
@@ -206,9 +262,12 @@ export async function evaluateMemoryGateBenchmark(input: {
|
|
|
206
262
|
candidatePass,
|
|
207
263
|
judgeUsed,
|
|
208
264
|
actualDecision: actual.decision,
|
|
265
|
+
errorType,
|
|
209
266
|
});
|
|
210
267
|
} catch (error) {
|
|
211
268
|
const reason = getErrorMessage(error);
|
|
269
|
+
const errorType = classifyMemoryGateError(reason) ?? "execution_error";
|
|
270
|
+
errorCounts[errorType] += 1;
|
|
212
271
|
results.push({
|
|
213
272
|
scenarioId: benchmarkCase.scenario_id,
|
|
214
273
|
pass: false,
|
|
@@ -218,11 +277,13 @@ export async function evaluateMemoryGateBenchmark(input: {
|
|
|
218
277
|
actualDecision: "NO_WRITE",
|
|
219
278
|
expectedDecision: benchmarkCase.expected_decision,
|
|
220
279
|
expectedCandidateFact: benchmarkCase.expected_candidate_fact,
|
|
280
|
+
errorType,
|
|
221
281
|
error: reason,
|
|
222
282
|
});
|
|
223
283
|
logger.error("EvalRunner", "memory_gate case failed", {
|
|
224
284
|
scenarioId: benchmarkCase.scenario_id,
|
|
225
285
|
reason,
|
|
286
|
+
errorType,
|
|
226
287
|
});
|
|
227
288
|
}
|
|
228
289
|
}
|
|
@@ -231,6 +292,7 @@ export async function evaluateMemoryGateBenchmark(input: {
|
|
|
231
292
|
summary: {
|
|
232
293
|
total: results.length,
|
|
233
294
|
passed: results.filter((result) => result.pass).length,
|
|
295
|
+
errorCounts,
|
|
234
296
|
},
|
|
235
297
|
results,
|
|
236
298
|
};
|
package/src/index.ts
CHANGED
|
@@ -12,6 +12,10 @@ import {
|
|
|
12
12
|
MemoryGateAnalyzer,
|
|
13
13
|
} from "./memory-gate/index.js";
|
|
14
14
|
import { WriteGuardian } from "./write-guardian/index.js";
|
|
15
|
+
import {
|
|
16
|
+
WriteGuardianAuditLog,
|
|
17
|
+
type WriteGuardianAuditEntry,
|
|
18
|
+
} from "./write-guardian/audit-log.js";
|
|
15
19
|
import {
|
|
16
20
|
handleBeforeMessageWrite,
|
|
17
21
|
handleMessageReceived,
|
|
@@ -47,6 +51,10 @@ export interface PluginAPI {
|
|
|
47
51
|
handler: (event: unknown, context?: unknown) => void,
|
|
48
52
|
options?: { priority?: number }
|
|
49
53
|
) => void;
|
|
54
|
+
registerCommand?: (
|
|
55
|
+
command: string,
|
|
56
|
+
handler: (args?: string) => string | Promise<string>
|
|
57
|
+
) => void;
|
|
50
58
|
}
|
|
51
59
|
|
|
52
60
|
let bufferManager: SessionBufferManager | null = null;
|
|
@@ -54,6 +62,54 @@ let gatewayLogger: PluginLogger | null = null;
|
|
|
54
62
|
let fileLogger: FileLogger | null = null;
|
|
55
63
|
let isRegistered = false;
|
|
56
64
|
|
|
65
|
+
function formatWriteGuardianAudit(entries: WriteGuardianAuditEntry[]): string {
|
|
66
|
+
if (entries.length === 0) {
|
|
67
|
+
return "No write_guardian records found.";
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const lines = entries.map((entry, index) => {
|
|
71
|
+
const summary = [
|
|
72
|
+
`${index + 1}. [${entry.timestamp}] ${entry.status}`,
|
|
73
|
+
`decision=${entry.decision}`,
|
|
74
|
+
entry.targetFile ? `file=${entry.targetFile}` : undefined,
|
|
75
|
+
entry.reason ? `reason=${entry.reason}` : undefined,
|
|
76
|
+
entry.candidateFact ? `fact=${entry.candidateFact}` : undefined,
|
|
77
|
+
]
|
|
78
|
+
.filter((part): part is string => Boolean(part))
|
|
79
|
+
.join(" | ");
|
|
80
|
+
|
|
81
|
+
return summary;
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
return lines.join("\n");
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function registerReflectionCommand(
|
|
88
|
+
api: PluginAPI,
|
|
89
|
+
logger: FileLogger,
|
|
90
|
+
auditLog?: WriteGuardianAuditLog
|
|
91
|
+
): void {
|
|
92
|
+
if (typeof api.registerCommand !== "function") {
|
|
93
|
+
logger.info("PluginLifecycle", "registerCommand unavailable, skip command registration", {
|
|
94
|
+
command: "/openclaw-reflection",
|
|
95
|
+
});
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
api.registerCommand("/openclaw-reflection", async () => {
|
|
100
|
+
if (!auditLog) {
|
|
101
|
+
return "write_guardian audit log unavailable: workspace is not configured.";
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const entries = await auditLog.readRecent(10);
|
|
105
|
+
return formatWriteGuardianAudit(entries);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
logger.info("PluginLifecycle", "Registered plugin command", {
|
|
109
|
+
command: "/openclaw-reflection",
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
|
|
57
113
|
function getErrorMessage(error: unknown): string {
|
|
58
114
|
if (error instanceof Error) {
|
|
59
115
|
return error.message;
|
|
@@ -206,6 +262,7 @@ export default function activate(api: PluginAPI): void {
|
|
|
206
262
|
|
|
207
263
|
let memoryGate: MemoryGateAnalyzer | undefined;
|
|
208
264
|
let writeGuardian: WriteGuardian | undefined;
|
|
265
|
+
let writeGuardianAuditLog: WriteGuardianAuditLog | undefined;
|
|
209
266
|
|
|
210
267
|
if (config.memoryGate.enabled && llmService) {
|
|
211
268
|
memoryGate = new MemoryGateAnalyzer(llmService, logger);
|
|
@@ -217,7 +274,13 @@ export default function activate(api: PluginAPI): void {
|
|
|
217
274
|
}
|
|
218
275
|
|
|
219
276
|
if (llmService && workspaceDir) {
|
|
220
|
-
|
|
277
|
+
writeGuardianAuditLog = new WriteGuardianAuditLog(workspaceDir);
|
|
278
|
+
writeGuardian = new WriteGuardian(
|
|
279
|
+
{ workspaceDir },
|
|
280
|
+
logger,
|
|
281
|
+
llmService,
|
|
282
|
+
writeGuardianAuditLog
|
|
283
|
+
);
|
|
221
284
|
logger.info("PluginLifecycle", "write_guardian initialized", {
|
|
222
285
|
workspaceDir,
|
|
223
286
|
});
|
|
@@ -303,6 +366,8 @@ export default function activate(api: PluginAPI): void {
|
|
|
303
366
|
}
|
|
304
367
|
);
|
|
305
368
|
|
|
369
|
+
registerReflectionCommand(api, logger, writeGuardianAuditLog);
|
|
370
|
+
|
|
306
371
|
gatewayLogger.info("[Reflection] Message hooks registered");
|
|
307
372
|
logger.info("PluginLifecycle", "Message hooks registered");
|
|
308
373
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import * as fs from "node:fs";
|
|
2
|
+
import * as fsp from "node:fs/promises";
|
|
3
|
+
import * as path from "node:path";
|
|
4
|
+
|
|
5
|
+
export interface WriteGuardianAuditEntry {
|
|
6
|
+
timestamp: string;
|
|
7
|
+
decision: string;
|
|
8
|
+
targetFile?: string;
|
|
9
|
+
status: "written" | "refused" | "failed" | "skipped";
|
|
10
|
+
reason?: string;
|
|
11
|
+
candidateFact?: string;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function normalizeError(error: unknown): string {
|
|
15
|
+
if (error instanceof Error) {
|
|
16
|
+
return error.message;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
return String(error);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export class WriteGuardianAuditLog {
|
|
23
|
+
private readonly filePath: string;
|
|
24
|
+
|
|
25
|
+
constructor(workspaceDir: string) {
|
|
26
|
+
const logDir = path.join(workspaceDir, ".openclaw-reflection");
|
|
27
|
+
this.filePath = path.join(logDir, "write-guardian.log.jsonl");
|
|
28
|
+
|
|
29
|
+
if (!fs.existsSync(logDir)) {
|
|
30
|
+
fs.mkdirSync(logDir, { recursive: true });
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
async append(entry: Omit<WriteGuardianAuditEntry, "timestamp">): Promise<void> {
|
|
35
|
+
const serialized = JSON.stringify({
|
|
36
|
+
timestamp: new Date().toISOString(),
|
|
37
|
+
...entry,
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
await fsp.appendFile(this.filePath, `${serialized}\n`, "utf8");
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
async readRecent(limit: number): Promise<WriteGuardianAuditEntry[]> {
|
|
44
|
+
try {
|
|
45
|
+
const content = await fsp.readFile(this.filePath, "utf8");
|
|
46
|
+
const lines = content
|
|
47
|
+
.split("\n")
|
|
48
|
+
.map((line) => line.trim())
|
|
49
|
+
.filter((line) => line.length > 0);
|
|
50
|
+
|
|
51
|
+
const parsed = lines
|
|
52
|
+
.map((line) => {
|
|
53
|
+
try {
|
|
54
|
+
return JSON.parse(line) as WriteGuardianAuditEntry;
|
|
55
|
+
} catch {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
})
|
|
59
|
+
.filter((entry): entry is WriteGuardianAuditEntry => entry !== null);
|
|
60
|
+
|
|
61
|
+
return parsed.slice(-limit).reverse();
|
|
62
|
+
} catch (error) {
|
|
63
|
+
const errorMessage = normalizeError(error);
|
|
64
|
+
if (errorMessage.includes("ENOENT")) {
|
|
65
|
+
return [];
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
throw error;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import * as path from "path";
|
|
2
2
|
import type { AgentTool, LLMService, MemoryGateOutput, Logger } from "../types.js";
|
|
3
3
|
import { readFile, writeFileWithLock } from "../utils/file-utils.js";
|
|
4
|
+
import { WriteGuardianAuditLog } from "./audit-log.js";
|
|
4
5
|
|
|
5
6
|
type UpdateDecision =
|
|
6
7
|
| "UPDATE_MEMORY"
|
|
@@ -93,16 +94,25 @@ export class WriteGuardian {
|
|
|
93
94
|
private config: WriteGuardianConfig;
|
|
94
95
|
private logger: Logger;
|
|
95
96
|
private llmService: LLMService;
|
|
96
|
-
|
|
97
|
-
|
|
97
|
+
private auditLog?: WriteGuardianAuditLog;
|
|
98
|
+
|
|
99
|
+
constructor(
|
|
100
|
+
config: WriteGuardianConfig,
|
|
101
|
+
logger: Logger,
|
|
102
|
+
llmService: LLMService,
|
|
103
|
+
auditLog?: WriteGuardianAuditLog
|
|
104
|
+
) {
|
|
98
105
|
this.config = config;
|
|
99
106
|
this.logger = logger;
|
|
100
107
|
this.llmService = llmService;
|
|
108
|
+
this.auditLog = auditLog;
|
|
101
109
|
}
|
|
102
110
|
|
|
103
111
|
async write(output: MemoryGateOutput): Promise<WriteGuardianWriteResult> {
|
|
104
112
|
if (!isUpdateDecision(output.decision)) {
|
|
105
|
-
|
|
113
|
+
const result = { status: "skipped", reason: "not an update decision" } as const;
|
|
114
|
+
await this.recordAudit(output, result);
|
|
115
|
+
return result;
|
|
106
116
|
}
|
|
107
117
|
|
|
108
118
|
const candidateFact = output.candidateFact?.trim();
|
|
@@ -111,7 +121,9 @@ export class WriteGuardian {
|
|
|
111
121
|
decision: output.decision,
|
|
112
122
|
reason: output.reason,
|
|
113
123
|
});
|
|
114
|
-
|
|
124
|
+
const result = { status: "skipped", reason: "missing candidate fact" } as const;
|
|
125
|
+
await this.recordAudit(output, result);
|
|
126
|
+
return result;
|
|
115
127
|
}
|
|
116
128
|
|
|
117
129
|
const targetFile = TARGET_FILES[output.decision];
|
|
@@ -141,14 +153,18 @@ export class WriteGuardian {
|
|
|
141
153
|
filePath,
|
|
142
154
|
reason,
|
|
143
155
|
});
|
|
144
|
-
|
|
156
|
+
const writeResult = { status: "refused", reason } as const;
|
|
157
|
+
await this.recordAudit(output, writeResult, targetFile);
|
|
158
|
+
return writeResult;
|
|
145
159
|
}
|
|
146
160
|
|
|
147
161
|
this.logger.info("WriteGuardian", "write_guardian rewrote target file", {
|
|
148
162
|
decision: output.decision,
|
|
149
163
|
filePath,
|
|
150
164
|
});
|
|
151
|
-
|
|
165
|
+
const writeResult = { status: "written" } as const;
|
|
166
|
+
await this.recordAudit(output, writeResult, targetFile);
|
|
167
|
+
return writeResult;
|
|
152
168
|
} catch (error) {
|
|
153
169
|
const reason = getErrorMessage(error);
|
|
154
170
|
this.logger.error("WriteGuardian", "write_guardian execution failed", {
|
|
@@ -156,7 +172,33 @@ export class WriteGuardian {
|
|
|
156
172
|
filePath,
|
|
157
173
|
reason,
|
|
158
174
|
});
|
|
159
|
-
|
|
175
|
+
const writeResult = { status: "failed", reason } as const;
|
|
176
|
+
await this.recordAudit(output, writeResult, targetFile);
|
|
177
|
+
return writeResult;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
private async recordAudit(
|
|
182
|
+
output: MemoryGateOutput,
|
|
183
|
+
result: WriteGuardianWriteResult,
|
|
184
|
+
targetFile?: CuratedFilename
|
|
185
|
+
): Promise<void> {
|
|
186
|
+
if (!this.auditLog) {
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
try {
|
|
191
|
+
await this.auditLog.append({
|
|
192
|
+
decision: output.decision,
|
|
193
|
+
targetFile,
|
|
194
|
+
status: result.status,
|
|
195
|
+
reason: result.reason,
|
|
196
|
+
candidateFact: output.candidateFact,
|
|
197
|
+
});
|
|
198
|
+
} catch (error) {
|
|
199
|
+
this.logger.warn("WriteGuardian", "Failed to append write_guardian audit log", {
|
|
200
|
+
reason: getErrorMessage(error),
|
|
201
|
+
});
|
|
160
202
|
}
|
|
161
203
|
}
|
|
162
204
|
|