poe-code 3.0.269 → 3.0.271
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/agent.js +3 -31
- package/dist/cli/commands/agent.js.map +1 -1
- package/dist/index.js +1143 -909
- package/dist/index.js.map +4 -4
- package/dist/metafile.json +1 -1
- package/dist/providers/gemini-cli.js +10 -8
- package/dist/providers/gemini-cli.js.map +2 -2
- package/dist/providers/poe-agent.js +1 -0
- package/dist/providers/poe-agent.js.map +2 -2
- package/package.json +1 -1
- package/packages/agent-eval/dist/check/check.js +3 -0
- package/packages/agent-eval/dist/report/load.js +116 -2
- package/packages/agent-eval/dist/run/fixture-copy.d.ts +1 -0
- package/packages/agent-eval/dist/run/fixture-copy.js +24 -0
- package/packages/agent-eval/dist/run/run.js +2 -0
- package/packages/agent-eval/dist/schema.js +61 -20
- package/packages/agent-eval/dist/source/config.js +43 -1
- package/packages/agent-eval/dist/source/registry.js +3 -0
- package/packages/agent-spawn/dist/acp/session-update-converter.d.ts +7 -4
- package/packages/agent-spawn/dist/acp/session-update-converter.js +16 -14
- package/packages/agent-spawn/dist/acp/types.d.ts +15 -3
- package/packages/agent-spawn/dist/index.d.ts +2 -0
- package/packages/agent-spawn/dist/index.js +1 -0
package/package.json
CHANGED
|
@@ -7,6 +7,7 @@ import { loadEval } from "../source/registry.js";
|
|
|
7
7
|
import { cloneTarget } from "../run/clone.js";
|
|
8
8
|
import { runScorer } from "../run/scorer.js";
|
|
9
9
|
import { assertCanonicalContainedPath, assertCanonicalDestinationPath, resolveContainedPath } from "../path-boundary.js";
|
|
10
|
+
import { assertNoSymlinksInDirectoryTree } from "../run/fixture-copy.js";
|
|
10
11
|
export async function evalCheck(opts) {
|
|
11
12
|
const startedAt = Date.now();
|
|
12
13
|
const source = await openSource(opts.sourceDir);
|
|
@@ -66,6 +67,7 @@ async function copyDirectoryIfPresent(sourceDir, destDir) {
|
|
|
66
67
|
}
|
|
67
68
|
throw error;
|
|
68
69
|
}
|
|
70
|
+
await assertNoSymlinksInDirectoryTree(sourceDir, "starter");
|
|
69
71
|
await cp(sourceDir, destDir, {
|
|
70
72
|
recursive: true,
|
|
71
73
|
force: true
|
|
@@ -75,6 +77,7 @@ async function copyOracleSolution(input) {
|
|
|
75
77
|
const destDir = resolveCloneRelativePath(input.cloneDir, input.solutionDest);
|
|
76
78
|
await assertCanonicalDestinationPath(input.cloneDir, destDir, "oracle.solution_dest");
|
|
77
79
|
await mkdir(destDir, { recursive: true });
|
|
80
|
+
await assertNoSymlinksInDirectoryTree(input.solutionDir, "oracle.solution");
|
|
78
81
|
await cp(input.solutionDir, destDir, {
|
|
79
82
|
recursive: true,
|
|
80
83
|
force: true
|
|
@@ -80,7 +80,7 @@ async function enrichAggregatedCell(cell, outDir) {
|
|
|
80
80
|
}
|
|
81
81
|
}
|
|
82
82
|
async function enrichMatchedRunResult(runId, resultPath) {
|
|
83
|
-
const result = parseJson(await readFile(resultPath, "utf8"), resultPath);
|
|
83
|
+
const result = validateRunResult(parseJson(await readFile(resultPath, "utf8"), resultPath), resultPath);
|
|
84
84
|
if (result.runId !== runId) {
|
|
85
85
|
throw new Error(`Run result "${runId}" embeds mismatched runId "${result.runId}"`);
|
|
86
86
|
}
|
|
@@ -158,7 +158,7 @@ async function enrichRunResult(result, runDir) {
|
|
|
158
158
|
async function loadTraceSummary(tracePath, outDir) {
|
|
159
159
|
try {
|
|
160
160
|
await assertCanonicalOutputFile(outDir, tracePath);
|
|
161
|
-
const trace = parseJson(await readFile(tracePath, "utf8"), tracePath);
|
|
161
|
+
const trace = validateTrace(parseJson(await readFile(tracePath, "utf8"), tracePath), tracePath);
|
|
162
162
|
return {
|
|
163
163
|
available: true,
|
|
164
164
|
eventCount: trace.events.length,
|
|
@@ -173,6 +173,120 @@ async function loadTraceSummary(tracePath, outDir) {
|
|
|
173
173
|
throw error;
|
|
174
174
|
}
|
|
175
175
|
}
|
|
176
|
+
function validateRunResult(value, filePath) {
|
|
177
|
+
const result = requireRecord(value, filePath, "result.json", []);
|
|
178
|
+
requireString(result.runId, filePath, "result.json", ["runId"]);
|
|
179
|
+
requireString(result.eval, filePath, "result.json", ["eval"]);
|
|
180
|
+
requireString(result.agent, filePath, "result.json", ["agent"]);
|
|
181
|
+
requireString(result.model, filePath, "result.json", ["model"]);
|
|
182
|
+
requireString(result.planKind, filePath, "result.json", ["planKind"]);
|
|
183
|
+
requireString(result.verdict, filePath, "result.json", ["verdict"]);
|
|
184
|
+
requireNonNegativeInteger(result.iterations, filePath, "result.json", ["iterations"]);
|
|
185
|
+
requireNonNegativeNumber(result.durationMs, filePath, "result.json", ["durationMs"]);
|
|
186
|
+
requireRange(result.correctness, 0, 1, filePath, "result.json", ["correctness"]);
|
|
187
|
+
const usage = requireRecord(result.usage, filePath, "result.json", ["usage"]);
|
|
188
|
+
requireNonNegativeInteger(usage.inputTokens, filePath, "result.json", ["usage", "inputTokens"]);
|
|
189
|
+
requireNonNegativeInteger(usage.outputTokens, filePath, "result.json", ["usage", "outputTokens"]);
|
|
190
|
+
if (usage.cachedTokens !== undefined) {
|
|
191
|
+
requireNonNegativeInteger(usage.cachedTokens, filePath, "result.json", [
|
|
192
|
+
"usage",
|
|
193
|
+
"cachedTokens"
|
|
194
|
+
]);
|
|
195
|
+
}
|
|
196
|
+
if (usage.costUsd !== undefined) {
|
|
197
|
+
requireNonNegativeNumber(usage.costUsd, filePath, "result.json", ["usage", "costUsd"]);
|
|
198
|
+
}
|
|
199
|
+
const tests = requireRecord(result.tests, filePath, "result.json", ["tests"]);
|
|
200
|
+
const passed = requireNonNegativeInteger(tests.passed, filePath, "result.json", [
|
|
201
|
+
"tests",
|
|
202
|
+
"passed"
|
|
203
|
+
]);
|
|
204
|
+
const total = requireNonNegativeInteger(tests.total, filePath, "result.json", ["tests", "total"]);
|
|
205
|
+
if (passed > total) {
|
|
206
|
+
throw invalidArtifactField(filePath, "result.json", ["tests", "passed"], "integer less than or equal to tests.total", passed);
|
|
207
|
+
}
|
|
208
|
+
requireRange(tests.pass_rate, 0, 1, filePath, "result.json", ["tests", "pass_rate"]);
|
|
209
|
+
if (!Array.isArray(tests.cases)) {
|
|
210
|
+
throw invalidArtifactField(filePath, "result.json", ["tests", "cases"], "array", tests.cases);
|
|
211
|
+
}
|
|
212
|
+
const scoring = requireRecord(result.scoring, filePath, "result.json", ["scoring"]);
|
|
213
|
+
validateScoringComponent(scoring.tests, filePath, ["scoring", "tests"]);
|
|
214
|
+
validateScoringComponent(scoring.judge, filePath, ["scoring", "judge"]);
|
|
215
|
+
requireBoolean(result.cheated, filePath, "result.json", ["cheated"]);
|
|
216
|
+
requireRecord(result.cheatReport, filePath, "result.json", ["cheatReport"]);
|
|
217
|
+
return result;
|
|
218
|
+
}
|
|
219
|
+
function validateScoringComponent(value, filePath, path) {
|
|
220
|
+
const component = requireRecord(value, filePath, "result.json", path);
|
|
221
|
+
requireBoolean(component.configured, filePath, "result.json", [...path, "configured"]);
|
|
222
|
+
requireBoolean(component.required, filePath, "result.json", [...path, "required"]);
|
|
223
|
+
requireRange(component.configuredWeight, 0, 1, filePath, "result.json", [
|
|
224
|
+
...path,
|
|
225
|
+
"configuredWeight"
|
|
226
|
+
]);
|
|
227
|
+
requireRange(component.effectiveWeight, 0, 1, filePath, "result.json", [
|
|
228
|
+
...path,
|
|
229
|
+
"effectiveWeight"
|
|
230
|
+
]);
|
|
231
|
+
requireString(component.status, filePath, "result.json", [...path, "status"]);
|
|
232
|
+
}
|
|
233
|
+
function validateTrace(value, filePath) {
|
|
234
|
+
const trace = requireRecord(value, filePath, "trace.json", []);
|
|
235
|
+
if (!Array.isArray(trace.events)) {
|
|
236
|
+
throw invalidArtifactField(filePath, "trace.json", ["events"], "array", trace.events);
|
|
237
|
+
}
|
|
238
|
+
return trace;
|
|
239
|
+
}
|
|
240
|
+
function requireRecord(value, filePath, artifact, path) {
|
|
241
|
+
if (typeof value === "object" && value !== null && !Array.isArray(value)) {
|
|
242
|
+
return value;
|
|
243
|
+
}
|
|
244
|
+
throw invalidArtifactField(filePath, artifact, path, "object", value);
|
|
245
|
+
}
|
|
246
|
+
function requireString(value, filePath, artifact, path) {
|
|
247
|
+
if (typeof value === "string" && value.length > 0) {
|
|
248
|
+
return value;
|
|
249
|
+
}
|
|
250
|
+
throw invalidArtifactField(filePath, artifact, path, "non-empty string", value);
|
|
251
|
+
}
|
|
252
|
+
function requireBoolean(value, filePath, artifact, path) {
|
|
253
|
+
if (typeof value !== "boolean") {
|
|
254
|
+
throw invalidArtifactField(filePath, artifact, path, "boolean", value);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
function requireNonNegativeInteger(value, filePath, artifact, path) {
|
|
258
|
+
if (typeof value === "number" && Number.isInteger(value) && value >= 0) {
|
|
259
|
+
return value;
|
|
260
|
+
}
|
|
261
|
+
throw invalidArtifactField(filePath, artifact, path, "non-negative integer", value);
|
|
262
|
+
}
|
|
263
|
+
function requireNonNegativeNumber(value, filePath, artifact, path) {
|
|
264
|
+
if (typeof value === "number" && Number.isFinite(value) && value >= 0) {
|
|
265
|
+
return value;
|
|
266
|
+
}
|
|
267
|
+
throw invalidArtifactField(filePath, artifact, path, "non-negative number", value);
|
|
268
|
+
}
|
|
269
|
+
function requireRange(value, min, max, filePath, artifact, path) {
|
|
270
|
+
if (typeof value === "number" && Number.isFinite(value) && value >= min && value <= max) {
|
|
271
|
+
return;
|
|
272
|
+
}
|
|
273
|
+
throw invalidArtifactField(filePath, artifact, path, `number from ${min} through ${max}`, value);
|
|
274
|
+
}
|
|
275
|
+
function invalidArtifactField(filePath, artifact, path, expected, received) {
|
|
276
|
+
return new Error(`Invalid ${artifact} in ${filePath} (${formatIssuePath(path)}): expected ${expected}, received ${formatReceived(received)}.`);
|
|
277
|
+
}
|
|
278
|
+
function formatIssuePath(path) {
|
|
279
|
+
return path.join(".") || "value";
|
|
280
|
+
}
|
|
281
|
+
function formatReceived(value) {
|
|
282
|
+
if (typeof value === "string") {
|
|
283
|
+
return JSON.stringify(value);
|
|
284
|
+
}
|
|
285
|
+
if (typeof value === "object" && value !== null) {
|
|
286
|
+
return Array.isArray(value) ? "array" : "object";
|
|
287
|
+
}
|
|
288
|
+
return String(value);
|
|
289
|
+
}
|
|
176
290
|
async function assertCanonicalOutputFile(outDir, filePath) {
|
|
177
291
|
const canonicalOutDir = await realpath(path.resolve(outDir));
|
|
178
292
|
const canonicalFilePath = await realpath(filePath);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function assertNoSymlinksInDirectoryTree(rootDir: string, label: string): Promise<void>;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { lstat, readdir } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
export async function assertNoSymlinksInDirectoryTree(rootDir, label) {
|
|
4
|
+
await walkDirectoryTree(rootDir, label);
|
|
5
|
+
}
|
|
6
|
+
async function walkDirectoryTree(targetPath, label) {
|
|
7
|
+
const targetStat = await lstat(targetPath);
|
|
8
|
+
if (targetStat.isSymbolicLink()) {
|
|
9
|
+
throw new Error(`${label} must not contain symbolic links: ${targetPath}`);
|
|
10
|
+
}
|
|
11
|
+
if (!targetStat.isDirectory()) {
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
const entries = await readdir(targetPath, { withFileTypes: true });
|
|
15
|
+
for (const entry of entries) {
|
|
16
|
+
const entryPath = path.join(targetPath, entry.name);
|
|
17
|
+
if (entry.isSymbolicLink()) {
|
|
18
|
+
throw new Error(`${label} must not contain symbolic links: ${entryPath}`);
|
|
19
|
+
}
|
|
20
|
+
if (entry.isDirectory()) {
|
|
21
|
+
await walkDirectoryTree(entryPath, label);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -19,6 +19,7 @@ import { verifyOracle } from "./oracle.js";
|
|
|
19
19
|
import { runScorer } from "./scorer.js";
|
|
20
20
|
import { createTraceNormalizer } from "./trace/normalize.js";
|
|
21
21
|
import { writeRunCompletion, writeRunEvidence, writeRunResult } from "./result-writer.js";
|
|
22
|
+
import { assertNoSymlinksInDirectoryTree } from "./fixture-copy.js";
|
|
22
23
|
export class EvalFrameworkError extends Error {
|
|
23
24
|
constructor(message) {
|
|
24
25
|
super(message);
|
|
@@ -258,6 +259,7 @@ async function copyStarterIfPresent(starterDir, cloneDir) {
|
|
|
258
259
|
}
|
|
259
260
|
throw error;
|
|
260
261
|
}
|
|
262
|
+
await assertNoSymlinksInDirectoryTree(starterDir, "starter");
|
|
261
263
|
await cp(starterDir, cloneDir, {
|
|
262
264
|
recursive: true,
|
|
263
265
|
force: true
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import { S, validate } from "toolcraft-schema";
|
|
2
2
|
import path from "node:path";
|
|
3
|
+
const nonEmptyString = S.String({ minLength: 1 });
|
|
4
|
+
const positiveInteger = S.Number({ jsonType: "integer", minimum: 1 });
|
|
5
|
+
const nonNegativeInteger = S.Number({ jsonType: "integer", minimum: 0 });
|
|
6
|
+
const scoringWeight = S.Number({ minimum: 0, maximum: 1 });
|
|
3
7
|
const metricEvaluatorSchema = S.OneOf({
|
|
4
8
|
discriminator: "kind",
|
|
5
9
|
branches: {
|
|
@@ -38,41 +42,41 @@ const metricSchema = S.Object({
|
|
|
38
42
|
* oracle.solution_dest to copy it under a clone-root-relative subdirectory.
|
|
39
43
|
*/
|
|
40
44
|
export const evalYamlSchema = S.Object({
|
|
41
|
-
id:
|
|
42
|
-
title:
|
|
45
|
+
id: nonEmptyString,
|
|
46
|
+
title: nonEmptyString,
|
|
43
47
|
target: S.Object({
|
|
44
|
-
repo:
|
|
45
|
-
ref:
|
|
46
|
-
plan_dest: S.Optional(S.String({ default: "docs/plans/eval-task.md" }))
|
|
48
|
+
repo: nonEmptyString,
|
|
49
|
+
ref: nonEmptyString,
|
|
50
|
+
plan_dest: S.Optional(S.String({ default: "docs/plans/eval-task.md", minLength: 1 }))
|
|
47
51
|
}),
|
|
48
52
|
scorer: S.Optional(S.Object({
|
|
49
|
-
command:
|
|
53
|
+
command: nonEmptyString,
|
|
50
54
|
cwd: S.Optional(S.String({ default: "" })),
|
|
51
|
-
result_path:
|
|
52
|
-
timeout_ms:
|
|
55
|
+
result_path: nonEmptyString,
|
|
56
|
+
timeout_ms: nonNegativeInteger
|
|
53
57
|
})),
|
|
54
58
|
oracle: S.Object({
|
|
55
|
-
path: S.Optional(S.String({ default: "oracle" })),
|
|
56
|
-
solution_dest: S.Optional(S.String({ default: "." }))
|
|
59
|
+
path: S.Optional(S.String({ default: "oracle", minLength: 1 })),
|
|
60
|
+
solution_dest: S.Optional(S.String({ default: ".", minLength: 1 }))
|
|
57
61
|
}),
|
|
58
62
|
budget: S.Object({
|
|
59
|
-
max_iterations:
|
|
60
|
-
max_tokens:
|
|
61
|
-
wall_clock_ms:
|
|
63
|
+
max_iterations: positiveInteger,
|
|
64
|
+
max_tokens: positiveInteger,
|
|
65
|
+
wall_clock_ms: positiveInteger
|
|
62
66
|
}),
|
|
63
67
|
judge: S.Object({
|
|
64
|
-
agent:
|
|
65
|
-
model:
|
|
66
|
-
rubric: S.Array(
|
|
68
|
+
agent: nonEmptyString,
|
|
69
|
+
model: nonEmptyString,
|
|
70
|
+
rubric: S.Array(nonEmptyString, { minItems: 1 })
|
|
67
71
|
}),
|
|
68
72
|
weights: S.Object({
|
|
69
|
-
tests:
|
|
70
|
-
judge:
|
|
73
|
+
tests: scoringWeight,
|
|
74
|
+
judge: scoringWeight
|
|
71
75
|
}),
|
|
72
76
|
metrics: S.Optional(S.Array(metricSchema)),
|
|
73
77
|
verify: S.Optional(S.Object({
|
|
74
|
-
command:
|
|
75
|
-
timeout_ms:
|
|
78
|
+
command: nonEmptyString,
|
|
79
|
+
timeout_ms: nonNegativeInteger
|
|
76
80
|
}))
|
|
77
81
|
});
|
|
78
82
|
Object.freeze(evalYamlSchema.shape);
|
|
@@ -89,6 +93,7 @@ export function validateEvalYaml(value, filePath = "eval.yaml") {
|
|
|
89
93
|
const result = validate(evalYamlSchema, value);
|
|
90
94
|
if (result.ok) {
|
|
91
95
|
const issues = [
|
|
96
|
+
...validateNonBlankStrings(result.value),
|
|
92
97
|
...validateTarget(result.value.target),
|
|
93
98
|
...validateMetrics(result.value.metrics)
|
|
94
99
|
];
|
|
@@ -99,6 +104,42 @@ export function validateEvalYaml(value, filePath = "eval.yaml") {
|
|
|
99
104
|
}
|
|
100
105
|
throw new EvalYamlValidationError(formatIssues(filePath, result.issues), result.issues);
|
|
101
106
|
}
|
|
107
|
+
function validateNonBlankStrings(value) {
|
|
108
|
+
const issues = [];
|
|
109
|
+
const fields = [
|
|
110
|
+
[["id"], value.id],
|
|
111
|
+
[["title"], value.title],
|
|
112
|
+
[["target", "repo"], value.target.repo],
|
|
113
|
+
[["target", "ref"], value.target.ref],
|
|
114
|
+
[["target", "plan_dest"], value.target.plan_dest],
|
|
115
|
+
[["scorer", "command"], value.scorer?.command],
|
|
116
|
+
[["scorer", "result_path"], value.scorer?.result_path],
|
|
117
|
+
[["oracle", "path"], value.oracle.path],
|
|
118
|
+
[["oracle", "solution_dest"], value.oracle.solution_dest],
|
|
119
|
+
[["judge", "agent"], value.judge.agent],
|
|
120
|
+
[["judge", "model"], value.judge.model],
|
|
121
|
+
[["verify", "command"], value.verify?.command]
|
|
122
|
+
];
|
|
123
|
+
for (const [fieldPath, fieldValue] of fields) {
|
|
124
|
+
if (fieldValue !== undefined && fieldValue.trim().length === 0) {
|
|
125
|
+
issues.push(blankStringIssue(fieldPath, fieldValue));
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
for (const [index, rubricLine] of value.judge.rubric.entries()) {
|
|
129
|
+
if (rubricLine.trim().length === 0) {
|
|
130
|
+
issues.push(blankStringIssue(["judge", "rubric", String(index)], rubricLine));
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return issues;
|
|
134
|
+
}
|
|
135
|
+
function blankStringIssue(path, value) {
|
|
136
|
+
return {
|
|
137
|
+
path,
|
|
138
|
+
expected: "non-blank string",
|
|
139
|
+
received: JSON.stringify(value),
|
|
140
|
+
message: `${path.join(".")} must not be blank.`
|
|
141
|
+
};
|
|
142
|
+
}
|
|
102
143
|
function validateTarget(target) {
|
|
103
144
|
const destination = target.plan_dest ?? "docs/plans/eval-task.md";
|
|
104
145
|
if (path.isAbsolute(destination)) {
|
|
@@ -39,7 +39,9 @@ export async function loadSourceConfig(source, fs = nodeFs) {
|
|
|
39
39
|
if (!isRecord(parsed)) {
|
|
40
40
|
throw new Error(`${configPath} must contain a JSON object.`);
|
|
41
41
|
}
|
|
42
|
-
|
|
42
|
+
const merged = deepMerge(cloneDefaultConfig(), parsed);
|
|
43
|
+
validateSourceConfig(merged, configPath);
|
|
44
|
+
return merged;
|
|
43
45
|
}
|
|
44
46
|
function cloneDefaultConfig() {
|
|
45
47
|
return {
|
|
@@ -74,6 +76,46 @@ function deepMerge(base, patch) {
|
|
|
74
76
|
}
|
|
75
77
|
return result;
|
|
76
78
|
}
|
|
79
|
+
function validateSourceConfig(config, configPath) {
|
|
80
|
+
const judge = requireRecord(config.judge, configPath, "judge");
|
|
81
|
+
requireNonBlankString(judge.agent, configPath, "judge.agent");
|
|
82
|
+
requireNonBlankString(judge.model, configPath, "judge.model");
|
|
83
|
+
requireNonBlankString(config.out, configPath, "out");
|
|
84
|
+
const weights = requireRecord(config.weights, configPath, "weights");
|
|
85
|
+
requireWeight(weights.tests, configPath, "weights.tests");
|
|
86
|
+
requireWeight(weights.judge, configPath, "weights.judge");
|
|
87
|
+
if (config.clone_cache_dir !== null) {
|
|
88
|
+
requireNonBlankString(config.clone_cache_dir, configPath, "clone_cache_dir");
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
function requireRecord(value, configPath, fieldPath) {
|
|
92
|
+
if (isRecord(value)) {
|
|
93
|
+
return value;
|
|
94
|
+
}
|
|
95
|
+
throw invalidConfigField(configPath, fieldPath, "object", value);
|
|
96
|
+
}
|
|
97
|
+
function requireNonBlankString(value, configPath, fieldPath) {
|
|
98
|
+
if (typeof value !== "string") {
|
|
99
|
+
throw invalidConfigField(configPath, fieldPath, "string", value);
|
|
100
|
+
}
|
|
101
|
+
if (value.trim().length === 0) {
|
|
102
|
+
throw invalidConfigField(configPath, fieldPath, "non-blank string", value);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
function requireWeight(value, configPath, fieldPath) {
|
|
106
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value < 0 || value > 1) {
|
|
107
|
+
throw invalidConfigField(configPath, fieldPath, "number from 0 through 1", value);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
function invalidConfigField(configPath, fieldPath, expected, received) {
|
|
111
|
+
return new Error(`${configPath} (${fieldPath}): expected ${expected}, received ${formatReceived(received)}.`);
|
|
112
|
+
}
|
|
113
|
+
function formatReceived(value) {
|
|
114
|
+
if (typeof value === "string") {
|
|
115
|
+
return JSON.stringify(value);
|
|
116
|
+
}
|
|
117
|
+
return String(value);
|
|
118
|
+
}
|
|
77
119
|
function isRecord(value) {
|
|
78
120
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
79
121
|
}
|
|
@@ -34,6 +34,9 @@ export async function loadEval(source, id, fs = nodeFs) {
|
|
|
34
34
|
await assertFsCanonicalContainedPath(fs, source.rootDir, evalYamlPath, "eval.yaml");
|
|
35
35
|
await assertFsCanonicalContainedPath(fs, source.rootDir, planPath, "plan.md");
|
|
36
36
|
const evalYaml = validateEvalYaml(parseYamlFile(await fs.readFile(evalYamlPath, "utf8"), evalYamlPath), evalYamlPath);
|
|
37
|
+
if (evalYaml.id !== id) {
|
|
38
|
+
throw new Error(`Eval id mismatch in ${evalYamlPath}: expected "${id}", found "${evalYaml.id}".`);
|
|
39
|
+
}
|
|
37
40
|
const plan = parsePlanMarkdown(await fs.readFile(planPath, "utf8"), planPath);
|
|
38
41
|
return {
|
|
39
42
|
id: evalYaml.id,
|
|
@@ -1,10 +1,13 @@
|
|
|
1
|
-
import type { SessionUpdate, ToolKind } from "../../../poe-acp-client/dist/index.js";
|
|
2
|
-
import type { AcpEvent } from "./types.js";
|
|
1
|
+
import type { SessionUpdate as AcpClientSessionUpdate, ToolKind as AcpClientToolKind } from "../../../poe-acp-client/dist/index.js";
|
|
2
|
+
import type { AcpEvent, SessionUpdate as LegacySessionUpdate, ToolKind as LegacyToolKind } from "./types.js";
|
|
3
|
+
type ConvertibleSessionUpdate = AcpClientSessionUpdate | LegacySessionUpdate;
|
|
4
|
+
type ConvertibleToolKind = AcpClientToolKind | LegacyToolKind;
|
|
3
5
|
export interface ToolRenderState {
|
|
4
6
|
startedToolCalls: Set<string>;
|
|
5
7
|
toolCallKinds: Map<string, string>;
|
|
6
8
|
toolCallTitles: Map<string, string>;
|
|
7
9
|
}
|
|
8
10
|
export declare function createToolRenderState(): ToolRenderState;
|
|
9
|
-
export declare function toRenderKind(kind:
|
|
10
|
-
export declare function sessionUpdateToEvents(update:
|
|
11
|
+
export declare function toRenderKind(kind: ConvertibleToolKind | undefined | null): string;
|
|
12
|
+
export declare function sessionUpdateToEvents(update: ConvertibleSessionUpdate, state: ToolRenderState): AcpEvent[];
|
|
13
|
+
export {};
|
|
@@ -2,13 +2,13 @@ export function createToolRenderState() {
|
|
|
2
2
|
return {
|
|
3
3
|
startedToolCalls: new Set(),
|
|
4
4
|
toolCallKinds: new Map(),
|
|
5
|
-
toolCallTitles: new Map()
|
|
5
|
+
toolCallTitles: new Map()
|
|
6
6
|
};
|
|
7
7
|
}
|
|
8
8
|
export function toRenderKind(kind) {
|
|
9
9
|
if (kind === "execute")
|
|
10
10
|
return "exec";
|
|
11
|
-
if (kind === "write")
|
|
11
|
+
if (kind === "write" || kind === "edit")
|
|
12
12
|
return "edit";
|
|
13
13
|
if (kind === "read")
|
|
14
14
|
return "read";
|
|
@@ -60,7 +60,7 @@ export function sessionUpdateToEvents(update, state) {
|
|
|
60
60
|
const usage = {
|
|
61
61
|
event: "usage",
|
|
62
62
|
inputTokens,
|
|
63
|
-
outputTokens
|
|
63
|
+
outputTokens
|
|
64
64
|
};
|
|
65
65
|
if (cachedTokens > 0) {
|
|
66
66
|
usage.cachedTokens = cachedTokens;
|
|
@@ -79,31 +79,33 @@ export function sessionUpdateToEvents(update, state) {
|
|
|
79
79
|
return [];
|
|
80
80
|
}
|
|
81
81
|
state.startedToolCalls.add(update.toolCallId);
|
|
82
|
-
return [
|
|
82
|
+
return [
|
|
83
|
+
{
|
|
83
84
|
event: "tool_start",
|
|
84
85
|
kind: renderKind,
|
|
85
86
|
title,
|
|
86
|
-
id: update.toolCallId
|
|
87
|
-
}
|
|
87
|
+
id: update.toolCallId
|
|
88
|
+
}
|
|
89
|
+
];
|
|
88
90
|
}
|
|
89
91
|
if (update.sessionUpdate === "tool_call_update") {
|
|
90
|
-
const renderKind =
|
|
91
|
-
|
|
92
|
-
|
|
92
|
+
const renderKind = (update.kind == null ? undefined : toRenderKind(update.kind)) ||
|
|
93
|
+
state.toolCallKinds.get(update.toolCallId) ||
|
|
94
|
+
"other";
|
|
93
95
|
state.toolCallKinds.set(update.toolCallId, renderKind);
|
|
94
96
|
const events = [];
|
|
95
97
|
const toolTitle = toToolTitle(state.toolCallTitles.get(update.toolCallId) ?? update.toolCallId, update.locations);
|
|
96
98
|
state.toolCallTitles.set(update.toolCallId, toolTitle);
|
|
97
99
|
const status = update.status;
|
|
98
|
-
const shouldStart = !state.startedToolCalls.has(update.toolCallId)
|
|
99
|
-
|
|
100
|
+
const shouldStart = !state.startedToolCalls.has(update.toolCallId) &&
|
|
101
|
+
(status === "pending" || status === "in_progress");
|
|
100
102
|
if (shouldStart) {
|
|
101
103
|
state.startedToolCalls.add(update.toolCallId);
|
|
102
104
|
events.push({
|
|
103
105
|
event: "tool_start",
|
|
104
106
|
kind: renderKind,
|
|
105
107
|
title: toolTitle,
|
|
106
|
-
id: update.toolCallId
|
|
108
|
+
id: update.toolCallId
|
|
107
109
|
});
|
|
108
110
|
}
|
|
109
111
|
if (status === "completed" || status === "failed" || status === "cancelled") {
|
|
@@ -113,14 +115,14 @@ export function sessionUpdateToEvents(update, state) {
|
|
|
113
115
|
event: "tool_start",
|
|
114
116
|
kind: renderKind,
|
|
115
117
|
title: toolTitle,
|
|
116
|
-
id: update.toolCallId
|
|
118
|
+
id: update.toolCallId
|
|
117
119
|
});
|
|
118
120
|
}
|
|
119
121
|
events.push({
|
|
120
122
|
event: "tool_complete",
|
|
121
123
|
kind: renderKind,
|
|
122
124
|
path: extractToolOutputText(update),
|
|
123
|
-
id: update.toolCallId
|
|
125
|
+
id: update.toolCallId
|
|
124
126
|
});
|
|
125
127
|
}
|
|
126
128
|
return events;
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
*/
|
|
19
19
|
export type ToolKind = "read" | "edit" | "delete" | "move" | "search" | "execute" | "think" | "fetch" | "switch_mode" | "other";
|
|
20
20
|
/** ACP-compatible type - @see https://agentclientprotocol.com/ - no package dependency, structural compatibility only */
|
|
21
|
-
export type ToolCallStatus = "pending" | "in_progress" | "completed" | "failed";
|
|
21
|
+
export type ToolCallStatus = "pending" | "in_progress" | "completed" | "failed" | "cancelled";
|
|
22
22
|
/** ACP-compatible type - @see https://agentclientprotocol.com/ - no package dependency, structural compatibility only */
|
|
23
23
|
export interface ContentChunk {
|
|
24
24
|
type: "text";
|
|
@@ -39,9 +39,12 @@ export interface ToolCall {
|
|
|
39
39
|
sessionUpdate: "tool_call";
|
|
40
40
|
toolCallId: string;
|
|
41
41
|
title: string;
|
|
42
|
+
content?: ToolCallContent[];
|
|
42
43
|
kind?: ToolKind;
|
|
44
|
+
locations?: ToolCallLocation[];
|
|
43
45
|
status?: ToolCallStatus;
|
|
44
46
|
rawInput?: unknown;
|
|
47
|
+
rawOutput?: unknown;
|
|
45
48
|
_meta?: Record<string, unknown>;
|
|
46
49
|
}
|
|
47
50
|
/** ACP-compatible type - @see https://agentclientprotocol.com/ - no package dependency, structural compatibility only */
|
|
@@ -54,13 +57,22 @@ export type ToolCallContent = {
|
|
|
54
57
|
data: string;
|
|
55
58
|
};
|
|
56
59
|
/** ACP-compatible type - @see https://agentclientprotocol.com/ - no package dependency, structural compatibility only */
|
|
60
|
+
export interface ToolCallLocation {
|
|
61
|
+
path: string;
|
|
62
|
+
lineNumber?: number | null;
|
|
63
|
+
_meta?: Record<string, unknown>;
|
|
64
|
+
}
|
|
65
|
+
/** ACP-compatible type - @see https://agentclientprotocol.com/ - no package dependency, structural compatibility only */
|
|
57
66
|
export interface ToolCallUpdate {
|
|
58
67
|
sessionUpdate: "tool_call_update";
|
|
59
68
|
toolCallId: string;
|
|
60
69
|
kind?: ToolKind;
|
|
61
|
-
status?: ToolCallStatus;
|
|
70
|
+
status?: ToolCallStatus | null;
|
|
62
71
|
rawOutput?: unknown;
|
|
63
|
-
|
|
72
|
+
rawInput?: unknown;
|
|
73
|
+
content?: ToolCallContent[] | null;
|
|
74
|
+
locations?: ToolCallLocation[] | null;
|
|
75
|
+
title?: string | null;
|
|
64
76
|
_meta?: Record<string, unknown>;
|
|
65
77
|
}
|
|
66
78
|
/** ACP-compatible type - @see https://agentclientprotocol.com/ - no package dependency, structural compatibility only */
|
|
@@ -17,6 +17,8 @@ export { spawnInteractive } from "./spawn-interactive.js";
|
|
|
17
17
|
export { spawnAutonomous } from "./autonomous.js";
|
|
18
18
|
export type { AutonomousOptions, StreamingSpawnFn, StreamingSpawnReturn } from "./autonomous.js";
|
|
19
19
|
export { renderAcpEvent, renderAcpStream, renderSessionUpdateStream } from "./acp/renderer.js";
|
|
20
|
+
export { createToolRenderState, sessionUpdateToEvents } from "./acp/session-update-converter.js";
|
|
21
|
+
export type { ToolRenderState } from "./acp/session-update-converter.js";
|
|
20
22
|
export type { LogEntry, MalformedSpawnLogRecord, ReadSpawnLogOptions } from "./acp/replay.js";
|
|
21
23
|
export { findLatestLog, listSpawnLogs, pickRandomLog, readSpawnLog, replaySpawnLog } from "./acp/replay.js";
|
|
22
24
|
export type { SpawnStreamingOptions, SpawnStreamingResult } from "./acp/spawn.js";
|
|
@@ -14,6 +14,7 @@ export { createSpawnParallel, SpawnParallelError } from "./parallel.js";
|
|
|
14
14
|
export { spawnInteractive } from "./spawn-interactive.js";
|
|
15
15
|
export { spawnAutonomous } from "./autonomous.js";
|
|
16
16
|
export { renderAcpEvent, renderAcpStream, renderSessionUpdateStream } from "./acp/renderer.js";
|
|
17
|
+
export { createToolRenderState, sessionUpdateToEvents } from "./acp/session-update-converter.js";
|
|
17
18
|
export { findLatestLog, listSpawnLogs, pickRandomLog, readSpawnLog, replaySpawnLog } from "./acp/replay.js";
|
|
18
19
|
export { spawnStreaming } from "./acp/spawn.js";
|
|
19
20
|
export { spawnAcp } from "./acp/spawn-acp.js";
|