poe-code 3.0.270 → 3.0.272
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/commands/gaslight.js +13 -1
- package/dist/cli/commands/gaslight.js.map +1 -1
- package/dist/index.js +1264 -903
- package/dist/index.js.map +4 -4
- package/dist/metafile.json +1 -1
- package/package.json +1 -1
- package/packages/agent-eval/dist/check/check.js +3 -0
- package/packages/agent-eval/dist/report/load.js +116 -2
- package/packages/agent-eval/dist/run/fixture-copy.d.ts +1 -0
- package/packages/agent-eval/dist/run/fixture-copy.js +24 -0
- package/packages/agent-eval/dist/run/run.js +2 -0
- package/packages/agent-eval/dist/schema.js +61 -20
- package/packages/agent-eval/dist/source/config.js +43 -1
- package/packages/agent-eval/dist/source/registry.js +3 -0
- package/packages/agent-gaslight/dist/ingest.js +64 -18
- package/packages/agent-gaslight/dist/run.js +50 -3
package/package.json
CHANGED
|
@@ -7,6 +7,7 @@ import { loadEval } from "../source/registry.js";
|
|
|
7
7
|
import { cloneTarget } from "../run/clone.js";
|
|
8
8
|
import { runScorer } from "../run/scorer.js";
|
|
9
9
|
import { assertCanonicalContainedPath, assertCanonicalDestinationPath, resolveContainedPath } from "../path-boundary.js";
|
|
10
|
+
import { assertNoSymlinksInDirectoryTree } from "../run/fixture-copy.js";
|
|
10
11
|
export async function evalCheck(opts) {
|
|
11
12
|
const startedAt = Date.now();
|
|
12
13
|
const source = await openSource(opts.sourceDir);
|
|
@@ -66,6 +67,7 @@ async function copyDirectoryIfPresent(sourceDir, destDir) {
|
|
|
66
67
|
}
|
|
67
68
|
throw error;
|
|
68
69
|
}
|
|
70
|
+
await assertNoSymlinksInDirectoryTree(sourceDir, "starter");
|
|
69
71
|
await cp(sourceDir, destDir, {
|
|
70
72
|
recursive: true,
|
|
71
73
|
force: true
|
|
@@ -75,6 +77,7 @@ async function copyOracleSolution(input) {
|
|
|
75
77
|
const destDir = resolveCloneRelativePath(input.cloneDir, input.solutionDest);
|
|
76
78
|
await assertCanonicalDestinationPath(input.cloneDir, destDir, "oracle.solution_dest");
|
|
77
79
|
await mkdir(destDir, { recursive: true });
|
|
80
|
+
await assertNoSymlinksInDirectoryTree(input.solutionDir, "oracle.solution");
|
|
78
81
|
await cp(input.solutionDir, destDir, {
|
|
79
82
|
recursive: true,
|
|
80
83
|
force: true
|
|
@@ -80,7 +80,7 @@ async function enrichAggregatedCell(cell, outDir) {
|
|
|
80
80
|
}
|
|
81
81
|
}
|
|
82
82
|
async function enrichMatchedRunResult(runId, resultPath) {
|
|
83
|
-
const result = parseJson(await readFile(resultPath, "utf8"), resultPath);
|
|
83
|
+
const result = validateRunResult(parseJson(await readFile(resultPath, "utf8"), resultPath), resultPath);
|
|
84
84
|
if (result.runId !== runId) {
|
|
85
85
|
throw new Error(`Run result "${runId}" embeds mismatched runId "${result.runId}"`);
|
|
86
86
|
}
|
|
@@ -158,7 +158,7 @@ async function enrichRunResult(result, runDir) {
|
|
|
158
158
|
async function loadTraceSummary(tracePath, outDir) {
|
|
159
159
|
try {
|
|
160
160
|
await assertCanonicalOutputFile(outDir, tracePath);
|
|
161
|
-
const trace = parseJson(await readFile(tracePath, "utf8"), tracePath);
|
|
161
|
+
const trace = validateTrace(parseJson(await readFile(tracePath, "utf8"), tracePath), tracePath);
|
|
162
162
|
return {
|
|
163
163
|
available: true,
|
|
164
164
|
eventCount: trace.events.length,
|
|
@@ -173,6 +173,120 @@ async function loadTraceSummary(tracePath, outDir) {
|
|
|
173
173
|
throw error;
|
|
174
174
|
}
|
|
175
175
|
}
|
|
176
|
+
function validateRunResult(value, filePath) {
|
|
177
|
+
const result = requireRecord(value, filePath, "result.json", []);
|
|
178
|
+
requireString(result.runId, filePath, "result.json", ["runId"]);
|
|
179
|
+
requireString(result.eval, filePath, "result.json", ["eval"]);
|
|
180
|
+
requireString(result.agent, filePath, "result.json", ["agent"]);
|
|
181
|
+
requireString(result.model, filePath, "result.json", ["model"]);
|
|
182
|
+
requireString(result.planKind, filePath, "result.json", ["planKind"]);
|
|
183
|
+
requireString(result.verdict, filePath, "result.json", ["verdict"]);
|
|
184
|
+
requireNonNegativeInteger(result.iterations, filePath, "result.json", ["iterations"]);
|
|
185
|
+
requireNonNegativeNumber(result.durationMs, filePath, "result.json", ["durationMs"]);
|
|
186
|
+
requireRange(result.correctness, 0, 1, filePath, "result.json", ["correctness"]);
|
|
187
|
+
const usage = requireRecord(result.usage, filePath, "result.json", ["usage"]);
|
|
188
|
+
requireNonNegativeInteger(usage.inputTokens, filePath, "result.json", ["usage", "inputTokens"]);
|
|
189
|
+
requireNonNegativeInteger(usage.outputTokens, filePath, "result.json", ["usage", "outputTokens"]);
|
|
190
|
+
if (usage.cachedTokens !== undefined) {
|
|
191
|
+
requireNonNegativeInteger(usage.cachedTokens, filePath, "result.json", [
|
|
192
|
+
"usage",
|
|
193
|
+
"cachedTokens"
|
|
194
|
+
]);
|
|
195
|
+
}
|
|
196
|
+
if (usage.costUsd !== undefined) {
|
|
197
|
+
requireNonNegativeNumber(usage.costUsd, filePath, "result.json", ["usage", "costUsd"]);
|
|
198
|
+
}
|
|
199
|
+
const tests = requireRecord(result.tests, filePath, "result.json", ["tests"]);
|
|
200
|
+
const passed = requireNonNegativeInteger(tests.passed, filePath, "result.json", [
|
|
201
|
+
"tests",
|
|
202
|
+
"passed"
|
|
203
|
+
]);
|
|
204
|
+
const total = requireNonNegativeInteger(tests.total, filePath, "result.json", ["tests", "total"]);
|
|
205
|
+
if (passed > total) {
|
|
206
|
+
throw invalidArtifactField(filePath, "result.json", ["tests", "passed"], "integer less than or equal to tests.total", passed);
|
|
207
|
+
}
|
|
208
|
+
requireRange(tests.pass_rate, 0, 1, filePath, "result.json", ["tests", "pass_rate"]);
|
|
209
|
+
if (!Array.isArray(tests.cases)) {
|
|
210
|
+
throw invalidArtifactField(filePath, "result.json", ["tests", "cases"], "array", tests.cases);
|
|
211
|
+
}
|
|
212
|
+
const scoring = requireRecord(result.scoring, filePath, "result.json", ["scoring"]);
|
|
213
|
+
validateScoringComponent(scoring.tests, filePath, ["scoring", "tests"]);
|
|
214
|
+
validateScoringComponent(scoring.judge, filePath, ["scoring", "judge"]);
|
|
215
|
+
requireBoolean(result.cheated, filePath, "result.json", ["cheated"]);
|
|
216
|
+
requireRecord(result.cheatReport, filePath, "result.json", ["cheatReport"]);
|
|
217
|
+
return result;
|
|
218
|
+
}
|
|
219
|
+
function validateScoringComponent(value, filePath, path) {
|
|
220
|
+
const component = requireRecord(value, filePath, "result.json", path);
|
|
221
|
+
requireBoolean(component.configured, filePath, "result.json", [...path, "configured"]);
|
|
222
|
+
requireBoolean(component.required, filePath, "result.json", [...path, "required"]);
|
|
223
|
+
requireRange(component.configuredWeight, 0, 1, filePath, "result.json", [
|
|
224
|
+
...path,
|
|
225
|
+
"configuredWeight"
|
|
226
|
+
]);
|
|
227
|
+
requireRange(component.effectiveWeight, 0, 1, filePath, "result.json", [
|
|
228
|
+
...path,
|
|
229
|
+
"effectiveWeight"
|
|
230
|
+
]);
|
|
231
|
+
requireString(component.status, filePath, "result.json", [...path, "status"]);
|
|
232
|
+
}
|
|
233
|
+
function validateTrace(value, filePath) {
|
|
234
|
+
const trace = requireRecord(value, filePath, "trace.json", []);
|
|
235
|
+
if (!Array.isArray(trace.events)) {
|
|
236
|
+
throw invalidArtifactField(filePath, "trace.json", ["events"], "array", trace.events);
|
|
237
|
+
}
|
|
238
|
+
return trace;
|
|
239
|
+
}
|
|
240
|
+
function requireRecord(value, filePath, artifact, path) {
|
|
241
|
+
if (typeof value === "object" && value !== null && !Array.isArray(value)) {
|
|
242
|
+
return value;
|
|
243
|
+
}
|
|
244
|
+
throw invalidArtifactField(filePath, artifact, path, "object", value);
|
|
245
|
+
}
|
|
246
|
+
function requireString(value, filePath, artifact, path) {
|
|
247
|
+
if (typeof value === "string" && value.length > 0) {
|
|
248
|
+
return value;
|
|
249
|
+
}
|
|
250
|
+
throw invalidArtifactField(filePath, artifact, path, "non-empty string", value);
|
|
251
|
+
}
|
|
252
|
+
function requireBoolean(value, filePath, artifact, path) {
|
|
253
|
+
if (typeof value !== "boolean") {
|
|
254
|
+
throw invalidArtifactField(filePath, artifact, path, "boolean", value);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
function requireNonNegativeInteger(value, filePath, artifact, path) {
|
|
258
|
+
if (typeof value === "number" && Number.isInteger(value) && value >= 0) {
|
|
259
|
+
return value;
|
|
260
|
+
}
|
|
261
|
+
throw invalidArtifactField(filePath, artifact, path, "non-negative integer", value);
|
|
262
|
+
}
|
|
263
|
+
function requireNonNegativeNumber(value, filePath, artifact, path) {
|
|
264
|
+
if (typeof value === "number" && Number.isFinite(value) && value >= 0) {
|
|
265
|
+
return value;
|
|
266
|
+
}
|
|
267
|
+
throw invalidArtifactField(filePath, artifact, path, "non-negative number", value);
|
|
268
|
+
}
|
|
269
|
+
function requireRange(value, min, max, filePath, artifact, path) {
|
|
270
|
+
if (typeof value === "number" && Number.isFinite(value) && value >= min && value <= max) {
|
|
271
|
+
return;
|
|
272
|
+
}
|
|
273
|
+
throw invalidArtifactField(filePath, artifact, path, `number from ${min} through ${max}`, value);
|
|
274
|
+
}
|
|
275
|
+
function invalidArtifactField(filePath, artifact, path, expected, received) {
|
|
276
|
+
return new Error(`Invalid ${artifact} in ${filePath} (${formatIssuePath(path)}): expected ${expected}, received ${formatReceived(received)}.`);
|
|
277
|
+
}
|
|
278
|
+
function formatIssuePath(path) {
|
|
279
|
+
return path.join(".") || "value";
|
|
280
|
+
}
|
|
281
|
+
function formatReceived(value) {
|
|
282
|
+
if (typeof value === "string") {
|
|
283
|
+
return JSON.stringify(value);
|
|
284
|
+
}
|
|
285
|
+
if (typeof value === "object" && value !== null) {
|
|
286
|
+
return Array.isArray(value) ? "array" : "object";
|
|
287
|
+
}
|
|
288
|
+
return String(value);
|
|
289
|
+
}
|
|
176
290
|
async function assertCanonicalOutputFile(outDir, filePath) {
|
|
177
291
|
const canonicalOutDir = await realpath(path.resolve(outDir));
|
|
178
292
|
const canonicalFilePath = await realpath(filePath);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function assertNoSymlinksInDirectoryTree(rootDir: string, label: string): Promise<void>;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { lstat, readdir } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
export async function assertNoSymlinksInDirectoryTree(rootDir, label) {
|
|
4
|
+
await walkDirectoryTree(rootDir, label);
|
|
5
|
+
}
|
|
6
|
+
async function walkDirectoryTree(targetPath, label) {
|
|
7
|
+
const targetStat = await lstat(targetPath);
|
|
8
|
+
if (targetStat.isSymbolicLink()) {
|
|
9
|
+
throw new Error(`${label} must not contain symbolic links: ${targetPath}`);
|
|
10
|
+
}
|
|
11
|
+
if (!targetStat.isDirectory()) {
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
const entries = await readdir(targetPath, { withFileTypes: true });
|
|
15
|
+
for (const entry of entries) {
|
|
16
|
+
const entryPath = path.join(targetPath, entry.name);
|
|
17
|
+
if (entry.isSymbolicLink()) {
|
|
18
|
+
throw new Error(`${label} must not contain symbolic links: ${entryPath}`);
|
|
19
|
+
}
|
|
20
|
+
if (entry.isDirectory()) {
|
|
21
|
+
await walkDirectoryTree(entryPath, label);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -19,6 +19,7 @@ import { verifyOracle } from "./oracle.js";
|
|
|
19
19
|
import { runScorer } from "./scorer.js";
|
|
20
20
|
import { createTraceNormalizer } from "./trace/normalize.js";
|
|
21
21
|
import { writeRunCompletion, writeRunEvidence, writeRunResult } from "./result-writer.js";
|
|
22
|
+
import { assertNoSymlinksInDirectoryTree } from "./fixture-copy.js";
|
|
22
23
|
export class EvalFrameworkError extends Error {
|
|
23
24
|
constructor(message) {
|
|
24
25
|
super(message);
|
|
@@ -258,6 +259,7 @@ async function copyStarterIfPresent(starterDir, cloneDir) {
|
|
|
258
259
|
}
|
|
259
260
|
throw error;
|
|
260
261
|
}
|
|
262
|
+
await assertNoSymlinksInDirectoryTree(starterDir, "starter");
|
|
261
263
|
await cp(starterDir, cloneDir, {
|
|
262
264
|
recursive: true,
|
|
263
265
|
force: true
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import { S, validate } from "toolcraft-schema";
|
|
2
2
|
import path from "node:path";
|
|
3
|
+
const nonEmptyString = S.String({ minLength: 1 });
|
|
4
|
+
const positiveInteger = S.Number({ jsonType: "integer", minimum: 1 });
|
|
5
|
+
const nonNegativeInteger = S.Number({ jsonType: "integer", minimum: 0 });
|
|
6
|
+
const scoringWeight = S.Number({ minimum: 0, maximum: 1 });
|
|
3
7
|
const metricEvaluatorSchema = S.OneOf({
|
|
4
8
|
discriminator: "kind",
|
|
5
9
|
branches: {
|
|
@@ -38,41 +42,41 @@ const metricSchema = S.Object({
|
|
|
38
42
|
* oracle.solution_dest to copy it under a clone-root-relative subdirectory.
|
|
39
43
|
*/
|
|
40
44
|
export const evalYamlSchema = S.Object({
|
|
41
|
-
id:
|
|
42
|
-
title:
|
|
45
|
+
id: nonEmptyString,
|
|
46
|
+
title: nonEmptyString,
|
|
43
47
|
target: S.Object({
|
|
44
|
-
repo:
|
|
45
|
-
ref:
|
|
46
|
-
plan_dest: S.Optional(S.String({ default: "docs/plans/eval-task.md" }))
|
|
48
|
+
repo: nonEmptyString,
|
|
49
|
+
ref: nonEmptyString,
|
|
50
|
+
plan_dest: S.Optional(S.String({ default: "docs/plans/eval-task.md", minLength: 1 }))
|
|
47
51
|
}),
|
|
48
52
|
scorer: S.Optional(S.Object({
|
|
49
|
-
command:
|
|
53
|
+
command: nonEmptyString,
|
|
50
54
|
cwd: S.Optional(S.String({ default: "" })),
|
|
51
|
-
result_path:
|
|
52
|
-
timeout_ms:
|
|
55
|
+
result_path: nonEmptyString,
|
|
56
|
+
timeout_ms: nonNegativeInteger
|
|
53
57
|
})),
|
|
54
58
|
oracle: S.Object({
|
|
55
|
-
path: S.Optional(S.String({ default: "oracle" })),
|
|
56
|
-
solution_dest: S.Optional(S.String({ default: "." }))
|
|
59
|
+
path: S.Optional(S.String({ default: "oracle", minLength: 1 })),
|
|
60
|
+
solution_dest: S.Optional(S.String({ default: ".", minLength: 1 }))
|
|
57
61
|
}),
|
|
58
62
|
budget: S.Object({
|
|
59
|
-
max_iterations:
|
|
60
|
-
max_tokens:
|
|
61
|
-
wall_clock_ms:
|
|
63
|
+
max_iterations: positiveInteger,
|
|
64
|
+
max_tokens: positiveInteger,
|
|
65
|
+
wall_clock_ms: positiveInteger
|
|
62
66
|
}),
|
|
63
67
|
judge: S.Object({
|
|
64
|
-
agent:
|
|
65
|
-
model:
|
|
66
|
-
rubric: S.Array(
|
|
68
|
+
agent: nonEmptyString,
|
|
69
|
+
model: nonEmptyString,
|
|
70
|
+
rubric: S.Array(nonEmptyString, { minItems: 1 })
|
|
67
71
|
}),
|
|
68
72
|
weights: S.Object({
|
|
69
|
-
tests:
|
|
70
|
-
judge:
|
|
73
|
+
tests: scoringWeight,
|
|
74
|
+
judge: scoringWeight
|
|
71
75
|
}),
|
|
72
76
|
metrics: S.Optional(S.Array(metricSchema)),
|
|
73
77
|
verify: S.Optional(S.Object({
|
|
74
|
-
command:
|
|
75
|
-
timeout_ms:
|
|
78
|
+
command: nonEmptyString,
|
|
79
|
+
timeout_ms: nonNegativeInteger
|
|
76
80
|
}))
|
|
77
81
|
});
|
|
78
82
|
Object.freeze(evalYamlSchema.shape);
|
|
@@ -89,6 +93,7 @@ export function validateEvalYaml(value, filePath = "eval.yaml") {
|
|
|
89
93
|
const result = validate(evalYamlSchema, value);
|
|
90
94
|
if (result.ok) {
|
|
91
95
|
const issues = [
|
|
96
|
+
...validateNonBlankStrings(result.value),
|
|
92
97
|
...validateTarget(result.value.target),
|
|
93
98
|
...validateMetrics(result.value.metrics)
|
|
94
99
|
];
|
|
@@ -99,6 +104,42 @@ export function validateEvalYaml(value, filePath = "eval.yaml") {
|
|
|
99
104
|
}
|
|
100
105
|
throw new EvalYamlValidationError(formatIssues(filePath, result.issues), result.issues);
|
|
101
106
|
}
|
|
107
|
+
function validateNonBlankStrings(value) {
|
|
108
|
+
const issues = [];
|
|
109
|
+
const fields = [
|
|
110
|
+
[["id"], value.id],
|
|
111
|
+
[["title"], value.title],
|
|
112
|
+
[["target", "repo"], value.target.repo],
|
|
113
|
+
[["target", "ref"], value.target.ref],
|
|
114
|
+
[["target", "plan_dest"], value.target.plan_dest],
|
|
115
|
+
[["scorer", "command"], value.scorer?.command],
|
|
116
|
+
[["scorer", "result_path"], value.scorer?.result_path],
|
|
117
|
+
[["oracle", "path"], value.oracle.path],
|
|
118
|
+
[["oracle", "solution_dest"], value.oracle.solution_dest],
|
|
119
|
+
[["judge", "agent"], value.judge.agent],
|
|
120
|
+
[["judge", "model"], value.judge.model],
|
|
121
|
+
[["verify", "command"], value.verify?.command]
|
|
122
|
+
];
|
|
123
|
+
for (const [fieldPath, fieldValue] of fields) {
|
|
124
|
+
if (fieldValue !== undefined && fieldValue.trim().length === 0) {
|
|
125
|
+
issues.push(blankStringIssue(fieldPath, fieldValue));
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
for (const [index, rubricLine] of value.judge.rubric.entries()) {
|
|
129
|
+
if (rubricLine.trim().length === 0) {
|
|
130
|
+
issues.push(blankStringIssue(["judge", "rubric", String(index)], rubricLine));
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return issues;
|
|
134
|
+
}
|
|
135
|
+
function blankStringIssue(path, value) {
|
|
136
|
+
return {
|
|
137
|
+
path,
|
|
138
|
+
expected: "non-blank string",
|
|
139
|
+
received: JSON.stringify(value),
|
|
140
|
+
message: `${path.join(".")} must not be blank.`
|
|
141
|
+
};
|
|
142
|
+
}
|
|
102
143
|
function validateTarget(target) {
|
|
103
144
|
const destination = target.plan_dest ?? "docs/plans/eval-task.md";
|
|
104
145
|
if (path.isAbsolute(destination)) {
|
|
@@ -39,7 +39,9 @@ export async function loadSourceConfig(source, fs = nodeFs) {
|
|
|
39
39
|
if (!isRecord(parsed)) {
|
|
40
40
|
throw new Error(`${configPath} must contain a JSON object.`);
|
|
41
41
|
}
|
|
42
|
-
|
|
42
|
+
const merged = deepMerge(cloneDefaultConfig(), parsed);
|
|
43
|
+
validateSourceConfig(merged, configPath);
|
|
44
|
+
return merged;
|
|
43
45
|
}
|
|
44
46
|
function cloneDefaultConfig() {
|
|
45
47
|
return {
|
|
@@ -74,6 +76,46 @@ function deepMerge(base, patch) {
|
|
|
74
76
|
}
|
|
75
77
|
return result;
|
|
76
78
|
}
|
|
79
|
+
function validateSourceConfig(config, configPath) {
|
|
80
|
+
const judge = requireRecord(config.judge, configPath, "judge");
|
|
81
|
+
requireNonBlankString(judge.agent, configPath, "judge.agent");
|
|
82
|
+
requireNonBlankString(judge.model, configPath, "judge.model");
|
|
83
|
+
requireNonBlankString(config.out, configPath, "out");
|
|
84
|
+
const weights = requireRecord(config.weights, configPath, "weights");
|
|
85
|
+
requireWeight(weights.tests, configPath, "weights.tests");
|
|
86
|
+
requireWeight(weights.judge, configPath, "weights.judge");
|
|
87
|
+
if (config.clone_cache_dir !== null) {
|
|
88
|
+
requireNonBlankString(config.clone_cache_dir, configPath, "clone_cache_dir");
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
function requireRecord(value, configPath, fieldPath) {
|
|
92
|
+
if (isRecord(value)) {
|
|
93
|
+
return value;
|
|
94
|
+
}
|
|
95
|
+
throw invalidConfigField(configPath, fieldPath, "object", value);
|
|
96
|
+
}
|
|
97
|
+
function requireNonBlankString(value, configPath, fieldPath) {
|
|
98
|
+
if (typeof value !== "string") {
|
|
99
|
+
throw invalidConfigField(configPath, fieldPath, "string", value);
|
|
100
|
+
}
|
|
101
|
+
if (value.trim().length === 0) {
|
|
102
|
+
throw invalidConfigField(configPath, fieldPath, "non-blank string", value);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
function requireWeight(value, configPath, fieldPath) {
|
|
106
|
+
if (typeof value !== "number" || !Number.isFinite(value) || value < 0 || value > 1) {
|
|
107
|
+
throw invalidConfigField(configPath, fieldPath, "number from 0 through 1", value);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
function invalidConfigField(configPath, fieldPath, expected, received) {
|
|
111
|
+
return new Error(`${configPath} (${fieldPath}): expected ${expected}, received ${formatReceived(received)}.`);
|
|
112
|
+
}
|
|
113
|
+
function formatReceived(value) {
|
|
114
|
+
if (typeof value === "string") {
|
|
115
|
+
return JSON.stringify(value);
|
|
116
|
+
}
|
|
117
|
+
return String(value);
|
|
118
|
+
}
|
|
77
119
|
function isRecord(value) {
|
|
78
120
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
79
121
|
}
|
|
@@ -34,6 +34,9 @@ export async function loadEval(source, id, fs = nodeFs) {
|
|
|
34
34
|
await assertFsCanonicalContainedPath(fs, source.rootDir, evalYamlPath, "eval.yaml");
|
|
35
35
|
await assertFsCanonicalContainedPath(fs, source.rootDir, planPath, "plan.md");
|
|
36
36
|
const evalYaml = validateEvalYaml(parseYamlFile(await fs.readFile(evalYamlPath, "utf8"), evalYamlPath), evalYamlPath);
|
|
37
|
+
if (evalYaml.id !== id) {
|
|
38
|
+
throw new Error(`Eval id mismatch in ${evalYamlPath}: expected "${id}", found "${evalYaml.id}".`);
|
|
39
|
+
}
|
|
37
40
|
const plan = parsePlanMarkdown(await fs.readFile(planPath, "utf8"), planPath);
|
|
38
41
|
return {
|
|
39
42
|
id: evalYaml.id,
|
|
@@ -50,10 +50,28 @@ function sanitizeAgentForFileName(agent) {
|
|
|
50
50
|
function resolvePath(cwd, filePath) {
|
|
51
51
|
return path.isAbsolute(filePath) ? filePath : path.join(cwd, filePath);
|
|
52
52
|
}
|
|
53
|
+
function requireNonEmptyString(value, label) {
|
|
54
|
+
const trimmed = value.trim();
|
|
55
|
+
if (trimmed.length === 0) {
|
|
56
|
+
throw new Error(`${label} must be a non-empty string.`);
|
|
57
|
+
}
|
|
58
|
+
return trimmed;
|
|
59
|
+
}
|
|
60
|
+
function resolveOptionalNonEmptyString(value, label) {
|
|
61
|
+
if (value === undefined) {
|
|
62
|
+
return undefined;
|
|
63
|
+
}
|
|
64
|
+
const trimmed = value.trim();
|
|
65
|
+
if (trimmed.length === 0) {
|
|
66
|
+
throw new Error(`${label} must be a non-empty string when provided.`);
|
|
67
|
+
}
|
|
68
|
+
return trimmed;
|
|
69
|
+
}
|
|
53
70
|
async function resolveOutputPath(fs, cwd, analysisAgent, outputPath) {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
71
|
+
const normalizedOutputPath = resolveOptionalNonEmptyString(outputPath, "outputPath");
|
|
72
|
+
if (normalizedOutputPath) {
|
|
73
|
+
const absolutePath = resolvePath(cwd, normalizedOutputPath);
|
|
74
|
+
return { absolutePath, resultPath: normalizedOutputPath };
|
|
57
75
|
}
|
|
58
76
|
const configDirectory = path.join(cwd, ".poe-code");
|
|
59
77
|
const defaultPath = path.join(configDirectory, "gaslight.yaml");
|
|
@@ -77,6 +95,9 @@ function resolveSince(value) {
|
|
|
77
95
|
return milliseconds === null ? undefined : new Date(Date.now() - milliseconds);
|
|
78
96
|
}
|
|
79
97
|
if (value instanceof Date) {
|
|
98
|
+
if (!Number.isFinite(value.getTime())) {
|
|
99
|
+
throw new Error(`Invalid since date "${String(value)}".`);
|
|
100
|
+
}
|
|
80
101
|
return value;
|
|
81
102
|
}
|
|
82
103
|
const milliseconds = parseDuration(value);
|
|
@@ -93,6 +114,13 @@ async function resolveDataPath(cwd, keepDataPath) {
|
|
|
93
114
|
const resultPath = path.join(".poe-code", "ingest", `human-prompts-${process.pid}-${Date.now()}-${process.hrtime.bigint()}.md`);
|
|
94
115
|
return { absolutePath: path.join(cwd, resultPath), resultPath };
|
|
95
116
|
}
|
|
117
|
+
function resolveLimit(value) {
|
|
118
|
+
const limit = value ?? 200;
|
|
119
|
+
if (!Number.isInteger(limit) || limit <= 0) {
|
|
120
|
+
throw new Error("limit must be a positive integer.");
|
|
121
|
+
}
|
|
122
|
+
return limit;
|
|
123
|
+
}
|
|
96
124
|
function buildAnalysisPrompt(dataPath) {
|
|
97
125
|
return [
|
|
98
126
|
"Read this curated Markdown file of human prompts from coding-agent traces:",
|
|
@@ -111,7 +139,7 @@ function buildAnalysisPrompt(dataPath) {
|
|
|
111
139
|
"- Do not put review questions, validation checks, cleanup checks, commit checks, or release checks in `prompt`; those belong in `followups`.",
|
|
112
140
|
"- Prefer concise followups that generalize across tasks.",
|
|
113
141
|
"- Do not produce two followups for the same workflow step; merge semantic duplicates.",
|
|
114
|
-
|
|
142
|
+
'- Repeated short prompts like "commit" are evidence for one well-placed workflow check, not multiple followups.',
|
|
115
143
|
"- Order followups as a useful review sequence: quality, verification, cleanup, then commit or release when supported by the evidence.",
|
|
116
144
|
"- Do not include project secrets, file paths, names, tokens, or one-off task details.",
|
|
117
145
|
"- Preserve the user's direct style when it is reusable.",
|
|
@@ -148,8 +176,7 @@ function stripIdeSelection(value) {
|
|
|
148
176
|
if (endIndex === -1) {
|
|
149
177
|
return stripped;
|
|
150
178
|
}
|
|
151
|
-
stripped =
|
|
152
|
-
stripped.slice(0, startIndex) + stripped.slice(endIndex + closeTag.length);
|
|
179
|
+
stripped = stripped.slice(0, startIndex) + stripped.slice(endIndex + closeTag.length);
|
|
153
180
|
}
|
|
154
181
|
}
|
|
155
182
|
function normalizePromptText(value, cwd, homeDir) {
|
|
@@ -234,12 +261,7 @@ function buildRepeatedShortPromptSection(records, cwd, homeDir) {
|
|
|
234
261
|
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
|
|
235
262
|
.slice(0, 12);
|
|
236
263
|
if (repeated.length === 0) {
|
|
237
|
-
return [
|
|
238
|
-
"## Repeated short prompts",
|
|
239
|
-
"",
|
|
240
|
-
"No repeated short prompts were found.",
|
|
241
|
-
""
|
|
242
|
-
];
|
|
264
|
+
return ["## Repeated short prompts", "", "No repeated short prompts were found.", ""];
|
|
243
265
|
}
|
|
244
266
|
return [
|
|
245
267
|
"## Repeated short prompts",
|
|
@@ -418,7 +440,10 @@ function extractGeneratedConfigContent(stdout) {
|
|
|
418
440
|
async function writeGeneratedConfig(fs, content, absoluteOutputPath) {
|
|
419
441
|
const yaml = extractYamlCandidate(extractGeneratedConfigContent(content));
|
|
420
442
|
parseGaslightConfig(yaml, "generated gaslight config", { rejectExtraKeys: true });
|
|
421
|
-
|
|
443
|
+
const outputDirectory = path.dirname(absoluteOutputPath);
|
|
444
|
+
await assertNotSymlink(fs, outputDirectory, "Output directory");
|
|
445
|
+
await fs.mkdir(outputDirectory, { recursive: true });
|
|
446
|
+
await assertNotSymlink(fs, outputDirectory, "Output directory");
|
|
422
447
|
const temporaryPath = `${absoluteOutputPath}.tmp-${process.pid}-${Date.now()}`;
|
|
423
448
|
await fs.writeFile(temporaryPath, `${yaml}\n`, { encoding: "utf8" });
|
|
424
449
|
if (fs.rename) {
|
|
@@ -427,19 +452,40 @@ async function writeGeneratedConfig(fs, content, absoluteOutputPath) {
|
|
|
427
452
|
}
|
|
428
453
|
await fs.writeFile(absoluteOutputPath, `${yaml}\n`, { encoding: "utf8" });
|
|
429
454
|
}
|
|
455
|
+
async function assertNotSymlink(fs, targetPath, label) {
|
|
456
|
+
if (!fs.lstat) {
|
|
457
|
+
return;
|
|
458
|
+
}
|
|
459
|
+
try {
|
|
460
|
+
const stats = await fs.lstat(targetPath);
|
|
461
|
+
if (stats.isSymbolicLink()) {
|
|
462
|
+
throw new Error(`${label} cannot be a symbolic link: ${targetPath}`);
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
catch (error) {
|
|
466
|
+
if (isMissingFile(error)) {
|
|
467
|
+
return;
|
|
468
|
+
}
|
|
469
|
+
throw error;
|
|
470
|
+
}
|
|
471
|
+
}
|
|
430
472
|
export async function ingestGaslight(options) {
|
|
431
473
|
const cwd = options.cwd ?? process.cwd();
|
|
432
474
|
const homeDir = options.homeDir ?? os.homedir();
|
|
433
475
|
const fs = (options.fs ?? nodeFs);
|
|
434
476
|
const spawn = options.spawn ?? defaultSpawn;
|
|
435
477
|
const collectHumanPrompts = options.collectHumanPrompts ?? collectHumanPromptsWithStats;
|
|
478
|
+
const analysisAgent = requireNonEmptyString(options.analysisAgent, "analysisAgent");
|
|
479
|
+
const model = resolveOptionalNonEmptyString(options.model, "model");
|
|
480
|
+
const limit = resolveLimit(options.limit);
|
|
481
|
+
const outputPathOption = resolveOptionalNonEmptyString(options.outputPath, "outputPath");
|
|
436
482
|
const since = resolveSince(options.since);
|
|
437
483
|
const collection = await collectHumanPrompts({
|
|
438
484
|
sources: options.sources,
|
|
439
485
|
cwd,
|
|
440
486
|
homeDir,
|
|
441
487
|
since,
|
|
442
|
-
limit
|
|
488
|
+
limit,
|
|
443
489
|
allWorkspaces: options.allWorkspaces,
|
|
444
490
|
fs
|
|
445
491
|
});
|
|
@@ -458,20 +504,20 @@ export async function ingestGaslight(options) {
|
|
|
458
504
|
try {
|
|
459
505
|
options.onEvent?.({
|
|
460
506
|
type: "analysis.started",
|
|
461
|
-
agent:
|
|
507
|
+
agent: analysisAgent,
|
|
462
508
|
dataPath: dataPath.absolutePath
|
|
463
509
|
});
|
|
464
|
-
const result = await spawn(
|
|
510
|
+
const result = await spawn(analysisAgent, {
|
|
465
511
|
prompt: buildAnalysisPrompt(dataPath.absolutePath),
|
|
466
512
|
cwd,
|
|
467
513
|
mode: "read",
|
|
468
|
-
...(
|
|
514
|
+
...(model ? { model } : {})
|
|
469
515
|
});
|
|
470
516
|
if (result.exitCode !== 0) {
|
|
471
517
|
const message = result.stderr.trim() || result.stdout.trim() || `exit code ${result.exitCode}`;
|
|
472
518
|
throw new Error(`Gaslight ingest analysis failed: ${message}`);
|
|
473
519
|
}
|
|
474
|
-
const outputPath = await resolveOutputPath(fs, cwd,
|
|
520
|
+
const outputPath = await resolveOutputPath(fs, cwd, analysisAgent, outputPathOption);
|
|
475
521
|
await writeGeneratedConfig(fs, result.stdout, outputPath.absolutePath);
|
|
476
522
|
options.onEvent?.({ type: "config.written", path: outputPath.resultPath });
|
|
477
523
|
return {
|
|
@@ -91,6 +91,23 @@ async function archivePlan(fs, cwd, planPath) {
|
|
|
91
91
|
}
|
|
92
92
|
return archivedPath;
|
|
93
93
|
}
|
|
94
|
+
function archivePathForPlan(cwd, planPath) {
|
|
95
|
+
const absolutePath = path.resolve(cwd, planPath);
|
|
96
|
+
return path.join(path.dirname(absolutePath), "archive", path.basename(absolutePath));
|
|
97
|
+
}
|
|
98
|
+
async function assertArchiveDestinationAvailable(fs, cwd, planPath) {
|
|
99
|
+
const archivedPath = archivePathForPlan(cwd, planPath);
|
|
100
|
+
try {
|
|
101
|
+
await fs.readFile(archivedPath, "utf8");
|
|
102
|
+
throw new Error(`Archive destination already exists: ${archivedPath}`);
|
|
103
|
+
}
|
|
104
|
+
catch (error) {
|
|
105
|
+
if (!isMissingFile(error)) {
|
|
106
|
+
throw error;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
await rejectArchiveSymlink(fs, path.dirname(archivedPath));
|
|
110
|
+
}
|
|
94
111
|
function validateInlineConfig(prompt, followups) {
|
|
95
112
|
if ((prompt === undefined) !== (followups === undefined)) {
|
|
96
113
|
throw new Error("prompt and followups must be provided together.");
|
|
@@ -103,6 +120,23 @@ function validateInlineConfig(prompt, followups) {
|
|
|
103
120
|
throw new Error("followups must be a non-empty array of non-empty strings.");
|
|
104
121
|
}
|
|
105
122
|
}
|
|
123
|
+
function requireNonEmptyString(value, label) {
|
|
124
|
+
const trimmed = value.trim();
|
|
125
|
+
if (trimmed.length === 0) {
|
|
126
|
+
throw new Error(`${label} must be a non-empty string.`);
|
|
127
|
+
}
|
|
128
|
+
return trimmed;
|
|
129
|
+
}
|
|
130
|
+
function resolveModel(value) {
|
|
131
|
+
if (value === undefined) {
|
|
132
|
+
return undefined;
|
|
133
|
+
}
|
|
134
|
+
const trimmed = value.trim();
|
|
135
|
+
if (trimmed.length === 0) {
|
|
136
|
+
throw new Error("model must be a non-empty string when provided.");
|
|
137
|
+
}
|
|
138
|
+
return trimmed;
|
|
139
|
+
}
|
|
106
140
|
function resolvePlanPaths(options) {
|
|
107
141
|
if (options.planPaths.length === 0) {
|
|
108
142
|
throw new Error("Provide at least one plan path.");
|
|
@@ -112,17 +146,30 @@ function resolvePlanPaths(options) {
|
|
|
112
146
|
throw new Error("plan paths must be non-empty strings.");
|
|
113
147
|
}
|
|
114
148
|
}
|
|
115
|
-
|
|
149
|
+
const planPaths = options.planPaths.map((planPath) => planPath.trim());
|
|
150
|
+
const seen = new Map();
|
|
151
|
+
for (const planPath of planPaths) {
|
|
152
|
+
const resolvedPath = path.resolve(options.cwd ?? process.cwd(), planPath);
|
|
153
|
+
const duplicate = seen.get(resolvedPath);
|
|
154
|
+
if (duplicate !== undefined) {
|
|
155
|
+
throw new Error(`Duplicate plan path: ${duplicate}`);
|
|
156
|
+
}
|
|
157
|
+
seen.set(resolvedPath, planPath);
|
|
158
|
+
}
|
|
159
|
+
return planPaths;
|
|
116
160
|
}
|
|
117
161
|
export async function runGaslight(options) {
|
|
118
162
|
const cwd = options.cwd ?? process.cwd();
|
|
119
163
|
const homeDir = options.homeDir ?? os.homedir();
|
|
120
164
|
const fs = (options.fs ?? nodeFs);
|
|
121
165
|
const spawn = options.spawn ?? defaultSpawn;
|
|
166
|
+
const agent = requireNonEmptyString(options.agent, "agent");
|
|
167
|
+
const model = resolveModel(options.model);
|
|
122
168
|
validateInlineConfig(options.prompt, options.followups);
|
|
123
169
|
const planPaths = resolvePlanPaths(options);
|
|
124
170
|
for (const planPath of planPaths) {
|
|
125
171
|
await requirePlan(fs, cwd, planPath);
|
|
172
|
+
await assertArchiveDestinationAvailable(fs, cwd, planPath);
|
|
126
173
|
}
|
|
127
174
|
const config = options.prompt !== undefined && options.followups !== undefined
|
|
128
175
|
? { prompt: options.prompt.trim(), followups: options.followups.map((value) => value.trim()) }
|
|
@@ -146,11 +193,11 @@ export async function runGaslight(options) {
|
|
|
146
193
|
planIndex: planIndex + 1,
|
|
147
194
|
totalPlans: planPaths.length
|
|
148
195
|
});
|
|
149
|
-
const result = await spawn(
|
|
196
|
+
const result = await spawn(agent, {
|
|
150
197
|
prompt,
|
|
151
198
|
cwd,
|
|
152
199
|
mode: options.mode ?? "edit",
|
|
153
|
-
...(
|
|
200
|
+
...(model ? { model } : {}),
|
|
154
201
|
...(resumeThreadId ? { resumeThreadId } : {}),
|
|
155
202
|
...(options.signal ? { signal: options.signal } : {})
|
|
156
203
|
});
|