agent-skill-evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +155 -0
- package/dist/agent/index.d.mts +3 -0
- package/dist/agent/index.mjs +2 -0
- package/dist/agent-CM7fIL_C.mjs +1525 -0
- package/dist/agent-CM7fIL_C.mjs.map +1 -0
- package/dist/assertion-entries-CfmNt-fp.d.mts +9 -0
- package/dist/assertion-entries-CfmNt-fp.d.mts.map +1 -0
- package/dist/assertions/index.d.mts +47 -0
- package/dist/assertions/index.d.mts.map +1 -0
- package/dist/assertions/index.mjs +574 -0
- package/dist/assertions/index.mjs.map +1 -0
- package/dist/index-4l7TCFny.d.mts +90 -0
- package/dist/index-4l7TCFny.d.mts.map +1 -0
- package/dist/internal-services-5-mRgNls.mjs +226 -0
- package/dist/internal-services-5-mRgNls.mjs.map +1 -0
- package/dist/internal-services-DbsekQ_K.d.mts +76 -0
- package/dist/internal-services-DbsekQ_K.d.mts.map +1 -0
- package/dist/skill-checks/index.d.mts +113 -0
- package/dist/skill-checks/index.d.mts.map +1 -0
- package/dist/skill-checks/index.mjs +408 -0
- package/dist/skill-checks/index.mjs.map +1 -0
- package/package.json +56 -0
|
@@ -0,0 +1,1525 @@
|
|
|
1
|
+
import { c as parseRuntimeTestFields, n as FileSystem, r as NodeServicesLive, t as Environment } from "./internal-services-5-mRgNls.mjs";
|
|
2
|
+
import { basename, dirname, isAbsolute, join, relative, resolve } from "node:path";
|
|
3
|
+
import * as Clock from "effect/Clock";
|
|
4
|
+
import * as Either from "effect/Either";
|
|
5
|
+
import * as Effect from "effect/Effect";
|
|
6
|
+
import * as Layer from "effect/Layer";
|
|
7
|
+
import * as ParseResult from "effect/ParseResult";
|
|
8
|
+
import * as Schema from "effect/Schema";
|
|
9
|
+
import * as Context from "effect/Context";
|
|
10
|
+
import { spawn } from "node:child_process";
|
|
11
|
+
import "effect/Stream";
|
|
12
|
+
import { createHash } from "node:crypto";
|
|
13
|
+
//#region src/runtime-checks/_helpers.ts
|
|
14
|
+
function result(pass, reason, evidence) {
|
|
15
|
+
return {
|
|
16
|
+
pass,
|
|
17
|
+
score: pass ? 1 : 0,
|
|
18
|
+
reason,
|
|
19
|
+
evidence
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
function validationFailure(reason) {
|
|
23
|
+
return result(false, reason);
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Map a "matched" boolean to a pass result based on mode. Used by effect
|
|
27
|
+
* types that don't self-encode polarity (file.exists, tool.called, ...).
|
|
28
|
+
*
|
|
29
|
+
* - should: pass = matched
|
|
30
|
+
* - should_not: pass = !matched
|
|
31
|
+
* - precondition: pass = matched (precondition asserts a current state)
|
|
32
|
+
*/
|
|
33
|
+
function applyMode(matched, mode, reasonMatched, reasonUnmatched) {
|
|
34
|
+
switch (mode) {
|
|
35
|
+
case "should":
|
|
36
|
+
case "precondition": return result(matched, matched ? reasonMatched : reasonUnmatched);
|
|
37
|
+
case "should_not": return result(!matched, matched ? reasonMatched : reasonUnmatched);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
//#endregion
|
|
41
|
+
//#region src/runtime-checks/schemas.ts
|
|
42
|
+
const NonEmptyString = Schema.String.pipe(Schema.filter((value) => value.trim().length > 0, { identifier: "NonEmptyString" }));
|
|
43
|
+
const OptionalString$1 = Schema.optional(Schema.String);
|
|
44
|
+
const PathArgsSchema = Schema.Struct({ path: NonEmptyString });
|
|
45
|
+
const FileContainsArgsSchema = Schema.Struct({
|
|
46
|
+
path: NonEmptyString,
|
|
47
|
+
text: Schema.String
|
|
48
|
+
});
|
|
49
|
+
const FileChangesOutsideScopeArgsSchema = Schema.Struct({ scope: Schema.Array(NonEmptyString).pipe(Schema.minItems(1)) });
|
|
50
|
+
const CodePatternArgsSchema = Schema.Struct({
|
|
51
|
+
glob: NonEmptyString,
|
|
52
|
+
pattern: NonEmptyString
|
|
53
|
+
});
|
|
54
|
+
const VerifierArgsSchema = Schema.Struct({
|
|
55
|
+
run: NonEmptyString,
|
|
56
|
+
args: Schema.optional(Schema.Array(Schema.String)),
|
|
57
|
+
timeoutMs: Schema.optional(Schema.Number)
|
|
58
|
+
});
|
|
59
|
+
const ToolCalledArgsSchema = Schema.Struct({
|
|
60
|
+
tool: NonEmptyString,
|
|
61
|
+
provider: OptionalString$1,
|
|
62
|
+
server: OptionalString$1,
|
|
63
|
+
args_match: Schema.optional(Schema.Unknown)
|
|
64
|
+
});
|
|
65
|
+
const ToolNotCalledArgsSchema = Schema.Struct({
|
|
66
|
+
tool: OptionalString$1,
|
|
67
|
+
provider: OptionalString$1,
|
|
68
|
+
server: OptionalString$1,
|
|
69
|
+
args_match: Schema.optional(Schema.Unknown)
|
|
70
|
+
});
|
|
71
|
+
const SkillLoadedArgsSchema = Schema.Struct({
|
|
72
|
+
should_include: Schema.optional(Schema.Array(NonEmptyString)),
|
|
73
|
+
should_exclude: Schema.optional(Schema.Array(NonEmptyString)),
|
|
74
|
+
delivery: Schema.optional(Schema.Literal("native", "mcp")),
|
|
75
|
+
provider: OptionalString$1,
|
|
76
|
+
server: OptionalString$1,
|
|
77
|
+
source: OptionalString$1
|
|
78
|
+
});
|
|
79
|
+
function decodeCheckArgs(schema, assertion, invalidReason) {
|
|
80
|
+
const decoded = Schema.decodeUnknownEither(schema, { errors: "all" })(assertion ?? {});
|
|
81
|
+
return Either.isRight(decoded) ? decoded.right : validationFailure(invalidReason);
|
|
82
|
+
}
|
|
83
|
+
function isValidationFailure(value) {
|
|
84
|
+
return !!value && typeof value === "object" && "pass" in value && value.pass === false && typeof value.reason === "string";
|
|
85
|
+
}
|
|
86
|
+
function decodeToolNotCalledArgs(assertion) {
|
|
87
|
+
return decodeCheckArgs(ToolNotCalledArgsSchema, assertion, "tool.not_called: assertion must include at least one selector");
|
|
88
|
+
}
|
|
89
|
+
//#endregion
|
|
90
|
+
//#region src/runtime-checks/verifier-succeeds.ts
|
|
91
|
+
const verifierSucceeds = {
|
|
92
|
+
type: "verifier.succeeds",
|
|
93
|
+
verify(ctx) {
|
|
94
|
+
return Effect.gen(function* () {
|
|
95
|
+
const a = decodeCheckArgs(VerifierArgsSchema, ctx.assertion, "verifier.succeeds: assertion.run must be a non-empty string");
|
|
96
|
+
if (isValidationFailure(a)) return a;
|
|
97
|
+
const r = yield* ctx.world.exec(a.run, a.args ?? [], { timeoutMs: a.timeoutMs ?? 6e4 });
|
|
98
|
+
return applyMode(r.exitCode === 0, ctx.mode, `verifier.succeeds: ${a.run} exited 0`, `verifier.succeeds: ${a.run} exited ${r.exitCode}: ${r.stderr.slice(0, 200)}`);
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
//#endregion
|
|
103
|
+
//#region src/runtime-checks/verifier-fails.ts
|
|
104
|
+
const verifierFails = {
|
|
105
|
+
type: "verifier.fails",
|
|
106
|
+
verify(ctx) {
|
|
107
|
+
return Effect.gen(function* () {
|
|
108
|
+
const a = decodeCheckArgs(VerifierArgsSchema, ctx.assertion, "verifier.fails: assertion.run must be a non-empty string");
|
|
109
|
+
if (isValidationFailure(a)) return a;
|
|
110
|
+
const r = yield* ctx.world.exec(a.run, a.args ?? [], { timeoutMs: a.timeoutMs ?? 6e4 });
|
|
111
|
+
return applyMode(r.exitCode !== 0, ctx.mode, `verifier.fails: ${a.run} exited ${r.exitCode} (failed as expected)`, `verifier.fails: ${a.run} unexpectedly exited 0`);
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
};
|
|
115
|
+
//#endregion
|
|
116
|
+
//#region src/runtime-checks/file-exists.ts
|
|
117
|
+
const fileExists = {
|
|
118
|
+
type: "file.exists",
|
|
119
|
+
verify(ctx) {
|
|
120
|
+
return Effect.gen(function* () {
|
|
121
|
+
const a = decodeCheckArgs(PathArgsSchema, ctx.assertion, "file.exists: assertion.path must be a non-empty string");
|
|
122
|
+
if (isValidationFailure(a)) return a;
|
|
123
|
+
return applyMode((yield* ctx.world.readFile(a.path)) !== null, ctx.mode, `file.exists: ${a.path} present`, `file.exists: ${a.path} not found`);
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
};
|
|
127
|
+
//#endregion
|
|
128
|
+
//#region src/runtime-checks/file-created.ts
|
|
129
|
+
const fileCreated = {
|
|
130
|
+
type: "file.created",
|
|
131
|
+
verify(ctx) {
|
|
132
|
+
const a = decodeCheckArgs(PathArgsSchema, ctx.assertion, "file.created: assertion.path must be a non-empty string");
|
|
133
|
+
if (isValidationFailure(a)) return Effect.succeed(a);
|
|
134
|
+
const created = ctx.evidence.filesWritten().some((ev) => ev.path === a.path && ev.op === "create");
|
|
135
|
+
return Effect.succeed(applyMode(created, ctx.mode, `file.created: ${a.path} created`, `file.created: ${a.path} was not created`));
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
//#endregion
|
|
139
|
+
//#region src/runtime-checks/file-not-modified.ts
|
|
140
|
+
const fileNotModified = {
|
|
141
|
+
type: "file.not_modified",
|
|
142
|
+
verify(ctx) {
|
|
143
|
+
const a = decodeCheckArgs(PathArgsSchema, ctx.assertion, "file.not_modified: assertion.path must be a non-empty string");
|
|
144
|
+
if (isValidationFailure(a)) return Effect.succeed(a);
|
|
145
|
+
const matched = !ctx.evidence.filesWritten().some((f) => f.path === a.path);
|
|
146
|
+
return Effect.succeed(applyMode(matched, ctx.mode, `file.not_modified: ${a.path} unchanged`, `file.not_modified: ${a.path} was modified`));
|
|
147
|
+
}
|
|
148
|
+
};
|
|
149
|
+
//#endregion
|
|
150
|
+
//#region src/runtime-checks/file-contains.ts
|
|
151
|
+
const fileContains = {
|
|
152
|
+
type: "file.contains",
|
|
153
|
+
verify(ctx) {
|
|
154
|
+
return Effect.gen(function* () {
|
|
155
|
+
const a = decodeCheckArgs(FileContainsArgsSchema, ctx.assertion, "file.contains: assertion.path must be a non-empty string and assertion.text must be a string");
|
|
156
|
+
if (isValidationFailure(a)) return a;
|
|
157
|
+
const content = yield* ctx.world.readFile(a.path);
|
|
158
|
+
return applyMode(content !== null && content.includes(a.text), ctx.mode, `file.contains: ${a.path} contains "${a.text.slice(0, 40)}"`, content === null ? `file.contains: ${a.path} not found` : `file.contains: ${a.path} missing "${a.text.slice(0, 40)}"`);
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
};
|
|
162
|
+
//#endregion
|
|
163
|
+
//#region src/runtime-checks/code-pattern-exists.ts
|
|
164
|
+
const codePatternExists = {
|
|
165
|
+
type: "code.pattern_exists",
|
|
166
|
+
verify(ctx) {
|
|
167
|
+
return Effect.gen(function* () {
|
|
168
|
+
const a = decodeCheckArgs(CodePatternArgsSchema, ctx.assertion, "code.pattern_exists: assertion.glob and assertion.pattern must be non-empty strings");
|
|
169
|
+
if (isValidationFailure(a)) return a;
|
|
170
|
+
let re;
|
|
171
|
+
try {
|
|
172
|
+
re = new RegExp(a.pattern);
|
|
173
|
+
} catch (err) {
|
|
174
|
+
return {
|
|
175
|
+
pass: false,
|
|
176
|
+
score: 0,
|
|
177
|
+
reason: `code.pattern_exists: invalid assertion.pattern regex: ${err instanceof Error ? err.message : String(err)}`
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
const files = yield* ctx.world.listFiles(a.glob);
|
|
181
|
+
const matchedFiles = [];
|
|
182
|
+
for (const f of files) {
|
|
183
|
+
const content = yield* ctx.world.readFile(f);
|
|
184
|
+
if (content && re.test(content)) matchedFiles.push(f);
|
|
185
|
+
}
|
|
186
|
+
return applyMode(matchedFiles.length > 0, ctx.mode, `code.pattern_exists: /${a.pattern}/ found in ${matchedFiles.slice(0, 3).join(", ")}`, `code.pattern_exists: /${a.pattern}/ not found in any ${a.glob}`);
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
};
|
|
190
|
+
//#endregion
|
|
191
|
+
//#region src/runtime-checks/code-no-pattern.ts
|
|
192
|
+
/**
|
|
193
|
+
* Polarity-inverted alias of code.pattern_exists. "matched" means
|
|
194
|
+
* "no pattern matches anywhere", which is the natural reading of the name.
|
|
195
|
+
*/
|
|
196
|
+
const codeNoPattern = {
|
|
197
|
+
type: "code.no_pattern",
|
|
198
|
+
verify(ctx) {
|
|
199
|
+
return Effect.gen(function* () {
|
|
200
|
+
const a = decodeCheckArgs(CodePatternArgsSchema, ctx.assertion, "code.no_pattern: assertion.glob and assertion.pattern must be non-empty strings");
|
|
201
|
+
if (isValidationFailure(a)) return a;
|
|
202
|
+
try {
|
|
203
|
+
new RegExp(a.pattern);
|
|
204
|
+
} catch (err) {
|
|
205
|
+
return {
|
|
206
|
+
pass: false,
|
|
207
|
+
score: 0,
|
|
208
|
+
reason: `code.no_pattern: invalid assertion.pattern regex: ${err instanceof Error ? err.message : String(err)}`
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
const inner = yield* codePatternExists.verify({
|
|
212
|
+
...ctx,
|
|
213
|
+
mode: "should"
|
|
214
|
+
});
|
|
215
|
+
return applyMode(!inner.pass, ctx.mode, `code.no_pattern: pattern absent`, `code.no_pattern: pattern present (${inner.reason})`);
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
};
|
|
219
|
+
//#endregion
|
|
220
|
+
//#region src/runtime-checks/file-changes-outside-scope.ts
|
|
221
|
+
const fileChangesOutsideScope = {
|
|
222
|
+
type: "file.changes_outside_scope",
|
|
223
|
+
verify(ctx) {
|
|
224
|
+
const a = decodeCheckArgs(FileChangesOutsideScopeArgsSchema, ctx.assertion, "file.changes_outside_scope: assertion.scope must contain at least one non-empty string");
|
|
225
|
+
if (isValidationFailure(a)) return Effect.succeed(a);
|
|
226
|
+
const unrelated = ctx.evidence.filesWritten().filter((f) => !a.scope.some((s) => f.path.startsWith(s)));
|
|
227
|
+
const matched = unrelated.length > 0;
|
|
228
|
+
return Effect.succeed(applyMode(matched, ctx.mode, `file.changes_outside_scope: ${unrelated.length} file(s) outside scope: ${unrelated.slice(0, 3).map((f) => f.path).join(", ")}`, `file.changes_outside_scope: no changes outside scope`));
|
|
229
|
+
}
|
|
230
|
+
};
|
|
231
|
+
//#endregion
|
|
232
|
+
//#region src/runtime-checks/_match.ts
|
|
233
|
+
function matchesSubset(actual, expected) {
|
|
234
|
+
if (expected === actual) return true;
|
|
235
|
+
if (Array.isArray(expected)) return Array.isArray(actual) && expected.length === actual.length && expected.every((item, i) => matchesSubset(actual[i], item));
|
|
236
|
+
if (expected && typeof expected === "object") {
|
|
237
|
+
if (!actual || typeof actual !== "object" || Array.isArray(actual)) return false;
|
|
238
|
+
const actualRecord = actual;
|
|
239
|
+
return Object.entries(expected).every(([key, value]) => matchesSubset(actualRecord[key], value));
|
|
240
|
+
}
|
|
241
|
+
return false;
|
|
242
|
+
}
|
|
243
|
+
//#endregion
|
|
244
|
+
//#region src/runtime-checks/_call-match.ts
|
|
245
|
+
function matchesRecordedCall(call, args) {
|
|
246
|
+
return (!args.tool || call.tool === args.tool) && (!args.provider || call.provider === args.provider) && (!args.server || call.server === args.server) && (args.args_match === void 0 || matchesSubset(call.args, args.args_match));
|
|
247
|
+
}
|
|
248
|
+
//#endregion
|
|
249
|
+
//#region src/runtime-checks/tool-called.ts
|
|
250
|
+
const toolCalled = {
|
|
251
|
+
type: "tool.called",
|
|
252
|
+
verify({ assertion, evidence, mode }) {
|
|
253
|
+
const a = decodeCheckArgs(ToolCalledArgsSchema, assertion, "tool.called: assertion.tool must be a non-empty string");
|
|
254
|
+
if (isValidationFailure(a)) return Effect.succeed(a);
|
|
255
|
+
const calls = evidence.toolCalls();
|
|
256
|
+
if (!calls.length) return Effect.succeed({
|
|
257
|
+
pass: false,
|
|
258
|
+
score: 0,
|
|
259
|
+
reason: "tool.called: no built-in tool evidence found"
|
|
260
|
+
});
|
|
261
|
+
const found = calls.some((c) => matchesRecordedCall(c, a));
|
|
262
|
+
return Effect.succeed(applyMode(found, mode, "tool.called: matched built-in tool call", "tool.called: matching call not found"));
|
|
263
|
+
}
|
|
264
|
+
};
|
|
265
|
+
//#endregion
|
|
266
|
+
//#region src/runtime-checks/tool-not-called.ts
|
|
267
|
+
const toolNotCalled = {
|
|
268
|
+
type: "tool.not_called",
|
|
269
|
+
verify({ assertion, evidence }) {
|
|
270
|
+
const a = decodeToolNotCalledArgs(assertion);
|
|
271
|
+
if (isValidationFailure(a)) return Effect.succeed(a);
|
|
272
|
+
const found = evidence.toolCalls().some((c) => matchesRecordedCall(c, a));
|
|
273
|
+
return Effect.succeed(found ? {
|
|
274
|
+
pass: false,
|
|
275
|
+
score: 0,
|
|
276
|
+
reason: "tool.not_called: forbidden built-in tool call observed"
|
|
277
|
+
} : {
|
|
278
|
+
pass: true,
|
|
279
|
+
score: 1,
|
|
280
|
+
reason: "tool.not_called: no matching built-in tool calls observed"
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
};
|
|
284
|
+
//#endregion
|
|
285
|
+
//#region src/runtime-checks/skill-loaded.ts
|
|
286
|
+
const skillLoaded = {
|
|
287
|
+
type: "skill.loaded",
|
|
288
|
+
verify({ assertion, evidence, mode }) {
|
|
289
|
+
const a = decodeCheckArgs(SkillLoadedArgsSchema, assertion, "skill.loaded: assertion must be an object");
|
|
290
|
+
if (isValidationFailure(a)) return Effect.succeed(a);
|
|
291
|
+
const shouldInclude = a.should_include ?? [];
|
|
292
|
+
const shouldExclude = a.should_exclude ?? [];
|
|
293
|
+
if (shouldInclude.length === 0 && shouldExclude.length === 0) return Effect.succeed({
|
|
294
|
+
pass: false,
|
|
295
|
+
score: 0,
|
|
296
|
+
reason: "skill.loaded: declare should_include or should_exclude"
|
|
297
|
+
});
|
|
298
|
+
const events = evidence.skillsLoaded().filter((event) => matchesLoad(event, a));
|
|
299
|
+
const loaded = new Set(events.map((event) => event.skill));
|
|
300
|
+
const missing = shouldInclude.filter((skill) => !loaded.has(skill));
|
|
301
|
+
const forbidden = shouldExclude.filter((skill) => loaded.has(skill));
|
|
302
|
+
const matched = missing.length === 0 && forbidden.length === 0;
|
|
303
|
+
const unmatchedReason = skillLoadedMismatchReason(missing, forbidden);
|
|
304
|
+
return Effect.succeed(applyMode(matched, mode, "skill.loaded: expected skill context observed", unmatchedReason));
|
|
305
|
+
}
|
|
306
|
+
};
|
|
307
|
+
function skillLoadedMismatchReason(missing, forbidden) {
|
|
308
|
+
return `skill.loaded: ${[missing.length ? `missing loaded skill(s): ${missing.join(", ")}` : "", forbidden.length ? `forbidden loaded skill(s): ${forbidden.join(", ")}` : ""].filter(Boolean).join("; ")}`;
|
|
309
|
+
}
|
|
310
|
+
function matchesLoad(event, args) {
|
|
311
|
+
return (!args.delivery || event.delivery === args.delivery) && (!args.provider || event.provider === args.provider) && (!args.server || event.server === args.server) && (!args.source || event.source === args.source);
|
|
312
|
+
}
|
|
313
|
+
//#endregion
|
|
314
|
+
//#region src/runtime-checks/index.ts
|
|
315
|
+
const corePlugins = [
|
|
316
|
+
verifierSucceeds,
|
|
317
|
+
verifierFails,
|
|
318
|
+
fileExists,
|
|
319
|
+
fileCreated,
|
|
320
|
+
fileContains,
|
|
321
|
+
fileNotModified,
|
|
322
|
+
fileChangesOutsideScope,
|
|
323
|
+
codePatternExists,
|
|
324
|
+
codeNoPattern,
|
|
325
|
+
toolCalled,
|
|
326
|
+
toolNotCalled,
|
|
327
|
+
skillLoaded
|
|
328
|
+
];
|
|
329
|
+
function buildRegistry(extra = []) {
|
|
330
|
+
const reg = /* @__PURE__ */ new Map();
|
|
331
|
+
for (const p of [...corePlugins, ...extra]) reg.set(p.type, p);
|
|
332
|
+
return reg;
|
|
333
|
+
}
|
|
334
|
+
buildRegistry();
|
|
335
|
+
//#endregion
|
|
336
|
+
//#region src/runtime-checks/catalog.ts
|
|
337
|
+
var RuntimeCheckCatalog = class extends Context.Tag("agent-skill-evals/promptfoo/RuntimeCheckCatalog")() {};
|
|
338
|
+
function runtimeCheckCatalogFromChecks(checks = corePlugins) {
|
|
339
|
+
const registry = /* @__PURE__ */ new Map();
|
|
340
|
+
for (const check of checks) registry.set(check.type, check);
|
|
341
|
+
return {
|
|
342
|
+
all: () => [...registry.values()],
|
|
343
|
+
get: (type) => registry.get(type),
|
|
344
|
+
knownTypes: () => new Set(registry.keys())
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
function makeRuntimeCheckCatalogLayer(extraChecks = []) {
|
|
348
|
+
return Layer.succeed(RuntimeCheckCatalog, runtimeCheckCatalogFromChecks([...corePlugins, ...extraChecks]));
|
|
349
|
+
}
|
|
350
|
+
const RuntimeCheckCatalogLive = makeRuntimeCheckCatalogLayer();
|
|
351
|
+
function getRuntimeCheck(type) {
|
|
352
|
+
return Effect.map(RuntimeCheckCatalog, (catalog) => catalog.get(type));
|
|
353
|
+
}
|
|
354
|
+
Effect.map(RuntimeCheckCatalog, (catalog) => catalog.knownTypes());
|
|
355
|
+
//#endregion
|
|
356
|
+
//#region src/evidence-types.ts
|
|
357
|
+
const EVIDENCE_SCHEMA_VERSION = "agent-skill-evals.evidence.v1";
|
|
358
|
+
//#endregion
|
|
359
|
+
//#region src/evidence-schema.ts
|
|
360
|
+
const OptionalString = Schema.optional(Schema.String);
|
|
361
|
+
const OptionalNumber = Schema.optional(Schema.Number);
|
|
362
|
+
const StringArray = Schema.mutable(Schema.Array(Schema.String));
|
|
363
|
+
const CommandEventSchema = Schema.Struct({
|
|
364
|
+
command: Schema.String,
|
|
365
|
+
args: Schema.optionalWith(StringArray, { default: () => [] }),
|
|
366
|
+
exitCode: Schema.Number,
|
|
367
|
+
signal: OptionalString,
|
|
368
|
+
stdout: OptionalString,
|
|
369
|
+
stderr: OptionalString,
|
|
370
|
+
startedAt: Schema.Number,
|
|
371
|
+
durationMs: Schema.Number
|
|
372
|
+
});
|
|
373
|
+
const FileEventSchema = Schema.Struct({
|
|
374
|
+
path: Schema.String,
|
|
375
|
+
op: Schema.Literal("create", "modify", "delete")
|
|
376
|
+
});
|
|
377
|
+
const ToolCallEventSchema = Schema.Struct({
|
|
378
|
+
tool: Schema.String,
|
|
379
|
+
provider: OptionalString,
|
|
380
|
+
server: OptionalString,
|
|
381
|
+
args: Schema.optional(Schema.Unknown),
|
|
382
|
+
result: Schema.optional(Schema.Unknown),
|
|
383
|
+
startedAt: Schema.Number,
|
|
384
|
+
durationMs: Schema.Number
|
|
385
|
+
});
|
|
386
|
+
const SkillLoadEventSchema = Schema.Struct({
|
|
387
|
+
skill: Schema.String,
|
|
388
|
+
delivery: Schema.Literal("native", "mcp"),
|
|
389
|
+
provider: OptionalString,
|
|
390
|
+
server: OptionalString,
|
|
391
|
+
source: OptionalString,
|
|
392
|
+
startedAt: Schema.Number
|
|
393
|
+
});
|
|
394
|
+
const UsageSchema = Schema.Struct({
|
|
395
|
+
inputTokens: OptionalNumber,
|
|
396
|
+
outputTokens: OptionalNumber,
|
|
397
|
+
totalTokens: OptionalNumber,
|
|
398
|
+
cacheReadTokens: OptionalNumber,
|
|
399
|
+
cacheWriteTokens: OptionalNumber
|
|
400
|
+
});
|
|
401
|
+
const RunSummarySchema = Schema.Struct({
|
|
402
|
+
runDir: Schema.String,
|
|
403
|
+
worldPath: Schema.String,
|
|
404
|
+
fixture: Schema.String,
|
|
405
|
+
durationMs: OptionalNumber
|
|
406
|
+
});
|
|
407
|
+
const EvidenceSnapshotSchema = Schema.Struct({
|
|
408
|
+
schemaVersion: Schema.Literal(EVIDENCE_SCHEMA_VERSION),
|
|
409
|
+
output: Schema.optionalWith(Schema.String, { default: () => "" }),
|
|
410
|
+
run: RunSummarySchema,
|
|
411
|
+
commands: Schema.optionalWith(Schema.mutable(Schema.Array(CommandEventSchema)), { default: () => [] }),
|
|
412
|
+
filesWritten: Schema.optionalWith(Schema.mutable(Schema.Array(FileEventSchema)), { default: () => [] }),
|
|
413
|
+
toolCalls: Schema.optionalWith(Schema.mutable(Schema.Array(ToolCallEventSchema)), { default: () => [] }),
|
|
414
|
+
skillsLoaded: Schema.optionalWith(Schema.mutable(Schema.Array(SkillLoadEventSchema)), { default: () => [] }),
|
|
415
|
+
usage: Schema.optionalWith(UsageSchema, { default: () => ({}) }),
|
|
416
|
+
extensions: Schema.optional(Schema.Record({
|
|
417
|
+
key: Schema.String,
|
|
418
|
+
value: Schema.Unknown
|
|
419
|
+
}))
|
|
420
|
+
});
|
|
421
|
+
const decodeEvidenceSnapshot = Schema.decodeUnknownEither(EvidenceSnapshotSchema, { errors: "all" });
|
|
422
|
+
function decodeEvidenceSnapshotEither(input) {
|
|
423
|
+
const decoded = decodeEvidenceSnapshot(input);
|
|
424
|
+
if (Either.isRight(decoded)) return Either.right(decoded.right);
|
|
425
|
+
return Either.left(new Error(ParseResult.TreeFormatter.formatErrorSync(decoded.left)));
|
|
426
|
+
}
|
|
427
|
+
function parseEvidenceSnapshot(input) {
|
|
428
|
+
const decoded = decodeEvidenceSnapshotEither(input);
|
|
429
|
+
if (Either.isRight(decoded)) return decoded.right;
|
|
430
|
+
throw decoded.left;
|
|
431
|
+
}
|
|
432
|
+
//#endregion
|
|
433
|
+
//#region src/agent/evidence.ts
|
|
434
|
+
const DEFAULT_SKILL_EVIDENCE_CONFIG = {
|
|
435
|
+
mcpResource: {
|
|
436
|
+
uriArgPaths: ["uri"],
|
|
437
|
+
uriPatterns: ["^skill://(?<skill>[^/]+)/SKILL\\.md$"]
|
|
438
|
+
},
|
|
439
|
+
mcpTool: { toolPatterns: ["^load_(?<skill>[A-Za-z0-9_-]+)_skill$"] }
|
|
440
|
+
};
|
|
441
|
+
var EvidenceCollector = class EvidenceCollector {
|
|
442
|
+
skillEvidenceConfig;
|
|
443
|
+
snapshot = {
|
|
444
|
+
schemaVersion: EVIDENCE_SCHEMA_VERSION,
|
|
445
|
+
output: "",
|
|
446
|
+
run: {
|
|
447
|
+
runDir: "",
|
|
448
|
+
worldPath: "",
|
|
449
|
+
fixture: ""
|
|
450
|
+
},
|
|
451
|
+
commands: [],
|
|
452
|
+
filesWritten: [],
|
|
453
|
+
toolCalls: [],
|
|
454
|
+
skillsLoaded: [],
|
|
455
|
+
usage: {}
|
|
456
|
+
};
|
|
457
|
+
constructor(skillEvidenceConfig = {}) {
|
|
458
|
+
this.skillEvidenceConfig = mergeSkillEvidenceConfig(skillEvidenceConfig);
|
|
459
|
+
}
|
|
460
|
+
addCommand(e) {
|
|
461
|
+
this.snapshot.commands.push(e);
|
|
462
|
+
}
|
|
463
|
+
addFileWrite(e) {
|
|
464
|
+
this.snapshot.filesWritten.push(e);
|
|
465
|
+
}
|
|
466
|
+
addToolCall(e) {
|
|
467
|
+
this.snapshot.toolCalls.push(e);
|
|
468
|
+
const skillLoad = skillLoadFromToolCall(e, this.skillEvidenceConfig);
|
|
469
|
+
if (skillLoad) this.addSkillLoad(skillLoad);
|
|
470
|
+
}
|
|
471
|
+
addSkillLoad(e) {
|
|
472
|
+
if (this.snapshot.skillsLoaded.some((existing) => existing.skill === e.skill && existing.delivery === e.delivery && existing.provider === e.provider && existing.server === e.server)) return;
|
|
473
|
+
this.snapshot.skillsLoaded.push(e);
|
|
474
|
+
}
|
|
475
|
+
setUsage(u) {
|
|
476
|
+
this.snapshot.usage = u;
|
|
477
|
+
}
|
|
478
|
+
addUsage(u) {
|
|
479
|
+
this.snapshot.usage = mergeUsage(this.snapshot.usage, u);
|
|
480
|
+
}
|
|
481
|
+
setOutput(output) {
|
|
482
|
+
this.snapshot.output = output;
|
|
483
|
+
}
|
|
484
|
+
setRun(run) {
|
|
485
|
+
this.snapshot.run = run;
|
|
486
|
+
}
|
|
487
|
+
toSnapshot() {
|
|
488
|
+
return parseEvidenceSnapshot({
|
|
489
|
+
schemaVersion: this.snapshot.schemaVersion,
|
|
490
|
+
output: this.snapshot.output,
|
|
491
|
+
run: { ...this.snapshot.run },
|
|
492
|
+
commands: [...this.snapshot.commands],
|
|
493
|
+
filesWritten: [...this.snapshot.filesWritten],
|
|
494
|
+
toolCalls: [...this.snapshot.toolCalls],
|
|
495
|
+
skillsLoaded: [...this.snapshot.skillsLoaded],
|
|
496
|
+
usage: { ...this.snapshot.usage },
|
|
497
|
+
extensions: this.snapshot.extensions ? { ...this.snapshot.extensions } : void 0
|
|
498
|
+
});
|
|
499
|
+
}
|
|
500
|
+
async writeTo(runDir) {
|
|
501
|
+
return Effect.runPromise(writeEvidenceToEffect(this, runDir).pipe(Effect.provide(NodeServicesLive)));
|
|
502
|
+
}
|
|
503
|
+
static fromSnapshot(snapshot) {
|
|
504
|
+
const collector = new EvidenceCollector();
|
|
505
|
+
collector.snapshot = parseEvidenceSnapshot(snapshot);
|
|
506
|
+
return collector;
|
|
507
|
+
}
|
|
508
|
+
};
|
|
509
|
+
function mergeSkillEvidenceConfig(config) {
|
|
510
|
+
return {
|
|
511
|
+
mcpResource: {
|
|
512
|
+
uriArgPaths: config.mcpResource?.uriArgPaths ?? DEFAULT_SKILL_EVIDENCE_CONFIG.mcpResource.uriArgPaths,
|
|
513
|
+
uriPatterns: config.mcpResource?.uriPatterns ?? DEFAULT_SKILL_EVIDENCE_CONFIG.mcpResource.uriPatterns
|
|
514
|
+
},
|
|
515
|
+
mcpTool: { toolPatterns: config.mcpTool?.toolPatterns ?? DEFAULT_SKILL_EVIDENCE_CONFIG.mcpTool.toolPatterns },
|
|
516
|
+
...config.nativeArgs ? { nativeArgs: config.nativeArgs } : {}
|
|
517
|
+
};
|
|
518
|
+
}
|
|
519
|
+
function skillLoadFromToolCall(event, config) {
|
|
520
|
+
const uri = skillUriFromArgs(event.args, config.mcpResource?.uriArgPaths ?? []);
|
|
521
|
+
const skill = uri ? skillFromPattern(uri, config.mcpResource?.uriPatterns ?? []) : skillFromTool(event.tool, config.mcpTool?.toolPatterns ?? []);
|
|
522
|
+
if (!skill) return void 0;
|
|
523
|
+
const server = event.server ?? serverFromArgs(event.args);
|
|
524
|
+
return {
|
|
525
|
+
skill,
|
|
526
|
+
delivery: "mcp",
|
|
527
|
+
...event.provider ? { provider: event.provider } : {},
|
|
528
|
+
...server ? { server } : {},
|
|
529
|
+
source: event.tool,
|
|
530
|
+
startedAt: event.startedAt
|
|
531
|
+
};
|
|
532
|
+
}
|
|
533
|
+
function skillUriFromArgs(args, paths) {
|
|
534
|
+
for (const path of paths) {
|
|
535
|
+
const uri = valueAtPath(args, path);
|
|
536
|
+
if (typeof uri === "string") return uri;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
function serverFromArgs(args) {
|
|
540
|
+
if (!args || typeof args !== "object" || Array.isArray(args)) return void 0;
|
|
541
|
+
const server = args.server;
|
|
542
|
+
return typeof server === "string" ? server : void 0;
|
|
543
|
+
}
|
|
544
|
+
function skillFromPattern(value, patterns) {
|
|
545
|
+
for (const pattern of patterns) {
|
|
546
|
+
let regex;
|
|
547
|
+
try {
|
|
548
|
+
regex = new RegExp(pattern, "i");
|
|
549
|
+
} catch {
|
|
550
|
+
continue;
|
|
551
|
+
}
|
|
552
|
+
const match = regex.exec(value);
|
|
553
|
+
const skill = match?.groups?.skill ?? match?.[1];
|
|
554
|
+
if (skill) return skill;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
function skillFromTool(tool, patterns) {
|
|
558
|
+
return skillFromPattern(tool, patterns)?.replace(/_/g, "-");
|
|
559
|
+
}
|
|
560
|
+
function valueAtPath(value, path) {
|
|
561
|
+
let current = value;
|
|
562
|
+
for (const segment of path.split(".")) {
|
|
563
|
+
if (!current || typeof current !== "object" || Array.isArray(current)) return void 0;
|
|
564
|
+
current = current[segment];
|
|
565
|
+
}
|
|
566
|
+
return current;
|
|
567
|
+
}
|
|
568
|
+
function addOptionalNumbers(a, b) {
|
|
569
|
+
if (a === void 0) return b;
|
|
570
|
+
if (b === void 0) return a;
|
|
571
|
+
return a + b;
|
|
572
|
+
}
|
|
573
|
+
function mergeUsage(a, b) {
|
|
574
|
+
return {
|
|
575
|
+
inputTokens: addOptionalNumbers(a.inputTokens, b.inputTokens),
|
|
576
|
+
outputTokens: addOptionalNumbers(a.outputTokens, b.outputTokens),
|
|
577
|
+
totalTokens: addOptionalNumbers(a.totalTokens, b.totalTokens),
|
|
578
|
+
cacheReadTokens: addOptionalNumbers(a.cacheReadTokens, b.cacheReadTokens),
|
|
579
|
+
cacheWriteTokens: addOptionalNumbers(a.cacheWriteTokens, b.cacheWriteTokens)
|
|
580
|
+
};
|
|
581
|
+
}
|
|
582
|
+
function writeEvidenceToEffect(collector, runDir) {
|
|
583
|
+
return Effect.gen(function* () {
|
|
584
|
+
const fs = yield* FileSystem;
|
|
585
|
+
const path = join(runDir, "evidence.json");
|
|
586
|
+
yield* fs.writeText(path, JSON.stringify(collector.toSnapshot(), null, 2));
|
|
587
|
+
return path;
|
|
588
|
+
});
|
|
589
|
+
}
|
|
590
|
+
function evidenceFromSnapshot(s) {
|
|
591
|
+
return {
|
|
592
|
+
commands: () => s.commands,
|
|
593
|
+
filesWritten: () => s.filesWritten,
|
|
594
|
+
toolCalls: () => s.toolCalls,
|
|
595
|
+
skillsLoaded: () => s.skillsLoaded,
|
|
596
|
+
usage: () => s.usage
|
|
597
|
+
};
|
|
598
|
+
}
|
|
599
|
+
//#endregion
|
|
600
|
+
//#region src/agent/command-runner.ts
|
|
601
|
+
const DEFAULT_OUTPUT_LIMIT = 4096;
|
|
602
|
+
const KILL_GRACE_MS = 1e3;
|
|
603
|
+
const EXIT_STDIO_FLUSH_MS = 50;
|
|
604
|
+
var ProcessRunner = class extends Context.Tag("agent-skill-evals/promptfoo/ProcessRunner")() {};
|
|
605
|
+
function appendLimited(current, chunk, limit) {
|
|
606
|
+
if (limit <= 0) return "";
|
|
607
|
+
const next = current + chunk;
|
|
608
|
+
return next.length <= limit ? next : next.slice(next.length - limit);
|
|
609
|
+
}
|
|
610
|
+
function killProcessGroup(child, signal) {
|
|
611
|
+
if (!child.pid) return;
|
|
612
|
+
try {
|
|
613
|
+
if (process.platform === "win32") child.kill(signal);
|
|
614
|
+
else process.kill(-child.pid, signal);
|
|
615
|
+
} catch {
|
|
616
|
+
try {
|
|
617
|
+
child.kill(signal);
|
|
618
|
+
} catch {}
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
function runCommandEffect(command, args = [], options) {
|
|
622
|
+
return Effect.flatMap(ProcessRunner, (runner) => runner.run(command, args, options));
|
|
623
|
+
}
|
|
624
|
+
const ProcessRunnerLive = Layer.succeed(ProcessRunner, { run: nodeRunCommandEffect });
|
|
625
|
+
function nodeRunCommandEffect(command, args, options) {
|
|
626
|
+
const stdoutLimit = options.stdoutLimit ?? DEFAULT_OUTPUT_LIMIT;
|
|
627
|
+
const stderrLimit = options.stderrLimit ?? DEFAULT_OUTPUT_LIMIT;
|
|
628
|
+
return Effect.gen(function* () {
|
|
629
|
+
const startedAt = yield* Clock.currentTimeMillis;
|
|
630
|
+
return yield* Effect.async((resume, signal) => {
|
|
631
|
+
const child = spawn(command, [...args], {
|
|
632
|
+
cwd: options.cwd,
|
|
633
|
+
env: options.env,
|
|
634
|
+
detached: process.platform !== "win32",
|
|
635
|
+
stdio: [
|
|
636
|
+
"pipe",
|
|
637
|
+
"pipe",
|
|
638
|
+
"pipe"
|
|
639
|
+
]
|
|
640
|
+
});
|
|
641
|
+
let stdout = "";
|
|
642
|
+
let stderr = "";
|
|
643
|
+
let timedOut = false;
|
|
644
|
+
let settled = false;
|
|
645
|
+
let timeoutTimer = null;
|
|
646
|
+
let forceTimer = null;
|
|
647
|
+
let exitFlushTimer = null;
|
|
648
|
+
let abortListener = null;
|
|
649
|
+
const finish = (result) => {
|
|
650
|
+
if (settled) return;
|
|
651
|
+
settled = true;
|
|
652
|
+
if (timeoutTimer) clearTimeout(timeoutTimer);
|
|
653
|
+
if (forceTimer) clearTimeout(forceTimer);
|
|
654
|
+
if (exitFlushTimer) clearTimeout(exitFlushTimer);
|
|
655
|
+
child.stdout?.destroy();
|
|
656
|
+
child.stderr?.destroy();
|
|
657
|
+
child.stdin?.destroy();
|
|
658
|
+
if (abortListener) signal.removeEventListener("abort", abortListener);
|
|
659
|
+
if (process.platform !== "win32") killProcessGroup(child, "SIGTERM");
|
|
660
|
+
resume(Clock.currentTimeMillis.pipe(Effect.map((endedAt) => ({
|
|
661
|
+
...result,
|
|
662
|
+
durationMs: endedAt - startedAt
|
|
663
|
+
}))));
|
|
664
|
+
};
|
|
665
|
+
abortListener = () => {
|
|
666
|
+
timedOut = true;
|
|
667
|
+
stderr = appendLimited(stderr, `${stderr ? "\n" : ""}agent-skill-evals: command interrupted`, stderrLimit);
|
|
668
|
+
killProcessGroup(child, "SIGKILL");
|
|
669
|
+
finish({
|
|
670
|
+
command,
|
|
671
|
+
args: [...args],
|
|
672
|
+
exitCode: -1,
|
|
673
|
+
signal: "SIGKILL",
|
|
674
|
+
stdout,
|
|
675
|
+
stderr,
|
|
676
|
+
startedAt,
|
|
677
|
+
timedOut
|
|
678
|
+
});
|
|
679
|
+
};
|
|
680
|
+
signal.addEventListener("abort", abortListener, { once: true });
|
|
681
|
+
child.stdout?.on("data", (chunk) => {
|
|
682
|
+
const text = chunk.toString();
|
|
683
|
+
stdout = appendLimited(stdout, text, stdoutLimit);
|
|
684
|
+
options.onStdout?.(text);
|
|
685
|
+
});
|
|
686
|
+
child.stderr?.on("data", (chunk) => {
|
|
687
|
+
const text = chunk.toString();
|
|
688
|
+
stderr = appendLimited(stderr, text, stderrLimit);
|
|
689
|
+
options.onStderr?.(text);
|
|
690
|
+
});
|
|
691
|
+
child.on("error", (error) => {
|
|
692
|
+
stderr = appendLimited(stderr, String(error), stderrLimit);
|
|
693
|
+
finish({
|
|
694
|
+
command,
|
|
695
|
+
args: [...args],
|
|
696
|
+
exitCode: -1,
|
|
697
|
+
stdout,
|
|
698
|
+
stderr,
|
|
699
|
+
startedAt,
|
|
700
|
+
timedOut,
|
|
701
|
+
error
|
|
702
|
+
});
|
|
703
|
+
});
|
|
704
|
+
child.on("exit", (code, signal) => {
|
|
705
|
+
const result = {
|
|
706
|
+
command,
|
|
707
|
+
args: [...args],
|
|
708
|
+
exitCode: code ?? -1,
|
|
709
|
+
signal: signal ?? void 0,
|
|
710
|
+
stdout,
|
|
711
|
+
stderr,
|
|
712
|
+
startedAt,
|
|
713
|
+
timedOut
|
|
714
|
+
};
|
|
715
|
+
exitFlushTimer = setTimeout(() => finish(result), EXIT_STDIO_FLUSH_MS);
|
|
716
|
+
});
|
|
717
|
+
child.on("close", (code, signal) => {
|
|
718
|
+
finish({
|
|
719
|
+
command,
|
|
720
|
+
args: [...args],
|
|
721
|
+
exitCode: code ?? -1,
|
|
722
|
+
signal: signal ?? void 0,
|
|
723
|
+
stdout,
|
|
724
|
+
stderr,
|
|
725
|
+
startedAt,
|
|
726
|
+
timedOut
|
|
727
|
+
});
|
|
728
|
+
});
|
|
729
|
+
if (options.stdin !== void 0) child.stdin?.write(options.stdin);
|
|
730
|
+
child.stdin?.end();
|
|
731
|
+
if (options.timeoutMs) timeoutTimer = setTimeout(() => {
|
|
732
|
+
timedOut = true;
|
|
733
|
+
stderr = appendLimited(stderr, `${stderr ? "\n" : ""}agent-skill-evals: command timed out after ${options.timeoutMs}ms`, stderrLimit);
|
|
734
|
+
killProcessGroup(child, "SIGKILL");
|
|
735
|
+
forceTimer = setTimeout(() => {
|
|
736
|
+
finish({
|
|
737
|
+
command,
|
|
738
|
+
args: [...args],
|
|
739
|
+
exitCode: -1,
|
|
740
|
+
signal: "SIGKILL",
|
|
741
|
+
stdout,
|
|
742
|
+
stderr,
|
|
743
|
+
startedAt,
|
|
744
|
+
timedOut
|
|
745
|
+
});
|
|
746
|
+
}, KILL_GRACE_MS);
|
|
747
|
+
}, options.timeoutMs);
|
|
748
|
+
});
|
|
749
|
+
});
|
|
750
|
+
}
|
|
751
|
+
//#endregion
|
|
752
|
+
//#region src/agent/jsonl-stream.ts
|
|
753
|
+
function appendLine(events, line) {
|
|
754
|
+
if (!line.trim()) return events;
|
|
755
|
+
try {
|
|
756
|
+
return [...events, JSON.parse(line)];
|
|
757
|
+
} catch {
|
|
758
|
+
return events;
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
function parseChunk(state, text) {
|
|
762
|
+
const lines = (state.leftover + text).split("\n");
|
|
763
|
+
return {
|
|
764
|
+
leftover: lines.pop() ?? "",
|
|
765
|
+
events: lines.reduce(appendLine, state.events)
|
|
766
|
+
};
|
|
767
|
+
}
|
|
768
|
+
function createJsonlEventParser() {
|
|
769
|
+
let leftover = "";
|
|
770
|
+
return {
|
|
771
|
+
push(chunk) {
|
|
772
|
+
const state = parseChunk({
|
|
773
|
+
leftover,
|
|
774
|
+
events: []
|
|
775
|
+
}, chunk);
|
|
776
|
+
leftover = state.leftover;
|
|
777
|
+
return state.events;
|
|
778
|
+
},
|
|
779
|
+
finish() {
|
|
780
|
+
const events = appendLine([], leftover);
|
|
781
|
+
leftover = "";
|
|
782
|
+
return events;
|
|
783
|
+
}
|
|
784
|
+
};
|
|
785
|
+
}
|
|
786
|
+
//#endregion
|
|
787
|
+
//#region src/agent/adapters.ts
|
|
788
|
+
function argsWithPrompt(args, prompt) {
|
|
789
|
+
const dashIndex = args.lastIndexOf("-");
|
|
790
|
+
if (dashIndex === -1) return [...args, prompt];
|
|
791
|
+
return args.map((arg, index) => index === dashIndex ? prompt : arg);
|
|
792
|
+
}
|
|
793
|
+
function normalizePathFromCwd(path, cwd) {
|
|
794
|
+
if (!isAbsolute(path)) return path;
|
|
795
|
+
const candidate = path.startsWith("/private/") ? path.slice(8) : path;
|
|
796
|
+
const rel = relative(cwd.startsWith("/private/") ? cwd.slice(8) : cwd, candidate);
|
|
797
|
+
return rel && !rel.startsWith("..") && !isAbsolute(rel) ? rel : path;
|
|
798
|
+
}
|
|
799
|
+
function normalizeToolCallArgs(args, cwd) {
|
|
800
|
+
if (!args || typeof args !== "object" || Array.isArray(args)) return args;
|
|
801
|
+
const normalized = {};
|
|
802
|
+
for (const [key, value] of Object.entries(args)) {
|
|
803
|
+
const normalizedValue = (key === "path" || key === "file_path") && typeof value === "string" ? normalizePathFromCwd(value, cwd) : value;
|
|
804
|
+
normalized[key] = normalizedValue;
|
|
805
|
+
if (key === "file_path" && typeof normalizedValue === "string" && normalized.path === void 0) normalized.path = normalizedValue;
|
|
806
|
+
}
|
|
807
|
+
return normalized;
|
|
808
|
+
}
|
|
809
|
+
function normalizeMcpToolCall(event) {
|
|
810
|
+
const match = /^mcp__(.+?)__(.+)$/.exec(event.tool);
|
|
811
|
+
if (!match) return event;
|
|
812
|
+
return {
|
|
813
|
+
...event,
|
|
814
|
+
server: event.server ?? match[1],
|
|
815
|
+
tool: match[2] ?? event.tool
|
|
816
|
+
};
|
|
817
|
+
}
|
|
818
|
+
function parseJsonObjectString(value) {
|
|
819
|
+
if (typeof value !== "string") return value;
|
|
820
|
+
try {
|
|
821
|
+
const parsed = JSON.parse(value);
|
|
822
|
+
return parsed && typeof parsed === "object" ? parsed : value;
|
|
823
|
+
} catch {
|
|
824
|
+
return value;
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
function evidenceWithRelativeToolPaths(evidence, cwd, now) {
|
|
828
|
+
return {
|
|
829
|
+
addCommand: (event) => evidence.addCommand(event),
|
|
830
|
+
addToolCall: (event) => {
|
|
831
|
+
const normalized = normalizeMcpToolCall(event);
|
|
832
|
+
evidence.addToolCall({
|
|
833
|
+
...normalized,
|
|
834
|
+
args: normalizeToolCallArgs(normalized.args, cwd)
|
|
835
|
+
});
|
|
836
|
+
},
|
|
837
|
+
setUsage: (usage) => evidence.setUsage(usage),
|
|
838
|
+
addUsage: (usage) => evidence.addUsage(usage),
|
|
839
|
+
now
|
|
840
|
+
};
|
|
841
|
+
}
|
|
842
|
+
function normalizeUsage(usage) {
|
|
843
|
+
const inputTokens = usage.input_tokens ?? usage.inputTokens ?? usage.input;
|
|
844
|
+
const outputTokens = usage.output_tokens ?? usage.outputTokens ?? usage.output;
|
|
845
|
+
const cacheReadTokens = usage.cache_read_input_tokens ?? usage.cacheReadTokens ?? usage.cacheRead;
|
|
846
|
+
const cacheWriteTokens = usage.cache_creation_input_tokens ?? usage.cacheWriteTokens ?? usage.cacheWrite;
|
|
847
|
+
return {
|
|
848
|
+
inputTokens,
|
|
849
|
+
outputTokens,
|
|
850
|
+
totalTokens: usage.total_tokens ?? usage.totalTokens ?? addKnownNumbers(inputTokens, outputTokens),
|
|
851
|
+
cacheReadTokens,
|
|
852
|
+
cacheWriteTokens
|
|
853
|
+
};
|
|
854
|
+
}
|
|
855
|
+
function addKnownNumbers(...values) {
|
|
856
|
+
const present = values.filter((value) => typeof value === "number");
|
|
857
|
+
return present.length > 0 ? present.reduce((sum, value) => sum + value, 0) : void 0;
|
|
858
|
+
}
|
|
859
|
+
function normalizePiToolName(tool) {
|
|
860
|
+
switch (tool.toLowerCase()) {
|
|
861
|
+
case "bash": return "Bash";
|
|
862
|
+
case "edit": return "Edit";
|
|
863
|
+
case "read": return "Read";
|
|
864
|
+
case "write": return "Write";
|
|
865
|
+
default: return tool;
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
function evidenceStartedAt(evidence) {
|
|
869
|
+
return evidence.now?.() ?? 0;
|
|
870
|
+
}
|
|
871
|
+
function runJsonlAdapter(input, onEvent, options = {}) {
|
|
872
|
+
const { command, args, cwd, prompt, evidence, timeoutMs, env } = input;
|
|
873
|
+
const promptDelivery = options.promptDelivery ?? "stdin";
|
|
874
|
+
const spawnArgs = promptDelivery === "arg" ? argsWithPrompt(args, prompt) : [...args];
|
|
875
|
+
return Effect.gen(function* () {
|
|
876
|
+
const runStartedAt = yield* Clock.currentTimeMillis;
|
|
877
|
+
const adapterEvidence = evidenceWithRelativeToolPaths(evidence, cwd, () => runStartedAt);
|
|
878
|
+
const parser = createJsonlEventParser();
|
|
879
|
+
let finalText = "";
|
|
880
|
+
const handleChunk = (chunk) => {
|
|
881
|
+
for (const evt of parser.push(chunk)) onEvent(evt, adapterEvidence, (text) => finalText += text);
|
|
882
|
+
};
|
|
883
|
+
const result = yield* runCommandEffect(command, spawnArgs, {
|
|
884
|
+
cwd,
|
|
885
|
+
env,
|
|
886
|
+
stdin: promptDelivery === "stdin" ? prompt : void 0,
|
|
887
|
+
timeoutMs,
|
|
888
|
+
stdoutLimit: 0,
|
|
889
|
+
onStdout: handleChunk
|
|
890
|
+
});
|
|
891
|
+
for (const evt of parser.finish()) onEvent(evt, adapterEvidence, (text) => finalText += text);
|
|
892
|
+
evidence.addCommand({
|
|
893
|
+
command,
|
|
894
|
+
args: [...spawnArgs],
|
|
895
|
+
exitCode: result.exitCode,
|
|
896
|
+
stderr: result.stderr.slice(0, 4096),
|
|
897
|
+
startedAt: result.startedAt,
|
|
898
|
+
durationMs: result.durationMs
|
|
899
|
+
});
|
|
900
|
+
const error = result.error ? `adapter error: failed to start "${command}": ${result.error.message}` : result.timedOut ? `${command} timed out after ${timeoutMs ?? 0}ms` : result.exitCode !== 0 ? `${command} exited ${result.exitCode}${result.stderr.trim() ? `: ${result.stderr.trim()}` : ""}` : void 0;
|
|
901
|
+
if (error && !finalText.trim()) finalText = error;
|
|
902
|
+
return {
|
|
903
|
+
output: finalText.trim(),
|
|
904
|
+
exitCode: result.exitCode,
|
|
905
|
+
timedOut: result.timedOut,
|
|
906
|
+
...error ? { error } : {},
|
|
907
|
+
durationMs: result.durationMs
|
|
908
|
+
};
|
|
909
|
+
});
|
|
910
|
+
}
|
|
911
|
+
/**
|
|
912
|
+
* Claude Code stream-json adapter: parses stream-json events emitted by
|
|
913
|
+
* `claude -p ... --output-format stream-json` and projects them into evidence.
|
|
914
|
+
*
|
|
915
|
+
* Events of interest (see Claude Code docs):
|
|
916
|
+
* - { type: "system" | "assistant" | "user" | "result", ... }
|
|
917
|
+
* - tool_use blocks inside assistant content (Bash, Edit, Write, etc.)
|
|
918
|
+
*/
|
|
919
|
+
const claudeCodeJsonAdapter = {
|
|
920
|
+
id: "claude-code-json",
|
|
921
|
+
run(input) {
|
|
922
|
+
return runJsonlAdapter(input, handleClaudeEvent);
|
|
923
|
+
}
|
|
924
|
+
};
|
|
925
|
+
const codexJsonAdapter = {
|
|
926
|
+
id: "codex-json",
|
|
927
|
+
run(input) {
|
|
928
|
+
return runJsonlAdapter(input, handleCodexEvent, { promptDelivery: "arg" });
|
|
929
|
+
}
|
|
930
|
+
};
|
|
931
|
+
const piJsonAdapter = {
|
|
932
|
+
id: "pi-json",
|
|
933
|
+
run(input) {
|
|
934
|
+
return runJsonlAdapter(input, handlePiEvent);
|
|
935
|
+
}
|
|
936
|
+
};
|
|
937
|
+
const internalTestJsonAdapter = {
|
|
938
|
+
id: "internal-test-json",
|
|
939
|
+
run(input) {
|
|
940
|
+
return runJsonlAdapter(input, handleCodexEvent);
|
|
941
|
+
}
|
|
942
|
+
};
|
|
943
|
+
function handleClaudeEvent(evt, evidence, appendFinal) {
|
|
944
|
+
if (!evt || typeof evt !== "object") return;
|
|
945
|
+
const e = evt;
|
|
946
|
+
if (e.type === "result" && typeof e.result === "string") appendFinal(e.result);
|
|
947
|
+
if (e.type === "assistant" && e.message && typeof e.message === "object") {
|
|
948
|
+
const msg = e.message;
|
|
949
|
+
if (msg.usage && typeof msg.usage === "object") evidence.addUsage(normalizeUsage(msg.usage));
|
|
950
|
+
if (Array.isArray(msg.content)) for (const block of msg.content) {
|
|
951
|
+
if (!block || typeof block !== "object") continue;
|
|
952
|
+
const b = block;
|
|
953
|
+
if (b.type === "text" && typeof b.text === "string") appendFinal(b.text);
|
|
954
|
+
if (b.type === "tool_use" && typeof b.name === "string") evidence.addToolCall({
|
|
955
|
+
tool: b.name,
|
|
956
|
+
provider: "claude-code-json",
|
|
957
|
+
args: b.input,
|
|
958
|
+
startedAt: evidenceStartedAt(evidence),
|
|
959
|
+
durationMs: 0
|
|
960
|
+
});
|
|
961
|
+
}
|
|
962
|
+
}
|
|
963
|
+
if (e.type === "result" && e.usage && typeof e.usage === "object") evidence.setUsage(normalizeUsage(e.usage));
|
|
964
|
+
}
|
|
965
|
+
function handleCodexEvent(evt, evidence, appendFinal) {
|
|
966
|
+
if (!evt || typeof evt !== "object") return;
|
|
967
|
+
const e = evt;
|
|
968
|
+
const type = typeof e.type === "string" ? e.type : "";
|
|
969
|
+
const text = typeof e.message === "string" ? e.message : typeof e.text === "string" ? e.text : typeof e.content === "string" ? e.content : typeof e.output === "string" ? e.output : void 0;
|
|
970
|
+
if (text && /message|final|result|response|output/.test(type)) appendFinal(text);
|
|
971
|
+
const item = e.item && typeof e.item === "object" ? e.item : e;
|
|
972
|
+
const itemType = typeof item.type === "string" ? item.type : "";
|
|
973
|
+
const toolName = typeof item.tool === "string" ? item.tool : typeof item.name === "string" && /tool|call|command/.test(`${type}:${itemType}`) ? item.name : void 0;
|
|
974
|
+
if (toolName && /tool|call|command/.test(`${type}:${itemType}`)) evidence.addToolCall({
|
|
975
|
+
tool: toolName,
|
|
976
|
+
provider: "codex-json",
|
|
977
|
+
server: typeof item.server === "string" ? item.server : void 0,
|
|
978
|
+
args: parseJsonObjectString(item.args ?? item.input ?? item.arguments),
|
|
979
|
+
result: item.result ?? item.output,
|
|
980
|
+
startedAt: evidenceStartedAt(evidence),
|
|
981
|
+
durationMs: 0
|
|
982
|
+
});
|
|
983
|
+
if (itemType === "file_change" && Array.isArray(item.changes)) for (const change of item.changes) {
|
|
984
|
+
if (!change || typeof change !== "object") continue;
|
|
985
|
+
const c = change;
|
|
986
|
+
if (typeof c.path !== "string") continue;
|
|
987
|
+
evidence.addToolCall({
|
|
988
|
+
tool: "Edit",
|
|
989
|
+
provider: "codex-json",
|
|
990
|
+
args: {
|
|
991
|
+
path: c.path,
|
|
992
|
+
kind: typeof c.kind === "string" ? c.kind : void 0
|
|
993
|
+
},
|
|
994
|
+
startedAt: evidenceStartedAt(evidence),
|
|
995
|
+
durationMs: 0
|
|
996
|
+
});
|
|
997
|
+
}
|
|
998
|
+
if ((type.includes("exec") || type.includes("command") || itemType.includes("command")) && typeof item.command === "string") {
|
|
999
|
+
const args = Array.isArray(item.args) ? item.args.map(String) : [];
|
|
1000
|
+
evidence.addCommand({
|
|
1001
|
+
command: item.command,
|
|
1002
|
+
args,
|
|
1003
|
+
exitCode: typeof item.exit_code === "number" ? item.exit_code : typeof item.exitCode === "number" ? item.exitCode : 0,
|
|
1004
|
+
stdout: typeof item.stdout === "string" ? item.stdout.slice(0, 4096) : typeof item.aggregated_output === "string" ? item.aggregated_output.slice(0, 4096) : void 0,
|
|
1005
|
+
stderr: typeof item.stderr === "string" ? item.stderr.slice(0, 4096) : void 0,
|
|
1006
|
+
startedAt: evidenceStartedAt(evidence),
|
|
1007
|
+
durationMs: typeof item.durationMs === "number" ? item.durationMs : 0
|
|
1008
|
+
});
|
|
1009
|
+
}
|
|
1010
|
+
const usage = e.usage && typeof e.usage === "object" ? e.usage : void 0;
|
|
1011
|
+
if (usage) evidence.setUsage(normalizeUsage(usage));
|
|
1012
|
+
}
|
|
1013
|
+
function handlePiEvent(evt, evidence, appendFinal) {
|
|
1014
|
+
if (!evt || typeof evt !== "object") return;
|
|
1015
|
+
const e = evt;
|
|
1016
|
+
const type = typeof e.type === "string" ? e.type : "";
|
|
1017
|
+
const message = e.message && typeof e.message === "object" ? e.message : void 0;
|
|
1018
|
+
const messageUsage = message?.usage && typeof message.usage === "object" ? message.usage : void 0;
|
|
1019
|
+
if (messageUsage && (type === "message_end" || type === "turn_end" || type === "agent_end")) evidence.setUsage(normalizeUsage(messageUsage));
|
|
1020
|
+
const text = typeof e.message === "string" ? e.message : typeof e.text === "string" ? e.text : typeof e.content === "string" ? e.content : typeof e.output === "string" ? e.output : typeof e.result === "string" ? e.result : void 0;
|
|
1021
|
+
if (text && /assistant|message|final|result|response|output/.test(type)) appendFinal(text);
|
|
1022
|
+
if (type === "tool_execution_start" || type === "tool_execution_end") {
|
|
1023
|
+
const rawTool = typeof e.tool === "string" ? e.tool : typeof e.name === "string" ? e.name : typeof e.tool_name === "string" ? e.tool_name : typeof e.toolName === "string" ? e.toolName : void 0;
|
|
1024
|
+
const tool = rawTool ? normalizePiToolName(rawTool) : void 0;
|
|
1025
|
+
if (tool) evidence.addToolCall({
|
|
1026
|
+
tool,
|
|
1027
|
+
provider: "pi-json",
|
|
1028
|
+
args: e.args ?? e.input ?? e.arguments,
|
|
1029
|
+
result: type === "tool_execution_end" ? e.result ?? e.output : void 0,
|
|
1030
|
+
startedAt: evidenceStartedAt(evidence),
|
|
1031
|
+
durationMs: typeof e.duration_ms === "number" ? e.duration_ms : typeof e.durationMs === "number" ? e.durationMs : 0
|
|
1032
|
+
});
|
|
1033
|
+
}
|
|
1034
|
+
if (type === "tool_execution_end") {
|
|
1035
|
+
const tool = typeof e.tool === "string" ? e.tool : typeof e.name === "string" ? e.name : typeof e.tool_name === "string" ? e.tool_name : typeof e.toolName === "string" ? e.toolName : "";
|
|
1036
|
+
const commandText = typeof e.command === "string" ? e.command : e.args && typeof e.args === "object" && !Array.isArray(e.args) && typeof e.args.command === "string" ? e.args.command : void 0;
|
|
1037
|
+
if (commandText && /bash|shell|command|exec/i.test(tool)) evidence.addCommand({
|
|
1038
|
+
command: commandText,
|
|
1039
|
+
args: [],
|
|
1040
|
+
exitCode: typeof e.exit_code === "number" ? e.exit_code : typeof e.exitCode === "number" ? e.exitCode : 0,
|
|
1041
|
+
stdout: typeof e.stdout === "string" ? e.stdout.slice(0, 4096) : typeof e.output === "string" ? e.output.slice(0, 4096) : void 0,
|
|
1042
|
+
stderr: typeof e.stderr === "string" ? e.stderr.slice(0, 4096) : void 0,
|
|
1043
|
+
startedAt: evidenceStartedAt(evidence),
|
|
1044
|
+
durationMs: typeof e.duration_ms === "number" ? e.duration_ms : typeof e.durationMs === "number" ? e.durationMs : 0
|
|
1045
|
+
});
|
|
1046
|
+
}
|
|
1047
|
+
const usage = e.usage && typeof e.usage === "object" ? e.usage : e;
|
|
1048
|
+
if (type === "usage" || e.usage) evidence.setUsage(normalizeUsage(usage));
|
|
1049
|
+
}
|
|
1050
|
+
const adapterRegistry = new Map([
|
|
1051
|
+
[claudeCodeJsonAdapter.id, claudeCodeJsonAdapter],
|
|
1052
|
+
[codexJsonAdapter.id, codexJsonAdapter],
|
|
1053
|
+
[piJsonAdapter.id, piJsonAdapter],
|
|
1054
|
+
[internalTestJsonAdapter.id, internalTestJsonAdapter]
|
|
1055
|
+
]);
|
|
1056
|
+
//#endregion
|
|
1057
|
+
//#region src/agent/adapter-catalog.ts
|
|
1058
|
+
var AdapterCatalog = class extends Context.Tag("agent-skill-evals/promptfoo/AdapterCatalog")() {};
|
|
1059
|
+
const AdapterCatalogLive = Layer.succeed(AdapterCatalog, { get: (id) => adapterRegistry.get(id) });
|
|
1060
|
+
function getAdapter(id) {
|
|
1061
|
+
return Effect.map(AdapterCatalog, (catalog) => catalog.get(id));
|
|
1062
|
+
}
|
|
1063
|
+
//#endregion
|
|
1064
|
+
//#region src/runtime-checks/_files.ts
|
|
1065
|
+
const SKIP_DIRS = new Set(["node_modules", ".git"]);
|
|
1066
|
+
function walkFilesEffect(root, visit) {
|
|
1067
|
+
return Effect.gen(function* () {
|
|
1068
|
+
const fs = yield* FileSystem;
|
|
1069
|
+
function walk(dir) {
|
|
1070
|
+
return Effect.gen(function* () {
|
|
1071
|
+
const entries = yield* fs.readDirectory(dir).pipe(Effect.catchAll(() => Effect.succeed([])));
|
|
1072
|
+
for (const entry of entries) {
|
|
1073
|
+
const absolutePath = join(dir, entry.name);
|
|
1074
|
+
if (entry.isDirectory()) {
|
|
1075
|
+
if (SKIP_DIRS.has(entry.name)) continue;
|
|
1076
|
+
yield* walk(absolutePath);
|
|
1077
|
+
continue;
|
|
1078
|
+
}
|
|
1079
|
+
if (entry.isFile()) yield* visit(relative(root, absolutePath), absolutePath);
|
|
1080
|
+
}
|
|
1081
|
+
});
|
|
1082
|
+
}
|
|
1083
|
+
if (!(yield* fs.stat(root).pipe(Effect.catchAll(() => Effect.succeed(null))))?.isDirectory()) return;
|
|
1084
|
+
yield* walk(root);
|
|
1085
|
+
});
|
|
1086
|
+
}
|
|
1087
|
+
function listMatchingFilesEffect(root, glob) {
|
|
1088
|
+
return Effect.gen(function* () {
|
|
1089
|
+
const matches = [];
|
|
1090
|
+
yield* walkFilesEffect(root, (relativePath) => Effect.sync(() => {
|
|
1091
|
+
if (matchesGlob(relativePath, glob)) matches.push(relativePath);
|
|
1092
|
+
}));
|
|
1093
|
+
return matches;
|
|
1094
|
+
});
|
|
1095
|
+
}
|
|
1096
|
+
function matchesGlob(relativePath, glob) {
|
|
1097
|
+
if (!glob.includes("/") && relativePath.includes("/")) return globToRegExp(glob).test(relativePath.split("/").at(-1) ?? relativePath);
|
|
1098
|
+
return globToRegExp(glob).test(relativePath);
|
|
1099
|
+
}
|
|
1100
|
+
function globToRegExp(glob) {
|
|
1101
|
+
const globstar = "__AGENT_SKILL_EVALS_GLOBSTAR__";
|
|
1102
|
+
const star = "__AGENT_SKILL_EVALS_STAR__";
|
|
1103
|
+
const pattern = glob.replace(/^\.\//, "").replace(/\*\*\//g, globstar).replace(/\*\*/g, globstar).replace(/\*/g, star).replace(/[.+^${}()|[\]\\]/g, "\\$&").replaceAll(globstar, "(?:.*/)?").replaceAll(star, "[^/]*");
|
|
1104
|
+
return new RegExp(`^${pattern}$`);
|
|
1105
|
+
}
|
|
1106
|
+
//#endregion
|
|
1107
|
+
//#region src/agent/world.ts
|
|
1108
|
+
function createRunDirEffect() {
|
|
1109
|
+
return Effect.gen(function* () {
|
|
1110
|
+
const fs = yield* FileSystem;
|
|
1111
|
+
const runDir = yield* fs.makeTempDirectory("agent-skill-evals-run-");
|
|
1112
|
+
const worldPath = join(runDir, "world");
|
|
1113
|
+
yield* fs.makeDirectory(worldPath);
|
|
1114
|
+
return {
|
|
1115
|
+
runDir,
|
|
1116
|
+
worldPath
|
|
1117
|
+
};
|
|
1118
|
+
});
|
|
1119
|
+
}
|
|
1120
|
+
function copyFixtureEffect(input, worldPath) {
|
|
1121
|
+
return Effect.gen(function* () {
|
|
1122
|
+
const fs = yield* FileSystem;
|
|
1123
|
+
const cwd = yield* (yield* Environment).cwd;
|
|
1124
|
+
const src = isAbsolute(input.fixturePath) ? input.fixturePath : resolve(input.testFileDir ?? input.baseDir ?? cwd, input.fixturePath);
|
|
1125
|
+
yield* fs.copyDirectory(src, worldPath);
|
|
1126
|
+
});
|
|
1127
|
+
}
|
|
1128
|
+
function makeWorldHandle(worldPath, recordCommand) {
|
|
1129
|
+
return {
|
|
1130
|
+
path: worldPath,
|
|
1131
|
+
readFile(rel) {
|
|
1132
|
+
return Effect.gen(function* () {
|
|
1133
|
+
return yield* (yield* FileSystem).readText(join(worldPath, rel)).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
1134
|
+
}).pipe(Effect.provide(NodeServicesLive));
|
|
1135
|
+
},
|
|
1136
|
+
listFiles(glob) {
|
|
1137
|
+
return listMatchingFilesEffect(worldPath, glob).pipe(Effect.provide(NodeServicesLive));
|
|
1138
|
+
},
|
|
1139
|
+
exec(command, args = [], opts = {}) {
|
|
1140
|
+
return Effect.gen(function* () {
|
|
1141
|
+
const result = yield* runCommandEffect(command, args, {
|
|
1142
|
+
cwd: worldPath,
|
|
1143
|
+
env: {
|
|
1144
|
+
...yield* (yield* Environment).env,
|
|
1145
|
+
...opts.env ?? {}
|
|
1146
|
+
},
|
|
1147
|
+
timeoutMs: opts.timeoutMs
|
|
1148
|
+
});
|
|
1149
|
+
recordCommand?.({
|
|
1150
|
+
command,
|
|
1151
|
+
args: [...args],
|
|
1152
|
+
exitCode: result.exitCode,
|
|
1153
|
+
stdout: result.stdout.slice(0, 4096),
|
|
1154
|
+
stderr: result.stderr.slice(0, 4096),
|
|
1155
|
+
startedAt: result.startedAt,
|
|
1156
|
+
durationMs: result.durationMs
|
|
1157
|
+
});
|
|
1158
|
+
return {
|
|
1159
|
+
exitCode: result.exitCode,
|
|
1160
|
+
stdout: result.stdout,
|
|
1161
|
+
stderr: result.stderr
|
|
1162
|
+
};
|
|
1163
|
+
}).pipe(Effect.provide(ProcessRunnerLive), Effect.provide(NodeServicesLive));
|
|
1164
|
+
},
|
|
1165
|
+
diff() {
|
|
1166
|
+
return Effect.succeed("");
|
|
1167
|
+
}
|
|
1168
|
+
};
|
|
1169
|
+
}
|
|
1170
|
+
//#endregion
|
|
1171
|
+
//#region src/agent/file-watch.ts
|
|
1172
|
+
function snapshotTreeEffect(root) {
|
|
1173
|
+
return Effect.gen(function* () {
|
|
1174
|
+
const out = /* @__PURE__ */ new Map();
|
|
1175
|
+
const fs = yield* FileSystem;
|
|
1176
|
+
function walk(dir) {
|
|
1177
|
+
return Effect.gen(function* () {
|
|
1178
|
+
const entries = yield* fs.readDirectory(dir).pipe(Effect.catchAll(() => Effect.succeed([])));
|
|
1179
|
+
for (const e of entries) {
|
|
1180
|
+
const p = join(dir, e.name);
|
|
1181
|
+
if (e.isDirectory()) {
|
|
1182
|
+
if (e.name === "node_modules" || e.name === ".git") continue;
|
|
1183
|
+
yield* walk(p);
|
|
1184
|
+
} else if (e.isFile()) {
|
|
1185
|
+
const buf = yield* fs.readFile(p).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
1186
|
+
if (!buf) continue;
|
|
1187
|
+
const hash = createHash("sha256").update(buf).digest("hex");
|
|
1188
|
+
out.set(relative(root, p), hash);
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
});
|
|
1192
|
+
}
|
|
1193
|
+
yield* fs.stat(root).pipe(Effect.catchAll(() => Effect.succeed(null)));
|
|
1194
|
+
yield* walk(root);
|
|
1195
|
+
return out;
|
|
1196
|
+
});
|
|
1197
|
+
}
|
|
1198
|
+
function diffTrees(before, after) {
|
|
1199
|
+
const events = [];
|
|
1200
|
+
for (const [path, hash] of after) {
|
|
1201
|
+
const prev = before.get(path);
|
|
1202
|
+
if (prev === void 0) events.push({
|
|
1203
|
+
path,
|
|
1204
|
+
op: "create"
|
|
1205
|
+
});
|
|
1206
|
+
else if (prev !== hash) events.push({
|
|
1207
|
+
path,
|
|
1208
|
+
op: "modify"
|
|
1209
|
+
});
|
|
1210
|
+
}
|
|
1211
|
+
for (const [path] of before) if (!after.has(path)) events.push({
|
|
1212
|
+
path,
|
|
1213
|
+
op: "delete"
|
|
1214
|
+
});
|
|
1215
|
+
return events;
|
|
1216
|
+
}
|
|
1217
|
+
//#endregion
|
|
1218
|
+
//#region src/agent/index.ts
|
|
1219
|
+
const SkillEvidenceConfigSchema = Schema.Struct({
|
|
1220
|
+
mcpResource: Schema.optional(Schema.Struct({
|
|
1221
|
+
uriArgPaths: Schema.optional(Schema.Array(Schema.String)),
|
|
1222
|
+
uriPatterns: Schema.optional(Schema.Array(Schema.String))
|
|
1223
|
+
})),
|
|
1224
|
+
mcpTool: Schema.optional(Schema.Struct({ toolPatterns: Schema.optional(Schema.Array(Schema.String)) })),
|
|
1225
|
+
nativeArgs: Schema.optional(Schema.Struct({
|
|
1226
|
+
whenArgs: Schema.optional(Schema.Array(Schema.String)),
|
|
1227
|
+
whenAnyArgs: Schema.optional(Schema.Array(Schema.String)),
|
|
1228
|
+
skillPathFlags: Schema.optional(Schema.Array(Schema.String)),
|
|
1229
|
+
provider: Schema.optional(Schema.String),
|
|
1230
|
+
source: Schema.optional(Schema.String)
|
|
1231
|
+
}))
|
|
1232
|
+
});
|
|
1233
|
+
const ProviderConfigSchema = Schema.Struct({
|
|
1234
|
+
adapter: Schema.optional(Schema.String),
|
|
1235
|
+
command: Schema.optional(Schema.String),
|
|
1236
|
+
args: Schema.optional(Schema.Array(Schema.String)),
|
|
1237
|
+
timeoutMs: Schema.optional(Schema.Number),
|
|
1238
|
+
baseDir: Schema.optional(Schema.String),
|
|
1239
|
+
isolatedHome: Schema.optional(Schema.Boolean),
|
|
1240
|
+
skillEvidence: Schema.optional(SkillEvidenceConfigSchema)
|
|
1241
|
+
});
|
|
1242
|
+
const DOCUMENTED_ADAPTERS = [
|
|
1243
|
+
"codex-json",
|
|
1244
|
+
"claude-code-json",
|
|
1245
|
+
"pi-json"
|
|
1246
|
+
];
|
|
1247
|
+
const AgentLiveLayer = Layer.mergeAll(RuntimeCheckCatalogLive, AdapterCatalogLive, NodeServicesLive, ProcessRunnerLive);
|
|
1248
|
+
function asVars(value) {
|
|
1249
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : void 0;
|
|
1250
|
+
}
|
|
1251
|
+
function varsFromContext(context) {
|
|
1252
|
+
return asVars(context.vars) ?? asVars(context.test?.vars) ?? {};
|
|
1253
|
+
}
|
|
1254
|
+
function decodeProviderConfig(input) {
|
|
1255
|
+
const decoded = Schema.decodeUnknownEither(ProviderConfigSchema, { errors: "all" })(input ?? {});
|
|
1256
|
+
if (Either.isRight(decoded)) return decoded.right;
|
|
1257
|
+
return { error: `agent-skill-evals-provider: invalid config: ${ParseResult.TreeFormatter.formatErrorSync(decoded.left)}` };
|
|
1258
|
+
}
|
|
1259
|
+
function prepareRunEffect(fixture, config) {
|
|
1260
|
+
return Effect.gen(function* () {
|
|
1261
|
+
const created = yield* Effect.either(createRunDirEffect());
|
|
1262
|
+
if (Either.isLeft(created)) return { error: `agent-skill-evals-provider: failed to create isolated world: ${created.left instanceof Error ? created.left.message : String(created.left)}` };
|
|
1263
|
+
const { runDir, worldPath } = created.right;
|
|
1264
|
+
const copied = yield* Effect.either(copyFixtureEffect({
|
|
1265
|
+
fixturePath: fixture,
|
|
1266
|
+
baseDir: config.baseDir
|
|
1267
|
+
}, worldPath));
|
|
1268
|
+
if (Either.isLeft(copied)) return { error: `agent-skill-evals-provider: failed to copy vars.fixture "${fixture}" into isolated world: ${copied.left instanceof Error ? copied.left.message : String(copied.left)}` };
|
|
1269
|
+
const evidenceCollector = new EvidenceCollector(config.skillEvidence);
|
|
1270
|
+
return {
|
|
1271
|
+
runDir,
|
|
1272
|
+
worldPath,
|
|
1273
|
+
evidenceCollector,
|
|
1274
|
+
world: makeWorldHandle(worldPath, (event) => evidenceCollector.addCommand(event))
|
|
1275
|
+
};
|
|
1276
|
+
});
|
|
1277
|
+
}
|
|
1278
|
+
function runPreconditionsEffect(vars, run) {
|
|
1279
|
+
return Effect.gen(function* () {
|
|
1280
|
+
const results = [];
|
|
1281
|
+
let passed = true;
|
|
1282
|
+
const parsed = parseRuntimeTestFields(vars);
|
|
1283
|
+
for (const error of parsed.errors.filter((e) => e.field === "preconditions")) {
|
|
1284
|
+
const at = error.index === void 0 ? error.field : `${error.field}[${error.index}]`;
|
|
1285
|
+
results.push({
|
|
1286
|
+
pass: false,
|
|
1287
|
+
score: 0,
|
|
1288
|
+
reason: `precondition: ${at}: ${error.reason}`
|
|
1289
|
+
});
|
|
1290
|
+
passed = false;
|
|
1291
|
+
}
|
|
1292
|
+
for (const entry of parsed.preconditions) {
|
|
1293
|
+
const plugin = yield* getRuntimeCheck(entry.type);
|
|
1294
|
+
if (!plugin) {
|
|
1295
|
+
results.push({
|
|
1296
|
+
pass: false,
|
|
1297
|
+
score: 0,
|
|
1298
|
+
reason: `precondition: unknown effect type "${entry.type}"`
|
|
1299
|
+
});
|
|
1300
|
+
passed = false;
|
|
1301
|
+
continue;
|
|
1302
|
+
}
|
|
1303
|
+
const result = yield* plugin.verify({
|
|
1304
|
+
assertion: entry.args,
|
|
1305
|
+
world: run.world,
|
|
1306
|
+
evidence: evidenceFromSnapshot(run.evidenceCollector.toSnapshot()),
|
|
1307
|
+
mode: "precondition"
|
|
1308
|
+
});
|
|
1309
|
+
results.push(result);
|
|
1310
|
+
if (!result.pass) passed = false;
|
|
1311
|
+
}
|
|
1312
|
+
return {
|
|
1313
|
+
results,
|
|
1314
|
+
passed
|
|
1315
|
+
};
|
|
1316
|
+
});
|
|
1317
|
+
}
|
|
1318
|
+
function resolveConfiguredPath(baseDir, path) {
|
|
1319
|
+
if (path.includes("=")) return path;
|
|
1320
|
+
return path.startsWith("./") || path.startsWith("../") || !isAbsolute(path) && path.includes("/") ? resolve(baseDir, path) : path;
|
|
1321
|
+
}
|
|
1322
|
+
function expandEnvVars(value, env) {
|
|
1323
|
+
return value.replace(/\$\{([A-Za-z_][A-Za-z0-9_]*)(:-([^}]*))?\}/g, (_match, name, _fallbackPart, fallback) => env[name] ?? fallback ?? "");
|
|
1324
|
+
}
|
|
1325
|
+
function skillNameFromNativePath(path) {
|
|
1326
|
+
const normalized = path.replace(/\/+$/, "");
|
|
1327
|
+
if (!normalized) return void 0;
|
|
1328
|
+
const leaf = basename(normalized);
|
|
1329
|
+
return leaf === "SKILL.md" ? basename(dirname(normalized)) : leaf.replace(/\.md$/i, "");
|
|
1330
|
+
}
|
|
1331
|
+
function addNativeSkillEvidenceFromConfig(run, config, startedAt) {
|
|
1332
|
+
const nativeConfig = nativeSkillEvidenceConfig(config);
|
|
1333
|
+
if (!nativeConfig) return;
|
|
1334
|
+
const args = config.args ?? [];
|
|
1335
|
+
if (!nativeConfig.whenArgs.every((arg) => args.includes(arg))) return;
|
|
1336
|
+
if (nativeConfig.whenAnyArgs.length > 0 && !nativeConfig.whenAnyArgs.some((arg) => args.includes(arg))) return;
|
|
1337
|
+
for (let index = 0; index < args.length; index += 1) {
|
|
1338
|
+
const arg = args[index];
|
|
1339
|
+
if (!arg) continue;
|
|
1340
|
+
const source = nativeConfig.skillPathFlags.includes(arg) ? arg : void 0;
|
|
1341
|
+
if (!source) continue;
|
|
1342
|
+
const skillPath = args[index + 1];
|
|
1343
|
+
if (!skillPath) continue;
|
|
1344
|
+
const skill = skillNameFromNativePath(skillPath);
|
|
1345
|
+
if (!skill) continue;
|
|
1346
|
+
run.evidenceCollector.addSkillLoad({
|
|
1347
|
+
skill,
|
|
1348
|
+
delivery: "native",
|
|
1349
|
+
provider: nativeConfig.provider,
|
|
1350
|
+
source: nativeConfig.source ?? source,
|
|
1351
|
+
startedAt
|
|
1352
|
+
});
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
function nativeSkillEvidenceConfig(config) {
|
|
1356
|
+
const configured = config.skillEvidence?.nativeArgs;
|
|
1357
|
+
if (configured) return {
|
|
1358
|
+
whenArgs: configured.whenArgs ?? [],
|
|
1359
|
+
whenAnyArgs: configured.whenAnyArgs ?? [],
|
|
1360
|
+
skillPathFlags: configured.skillPathFlags ?? ["--skill"],
|
|
1361
|
+
provider: configured.provider ?? config.adapter ?? "agent",
|
|
1362
|
+
source: configured.source
|
|
1363
|
+
};
|
|
1364
|
+
if (config.adapter === "pi-json") return {
|
|
1365
|
+
whenArgs: [],
|
|
1366
|
+
whenAnyArgs: ["--no-skills", "-ns"],
|
|
1367
|
+
skillPathFlags: ["--skill"],
|
|
1368
|
+
provider: "pi-json",
|
|
1369
|
+
source: "--skill"
|
|
1370
|
+
};
|
|
1371
|
+
}
|
|
1372
|
+
function promptfooTokenUsage(usage) {
|
|
1373
|
+
const tokenUsage = {
|
|
1374
|
+
...usage.totalTokens !== void 0 ? { total: usage.totalTokens } : {},
|
|
1375
|
+
...usage.inputTokens !== void 0 ? { prompt: usage.inputTokens } : {},
|
|
1376
|
+
...usage.outputTokens !== void 0 ? { completion: usage.outputTokens } : {},
|
|
1377
|
+
...usage.cacheReadTokens !== void 0 ? { cached: usage.cacheReadTokens } : {}
|
|
1378
|
+
};
|
|
1379
|
+
return Object.keys(tokenUsage).length > 0 ? tokenUsage : void 0;
|
|
1380
|
+
}
|
|
1381
|
+
function runConfiguredAdapterEffect(input) {
|
|
1382
|
+
return Effect.gen(function* () {
|
|
1383
|
+
const adapterId = input.config.adapter;
|
|
1384
|
+
if (!adapterId) return {
|
|
1385
|
+
output: "",
|
|
1386
|
+
error: "agent-skill-evals-provider: config.adapter is required. Use codex-json, claude-code-json, or pi-json."
|
|
1387
|
+
};
|
|
1388
|
+
const adapter = yield* getAdapter(adapterId);
|
|
1389
|
+
if (!adapter) return {
|
|
1390
|
+
output: "",
|
|
1391
|
+
error: `agent-skill-evals-provider: unknown adapter "${adapterId}". Supported adapters: ${DOCUMENTED_ADAPTERS.join(", ")}`
|
|
1392
|
+
};
|
|
1393
|
+
if (!input.config.command) return {
|
|
1394
|
+
output: "",
|
|
1395
|
+
error: "agent-skill-evals-provider: config.command is required for dynamic agent runs"
|
|
1396
|
+
};
|
|
1397
|
+
const command = input.config.command;
|
|
1398
|
+
const environment = yield* Environment;
|
|
1399
|
+
const cwd = yield* environment.cwd;
|
|
1400
|
+
const env = yield* environment.env;
|
|
1401
|
+
const baseDir = input.config.baseDir ?? cwd;
|
|
1402
|
+
const result = yield* adapter.run({
|
|
1403
|
+
command: resolveConfiguredPath(baseDir, expandEnvVars(command, env)),
|
|
1404
|
+
args: (input.config.args ?? []).map((arg) => typeof arg === "string" ? resolveConfiguredPath(baseDir, expandEnvVars(arg, env)) : arg),
|
|
1405
|
+
cwd: input.run.worldPath,
|
|
1406
|
+
prompt: input.prompt,
|
|
1407
|
+
evidence: input.run.evidenceCollector,
|
|
1408
|
+
timeoutMs: input.config.timeoutMs ?? 5 * 6e4,
|
|
1409
|
+
env: {
|
|
1410
|
+
...env,
|
|
1411
|
+
...input.config.isolatedHome ? { HOME: join(input.run.runDir, "agent-home") } : {}
|
|
1412
|
+
}
|
|
1413
|
+
});
|
|
1414
|
+
return {
|
|
1415
|
+
output: result.output,
|
|
1416
|
+
...result.error ? { error: result.error } : {}
|
|
1417
|
+
};
|
|
1418
|
+
});
|
|
1419
|
+
}
|
|
1420
|
+
function collectFileEvidenceEffect(run, preTree) {
|
|
1421
|
+
return Effect.gen(function* () {
|
|
1422
|
+
const postTree = yield* snapshotTreeEffect(run.worldPath);
|
|
1423
|
+
for (const event of diffTrees(preTree, postTree)) run.evidenceCollector.addFileWrite(event);
|
|
1424
|
+
});
|
|
1425
|
+
}
|
|
1426
|
+
function persistMetadataEffect(input) {
|
|
1427
|
+
return Effect.gen(function* () {
|
|
1428
|
+
const fs = yield* FileSystem;
|
|
1429
|
+
const durationMs = (yield* Clock.currentTimeMillis) - input.startedAt;
|
|
1430
|
+
input.run.evidenceCollector.setOutput(input.output);
|
|
1431
|
+
input.run.evidenceCollector.setRun({
|
|
1432
|
+
runDir: input.run.runDir,
|
|
1433
|
+
worldPath: input.run.worldPath,
|
|
1434
|
+
fixture: input.fixture,
|
|
1435
|
+
durationMs
|
|
1436
|
+
});
|
|
1437
|
+
const evidencePath = yield* writeEvidenceToEffect(input.run.evidenceCollector, input.run.runDir);
|
|
1438
|
+
const metadata = {
|
|
1439
|
+
runDir: input.run.runDir,
|
|
1440
|
+
worldPath: input.run.worldPath,
|
|
1441
|
+
evidencePath,
|
|
1442
|
+
fixture: input.fixture,
|
|
1443
|
+
skill: input.vars.skill,
|
|
1444
|
+
kind: input.vars.kind,
|
|
1445
|
+
preconditionResults: input.preconditionResults,
|
|
1446
|
+
preconditionsPassed: input.preconditionsPassed,
|
|
1447
|
+
durationMs
|
|
1448
|
+
};
|
|
1449
|
+
yield* fs.writeText(join(input.run.runDir, "agent-skill-evals-meta.json"), JSON.stringify(metadata, null, 2));
|
|
1450
|
+
return metadata;
|
|
1451
|
+
});
|
|
1452
|
+
}
|
|
1453
|
+
var AgentSkillEvalsProvider = class {
|
|
1454
|
+
config;
|
|
1455
|
+
configError;
|
|
1456
|
+
id;
|
|
1457
|
+
constructor(options = {}) {
|
|
1458
|
+
const config = decodeProviderConfig(options.config ?? {});
|
|
1459
|
+
if ("error" in config) {
|
|
1460
|
+
this.config = {};
|
|
1461
|
+
this.configError = config.error;
|
|
1462
|
+
} else this.config = config;
|
|
1463
|
+
const label = options.id ?? "agent-skill-evals";
|
|
1464
|
+
this.id = () => label;
|
|
1465
|
+
}
|
|
1466
|
+
async callApi(prompt, context = {}) {
|
|
1467
|
+
return Effect.runPromise(this.callApiEffect(prompt, context).pipe(Effect.provide(AgentLiveLayer)));
|
|
1468
|
+
}
|
|
1469
|
+
callApiEffect(prompt, context = {}) {
|
|
1470
|
+
const self = this;
|
|
1471
|
+
return Effect.gen(function* () {
|
|
1472
|
+
if (self.configError) return {
|
|
1473
|
+
output: "",
|
|
1474
|
+
error: self.configError
|
|
1475
|
+
};
|
|
1476
|
+
const startedAt = yield* Clock.currentTimeMillis;
|
|
1477
|
+
const vars = varsFromContext(context);
|
|
1478
|
+
const fixture = vars.fixture;
|
|
1479
|
+
if (!fixture) return {
|
|
1480
|
+
output: "",
|
|
1481
|
+
error: "agent-skill-evals-provider: vars.fixture is required. Set vars.fixture to the fixture directory for this test case."
|
|
1482
|
+
};
|
|
1483
|
+
const prepared = yield* prepareRunEffect(fixture, self.config);
|
|
1484
|
+
if ("error" in prepared) return {
|
|
1485
|
+
output: "",
|
|
1486
|
+
error: prepared.error
|
|
1487
|
+
};
|
|
1488
|
+
const preconditions = yield* runPreconditionsEffect(vars, prepared);
|
|
1489
|
+
const preTree = yield* snapshotTreeEffect(prepared.worldPath);
|
|
1490
|
+
let output = "";
|
|
1491
|
+
let error;
|
|
1492
|
+
if (preconditions.passed) {
|
|
1493
|
+
addNativeSkillEvidenceFromConfig(prepared, self.config, startedAt);
|
|
1494
|
+
const result = yield* runConfiguredAdapterEffect({
|
|
1495
|
+
prompt,
|
|
1496
|
+
run: prepared,
|
|
1497
|
+
config: self.config
|
|
1498
|
+
});
|
|
1499
|
+
output = result.output;
|
|
1500
|
+
error = result.error;
|
|
1501
|
+
}
|
|
1502
|
+
yield* collectFileEvidenceEffect(prepared, preTree);
|
|
1503
|
+
const metadata = yield* persistMetadataEffect({
|
|
1504
|
+
run: prepared,
|
|
1505
|
+
fixture,
|
|
1506
|
+
vars,
|
|
1507
|
+
output,
|
|
1508
|
+
preconditionResults: preconditions.results,
|
|
1509
|
+
preconditionsPassed: preconditions.passed,
|
|
1510
|
+
startedAt
|
|
1511
|
+
}).pipe(Effect.orDie);
|
|
1512
|
+
const usage = promptfooTokenUsage(prepared.evidenceCollector.toSnapshot().usage);
|
|
1513
|
+
return {
|
|
1514
|
+
output,
|
|
1515
|
+
...error ? { error } : {},
|
|
1516
|
+
metadata,
|
|
1517
|
+
...usage ? { tokenUsage: usage } : {}
|
|
1518
|
+
};
|
|
1519
|
+
});
|
|
1520
|
+
}
|
|
1521
|
+
};
|
|
1522
|
+
//#endregion
|
|
1523
|
+
export { writeEvidenceToEffect as a, getRuntimeCheck as c, evidenceFromSnapshot as i, makeWorldHandle as n, decodeEvidenceSnapshotEither as o, EvidenceCollector as r, RuntimeCheckCatalogLive as s, AgentSkillEvalsProvider as t };
|
|
1524
|
+
|
|
1525
|
+
//# sourceMappingURL=agent-CM7fIL_C.mjs.map
|