agent-skill-evals 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,574 @@
1
+ import { c as parseRuntimeTestFields, n as FileSystem, r as NodeServicesLive, t as Environment } from "../internal-services-5-mRgNls.mjs";
2
+ import { a as writeEvidenceToEffect, c as getRuntimeCheck, i as evidenceFromSnapshot, n as makeWorldHandle, o as decodeEvidenceSnapshotEither, r as EvidenceCollector, s as RuntimeCheckCatalogLive } from "../agent-CM7fIL_C.mjs";
3
+ import { join } from "node:path";
4
+ import * as Either from "effect/Either";
5
+ import * as Effect from "effect/Effect";
6
+ import * as Schema from "effect/Schema";
7
+ //#region src/assertions/_shared.ts
8
+ function loadMetadataEffect(context) {
9
+ return Effect.gen(function* () {
10
+ const direct = context.providerResponse?.metadata;
11
+ if (direct && typeof direct === "object" && "worldPath" in direct) return direct;
12
+ const runDir = (yield* (yield* Environment).env).AGENT_SKILL_EVALS_RUN_DIR;
13
+ if (runDir) {
14
+ const parsed = yield* (yield* FileSystem).readText(join(runDir, "agent-skill-evals-meta.json")).pipe(Effect.map((buf) => {
15
+ try {
16
+ return JSON.parse(buf);
17
+ } catch {
18
+ return null;
19
+ }
20
+ }), Effect.catchAll(() => Effect.succeed(null)));
21
+ if (parsed) return parsed;
22
+ }
23
+ return null;
24
+ });
25
+ }
26
+ function loadEvidenceEffect(meta) {
27
+ return Effect.gen(function* () {
28
+ const buf = yield* (yield* FileSystem).readText(meta.evidencePath).pipe(Effect.mapError((err) => /* @__PURE__ */ new Error(`evidence: failed to read ${meta.evidencePath}: ${err instanceof Error ? err.message : String(err)}`)));
29
+ const decoded = decodeEvidenceSnapshotEither(yield* Effect.try({
30
+ try: () => JSON.parse(buf),
31
+ catch: (err) => /* @__PURE__ */ new Error(`evidence: invalid JSON in ${meta.evidencePath}: ${err instanceof Error ? err.message : String(err)}`)
32
+ }));
33
+ if (Either.isLeft(decoded)) return yield* Effect.fail(/* @__PURE__ */ new Error(`evidence: invalid agent-skill-evals.evidence.v1 payload: ${decoded.left.message}`));
34
+ const snapshot = decoded.right;
35
+ return EvidenceCollector.fromSnapshot(snapshot);
36
+ });
37
+ }
38
+ function loadWorld(meta, evidenceCollector) {
39
+ return makeWorldHandle(meta.worldPath, (event) => evidenceCollector.addCommand(event));
40
+ }
41
+ function runEntriesEffect(entries, world, evidenceCollector, mode) {
42
+ return Effect.gen(function* () {
43
+ const results = [];
44
+ for (const entry of entries) {
45
+ const plugin = yield* getRuntimeCheck(entry.type);
46
+ if (!plugin) {
47
+ results.push({
48
+ pass: false,
49
+ score: 0,
50
+ reason: `unknown effect type: ${entry.type}`
51
+ });
52
+ continue;
53
+ }
54
+ const r = yield* plugin.verify({
55
+ assertion: entry.args,
56
+ world,
57
+ evidence: evidenceFromSnapshot(evidenceCollector.toSnapshot()),
58
+ mode
59
+ });
60
+ results.push(r);
61
+ }
62
+ return results;
63
+ });
64
+ }
65
+ function aggregate(results, emptyReason, options = {}) {
66
+ if (results.length === 0) {
67
+ const pass = options.emptyPass ?? true;
68
+ return {
69
+ pass,
70
+ score: pass ? 1 : 0,
71
+ reason: emptyReason
72
+ };
73
+ }
74
+ const allPass = results.every((r) => r.pass);
75
+ const failed = results.filter((r) => !r.pass).map((r) => r.reason);
76
+ return {
77
+ pass: allPass,
78
+ score: allPass ? 1 : 0,
79
+ reason: allPass ? `${results.length} check(s) passed` : failed.join("; "),
80
+ componentResults: results.map((r) => ({
81
+ pass: r.pass,
82
+ score: r.score,
83
+ reason: r.reason
84
+ }))
85
+ };
86
+ }
87
+ //#endregion
88
+ //#region src/assertions/skill-test.ts
89
+ async function skillTest(_output, context) {
90
+ return Effect.runPromise(skillTestEffect(_output, context).pipe(Effect.provide(RuntimeCheckCatalogLive), Effect.provide(NodeServicesLive)));
91
+ }
92
+ function skillTestEffect(_output, context) {
93
+ return Effect.gen(function* () {
94
+ const meta = yield* loadMetadataEffect(context);
95
+ if (!meta) return {
96
+ pass: false,
97
+ score: 0,
98
+ reason: "skill.test: provider metadata missing"
99
+ };
100
+ const preconditionResults = meta.preconditionResults ?? [];
101
+ if (!meta.preconditionsPassed) return aggregate(preconditionResults, "skill.test: preconditions failed");
102
+ const vars = context.vars ?? context.test?.vars ?? {};
103
+ const loadedEvidence = yield* Effect.either(loadEvidenceEffect(meta));
104
+ if (Either.isLeft(loadedEvidence)) {
105
+ const err = loadedEvidence.left;
106
+ return {
107
+ pass: false,
108
+ score: 0,
109
+ reason: err instanceof Error ? err.message : String(err)
110
+ };
111
+ }
112
+ const evidenceCollector = loadedEvidence.right;
113
+ const world = loadWorld(meta, evidenceCollector);
114
+ const parsed = parseRuntimeTestFields(vars);
115
+ const parseResults = parsed.errors.filter((error) => error.field !== "preconditions").map((error) => ({
116
+ pass: false,
117
+ score: 0,
118
+ reason: `runtime test field ${error.index === void 0 ? error.field : `${error.field}[${error.index}]`}: ${error.reason}`
119
+ }));
120
+ const shouldResults = yield* runEntriesEffect(parsed.should, world, evidenceCollector, "should");
121
+ const shouldNotResults = yield* runEntriesEffect(parsed.should_not, world, evidenceCollector, "should_not");
122
+ yield* writeEvidenceToEffect(evidenceCollector, meta.runDir).pipe(Effect.orDie);
123
+ return aggregate([
124
+ ...preconditionResults,
125
+ ...parseResults,
126
+ ...shouldResults,
127
+ ...shouldNotResults
128
+ ], "skill.test: no Runtime Test Fields checks declared", { emptyPass: false });
129
+ });
130
+ }
131
+ //#endregion
132
+ //#region src/assertions/skill-budget.ts
133
+ const budgetFields = [
134
+ ["total", "maxTotalTokens"],
135
+ ["prompt", "maxPromptTokens"],
136
+ ["completion", "maxCompletionTokens"],
137
+ ["cached", "maxCachedTokens"]
138
+ ];
139
+ function configObject$1(value) {
140
+ return value && typeof value === "object" && !Array.isArray(value) ? value : {};
141
+ }
142
+ function numberSetting$1(config, key) {
143
+ const value = config[key];
144
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
145
+ }
146
+ function budgetSettings(context) {
147
+ const config = configObject$1(context.config?.agentSkillEvals);
148
+ return {
149
+ maxTotalTokens: numberSetting$1(config, "maxTotalTokens"),
150
+ maxPromptTokens: numberSetting$1(config, "maxPromptTokens"),
151
+ maxCompletionTokens: numberSetting$1(config, "maxCompletionTokens"),
152
+ maxCachedTokens: numberSetting$1(config, "maxCachedTokens")
153
+ };
154
+ }
155
+ function tokenUsage(context) {
156
+ const usage = context.providerResponse?.tokenUsage;
157
+ return usage && typeof usage === "object" && !Array.isArray(usage) ? usage : void 0;
158
+ }
159
+ async function skillBudget(_output, context) {
160
+ const usage = tokenUsage(context);
161
+ if (!usage) return {
162
+ pass: false,
163
+ score: 0,
164
+ reason: "skill.budget: provider tokenUsage missing"
165
+ };
166
+ const settings = budgetSettings(context);
167
+ const configured = budgetFields.filter(([, limitKey]) => settings[limitKey] !== void 0);
168
+ if (configured.length === 0) return {
169
+ pass: false,
170
+ score: 0,
171
+ reason: "skill.budget: configure at least one token limit"
172
+ };
173
+ const components = configured.map(([usageKey, limitKey]) => {
174
+ const actual = usage[usageKey];
175
+ const limit = settings[limitKey] ?? 0;
176
+ if (typeof actual !== "number" || !Number.isFinite(actual)) return {
177
+ pass: false,
178
+ score: 0,
179
+ reason: `${usageKey} tokens missing`
180
+ };
181
+ return {
182
+ pass: actual <= limit,
183
+ score: actual <= limit ? 1 : 0,
184
+ reason: `${usageKey} tokens ${actual} <= ${limit}`
185
+ };
186
+ });
187
+ const failed = components.filter((component) => !component.pass);
188
+ return {
189
+ pass: failed.length === 0,
190
+ score: failed.length === 0 ? 1 : 0,
191
+ reason: failed.length === 0 ? `skill.budget: ${components.length} budget(s) passed` : failed.map((component) => component.reason).join("; "),
192
+ componentResults: components
193
+ };
194
+ }
195
+ //#endregion
196
+ //#region src/skill-checks/assertions-static/_shared.ts
197
+ function getStaticMeta(context) {
198
+ const m = context.providerResponse?.metadata;
199
+ if (m && typeof m === "object" && ("skill" in m || "tests" in m)) return m;
200
+ return null;
201
+ }
202
+ function pass(reason, components) {
203
+ return {
204
+ pass: true,
205
+ score: 1,
206
+ reason,
207
+ componentResults: components
208
+ };
209
+ }
210
+ function fail(reason, components) {
211
+ return {
212
+ pass: false,
213
+ score: 0,
214
+ reason,
215
+ componentResults: components
216
+ };
217
+ }
218
+ //#endregion
219
+ //#region src/skill-checks/assertions-static/settings.ts
220
+ const DEFAULT_SETTINGS = {
221
+ maxSkillLines: 200,
222
+ destructiveEffects: ["file.changes_outside_scope", "tool.called"],
223
+ requireTokenBudget: false,
224
+ riskyEffects: ["file.changes_outside_scope", "tool.called"]
225
+ };
226
+ const StringArraySchema = Schema.Array(Schema.String);
227
+ function configObject(value) {
228
+ return value && typeof value === "object" && !Array.isArray(value) ? value : {};
229
+ }
230
+ function numberSetting(config, key, fallback) {
231
+ return Schema.is(Schema.Number)(config[key]) ? config[key] : fallback;
232
+ }
233
+ function stringArraySetting(config, key, fallback) {
234
+ return Schema.is(StringArraySchema)(config[key]) ? config[key] : fallback;
235
+ }
236
+ function booleanSetting(config, key, fallback) {
237
+ return typeof config[key] === "boolean" ? config[key] : fallback;
238
+ }
239
+ function skillCheckSettings(context) {
240
+ const config = configObject(context.config?.agentSkillEvals);
241
+ return {
242
+ maxSkillLines: numberSetting(config, "maxSkillLines", DEFAULT_SETTINGS.maxSkillLines),
243
+ destructiveEffects: stringArraySetting(config, "destructiveEffects", DEFAULT_SETTINGS.destructiveEffects),
244
+ requireTokenBudget: booleanSetting(config, "requireTokenBudget", DEFAULT_SETTINGS.requireTokenBudget),
245
+ riskyEffects: stringArraySetting(config, "riskyEffects", DEFAULT_SETTINGS.riskyEffects)
246
+ };
247
+ }
248
+ //#endregion
249
+ //#region src/skill-checks/assertions-static/context-economy.ts
250
+ /**
251
+ * SPEC §7.2 — context economy. Hard: referenced files must exist (already
252
+ * surfaced via missingFiles). Warnings: SKILL.md size, reference depth,
253
+ * missing TOC in long reference files.
254
+ *
255
+ * Implemented as warnings-only: this assertion always passes, but emits a
256
+ * `score < 1` reason describing how SKILL.md compares to thresholds.
257
+ */
258
+ async function contextEconomy(_output, context) {
259
+ const meta = getStaticMeta(context);
260
+ if (!meta) return fail("context-economy: provider metadata missing");
261
+ const skill = meta.skill;
262
+ if (!skill) return fail("context-economy: skill not parsed");
263
+ const { maxSkillLines: maxLines } = skillCheckSettings(context);
264
+ const components = [];
265
+ components.push({
266
+ pass: skill.totalLines <= maxLines,
267
+ score: skill.totalLines <= maxLines ? 1 : .5,
268
+ reason: `SKILL.md ${skill.totalLines} line(s) (limit ${maxLines})`
269
+ });
270
+ components.push({
271
+ pass: skill.missingReferences.length === 0,
272
+ score: skill.missingReferences.length === 0 ? 1 : 0,
273
+ reason: skill.missingReferences.length === 0 ? `${skill.references.length} reference(s) all resolved` : `missing references: ${skill.missingReferences.slice(0, 3).join(", ")}`
274
+ });
275
+ const failed = components.filter((c) => !c.pass && c.reason.startsWith("missing references"));
276
+ if (failed.length === 0) {
277
+ const oversize = components.find((c) => c.score < 1);
278
+ return oversize ? {
279
+ pass: true,
280
+ score: .5,
281
+ reason: oversize.reason,
282
+ componentResults: components
283
+ } : pass(`context-economy: ok`, components);
284
+ }
285
+ return fail(failed.map((f) => `✗ ${f.reason}`).join("; "), components);
286
+ }
287
+ //#endregion
288
+ //#region src/skill-checks/assertions-static/executable-helper.ts
289
+ /**
290
+ * SPEC §7.4 — verifier scripts referenced by tests must exist and be
291
+ * runnable. Generated tests must not reference missing fixtures.
292
+ */
293
+ async function executableHelper(_output, context) {
294
+ const meta = getStaticMeta(context);
295
+ if (!meta) return fail("executable-helper: provider metadata missing");
296
+ const tests = meta.tests;
297
+ if (!tests) return fail("executable-helper: tests not parsed");
298
+ const components = [];
299
+ components.push({
300
+ pass: tests.missingVerifierScripts.length === 0,
301
+ score: tests.missingVerifierScripts.length === 0 ? 1 : 0,
302
+ reason: tests.missingVerifierScripts.length === 0 ? `${tests.verifierScripts.length} verifier script(s) all present` : `missing verifier scripts: ${tests.missingVerifierScripts.slice(0, 3).join(", ")}`
303
+ });
304
+ components.push({
305
+ pass: tests.nonExecutableVerifierScripts.length === 0,
306
+ score: tests.nonExecutableVerifierScripts.length === 0 ? 1 : 0,
307
+ reason: tests.nonExecutableVerifierScripts.length === 0 ? `${tests.verifierScripts.length} verifier script(s) executable` : `non-executable verifier scripts: ${tests.nonExecutableVerifierScripts.slice(0, 3).join(", ")}`
308
+ });
309
+ components.push({
310
+ pass: tests.missingFixturePaths.length === 0,
311
+ score: tests.missingFixturePaths.length === 0 ? 1 : 0,
312
+ reason: tests.missingFixturePaths.length === 0 ? `${tests.fixturePaths.length} fixture(s) all present` : `missing fixtures: ${tests.missingFixturePaths.slice(0, 3).join(", ")}`
313
+ });
314
+ const failed = components.filter((c) => !c.pass);
315
+ if (failed.length === 0) return pass("executable-helper: ok", components);
316
+ return fail(failed.map((f) => `✗ ${f.reason}`).join("; "), components);
317
+ }
318
+ //#endregion
319
+ //#region src/skill-checks/assertions-static/instruction-calibration.ts
320
+ /**
321
+ * SPEC §7.3 — for skills whose test pack uses destructive effects, the
322
+ * SKILL.md must contain confirmation/clarification language and at least
323
+ * one negative test must declare forbidden effects.
324
+ *
325
+ * `assert.config.agentSkillEvals.destructiveEffects` overrides the default list.
326
+ */
327
+ const CONFIRMATION_RE = /\b(confirm|ask first|do not.*without|before.*push|require.*approval|do not.*destructive)\b/i;
328
+ const PLAN_BEFORE_ACT_RE = /\b(plan first|plan before|read.*before.*write|validate.*before|dry.run)\b/i;
329
+ async function instructionCalibration(_output, context) {
330
+ const meta = getStaticMeta(context);
331
+ if (!meta) return fail("skill.instructions: provider metadata missing");
332
+ const skill = meta.skill;
333
+ const tests = meta.tests;
334
+ if (!skill) return fail("skill.instructions: skill not parsed");
335
+ const destructive = new Set(skillCheckSettings(context).destructiveEffects);
336
+ if (!(tests ? tests.tests.some((t) => t.effectTypes.some((e) => destructive.has(e))) : false)) return pass("skill.instructions: no destructive effects in test pack");
337
+ const components = [];
338
+ const hasConfirm = CONFIRMATION_RE.test(skill.body) || PLAN_BEFORE_ACT_RE.test(skill.body);
339
+ components.push({
340
+ pass: hasConfirm,
341
+ score: hasConfirm ? 1 : 0,
342
+ reason: hasConfirm ? "SKILL.md describes confirmation / plan-before-act" : "SKILL.md uses destructive effects but lacks confirmation / plan-before-act language"
343
+ });
344
+ const declaresForbidden = tests ? tests.tests.some((t) => Array.isArray(t.vars.should_not) && t.vars.should_not.length > 0) : false;
345
+ components.push({
346
+ pass: declaresForbidden,
347
+ score: declaresForbidden ? 1 : 0,
348
+ reason: declaresForbidden ? "test pack declares forbidden effects (should_not)" : "no should_not declared in any test, despite destructive effects"
349
+ });
350
+ const failed = components.filter((c) => !c.pass);
351
+ if (failed.length === 0) return pass("skill.instructions: ok", components);
352
+ return fail(failed.map((f) => `✗ ${f.reason}`).join("; "), components);
353
+ }
354
+ //#endregion
355
+ //#region src/skill-checks/assertions-static/negative-coverage.ts
356
+ /**
357
+ * SPEC §7.6 — risky skills (those whose test pack uses any risky effect)
358
+ * must include at least one negative test.
359
+ *
360
+ * `assert.config.agentSkillEvals.riskyEffects` overrides the default risky-effect list.
361
+ */
362
+ async function negativeCoverage(_output, context) {
363
+ const meta = getStaticMeta(context);
364
+ if (!meta) return fail("skill.tests: provider metadata missing");
365
+ const tests = meta.tests;
366
+ if (!tests) return fail("skill.tests: tests not parsed");
367
+ const risky = new Set(skillCheckSettings(context).riskyEffects);
368
+ if (!tests.tests.some((t) => t.effectTypes.some((e) => risky.has(e)))) return pass("skill.tests: no risky effects, negative test not required");
369
+ const negatives = tests.tests.filter((t) => t.isNegative);
370
+ if (negatives.length === 0) return fail(`skill.tests: skill uses risky effects (${[...risky].join(", ")}) but no negative test (kind: negative) is declared`);
371
+ return pass(`skill.tests: ${negatives.length} negative test(s)`);
372
+ }
373
+ //#endregion
374
+ //#region src/skill-checks/assertions-static/routing-metadata.ts
375
+ const GENERIC_PHRASES = [
376
+ /\bhelp(s|ing)?\b/i,
377
+ /\bgithub workflows?\b/i,
378
+ /\bvarious\b/i,
379
+ /\bany kind of\b/i
380
+ ];
381
+ const WHEN_TO_USE_RE = /\b(use when|use this|when (?:the )?(?:user|you))\b/i;
382
+ const WHEN_NOT_RE = /\bdo not use|do not invoke|don'?t use|not for|avoid using\b/i;
383
+ /**
384
+ * SPEC §7.1 — routing metadata hard checks.
385
+ */
386
+ async function routingMetadata(_output, context) {
387
+ const meta = getStaticMeta(context);
388
+ if (!meta) return fail("routing-metadata: provider metadata missing");
389
+ const skill = meta.skill;
390
+ if (!skill) return fail("routing-metadata: skill not parsed (vars.skillPath missing?)");
391
+ const fm = skill.frontmatter;
392
+ const components = [];
393
+ const name = typeof fm.name === "string" ? fm.name.trim() : "";
394
+ components.push({
395
+ pass: name.length > 0,
396
+ score: name.length > 0 ? 1 : 0,
397
+ reason: name ? `name: ${name}` : "missing `name` frontmatter"
398
+ });
399
+ const desc = typeof fm.description === "string" ? fm.description.trim() : "";
400
+ components.push({
401
+ pass: desc.length > 0,
402
+ score: desc.length > 0 ? 1 : 0,
403
+ reason: desc ? `description present (${desc.length} chars)` : "missing `description` frontmatter"
404
+ });
405
+ const sayWhen = WHEN_TO_USE_RE.test(desc);
406
+ components.push({
407
+ pass: sayWhen,
408
+ score: sayWhen ? 1 : 0,
409
+ reason: sayWhen ? "description says when to use" : "description does not say when to use (e.g. 'Use when …')"
410
+ });
411
+ const sayWhenNot = WHEN_NOT_RE.test(desc);
412
+ components.push({
413
+ pass: sayWhenNot,
414
+ score: sayWhenNot ? 1 : 0,
415
+ reason: sayWhenNot ? "description says when not to use" : "description does not say when not to use (e.g. 'Do not use for …')"
416
+ });
417
+ const generic = GENERIC_PHRASES.some((re) => re.test(desc)) && desc.length < 80;
418
+ components.push({
419
+ pass: !generic,
420
+ score: generic ? 0 : 1,
421
+ reason: generic ? "description is too generic" : "description is specific enough"
422
+ });
423
+ const failed = components.filter((c) => !c.pass);
424
+ if (failed.length === 0) return pass(`routing-metadata: ${components.length}/${components.length} checks ok`, components);
425
+ return fail(failed.map((f) => `✗ ${f.reason}`).join("; "), components);
426
+ }
427
+ //#endregion
428
+ //#region src/skill-checks/assertions-static/scenario-validity.ts
429
+ /**
430
+ * SPEC §7.6 — every runtime test must have prompt + fixture (or fixtureless)
431
+ * + at least one of should/should_not. Unsupported effect types fail
432
+ * static validation.
433
+ */
434
+ async function scenarioValidity(_output, context) {
435
+ const meta = getStaticMeta(context);
436
+ if (!meta) return fail("skill.tests: provider metadata missing");
437
+ const tests = meta.tests;
438
+ if (!tests) return fail("skill.tests: tests not parsed (vars.testsGlob missing?)");
439
+ const settings = skillCheckSettings(context);
440
+ const components = [];
441
+ if (tests.parseErrors.length > 0) components.push({
442
+ pass: false,
443
+ score: 0,
444
+ reason: `parse errors: ${tests.parseErrors.map((e) => `${e.filePath}: ${e.error}`).join("; ")}`
445
+ });
446
+ if (tests.matchedFiles.length === 0) components.push({
447
+ pass: false,
448
+ score: 0,
449
+ reason: "testsGlob matched no test files"
450
+ });
451
+ for (const t of tests.tests) {
452
+ const issues = [];
453
+ for (const error of t.entryErrors) {
454
+ const at = error.index === void 0 ? error.field : `${error.field}[${error.index}]`;
455
+ issues.push(`${at}: ${error.reason}`);
456
+ }
457
+ if (typeof t.vars.prompt !== "string" || t.vars.prompt.length === 0) issues.push("missing vars.prompt");
458
+ if (!t.hasFixture) issues.push("missing vars.fixture (or vars.fixtureless: true)");
459
+ if (!(Array.isArray(t.vars.should) && t.vars.should.length > 0 || Array.isArray(t.vars.should_not) && t.vars.should_not.length > 0)) issues.push("no should / should_not");
460
+ if (settings.requireTokenBudget && !t.hasTokenBudget) issues.push("missing skill.budget assertion");
461
+ components.push({
462
+ pass: issues.length === 0,
463
+ score: issues.length === 0 ? 1 : 0,
464
+ reason: `${t.description ?? "(no description)"}: ${issues.length === 0 ? "ok" : issues.join(", ")}`
465
+ });
466
+ }
467
+ if (meta.unresolvedEffectTypes.length > 0) components.push({
468
+ pass: false,
469
+ score: 0,
470
+ reason: `unsupported effect types: ${meta.unresolvedEffectTypes.join(", ")}`
471
+ });
472
+ if (meta.missingFiles.length > 0) components.push({
473
+ pass: false,
474
+ score: 0,
475
+ reason: `missing referenced files: ${meta.missingFiles.slice(0, 5).join(", ")}`
476
+ });
477
+ const failed = components.filter((c) => !c.pass);
478
+ if (failed.length === 0) return pass(`skill.tests: ${tests.tests.length} test(s) ok`, components);
479
+ return fail(failed.map((f) => `✗ ${f.reason}`).join("; "), components);
480
+ }
481
+ //#endregion
482
+ //#region src/skill-checks/assertions-static/promptfoo.ts
483
+ const metricChecks = {
484
+ "skill.activation": [routingMetadata],
485
+ "skill.budgets": [scenarioValidity],
486
+ "skill.context": [contextEconomy],
487
+ "skill.instructions": [instructionCalibration],
488
+ "skill.tests": [scenarioValidity, negativeCoverage],
489
+ "skill.verifiers": [executableHelper]
490
+ };
491
+ metricChecks["skill.checks"] = [
492
+ routingMetadata,
493
+ contextEconomy,
494
+ instructionCalibration,
495
+ scenarioValidity,
496
+ negativeCoverage,
497
+ executableHelper
498
+ ];
499
+ const availableMetrics$1 = Object.keys(metricChecks).sort().join(", ");
500
+ function metricFrom$1(context) {
501
+ return [
502
+ context.assertion?.metric,
503
+ context.assert?.metric,
504
+ context.config?.metric,
505
+ context.metric
506
+ ].find((metric) => typeof metric === "string");
507
+ }
508
+ function runChecksEffect(output, context, checks) {
509
+ return Effect.gen(function* () {
510
+ const results = yield* Effect.forEach(checks, (check) => Effect.promise(() => check(output, context)), { concurrency: "unbounded" });
511
+ const failed = results.filter((result) => !result.pass);
512
+ const soft = results.filter((result) => result.pass && result.score < 1);
513
+ return {
514
+ pass: failed.length === 0,
515
+ score: failed.length === 0 ? Math.min(...results.map((result) => result.score)) : 0,
516
+ reason: failed.length === 0 ? soft.length > 0 ? soft.map((result) => result.reason).join("; ") : `skill checks: ${results.length} check(s) passed` : failed.map((result) => result.reason).join("; "),
517
+ componentResults: results
518
+ };
519
+ });
520
+ }
521
+ async function agentSkillEvalsStaticAssertions(output, context) {
522
+ return Effect.runPromise(agentSkillEvalsStaticAssertionsEffect(output, context));
523
+ }
524
+ function agentSkillEvalsStaticAssertionsEffect(output, context) {
525
+ return Effect.gen(function* () {
526
+ const metric = metricFrom$1(context);
527
+ const checks = metric ? metricChecks[metric] : void 0;
528
+ if (!checks) return {
529
+ pass: false,
530
+ score: 0,
531
+ reason: `agent-skill-evals skill checks: unknown metric "${metric ?? "missing"}". Available metrics: ${availableMetrics$1}`
532
+ };
533
+ return yield* runChecksEffect(output, context, checks);
534
+ });
535
+ }
536
+ //#endregion
537
+ //#region src/assertions/promptfoo.ts
538
+ const routes = {
539
+ "skill.budget": skillBudget,
540
+ "skill.test": skillTest
541
+ };
542
+ const staticMetrics = [
543
+ "skill.checks",
544
+ "skill.activation",
545
+ "skill.budgets",
546
+ "skill.context",
547
+ "skill.instructions",
548
+ "skill.tests",
549
+ "skill.verifiers"
550
+ ];
551
+ const availableMetrics = [...Object.keys(routes), ...staticMetrics].sort().join(", ");
552
+ function metricFrom(context) {
553
+ return [
554
+ context.assertion?.metric,
555
+ context.assert?.metric,
556
+ context.config?.metric,
557
+ context.metric
558
+ ].find((metric) => typeof metric === "string");
559
+ }
560
+ async function agentSkillEvalsAssertions(output, context) {
561
+ const metric = metricFrom(context);
562
+ const assertion = metric ? routes[metric] : void 0;
563
+ if (assertion) return assertion(output, context);
564
+ if (metric && staticMetrics.includes(metric)) return agentSkillEvalsStaticAssertions(output, context);
565
+ return {
566
+ pass: false,
567
+ score: 0,
568
+ reason: `agent-skill-evals assertions: unknown metric "${metric ?? "missing"}". Available metrics: ${availableMetrics}`
569
+ };
570
+ }
571
+ //#endregion
572
+ export { agentSkillEvalsAssertions as default, agentSkillEvalsAssertions as promptfooAssertions, skillTest };
573
+
574
+ //# sourceMappingURL=index.mjs.map