@sanity/ailf 3.8.0 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/canary-tasks.ts +64 -0
- package/config/test-budgets.ts +24 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +19 -0
- package/dist/_vendor/ailf-core/config-helpers.js +27 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
- package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +2 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
- package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
- package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
- package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
- package/dist/_vendor/ailf-shared/index.d.ts +1 -0
- package/dist/_vendor/ailf-shared/index.js +1 -0
- package/dist/adapters/config-sources/file-config-adapter.js +4 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
- package/dist/cli-program.d.ts +39 -0
- package/dist/cli-program.js +137 -0
- package/dist/cli.d.ts +6 -0
- package/dist/cli.js +12 -122
- package/dist/config/canary-tasks.ts +64 -0
- package/dist/config/test-budgets.ts +24 -0
- package/dist/pipeline/calculate-scores.d.ts +17 -2
- package/dist/pipeline/calculate-scores.js +99 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
- package/package.json +6 -3
- package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +0 -10
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +0 -366
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +0 -145
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +0 -10
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +0 -314
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +0 -10
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +0 -486
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +0 -10
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +0 -425
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +0 -332
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +0 -12
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +0 -210
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +0 -7
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +0 -404
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +0 -10
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +0 -184
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +0 -8
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +0 -301
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +0 -503
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +0 -10
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +0 -509
|
@@ -1,404 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* scoring-and-presets.test.ts — Tests for 4-tier scoring engine,
|
|
3
|
-
* storage schema, and plugin registry / presets.
|
|
4
|
-
*
|
|
5
|
-
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-and-presets.test.ts
|
|
6
|
-
*/
|
|
7
|
-
import assert from "node:assert/strict";
|
|
8
|
-
import { dirname, resolve } from "node:path";
|
|
9
|
-
import { describe, it } from "node:test";
|
|
10
|
-
import { fileURLToPath } from "node:url";
|
|
11
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
12
|
-
import { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "../../../_vendor/ailf-core/index.js";
|
|
13
|
-
import { CURRENT_SCHEMA_VERSION, InMemoryPluginRegistry, isSchemaVersioned, migrateDocument, } from "../../../_vendor/ailf-core/index.js";
|
|
14
|
-
import { createSanityLiteracyPreset, sanityLiteracyPreset, } from "../presets/sanity-literacy.js";
|
|
15
|
-
// ---------------------------------------------------------------------------
|
|
16
|
-
// Helpers
|
|
17
|
-
// ---------------------------------------------------------------------------
|
|
18
|
-
function makeAssertion(overrides) {
|
|
19
|
-
return {
|
|
20
|
-
pass: true,
|
|
21
|
-
score: 0.8,
|
|
22
|
-
reason: "Good",
|
|
23
|
-
assertionType: "llm-rubric",
|
|
24
|
-
dimension: "task-completion",
|
|
25
|
-
latencyMs: 100,
|
|
26
|
-
weight: 1.0,
|
|
27
|
-
...overrides,
|
|
28
|
-
};
|
|
29
|
-
}
|
|
30
|
-
function makeDimension(overrides) {
|
|
31
|
-
return {
|
|
32
|
-
dimensionId: "task-completion",
|
|
33
|
-
label: "Task Completion",
|
|
34
|
-
score: 0.8,
|
|
35
|
-
assertionCount: 2,
|
|
36
|
-
passCount: 2,
|
|
37
|
-
aggregation: "weighted-mean",
|
|
38
|
-
assertions: [],
|
|
39
|
-
...overrides,
|
|
40
|
-
};
|
|
41
|
-
}
|
|
42
|
-
// ---------------------------------------------------------------------------
|
|
43
|
-
// Tier 1 → Tier 2: Assertion → Dimension aggregation
|
|
44
|
-
// ---------------------------------------------------------------------------
|
|
45
|
-
describe("aggregateDimensions", () => {
|
|
46
|
-
it("groups assertions by dimension", () => {
|
|
47
|
-
const assertions = [
|
|
48
|
-
makeAssertion({ dimension: "code-correctness", score: 0.9 }),
|
|
49
|
-
makeAssertion({ dimension: "code-correctness", score: 0.7 }),
|
|
50
|
-
makeAssertion({ dimension: "task-completion", score: 0.8 }),
|
|
51
|
-
];
|
|
52
|
-
const dims = aggregateDimensions(assertions);
|
|
53
|
-
assert.equal(dims.length, 2);
|
|
54
|
-
const cc = dims.find((d) => d.dimensionId === "code-correctness");
|
|
55
|
-
assert.ok(cc);
|
|
56
|
-
assert.equal(cc.assertionCount, 2);
|
|
57
|
-
});
|
|
58
|
-
it("uses weighted-mean by default", () => {
|
|
59
|
-
const assertions = [
|
|
60
|
-
makeAssertion({ score: 0.6, weight: 1.0 }),
|
|
61
|
-
makeAssertion({ score: 0.8, weight: 3.0 }),
|
|
62
|
-
];
|
|
63
|
-
const dims = aggregateDimensions(assertions);
|
|
64
|
-
// Weighted mean: (0.6*1 + 0.8*3) / (1+3) = 3.0/4 = 0.75
|
|
65
|
-
assert.ok(Math.abs(dims[0].score - 0.75) < 0.01);
|
|
66
|
-
});
|
|
67
|
-
it("falls back to pass rate when no numeric scores", () => {
|
|
68
|
-
const assertions = [
|
|
69
|
-
makeAssertion({ score: null, pass: true }),
|
|
70
|
-
makeAssertion({ score: null, pass: false }),
|
|
71
|
-
];
|
|
72
|
-
const dims = aggregateDimensions(assertions);
|
|
73
|
-
assert.equal(dims[0].score, 0.5);
|
|
74
|
-
});
|
|
75
|
-
it("applies custom dimension labels", () => {
|
|
76
|
-
const assertions = [makeAssertion({ dimension: "tc" })];
|
|
77
|
-
const dims = aggregateDimensions(assertions, {
|
|
78
|
-
dimensionLabels: { tc: "Task Completion" },
|
|
79
|
-
});
|
|
80
|
-
assert.equal(dims[0].label, "Task Completion");
|
|
81
|
-
});
|
|
82
|
-
});
|
|
83
|
-
// ---------------------------------------------------------------------------
|
|
84
|
-
// Tier 2 → Tier 3: Dimension → Task scoring
|
|
85
|
-
// ---------------------------------------------------------------------------
|
|
86
|
-
describe("computeTaskScore", () => {
|
|
87
|
-
it("computes weighted score from dimensions", () => {
|
|
88
|
-
const dims = [
|
|
89
|
-
makeDimension({ dimensionId: "tc", score: 0.8 }),
|
|
90
|
-
makeDimension({ dimensionId: "cc", score: 0.6 }),
|
|
91
|
-
];
|
|
92
|
-
const task = computeTaskScore(dims, {
|
|
93
|
-
taskId: "test-task",
|
|
94
|
-
weights: { tc: 0.6, cc: 0.4 },
|
|
95
|
-
});
|
|
96
|
-
// 0.8*0.6 + 0.6*0.4 = 0.48 + 0.24 = 0.72
|
|
97
|
-
assert.ok(Math.abs(task.score - 0.72) < 0.01);
|
|
98
|
-
});
|
|
99
|
-
it("normalizes weights that don't sum to 1", () => {
|
|
100
|
-
const dims = [
|
|
101
|
-
makeDimension({ dimensionId: "tc", score: 1.0 }),
|
|
102
|
-
makeDimension({ dimensionId: "cc", score: 0.0 }),
|
|
103
|
-
];
|
|
104
|
-
const task = computeTaskScore(dims, {
|
|
105
|
-
taskId: "test-task",
|
|
106
|
-
weights: { tc: 2, cc: 2 },
|
|
107
|
-
});
|
|
108
|
-
// (1.0*2 + 0.0*2) / (2+2) = 2/4 = 0.5
|
|
109
|
-
assert.ok(Math.abs(task.score - 0.5) < 0.01);
|
|
110
|
-
});
|
|
111
|
-
it("checks against threshold", () => {
|
|
112
|
-
const dims = [makeDimension({ dimensionId: "tc", score: 0.6 })];
|
|
113
|
-
const passing = computeTaskScore(dims, {
|
|
114
|
-
taskId: "t1",
|
|
115
|
-
weights: { tc: 1.0 },
|
|
116
|
-
threshold: 0.5,
|
|
117
|
-
});
|
|
118
|
-
assert.equal(passing.passesThreshold, true);
|
|
119
|
-
const failing = computeTaskScore(dims, {
|
|
120
|
-
taskId: "t2",
|
|
121
|
-
weights: { tc: 1.0 },
|
|
122
|
-
threshold: 0.7,
|
|
123
|
-
});
|
|
124
|
-
assert.equal(failing.passesThreshold, false);
|
|
125
|
-
});
|
|
126
|
-
it("records weight source", () => {
|
|
127
|
-
const task = computeTaskScore([makeDimension()], {
|
|
128
|
-
taskId: "t1",
|
|
129
|
-
weights: { "task-completion": 1.0 },
|
|
130
|
-
weightSource: "rubrics.yaml:default",
|
|
131
|
-
});
|
|
132
|
-
assert.equal(task.weightSource, "rubrics.yaml:default");
|
|
133
|
-
});
|
|
134
|
-
});
|
|
135
|
-
// ---------------------------------------------------------------------------
|
|
136
|
-
// Tier 3 → Tier 4: Task → Area aggregation
|
|
137
|
-
// ---------------------------------------------------------------------------
|
|
138
|
-
describe("aggregateAreas", () => {
|
|
139
|
-
it("groups tasks by area prefix", () => {
|
|
140
|
-
const tasks = [
|
|
141
|
-
computeTaskScore([makeDimension({ score: 0.8 })], {
|
|
142
|
-
taskId: "groq-basic",
|
|
143
|
-
weights: { "task-completion": 1.0 },
|
|
144
|
-
}),
|
|
145
|
-
computeTaskScore([makeDimension({ score: 0.6 })], {
|
|
146
|
-
taskId: "groq-advanced",
|
|
147
|
-
weights: { "task-completion": 1.0 },
|
|
148
|
-
}),
|
|
149
|
-
computeTaskScore([makeDimension({ score: 0.9 })], {
|
|
150
|
-
taskId: "studio-schema",
|
|
151
|
-
weights: { "task-completion": 1.0 },
|
|
152
|
-
}),
|
|
153
|
-
];
|
|
154
|
-
const areas = aggregateAreas(tasks);
|
|
155
|
-
assert.equal(areas.length, 2);
|
|
156
|
-
const groq = areas.find((a) => a.areaId === "groq");
|
|
157
|
-
assert.ok(groq);
|
|
158
|
-
assert.equal(groq.taskCount, 2);
|
|
159
|
-
assert.ok(Math.abs(groq.score - 0.7) < 0.01); // (0.8+0.6)/2
|
|
160
|
-
const studio = areas.find((a) => a.areaId === "studio");
|
|
161
|
-
assert.ok(studio);
|
|
162
|
-
assert.equal(studio.taskCount, 1);
|
|
163
|
-
});
|
|
164
|
-
it("computes delta from previous scores", () => {
|
|
165
|
-
const tasks = [
|
|
166
|
-
computeTaskScore([makeDimension({ score: 0.8 })], {
|
|
167
|
-
taskId: "groq-basic",
|
|
168
|
-
weights: { "task-completion": 1.0 },
|
|
169
|
-
}),
|
|
170
|
-
];
|
|
171
|
-
const areas = aggregateAreas(tasks, { groq: 0.6 });
|
|
172
|
-
assert.ok(areas[0].delta !== null);
|
|
173
|
-
assert.ok(Math.abs(areas[0].delta - 0.2) < 0.01);
|
|
174
|
-
});
|
|
175
|
-
});
|
|
176
|
-
// ---------------------------------------------------------------------------
|
|
177
|
-
// Score normalization
|
|
178
|
-
// ---------------------------------------------------------------------------
|
|
179
|
-
describe("normalizeScore", () => {
|
|
180
|
-
it("normalizes LLM rubric scores (0-100 → 0-1)", () => {
|
|
181
|
-
assert.ok(Math.abs(normalizeScore(75, "llm-rubric") - 0.75) < 0.01);
|
|
182
|
-
});
|
|
183
|
-
it("passes through already-normalized scores", () => {
|
|
184
|
-
assert.ok(Math.abs(normalizeScore(0.75, "llm-rubric") - 0.75) < 0.01);
|
|
185
|
-
});
|
|
186
|
-
it("normalizes boolean assertions to 0 or 1", () => {
|
|
187
|
-
assert.equal(normalizeScore(1, "contains"), 1);
|
|
188
|
-
assert.equal(normalizeScore(0, "contains"), 0);
|
|
189
|
-
});
|
|
190
|
-
it("clamps similarity scores to [0, 1]", () => {
|
|
191
|
-
assert.equal(normalizeScore(1.5, "similar"), 1);
|
|
192
|
-
assert.equal(normalizeScore(-0.1, "similar"), 0);
|
|
193
|
-
});
|
|
194
|
-
});
|
|
195
|
-
// ---------------------------------------------------------------------------
|
|
196
|
-
// Ensemble grading
|
|
197
|
-
// ---------------------------------------------------------------------------
|
|
198
|
-
describe("computeEnsembleScore", () => {
|
|
199
|
-
it("computes mean ensemble score", () => {
|
|
200
|
-
const { score, agreement } = computeEnsembleScore([0.8, 0.6, 0.7], "mean");
|
|
201
|
-
assert.ok(Math.abs(score - 0.7) < 0.01);
|
|
202
|
-
assert.ok(agreement > 0);
|
|
203
|
-
});
|
|
204
|
-
it("computes median ensemble score", () => {
|
|
205
|
-
const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "median");
|
|
206
|
-
assert.ok(Math.abs(score - 0.7) < 0.01);
|
|
207
|
-
});
|
|
208
|
-
it("computes max ensemble score", () => {
|
|
209
|
-
const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "max");
|
|
210
|
-
assert.ok(Math.abs(score - 0.9) < 0.01);
|
|
211
|
-
});
|
|
212
|
-
it("agreement is 1 for identical scores", () => {
|
|
213
|
-
const { agreement } = computeEnsembleScore([0.8, 0.8, 0.8]);
|
|
214
|
-
assert.ok(Math.abs(agreement - 1.0) < 0.01);
|
|
215
|
-
});
|
|
216
|
-
it("agreement decreases with divergent scores", () => {
|
|
217
|
-
const { agreement } = computeEnsembleScore([0.0, 1.0]);
|
|
218
|
-
assert.ok(agreement < 0.6);
|
|
219
|
-
});
|
|
220
|
-
});
|
|
221
|
-
// ---------------------------------------------------------------------------
|
|
222
|
-
// Storage schema
|
|
223
|
-
// ---------------------------------------------------------------------------
|
|
224
|
-
describe("storage schema", () => {
|
|
225
|
-
it("CURRENT_SCHEMA_VERSION is 1", () => {
|
|
226
|
-
assert.equal(CURRENT_SCHEMA_VERSION, 1);
|
|
227
|
-
});
|
|
228
|
-
it("isSchemaVersioned detects versioned docs", () => {
|
|
229
|
-
assert.equal(isSchemaVersioned({ schemaVersion: 1 }), true);
|
|
230
|
-
assert.equal(isSchemaVersioned({}), false);
|
|
231
|
-
assert.equal(isSchemaVersioned(null), false);
|
|
232
|
-
});
|
|
233
|
-
it("migrateDocument is no-op for current version", () => {
|
|
234
|
-
const doc = { schemaVersion: 1, _type: "ailf.run" };
|
|
235
|
-
const migrated = migrateDocument(doc);
|
|
236
|
-
assert.equal(migrated.schemaVersion, 1);
|
|
237
|
-
});
|
|
238
|
-
});
|
|
239
|
-
// ---------------------------------------------------------------------------
|
|
240
|
-
// Plugin registry
|
|
241
|
-
// ---------------------------------------------------------------------------
|
|
242
|
-
describe("InMemoryPluginRegistry", () => {
|
|
243
|
-
it("registers and retrieves modes", () => {
|
|
244
|
-
const registry = new InMemoryPluginRegistry();
|
|
245
|
-
registry.registerMode({
|
|
246
|
-
id: "custom",
|
|
247
|
-
label: "Custom Mode",
|
|
248
|
-
validProviderPatterns: [".*"],
|
|
249
|
-
rubricTemplateIds: [],
|
|
250
|
-
handlerModule: "./custom.js",
|
|
251
|
-
});
|
|
252
|
-
assert.equal(registry.getModes().length, 1);
|
|
253
|
-
assert.equal(registry.getMode("custom")?.label, "Custom Mode");
|
|
254
|
-
});
|
|
255
|
-
it("registers and retrieves assertions", () => {
|
|
256
|
-
const registry = new InMemoryPluginRegistry();
|
|
257
|
-
registry.registerAssertion({
|
|
258
|
-
type: "api-match",
|
|
259
|
-
label: "API Match",
|
|
260
|
-
compatibleModes: ["custom"],
|
|
261
|
-
handlerModule: "./api-match.js",
|
|
262
|
-
});
|
|
263
|
-
assert.equal(registry.getAssertions().length, 1);
|
|
264
|
-
});
|
|
265
|
-
it("registers a complete preset with mode base", () => {
|
|
266
|
-
const registry = new InMemoryPluginRegistry();
|
|
267
|
-
// Must register mode base first
|
|
268
|
-
const { createLiteracyModeBase } = require("../mode-bases/literacy.js");
|
|
269
|
-
registry.registerModeBase(createLiteracyModeBase());
|
|
270
|
-
registry.registerPreset(sanityLiteracyPreset);
|
|
271
|
-
// Mode + rubrics from mode base, domain config from preset
|
|
272
|
-
assert.ok(registry.getMode("literacy"));
|
|
273
|
-
assert.ok(registry.getRubricTemplates().length > 0);
|
|
274
|
-
assert.ok(registry.getPresets().length === 1);
|
|
275
|
-
});
|
|
276
|
-
});
|
|
277
|
-
// ---------------------------------------------------------------------------
|
|
278
|
-
// sanity-literacy preset
|
|
279
|
-
// ---------------------------------------------------------------------------
|
|
280
|
-
describe("sanityLiteracyPreset", () => {
|
|
281
|
-
it("has correct manifest", () => {
|
|
282
|
-
assert.equal(sanityLiteracyPreset.name, "sanity-literacy");
|
|
283
|
-
assert.equal(sanityLiteracyPreset.manifest.pluginApiVersion, 1);
|
|
284
|
-
});
|
|
285
|
-
it("targets literacy mode base", () => {
|
|
286
|
-
assert.equal(sanityLiteracyPreset.mode, "literacy");
|
|
287
|
-
});
|
|
288
|
-
it("does not bundle assertions (now framework built-ins)", () => {
|
|
289
|
-
assert.equal(sanityLiteracyPreset.assertions, undefined);
|
|
290
|
-
});
|
|
291
|
-
it("does not bundle rubrics/scoring/prompts (now in literacy mode base)", () => {
|
|
292
|
-
// Evaluation methodology moved to mode-bases/literacy.ts
|
|
293
|
-
assert.equal(sanityLiteracyPreset.rubricTemplates, undefined);
|
|
294
|
-
assert.equal(sanityLiteracyPreset.scoringProfiles, undefined);
|
|
295
|
-
assert.equal(sanityLiteracyPreset.promptTemplates, undefined);
|
|
296
|
-
});
|
|
297
|
-
it("includes sanity:// fixture resolver", () => {
|
|
298
|
-
assert.ok(sanityLiteracyPreset.fixtureResolvers?.some((r) => r.scheme === "sanity://"));
|
|
299
|
-
});
|
|
300
|
-
it("includes 3 source definitions", () => {
|
|
301
|
-
const sources = sanityLiteracyPreset.sourceDefs;
|
|
302
|
-
assert.ok(sources);
|
|
303
|
-
assert.equal(sources.length, 3);
|
|
304
|
-
const names = sources.map((s) => s.name);
|
|
305
|
-
assert.ok(names.includes("production"));
|
|
306
|
-
assert.ok(names.includes("branch"));
|
|
307
|
-
assert.ok(names.includes("local"));
|
|
308
|
-
});
|
|
309
|
-
it("production source has correct baseUrl", () => {
|
|
310
|
-
const prod = sanityLiteracyPreset.sourceDefs.find((s) => s.name === "production");
|
|
311
|
-
assert.ok(prod);
|
|
312
|
-
assert.equal(prod.baseUrl, "https://www.sanity.io/docs");
|
|
313
|
-
});
|
|
314
|
-
it("includes feature registry with all features", () => {
|
|
315
|
-
const features = sanityLiteracyPreset.featureDefs;
|
|
316
|
-
assert.ok(features);
|
|
317
|
-
assert.equal(features.features.length, 14);
|
|
318
|
-
const ids = features.features.map((f) => f.id);
|
|
319
|
-
assert.ok(ids.includes("groq"));
|
|
320
|
-
assert.ok(ids.includes("visual-editing"));
|
|
321
|
-
assert.ok(ids.includes("portable-text"));
|
|
322
|
-
assert.ok(ids.includes("ai-assist"));
|
|
323
|
-
});
|
|
324
|
-
it("includes a docFetcher factory", () => {
|
|
325
|
-
assert.equal(typeof sanityLiteracyPreset.docFetcher, "function");
|
|
326
|
-
const fetcher = sanityLiteracyPreset.docFetcher();
|
|
327
|
-
assert.ok(fetcher);
|
|
328
|
-
assert.equal(typeof fetcher.fetch, "function");
|
|
329
|
-
});
|
|
330
|
-
});
|
|
331
|
-
// ---------------------------------------------------------------------------
|
|
332
|
-
// createSanityLiteracyPreset factory
|
|
333
|
-
// ---------------------------------------------------------------------------
|
|
334
|
-
describe("createSanityLiteracyPreset", () => {
|
|
335
|
-
it("returns a domain-only preset targeting literacy mode", () => {
|
|
336
|
-
const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
|
|
337
|
-
assert.equal(preset.name, "sanity-literacy");
|
|
338
|
-
assert.equal(preset.mode, "literacy");
|
|
339
|
-
// Domain config present
|
|
340
|
-
assert.ok(preset.fixtureResolvers);
|
|
341
|
-
assert.ok(preset.docFetcher);
|
|
342
|
-
assert.ok(preset.sourceDefs);
|
|
343
|
-
assert.ok(preset.featureDefs);
|
|
344
|
-
// Methodology inherited from mode base, not on preset
|
|
345
|
-
assert.equal(preset.rubricTemplates, undefined);
|
|
346
|
-
assert.equal(preset.scoringProfiles, undefined);
|
|
347
|
-
assert.equal(preset.promptTemplates, undefined);
|
|
348
|
-
});
|
|
349
|
-
it("registers all extension points via mode base + domain config", () => {
|
|
350
|
-
const registry = new InMemoryPluginRegistry();
|
|
351
|
-
// Must register mode base first (composition root does this)
|
|
352
|
-
const { createLiteracyModeBase } = require("../mode-bases/literacy.js");
|
|
353
|
-
registry.registerModeBase(createLiteracyModeBase());
|
|
354
|
-
const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
|
|
355
|
-
registry.registerPreset(preset);
|
|
356
|
-
// Mode from mode base
|
|
357
|
-
assert.ok(registry.getMode("literacy"));
|
|
358
|
-
// Rubrics, scoring, prompts inherited from mode base
|
|
359
|
-
assert.equal(registry.getRubricTemplates().length, 3);
|
|
360
|
-
assert.equal(Object.keys(registry.getPromptTemplates()).length, 3);
|
|
361
|
-
assert.equal(Object.keys(registry.getScoringProfiles()).length, 2);
|
|
362
|
-
// Domain config from preset
|
|
363
|
-
assert.ok(registry.getDocFetcherFactory());
|
|
364
|
-
assert.equal(registry.getSourceDefs().length, 3);
|
|
365
|
-
assert.ok(registry.getFeatureDefs());
|
|
366
|
-
assert.equal(registry.getFeatureDefs().features.length, 14);
|
|
367
|
-
});
|
|
368
|
-
});
|
|
369
|
-
// ---------------------------------------------------------------------------
|
|
370
|
-
// Preset is single source of truth for sources and features
|
|
371
|
-
// ---------------------------------------------------------------------------
|
|
372
|
-
describe("preset is single source of truth for Sanity config", () => {
|
|
373
|
-
it("config/sources.ts exports an empty array", async () => {
|
|
374
|
-
const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
|
|
375
|
-
const ROOT = resolve(__dirname, "..", "..", "..", "..");
|
|
376
|
-
const loaded = tryLoadConfigFile("sources", ROOT);
|
|
377
|
-
assert.ok(loaded, "config/sources.ts should exist");
|
|
378
|
-
const sources = loaded.data;
|
|
379
|
-
assert.ok(Array.isArray(sources), "should export an array");
|
|
380
|
-
assert.equal(sources.length, 0, "config/sources should be empty (preset provides sources)");
|
|
381
|
-
});
|
|
382
|
-
it("config/features.ts exports an empty features array", async () => {
|
|
383
|
-
const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
|
|
384
|
-
const ROOT = resolve(__dirname, "..", "..", "..", "..");
|
|
385
|
-
const loaded = tryLoadConfigFile("features", ROOT);
|
|
386
|
-
assert.ok(loaded, "config/features.ts should exist");
|
|
387
|
-
assert.ok(Array.isArray(loaded.data.features), "should have a features array");
|
|
388
|
-
assert.equal(loaded.data.features.length, 0, "config/features should be empty (preset provides features)");
|
|
389
|
-
});
|
|
390
|
-
it("preset contains all 3 source entries", () => {
|
|
391
|
-
const sources = sanityLiteracyPreset.sourceDefs;
|
|
392
|
-
assert.equal(sources.length, 3);
|
|
393
|
-
const names = sources.map((s) => s.name).sort();
|
|
394
|
-
assert.deepEqual(names, ["branch", "local", "production"]);
|
|
395
|
-
});
|
|
396
|
-
it("preset contains all 14 feature entries", () => {
|
|
397
|
-
const features = sanityLiteracyPreset.featureDefs.features;
|
|
398
|
-
assert.equal(features.length, 14);
|
|
399
|
-
const covered = features.filter((f) => f.status === "covered");
|
|
400
|
-
const uncovered = features.filter((f) => f.status === "uncovered");
|
|
401
|
-
assert.equal(covered.length, 6, "should have 6 covered features");
|
|
402
|
-
assert.equal(uncovered.length, 8, "should have 8 uncovered features");
|
|
403
|
-
});
|
|
404
|
-
});
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
|
|
3
|
-
*
|
|
4
|
-
* Verifies that `scoreTestGroup` produces the same 0–100 output as the
|
|
5
|
-
* legacy `accumulateDimensions → averageDimensions → weightedComposite`
|
|
6
|
-
* chain when given identical inputs.
|
|
7
|
-
*
|
|
8
|
-
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
|
|
9
|
-
*/
|
|
10
|
-
export {};
|
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
|
|
3
|
-
*
|
|
4
|
-
* Verifies that `scoreTestGroup` produces the same 0–100 output as the
|
|
5
|
-
* legacy `accumulateDimensions → averageDimensions → weightedComposite`
|
|
6
|
-
* chain when given identical inputs.
|
|
7
|
-
*
|
|
8
|
-
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
|
|
9
|
-
*/
|
|
10
|
-
import assert from "node:assert/strict";
|
|
11
|
-
import { describe, it } from "node:test";
|
|
12
|
-
import { scoreTestGroup } from "../scoring-bridge.js";
|
|
13
|
-
// ---------------------------------------------------------------------------
|
|
14
|
-
// Helpers
|
|
15
|
-
// ---------------------------------------------------------------------------
|
|
16
|
-
function makeTestResult(overrides) {
|
|
17
|
-
const dims = overrides?.dimensions ?? {};
|
|
18
|
-
const componentResults = [];
|
|
19
|
-
if (dims.taskCompletion !== undefined) {
|
|
20
|
-
componentResults.push({
|
|
21
|
-
assertion: {
|
|
22
|
-
type: "llm-rubric",
|
|
23
|
-
metadata: { dimension: "task-completion" },
|
|
24
|
-
},
|
|
25
|
-
pass: true,
|
|
26
|
-
reason: JSON.stringify({ score: dims.taskCompletion }),
|
|
27
|
-
score: dims.taskCompletion / 100,
|
|
28
|
-
});
|
|
29
|
-
}
|
|
30
|
-
if (dims.codeCorrectness !== undefined) {
|
|
31
|
-
componentResults.push({
|
|
32
|
-
assertion: {
|
|
33
|
-
type: "llm-rubric",
|
|
34
|
-
metadata: { dimension: "code-correctness" },
|
|
35
|
-
},
|
|
36
|
-
pass: true,
|
|
37
|
-
reason: JSON.stringify({ score: dims.codeCorrectness }),
|
|
38
|
-
score: dims.codeCorrectness / 100,
|
|
39
|
-
});
|
|
40
|
-
}
|
|
41
|
-
if (dims.docCoverage !== undefined) {
|
|
42
|
-
componentResults.push({
|
|
43
|
-
assertion: {
|
|
44
|
-
type: "llm-rubric",
|
|
45
|
-
metadata: { dimension: "doc-coverage" },
|
|
46
|
-
},
|
|
47
|
-
pass: true,
|
|
48
|
-
reason: JSON.stringify({ score: dims.docCoverage }),
|
|
49
|
-
score: dims.docCoverage / 100,
|
|
50
|
-
});
|
|
51
|
-
}
|
|
52
|
-
return {
|
|
53
|
-
cost: overrides?.cost ?? 0.01,
|
|
54
|
-
description: overrides?.description ?? "test",
|
|
55
|
-
gradingResult: {
|
|
56
|
-
componentResults,
|
|
57
|
-
pass: true,
|
|
58
|
-
},
|
|
59
|
-
response: { output: "mock output" },
|
|
60
|
-
vars: overrides?.vars ?? { task: "test", docs: "" },
|
|
61
|
-
};
|
|
62
|
-
}
|
|
63
|
-
const DEFAULT_PROFILE = {
|
|
64
|
-
"code-correctness": 0.35,
|
|
65
|
-
"doc-coverage": 0.25,
|
|
66
|
-
"task-completion": 0.4,
|
|
67
|
-
};
|
|
68
|
-
const OUTPUT_ONLY_PROFILE = {
|
|
69
|
-
"code-correctness": 0.55,
|
|
70
|
-
"task-completion": 0.45,
|
|
71
|
-
};
|
|
72
|
-
// ---------------------------------------------------------------------------
|
|
73
|
-
// Tests
|
|
74
|
-
// ---------------------------------------------------------------------------
|
|
75
|
-
describe("scoreTestGroup — basic scoring", () => {
|
|
76
|
-
it("returns zeroes for empty test array", () => {
|
|
77
|
-
const result = scoreTestGroup([], DEFAULT_PROFILE);
|
|
78
|
-
assert.equal(result.composite, 0);
|
|
79
|
-
assert.equal(result.totalCost, 0);
|
|
80
|
-
assert.deepEqual(result.dimensions, {});
|
|
81
|
-
});
|
|
82
|
-
it("scores a single test with all dimensions", () => {
|
|
83
|
-
const tests = [
|
|
84
|
-
makeTestResult({
|
|
85
|
-
dimensions: {
|
|
86
|
-
taskCompletion: 80,
|
|
87
|
-
codeCorrectness: 70,
|
|
88
|
-
docCoverage: 60,
|
|
89
|
-
},
|
|
90
|
-
}),
|
|
91
|
-
];
|
|
92
|
-
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
93
|
-
// Expected: 80*0.4 + 70*0.35 + 60*0.25 = 32 + 24.5 + 15 = 71.5 → 72
|
|
94
|
-
assert.equal(result.dimensions.taskCompletion, 80);
|
|
95
|
-
assert.equal(result.dimensions.codeCorrectness, 70);
|
|
96
|
-
assert.equal(result.dimensions.docCoverage, 60);
|
|
97
|
-
assert.equal(result.composite, 72);
|
|
98
|
-
});
|
|
99
|
-
it("averages across multiple tests", () => {
|
|
100
|
-
const tests = [
|
|
101
|
-
makeTestResult({
|
|
102
|
-
dimensions: { taskCompletion: 80, codeCorrectness: 60 },
|
|
103
|
-
}),
|
|
104
|
-
makeTestResult({
|
|
105
|
-
dimensions: { taskCompletion: 60, codeCorrectness: 80 },
|
|
106
|
-
}),
|
|
107
|
-
];
|
|
108
|
-
const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
|
|
109
|
-
// taskCompletion avg = 70, codeCorrectness avg = 70
|
|
110
|
-
// Expected: 70*0.45 + 70*0.55 = 31.5 + 38.5 = 70
|
|
111
|
-
assert.equal(result.dimensions.taskCompletion, 70);
|
|
112
|
-
assert.equal(result.dimensions.codeCorrectness, 70);
|
|
113
|
-
assert.equal(result.composite, 70);
|
|
114
|
-
});
|
|
115
|
-
it("accumulates cost across tests", () => {
|
|
116
|
-
const tests = [
|
|
117
|
-
makeTestResult({ cost: 0.05, dimensions: { taskCompletion: 80 } }),
|
|
118
|
-
makeTestResult({ cost: 0.03, dimensions: { taskCompletion: 70 } }),
|
|
119
|
-
];
|
|
120
|
-
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
121
|
-
assert.ok(Math.abs(result.totalCost - 0.08) < 0.001);
|
|
122
|
-
});
|
|
123
|
-
});
|
|
124
|
-
describe("scoreTestGroup — profile handling", () => {
|
|
125
|
-
it("uses output-only profile (excludes doc-coverage)", () => {
|
|
126
|
-
const tests = [
|
|
127
|
-
makeTestResult({
|
|
128
|
-
dimensions: {
|
|
129
|
-
taskCompletion: 80,
|
|
130
|
-
codeCorrectness: 60,
|
|
131
|
-
docCoverage: 100,
|
|
132
|
-
},
|
|
133
|
-
}),
|
|
134
|
-
];
|
|
135
|
-
const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
|
|
136
|
-
// doc-coverage should be present in dimensions but NOT affect composite
|
|
137
|
-
// Expected: 80*0.45 + 60*0.55 = 36 + 33 = 69
|
|
138
|
-
assert.equal(result.dimensions.docCoverage, 100);
|
|
139
|
-
assert.equal(result.composite, 69);
|
|
140
|
-
});
|
|
141
|
-
it("handles profile with only one dimension", () => {
|
|
142
|
-
const tests = [
|
|
143
|
-
makeTestResult({
|
|
144
|
-
dimensions: { taskCompletion: 90, codeCorrectness: 50 },
|
|
145
|
-
}),
|
|
146
|
-
];
|
|
147
|
-
const result = scoreTestGroup(tests, { "task-completion": 1.0 });
|
|
148
|
-
// Only taskCompletion should count
|
|
149
|
-
assert.equal(result.composite, 90);
|
|
150
|
-
});
|
|
151
|
-
});
|
|
152
|
-
describe("scoreTestGroup — edge cases", () => {
|
|
153
|
-
it("handles tests with no rubric components", () => {
|
|
154
|
-
const test = {
|
|
155
|
-
cost: 0.01,
|
|
156
|
-
description: "no rubrics",
|
|
157
|
-
gradingResult: {
|
|
158
|
-
componentResults: [
|
|
159
|
-
{ assertion: { type: "javascript" }, pass: true, score: 1 },
|
|
160
|
-
],
|
|
161
|
-
pass: true,
|
|
162
|
-
},
|
|
163
|
-
response: { output: "mock" },
|
|
164
|
-
vars: { task: "test", docs: "" },
|
|
165
|
-
};
|
|
166
|
-
const result = scoreTestGroup([test], DEFAULT_PROFILE);
|
|
167
|
-
// No llm-rubric components → 0 composite
|
|
168
|
-
assert.equal(result.composite, 0);
|
|
169
|
-
assert.equal(result.totalCost, 0.01);
|
|
170
|
-
});
|
|
171
|
-
it("provides raw DimensionScore objects for advanced consumers", () => {
|
|
172
|
-
const tests = [
|
|
173
|
-
makeTestResult({
|
|
174
|
-
dimensions: { taskCompletion: 80, codeCorrectness: 60 },
|
|
175
|
-
}),
|
|
176
|
-
];
|
|
177
|
-
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
178
|
-
assert.ok(result.rawDimensions.length >= 2);
|
|
179
|
-
const tcDim = result.rawDimensions.find((d) => d.dimensionId === "task-completion");
|
|
180
|
-
assert.ok(tcDim);
|
|
181
|
-
assert.ok(tcDim.score >= 0 && tcDim.score <= 1); // 0–1 scale
|
|
182
|
-
assert.equal(tcDim.assertionCount, 1);
|
|
183
|
-
});
|
|
184
|
-
});
|