@sanity/ailf 3.8.0 → 3.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/dist/adapters/config-sources/file-config-adapter.js +4 -5
  2. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
  3. package/dist/cli-program.d.ts +39 -0
  4. package/dist/cli-program.js +137 -0
  5. package/dist/cli.d.ts +6 -0
  6. package/dist/cli.js +12 -122
  7. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  8. package/package.json +5 -3
  9. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +0 -10
  10. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +0 -366
  11. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +0 -9
  12. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +0 -145
  13. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +0 -10
  14. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +0 -314
  15. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +0 -10
  16. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +0 -486
  17. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +0 -10
  18. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +0 -425
  19. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +0 -9
  20. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +0 -332
  21. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +0 -12
  22. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +0 -210
  23. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +0 -7
  24. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +0 -404
  25. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +0 -10
  26. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +0 -184
  27. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +0 -8
  28. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +0 -301
  29. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +0 -9
  30. package/dist/pipeline/compiler/__tests__/telemetry.test.js +0 -503
  31. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +0 -10
  32. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +0 -509
@@ -1,404 +0,0 @@
1
- /**
2
- * scoring-and-presets.test.ts — Tests for 4-tier scoring engine,
3
- * storage schema, and plugin registry / presets.
4
- *
5
- * Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-and-presets.test.ts
6
- */
7
- import assert from "node:assert/strict";
8
- import { dirname, resolve } from "node:path";
9
- import { describe, it } from "node:test";
10
- import { fileURLToPath } from "node:url";
11
- const __dirname = dirname(fileURLToPath(import.meta.url));
12
- import { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "../../../_vendor/ailf-core/index.js";
13
- import { CURRENT_SCHEMA_VERSION, InMemoryPluginRegistry, isSchemaVersioned, migrateDocument, } from "../../../_vendor/ailf-core/index.js";
14
- import { createSanityLiteracyPreset, sanityLiteracyPreset, } from "../presets/sanity-literacy.js";
15
- // ---------------------------------------------------------------------------
16
- // Helpers
17
- // ---------------------------------------------------------------------------
18
- function makeAssertion(overrides) {
19
- return {
20
- pass: true,
21
- score: 0.8,
22
- reason: "Good",
23
- assertionType: "llm-rubric",
24
- dimension: "task-completion",
25
- latencyMs: 100,
26
- weight: 1.0,
27
- ...overrides,
28
- };
29
- }
30
- function makeDimension(overrides) {
31
- return {
32
- dimensionId: "task-completion",
33
- label: "Task Completion",
34
- score: 0.8,
35
- assertionCount: 2,
36
- passCount: 2,
37
- aggregation: "weighted-mean",
38
- assertions: [],
39
- ...overrides,
40
- };
41
- }
42
- // ---------------------------------------------------------------------------
43
- // Tier 1 → Tier 2: Assertion → Dimension aggregation
44
- // ---------------------------------------------------------------------------
45
- describe("aggregateDimensions", () => {
46
- it("groups assertions by dimension", () => {
47
- const assertions = [
48
- makeAssertion({ dimension: "code-correctness", score: 0.9 }),
49
- makeAssertion({ dimension: "code-correctness", score: 0.7 }),
50
- makeAssertion({ dimension: "task-completion", score: 0.8 }),
51
- ];
52
- const dims = aggregateDimensions(assertions);
53
- assert.equal(dims.length, 2);
54
- const cc = dims.find((d) => d.dimensionId === "code-correctness");
55
- assert.ok(cc);
56
- assert.equal(cc.assertionCount, 2);
57
- });
58
- it("uses weighted-mean by default", () => {
59
- const assertions = [
60
- makeAssertion({ score: 0.6, weight: 1.0 }),
61
- makeAssertion({ score: 0.8, weight: 3.0 }),
62
- ];
63
- const dims = aggregateDimensions(assertions);
64
- // Weighted mean: (0.6*1 + 0.8*3) / (1+3) = 3.0/4 = 0.75
65
- assert.ok(Math.abs(dims[0].score - 0.75) < 0.01);
66
- });
67
- it("falls back to pass rate when no numeric scores", () => {
68
- const assertions = [
69
- makeAssertion({ score: null, pass: true }),
70
- makeAssertion({ score: null, pass: false }),
71
- ];
72
- const dims = aggregateDimensions(assertions);
73
- assert.equal(dims[0].score, 0.5);
74
- });
75
- it("applies custom dimension labels", () => {
76
- const assertions = [makeAssertion({ dimension: "tc" })];
77
- const dims = aggregateDimensions(assertions, {
78
- dimensionLabels: { tc: "Task Completion" },
79
- });
80
- assert.equal(dims[0].label, "Task Completion");
81
- });
82
- });
83
- // ---------------------------------------------------------------------------
84
- // Tier 2 → Tier 3: Dimension → Task scoring
85
- // ---------------------------------------------------------------------------
86
- describe("computeTaskScore", () => {
87
- it("computes weighted score from dimensions", () => {
88
- const dims = [
89
- makeDimension({ dimensionId: "tc", score: 0.8 }),
90
- makeDimension({ dimensionId: "cc", score: 0.6 }),
91
- ];
92
- const task = computeTaskScore(dims, {
93
- taskId: "test-task",
94
- weights: { tc: 0.6, cc: 0.4 },
95
- });
96
- // 0.8*0.6 + 0.6*0.4 = 0.48 + 0.24 = 0.72
97
- assert.ok(Math.abs(task.score - 0.72) < 0.01);
98
- });
99
- it("normalizes weights that don't sum to 1", () => {
100
- const dims = [
101
- makeDimension({ dimensionId: "tc", score: 1.0 }),
102
- makeDimension({ dimensionId: "cc", score: 0.0 }),
103
- ];
104
- const task = computeTaskScore(dims, {
105
- taskId: "test-task",
106
- weights: { tc: 2, cc: 2 },
107
- });
108
- // (1.0*2 + 0.0*2) / (2+2) = 2/4 = 0.5
109
- assert.ok(Math.abs(task.score - 0.5) < 0.01);
110
- });
111
- it("checks against threshold", () => {
112
- const dims = [makeDimension({ dimensionId: "tc", score: 0.6 })];
113
- const passing = computeTaskScore(dims, {
114
- taskId: "t1",
115
- weights: { tc: 1.0 },
116
- threshold: 0.5,
117
- });
118
- assert.equal(passing.passesThreshold, true);
119
- const failing = computeTaskScore(dims, {
120
- taskId: "t2",
121
- weights: { tc: 1.0 },
122
- threshold: 0.7,
123
- });
124
- assert.equal(failing.passesThreshold, false);
125
- });
126
- it("records weight source", () => {
127
- const task = computeTaskScore([makeDimension()], {
128
- taskId: "t1",
129
- weights: { "task-completion": 1.0 },
130
- weightSource: "rubrics.yaml:default",
131
- });
132
- assert.equal(task.weightSource, "rubrics.yaml:default");
133
- });
134
- });
135
- // ---------------------------------------------------------------------------
136
- // Tier 3 → Tier 4: Task → Area aggregation
137
- // ---------------------------------------------------------------------------
138
- describe("aggregateAreas", () => {
139
- it("groups tasks by area prefix", () => {
140
- const tasks = [
141
- computeTaskScore([makeDimension({ score: 0.8 })], {
142
- taskId: "groq-basic",
143
- weights: { "task-completion": 1.0 },
144
- }),
145
- computeTaskScore([makeDimension({ score: 0.6 })], {
146
- taskId: "groq-advanced",
147
- weights: { "task-completion": 1.0 },
148
- }),
149
- computeTaskScore([makeDimension({ score: 0.9 })], {
150
- taskId: "studio-schema",
151
- weights: { "task-completion": 1.0 },
152
- }),
153
- ];
154
- const areas = aggregateAreas(tasks);
155
- assert.equal(areas.length, 2);
156
- const groq = areas.find((a) => a.areaId === "groq");
157
- assert.ok(groq);
158
- assert.equal(groq.taskCount, 2);
159
- assert.ok(Math.abs(groq.score - 0.7) < 0.01); // (0.8+0.6)/2
160
- const studio = areas.find((a) => a.areaId === "studio");
161
- assert.ok(studio);
162
- assert.equal(studio.taskCount, 1);
163
- });
164
- it("computes delta from previous scores", () => {
165
- const tasks = [
166
- computeTaskScore([makeDimension({ score: 0.8 })], {
167
- taskId: "groq-basic",
168
- weights: { "task-completion": 1.0 },
169
- }),
170
- ];
171
- const areas = aggregateAreas(tasks, { groq: 0.6 });
172
- assert.ok(areas[0].delta !== null);
173
- assert.ok(Math.abs(areas[0].delta - 0.2) < 0.01);
174
- });
175
- });
176
- // ---------------------------------------------------------------------------
177
- // Score normalization
178
- // ---------------------------------------------------------------------------
179
- describe("normalizeScore", () => {
180
- it("normalizes LLM rubric scores (0-100 → 0-1)", () => {
181
- assert.ok(Math.abs(normalizeScore(75, "llm-rubric") - 0.75) < 0.01);
182
- });
183
- it("passes through already-normalized scores", () => {
184
- assert.ok(Math.abs(normalizeScore(0.75, "llm-rubric") - 0.75) < 0.01);
185
- });
186
- it("normalizes boolean assertions to 0 or 1", () => {
187
- assert.equal(normalizeScore(1, "contains"), 1);
188
- assert.equal(normalizeScore(0, "contains"), 0);
189
- });
190
- it("clamps similarity scores to [0, 1]", () => {
191
- assert.equal(normalizeScore(1.5, "similar"), 1);
192
- assert.equal(normalizeScore(-0.1, "similar"), 0);
193
- });
194
- });
195
- // ---------------------------------------------------------------------------
196
- // Ensemble grading
197
- // ---------------------------------------------------------------------------
198
- describe("computeEnsembleScore", () => {
199
- it("computes mean ensemble score", () => {
200
- const { score, agreement } = computeEnsembleScore([0.8, 0.6, 0.7], "mean");
201
- assert.ok(Math.abs(score - 0.7) < 0.01);
202
- assert.ok(agreement > 0);
203
- });
204
- it("computes median ensemble score", () => {
205
- const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "median");
206
- assert.ok(Math.abs(score - 0.7) < 0.01);
207
- });
208
- it("computes max ensemble score", () => {
209
- const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "max");
210
- assert.ok(Math.abs(score - 0.9) < 0.01);
211
- });
212
- it("agreement is 1 for identical scores", () => {
213
- const { agreement } = computeEnsembleScore([0.8, 0.8, 0.8]);
214
- assert.ok(Math.abs(agreement - 1.0) < 0.01);
215
- });
216
- it("agreement decreases with divergent scores", () => {
217
- const { agreement } = computeEnsembleScore([0.0, 1.0]);
218
- assert.ok(agreement < 0.6);
219
- });
220
- });
221
- // ---------------------------------------------------------------------------
222
- // Storage schema
223
- // ---------------------------------------------------------------------------
224
- describe("storage schema", () => {
225
- it("CURRENT_SCHEMA_VERSION is 1", () => {
226
- assert.equal(CURRENT_SCHEMA_VERSION, 1);
227
- });
228
- it("isSchemaVersioned detects versioned docs", () => {
229
- assert.equal(isSchemaVersioned({ schemaVersion: 1 }), true);
230
- assert.equal(isSchemaVersioned({}), false);
231
- assert.equal(isSchemaVersioned(null), false);
232
- });
233
- it("migrateDocument is no-op for current version", () => {
234
- const doc = { schemaVersion: 1, _type: "ailf.run" };
235
- const migrated = migrateDocument(doc);
236
- assert.equal(migrated.schemaVersion, 1);
237
- });
238
- });
239
- // ---------------------------------------------------------------------------
240
- // Plugin registry
241
- // ---------------------------------------------------------------------------
242
- describe("InMemoryPluginRegistry", () => {
243
- it("registers and retrieves modes", () => {
244
- const registry = new InMemoryPluginRegistry();
245
- registry.registerMode({
246
- id: "custom",
247
- label: "Custom Mode",
248
- validProviderPatterns: [".*"],
249
- rubricTemplateIds: [],
250
- handlerModule: "./custom.js",
251
- });
252
- assert.equal(registry.getModes().length, 1);
253
- assert.equal(registry.getMode("custom")?.label, "Custom Mode");
254
- });
255
- it("registers and retrieves assertions", () => {
256
- const registry = new InMemoryPluginRegistry();
257
- registry.registerAssertion({
258
- type: "api-match",
259
- label: "API Match",
260
- compatibleModes: ["custom"],
261
- handlerModule: "./api-match.js",
262
- });
263
- assert.equal(registry.getAssertions().length, 1);
264
- });
265
- it("registers a complete preset with mode base", () => {
266
- const registry = new InMemoryPluginRegistry();
267
- // Must register mode base first
268
- const { createLiteracyModeBase } = require("../mode-bases/literacy.js");
269
- registry.registerModeBase(createLiteracyModeBase());
270
- registry.registerPreset(sanityLiteracyPreset);
271
- // Mode + rubrics from mode base, domain config from preset
272
- assert.ok(registry.getMode("literacy"));
273
- assert.ok(registry.getRubricTemplates().length > 0);
274
- assert.ok(registry.getPresets().length === 1);
275
- });
276
- });
277
- // ---------------------------------------------------------------------------
278
- // sanity-literacy preset
279
- // ---------------------------------------------------------------------------
280
- describe("sanityLiteracyPreset", () => {
281
- it("has correct manifest", () => {
282
- assert.equal(sanityLiteracyPreset.name, "sanity-literacy");
283
- assert.equal(sanityLiteracyPreset.manifest.pluginApiVersion, 1);
284
- });
285
- it("targets literacy mode base", () => {
286
- assert.equal(sanityLiteracyPreset.mode, "literacy");
287
- });
288
- it("does not bundle assertions (now framework built-ins)", () => {
289
- assert.equal(sanityLiteracyPreset.assertions, undefined);
290
- });
291
- it("does not bundle rubrics/scoring/prompts (now in literacy mode base)", () => {
292
- // Evaluation methodology moved to mode-bases/literacy.ts
293
- assert.equal(sanityLiteracyPreset.rubricTemplates, undefined);
294
- assert.equal(sanityLiteracyPreset.scoringProfiles, undefined);
295
- assert.equal(sanityLiteracyPreset.promptTemplates, undefined);
296
- });
297
- it("includes sanity:// fixture resolver", () => {
298
- assert.ok(sanityLiteracyPreset.fixtureResolvers?.some((r) => r.scheme === "sanity://"));
299
- });
300
- it("includes 3 source definitions", () => {
301
- const sources = sanityLiteracyPreset.sourceDefs;
302
- assert.ok(sources);
303
- assert.equal(sources.length, 3);
304
- const names = sources.map((s) => s.name);
305
- assert.ok(names.includes("production"));
306
- assert.ok(names.includes("branch"));
307
- assert.ok(names.includes("local"));
308
- });
309
- it("production source has correct baseUrl", () => {
310
- const prod = sanityLiteracyPreset.sourceDefs.find((s) => s.name === "production");
311
- assert.ok(prod);
312
- assert.equal(prod.baseUrl, "https://www.sanity.io/docs");
313
- });
314
- it("includes feature registry with all features", () => {
315
- const features = sanityLiteracyPreset.featureDefs;
316
- assert.ok(features);
317
- assert.equal(features.features.length, 14);
318
- const ids = features.features.map((f) => f.id);
319
- assert.ok(ids.includes("groq"));
320
- assert.ok(ids.includes("visual-editing"));
321
- assert.ok(ids.includes("portable-text"));
322
- assert.ok(ids.includes("ai-assist"));
323
- });
324
- it("includes a docFetcher factory", () => {
325
- assert.equal(typeof sanityLiteracyPreset.docFetcher, "function");
326
- const fetcher = sanityLiteracyPreset.docFetcher();
327
- assert.ok(fetcher);
328
- assert.equal(typeof fetcher.fetch, "function");
329
- });
330
- });
331
- // ---------------------------------------------------------------------------
332
- // createSanityLiteracyPreset factory
333
- // ---------------------------------------------------------------------------
334
- describe("createSanityLiteracyPreset", () => {
335
- it("returns a domain-only preset targeting literacy mode", () => {
336
- const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
337
- assert.equal(preset.name, "sanity-literacy");
338
- assert.equal(preset.mode, "literacy");
339
- // Domain config present
340
- assert.ok(preset.fixtureResolvers);
341
- assert.ok(preset.docFetcher);
342
- assert.ok(preset.sourceDefs);
343
- assert.ok(preset.featureDefs);
344
- // Methodology inherited from mode base, not on preset
345
- assert.equal(preset.rubricTemplates, undefined);
346
- assert.equal(preset.scoringProfiles, undefined);
347
- assert.equal(preset.promptTemplates, undefined);
348
- });
349
- it("registers all extension points via mode base + domain config", () => {
350
- const registry = new InMemoryPluginRegistry();
351
- // Must register mode base first (composition root does this)
352
- const { createLiteracyModeBase } = require("../mode-bases/literacy.js");
353
- registry.registerModeBase(createLiteracyModeBase());
354
- const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
355
- registry.registerPreset(preset);
356
- // Mode from mode base
357
- assert.ok(registry.getMode("literacy"));
358
- // Rubrics, scoring, prompts inherited from mode base
359
- assert.equal(registry.getRubricTemplates().length, 3);
360
- assert.equal(Object.keys(registry.getPromptTemplates()).length, 3);
361
- assert.equal(Object.keys(registry.getScoringProfiles()).length, 2);
362
- // Domain config from preset
363
- assert.ok(registry.getDocFetcherFactory());
364
- assert.equal(registry.getSourceDefs().length, 3);
365
- assert.ok(registry.getFeatureDefs());
366
- assert.equal(registry.getFeatureDefs().features.length, 14);
367
- });
368
- });
369
- // ---------------------------------------------------------------------------
370
- // Preset is single source of truth for sources and features
371
- // ---------------------------------------------------------------------------
372
- describe("preset is single source of truth for Sanity config", () => {
373
- it("config/sources.ts exports an empty array", async () => {
374
- const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
375
- const ROOT = resolve(__dirname, "..", "..", "..", "..");
376
- const loaded = tryLoadConfigFile("sources", ROOT);
377
- assert.ok(loaded, "config/sources.ts should exist");
378
- const sources = loaded.data;
379
- assert.ok(Array.isArray(sources), "should export an array");
380
- assert.equal(sources.length, 0, "config/sources should be empty (preset provides sources)");
381
- });
382
- it("config/features.ts exports an empty features array", async () => {
383
- const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
384
- const ROOT = resolve(__dirname, "..", "..", "..", "..");
385
- const loaded = tryLoadConfigFile("features", ROOT);
386
- assert.ok(loaded, "config/features.ts should exist");
387
- assert.ok(Array.isArray(loaded.data.features), "should have a features array");
388
- assert.equal(loaded.data.features.length, 0, "config/features should be empty (preset provides features)");
389
- });
390
- it("preset contains all 3 source entries", () => {
391
- const sources = sanityLiteracyPreset.sourceDefs;
392
- assert.equal(sources.length, 3);
393
- const names = sources.map((s) => s.name).sort();
394
- assert.deepEqual(names, ["branch", "local", "production"]);
395
- });
396
- it("preset contains all 14 feature entries", () => {
397
- const features = sanityLiteracyPreset.featureDefs.features;
398
- assert.equal(features.length, 14);
399
- const covered = features.filter((f) => f.status === "covered");
400
- const uncovered = features.filter((f) => f.status === "uncovered");
401
- assert.equal(covered.length, 6, "should have 6 covered features");
402
- assert.equal(uncovered.length, 8, "should have 8 uncovered features");
403
- });
404
- });
@@ -1,10 +0,0 @@
1
- /**
2
- * scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
3
- *
4
- * Verifies that `scoreTestGroup` produces the same 0–100 output as the
5
- * legacy `accumulateDimensions → averageDimensions → weightedComposite`
6
- * chain when given identical inputs.
7
- *
8
- * Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
9
- */
10
- export {};
@@ -1,184 +0,0 @@
1
- /**
2
- * scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
3
- *
4
- * Verifies that `scoreTestGroup` produces the same 0–100 output as the
5
- * legacy `accumulateDimensions → averageDimensions → weightedComposite`
6
- * chain when given identical inputs.
7
- *
8
- * Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
9
- */
10
- import assert from "node:assert/strict";
11
- import { describe, it } from "node:test";
12
- import { scoreTestGroup } from "../scoring-bridge.js";
13
- // ---------------------------------------------------------------------------
14
- // Helpers
15
- // ---------------------------------------------------------------------------
16
- function makeTestResult(overrides) {
17
- const dims = overrides?.dimensions ?? {};
18
- const componentResults = [];
19
- if (dims.taskCompletion !== undefined) {
20
- componentResults.push({
21
- assertion: {
22
- type: "llm-rubric",
23
- metadata: { dimension: "task-completion" },
24
- },
25
- pass: true,
26
- reason: JSON.stringify({ score: dims.taskCompletion }),
27
- score: dims.taskCompletion / 100,
28
- });
29
- }
30
- if (dims.codeCorrectness !== undefined) {
31
- componentResults.push({
32
- assertion: {
33
- type: "llm-rubric",
34
- metadata: { dimension: "code-correctness" },
35
- },
36
- pass: true,
37
- reason: JSON.stringify({ score: dims.codeCorrectness }),
38
- score: dims.codeCorrectness / 100,
39
- });
40
- }
41
- if (dims.docCoverage !== undefined) {
42
- componentResults.push({
43
- assertion: {
44
- type: "llm-rubric",
45
- metadata: { dimension: "doc-coverage" },
46
- },
47
- pass: true,
48
- reason: JSON.stringify({ score: dims.docCoverage }),
49
- score: dims.docCoverage / 100,
50
- });
51
- }
52
- return {
53
- cost: overrides?.cost ?? 0.01,
54
- description: overrides?.description ?? "test",
55
- gradingResult: {
56
- componentResults,
57
- pass: true,
58
- },
59
- response: { output: "mock output" },
60
- vars: overrides?.vars ?? { task: "test", docs: "" },
61
- };
62
- }
63
- const DEFAULT_PROFILE = {
64
- "code-correctness": 0.35,
65
- "doc-coverage": 0.25,
66
- "task-completion": 0.4,
67
- };
68
- const OUTPUT_ONLY_PROFILE = {
69
- "code-correctness": 0.55,
70
- "task-completion": 0.45,
71
- };
72
- // ---------------------------------------------------------------------------
73
- // Tests
74
- // ---------------------------------------------------------------------------
75
- describe("scoreTestGroup — basic scoring", () => {
76
- it("returns zeroes for empty test array", () => {
77
- const result = scoreTestGroup([], DEFAULT_PROFILE);
78
- assert.equal(result.composite, 0);
79
- assert.equal(result.totalCost, 0);
80
- assert.deepEqual(result.dimensions, {});
81
- });
82
- it("scores a single test with all dimensions", () => {
83
- const tests = [
84
- makeTestResult({
85
- dimensions: {
86
- taskCompletion: 80,
87
- codeCorrectness: 70,
88
- docCoverage: 60,
89
- },
90
- }),
91
- ];
92
- const result = scoreTestGroup(tests, DEFAULT_PROFILE);
93
- // Expected: 80*0.4 + 70*0.35 + 60*0.25 = 32 + 24.5 + 15 = 71.5 → 72
94
- assert.equal(result.dimensions.taskCompletion, 80);
95
- assert.equal(result.dimensions.codeCorrectness, 70);
96
- assert.equal(result.dimensions.docCoverage, 60);
97
- assert.equal(result.composite, 72);
98
- });
99
- it("averages across multiple tests", () => {
100
- const tests = [
101
- makeTestResult({
102
- dimensions: { taskCompletion: 80, codeCorrectness: 60 },
103
- }),
104
- makeTestResult({
105
- dimensions: { taskCompletion: 60, codeCorrectness: 80 },
106
- }),
107
- ];
108
- const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
109
- // taskCompletion avg = 70, codeCorrectness avg = 70
110
- // Expected: 70*0.45 + 70*0.55 = 31.5 + 38.5 = 70
111
- assert.equal(result.dimensions.taskCompletion, 70);
112
- assert.equal(result.dimensions.codeCorrectness, 70);
113
- assert.equal(result.composite, 70);
114
- });
115
- it("accumulates cost across tests", () => {
116
- const tests = [
117
- makeTestResult({ cost: 0.05, dimensions: { taskCompletion: 80 } }),
118
- makeTestResult({ cost: 0.03, dimensions: { taskCompletion: 70 } }),
119
- ];
120
- const result = scoreTestGroup(tests, DEFAULT_PROFILE);
121
- assert.ok(Math.abs(result.totalCost - 0.08) < 0.001);
122
- });
123
- });
124
- describe("scoreTestGroup — profile handling", () => {
125
- it("uses output-only profile (excludes doc-coverage)", () => {
126
- const tests = [
127
- makeTestResult({
128
- dimensions: {
129
- taskCompletion: 80,
130
- codeCorrectness: 60,
131
- docCoverage: 100,
132
- },
133
- }),
134
- ];
135
- const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
136
- // doc-coverage should be present in dimensions but NOT affect composite
137
- // Expected: 80*0.45 + 60*0.55 = 36 + 33 = 69
138
- assert.equal(result.dimensions.docCoverage, 100);
139
- assert.equal(result.composite, 69);
140
- });
141
- it("handles profile with only one dimension", () => {
142
- const tests = [
143
- makeTestResult({
144
- dimensions: { taskCompletion: 90, codeCorrectness: 50 },
145
- }),
146
- ];
147
- const result = scoreTestGroup(tests, { "task-completion": 1.0 });
148
- // Only taskCompletion should count
149
- assert.equal(result.composite, 90);
150
- });
151
- });
152
- describe("scoreTestGroup — edge cases", () => {
153
- it("handles tests with no rubric components", () => {
154
- const test = {
155
- cost: 0.01,
156
- description: "no rubrics",
157
- gradingResult: {
158
- componentResults: [
159
- { assertion: { type: "javascript" }, pass: true, score: 1 },
160
- ],
161
- pass: true,
162
- },
163
- response: { output: "mock" },
164
- vars: { task: "test", docs: "" },
165
- };
166
- const result = scoreTestGroup([test], DEFAULT_PROFILE);
167
- // No llm-rubric components → 0 composite
168
- assert.equal(result.composite, 0);
169
- assert.equal(result.totalCost, 0.01);
170
- });
171
- it("provides raw DimensionScore objects for advanced consumers", () => {
172
- const tests = [
173
- makeTestResult({
174
- dimensions: { taskCompletion: 80, codeCorrectness: 60 },
175
- }),
176
- ];
177
- const result = scoreTestGroup(tests, DEFAULT_PROFILE);
178
- assert.ok(result.rawDimensions.length >= 2);
179
- const tcDim = result.rawDimensions.find((d) => d.dimensionId === "task-completion");
180
- assert.ok(tcDim);
181
- assert.ok(tcDim.score >= 0 && tcDim.score <= 1); // 0–1 scale
182
- assert.equal(tcDim.assertionCount, 1);
183
- });
184
- });
@@ -1,8 +0,0 @@
1
- /**
2
- * task-graph-builder.test.ts — Unit tests for TaskGraphBuilder.
3
- *
4
- * Tests DAG construction, cycle detection, filtering, and priority assignment.
5
- *
6
- * Run: npx tsx --test src/pipeline/compiler/__tests__/task-graph-builder.test.ts
7
- */
8
- export {};