@sanity/ailf 0.4.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
- package/dist/_vendor/ailf-core/examples/index.js +10 -10
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +5 -7
- package/dist/pipeline/calculate-scores.js +74 -153
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +23 -14
- package/dist/pipeline/expand-tasks.js +37 -31
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +18 -21
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +47 -0
- package/dist/pipeline/profile-resolution.js +91 -0
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -62
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scoring-and-presets.test.ts — Tests for 4-tier scoring engine,
|
|
3
|
+
* storage schema, and plugin registry / presets.
|
|
4
|
+
*
|
|
5
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-and-presets.test.ts
|
|
6
|
+
*/
|
|
7
|
+
import assert from "node:assert/strict";
|
|
8
|
+
import { dirname, resolve } from "node:path";
|
|
9
|
+
import { describe, it } from "node:test";
|
|
10
|
+
import { fileURLToPath } from "node:url";
|
|
11
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
import { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "../../../_vendor/ailf-core/index.js";
|
|
13
|
+
import { CURRENT_SCHEMA_VERSION, InMemoryPluginRegistry, isSchemaVersioned, migrateDocument, } from "../../../_vendor/ailf-core/index.js";
|
|
14
|
+
import { createSanityLiteracyPreset, sanityLiteracyPreset, } from "../presets/sanity-literacy.js";
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Helpers
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
function makeAssertion(overrides) {
|
|
19
|
+
return {
|
|
20
|
+
pass: true,
|
|
21
|
+
score: 0.8,
|
|
22
|
+
reason: "Good",
|
|
23
|
+
assertionType: "llm-rubric",
|
|
24
|
+
dimension: "task-completion",
|
|
25
|
+
latencyMs: 100,
|
|
26
|
+
weight: 1.0,
|
|
27
|
+
...overrides,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
function makeDimension(overrides) {
|
|
31
|
+
return {
|
|
32
|
+
dimensionId: "task-completion",
|
|
33
|
+
label: "Task Completion",
|
|
34
|
+
score: 0.8,
|
|
35
|
+
assertionCount: 2,
|
|
36
|
+
passCount: 2,
|
|
37
|
+
aggregation: "weighted-mean",
|
|
38
|
+
assertions: [],
|
|
39
|
+
...overrides,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
// Tier 1 → Tier 2: Assertion → Dimension aggregation
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
describe("aggregateDimensions", () => {
|
|
46
|
+
it("groups assertions by dimension", () => {
|
|
47
|
+
const assertions = [
|
|
48
|
+
makeAssertion({ dimension: "code-correctness", score: 0.9 }),
|
|
49
|
+
makeAssertion({ dimension: "code-correctness", score: 0.7 }),
|
|
50
|
+
makeAssertion({ dimension: "task-completion", score: 0.8 }),
|
|
51
|
+
];
|
|
52
|
+
const dims = aggregateDimensions(assertions);
|
|
53
|
+
assert.equal(dims.length, 2);
|
|
54
|
+
const cc = dims.find((d) => d.dimensionId === "code-correctness");
|
|
55
|
+
assert.ok(cc);
|
|
56
|
+
assert.equal(cc.assertionCount, 2);
|
|
57
|
+
});
|
|
58
|
+
it("uses weighted-mean by default", () => {
|
|
59
|
+
const assertions = [
|
|
60
|
+
makeAssertion({ score: 0.6, weight: 1.0 }),
|
|
61
|
+
makeAssertion({ score: 0.8, weight: 3.0 }),
|
|
62
|
+
];
|
|
63
|
+
const dims = aggregateDimensions(assertions);
|
|
64
|
+
// Weighted mean: (0.6*1 + 0.8*3) / (1+3) = 3.0/4 = 0.75
|
|
65
|
+
assert.ok(Math.abs(dims[0].score - 0.75) < 0.01);
|
|
66
|
+
});
|
|
67
|
+
it("falls back to pass rate when no numeric scores", () => {
|
|
68
|
+
const assertions = [
|
|
69
|
+
makeAssertion({ score: null, pass: true }),
|
|
70
|
+
makeAssertion({ score: null, pass: false }),
|
|
71
|
+
];
|
|
72
|
+
const dims = aggregateDimensions(assertions);
|
|
73
|
+
assert.equal(dims[0].score, 0.5);
|
|
74
|
+
});
|
|
75
|
+
it("applies custom dimension labels", () => {
|
|
76
|
+
const assertions = [makeAssertion({ dimension: "tc" })];
|
|
77
|
+
const dims = aggregateDimensions(assertions, {
|
|
78
|
+
dimensionLabels: { tc: "Task Completion" },
|
|
79
|
+
});
|
|
80
|
+
assert.equal(dims[0].label, "Task Completion");
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Tier 2 → Tier 3: Dimension → Task scoring
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
describe("computeTaskScore", () => {
|
|
87
|
+
it("computes weighted score from dimensions", () => {
|
|
88
|
+
const dims = [
|
|
89
|
+
makeDimension({ dimensionId: "tc", score: 0.8 }),
|
|
90
|
+
makeDimension({ dimensionId: "cc", score: 0.6 }),
|
|
91
|
+
];
|
|
92
|
+
const task = computeTaskScore(dims, {
|
|
93
|
+
taskId: "test-task",
|
|
94
|
+
weights: { tc: 0.6, cc: 0.4 },
|
|
95
|
+
});
|
|
96
|
+
// 0.8*0.6 + 0.6*0.4 = 0.48 + 0.24 = 0.72
|
|
97
|
+
assert.ok(Math.abs(task.score - 0.72) < 0.01);
|
|
98
|
+
});
|
|
99
|
+
it("normalizes weights that don't sum to 1", () => {
|
|
100
|
+
const dims = [
|
|
101
|
+
makeDimension({ dimensionId: "tc", score: 1.0 }),
|
|
102
|
+
makeDimension({ dimensionId: "cc", score: 0.0 }),
|
|
103
|
+
];
|
|
104
|
+
const task = computeTaskScore(dims, {
|
|
105
|
+
taskId: "test-task",
|
|
106
|
+
weights: { tc: 2, cc: 2 },
|
|
107
|
+
});
|
|
108
|
+
// (1.0*2 + 0.0*2) / (2+2) = 2/4 = 0.5
|
|
109
|
+
assert.ok(Math.abs(task.score - 0.5) < 0.01);
|
|
110
|
+
});
|
|
111
|
+
it("checks against threshold", () => {
|
|
112
|
+
const dims = [makeDimension({ dimensionId: "tc", score: 0.6 })];
|
|
113
|
+
const passing = computeTaskScore(dims, {
|
|
114
|
+
taskId: "t1",
|
|
115
|
+
weights: { tc: 1.0 },
|
|
116
|
+
threshold: 0.5,
|
|
117
|
+
});
|
|
118
|
+
assert.equal(passing.passesThreshold, true);
|
|
119
|
+
const failing = computeTaskScore(dims, {
|
|
120
|
+
taskId: "t2",
|
|
121
|
+
weights: { tc: 1.0 },
|
|
122
|
+
threshold: 0.7,
|
|
123
|
+
});
|
|
124
|
+
assert.equal(failing.passesThreshold, false);
|
|
125
|
+
});
|
|
126
|
+
it("records weight source", () => {
|
|
127
|
+
const task = computeTaskScore([makeDimension()], {
|
|
128
|
+
taskId: "t1",
|
|
129
|
+
weights: { "task-completion": 1.0 },
|
|
130
|
+
weightSource: "rubrics.yaml:default",
|
|
131
|
+
});
|
|
132
|
+
assert.equal(task.weightSource, "rubrics.yaml:default");
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
// Tier 3 → Tier 4: Task → Area aggregation
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
describe("aggregateAreas", () => {
|
|
139
|
+
it("groups tasks by area prefix", () => {
|
|
140
|
+
const tasks = [
|
|
141
|
+
computeTaskScore([makeDimension({ score: 0.8 })], {
|
|
142
|
+
taskId: "groq-basic",
|
|
143
|
+
weights: { "task-completion": 1.0 },
|
|
144
|
+
}),
|
|
145
|
+
computeTaskScore([makeDimension({ score: 0.6 })], {
|
|
146
|
+
taskId: "groq-advanced",
|
|
147
|
+
weights: { "task-completion": 1.0 },
|
|
148
|
+
}),
|
|
149
|
+
computeTaskScore([makeDimension({ score: 0.9 })], {
|
|
150
|
+
taskId: "studio-schema",
|
|
151
|
+
weights: { "task-completion": 1.0 },
|
|
152
|
+
}),
|
|
153
|
+
];
|
|
154
|
+
const areas = aggregateAreas(tasks);
|
|
155
|
+
assert.equal(areas.length, 2);
|
|
156
|
+
const groq = areas.find((a) => a.areaId === "groq");
|
|
157
|
+
assert.ok(groq);
|
|
158
|
+
assert.equal(groq.taskCount, 2);
|
|
159
|
+
assert.ok(Math.abs(groq.score - 0.7) < 0.01); // (0.8+0.6)/2
|
|
160
|
+
const studio = areas.find((a) => a.areaId === "studio");
|
|
161
|
+
assert.ok(studio);
|
|
162
|
+
assert.equal(studio.taskCount, 1);
|
|
163
|
+
});
|
|
164
|
+
it("computes delta from previous scores", () => {
|
|
165
|
+
const tasks = [
|
|
166
|
+
computeTaskScore([makeDimension({ score: 0.8 })], {
|
|
167
|
+
taskId: "groq-basic",
|
|
168
|
+
weights: { "task-completion": 1.0 },
|
|
169
|
+
}),
|
|
170
|
+
];
|
|
171
|
+
const areas = aggregateAreas(tasks, { groq: 0.6 });
|
|
172
|
+
assert.ok(areas[0].delta !== null);
|
|
173
|
+
assert.ok(Math.abs(areas[0].delta - 0.2) < 0.01);
|
|
174
|
+
});
|
|
175
|
+
});
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
// Score normalization
|
|
178
|
+
// ---------------------------------------------------------------------------
|
|
179
|
+
describe("normalizeScore", () => {
|
|
180
|
+
it("normalizes LLM rubric scores (0-100 → 0-1)", () => {
|
|
181
|
+
assert.ok(Math.abs(normalizeScore(75, "llm-rubric") - 0.75) < 0.01);
|
|
182
|
+
});
|
|
183
|
+
it("passes through already-normalized scores", () => {
|
|
184
|
+
assert.ok(Math.abs(normalizeScore(0.75, "llm-rubric") - 0.75) < 0.01);
|
|
185
|
+
});
|
|
186
|
+
it("normalizes boolean assertions to 0 or 1", () => {
|
|
187
|
+
assert.equal(normalizeScore(1, "contains"), 1);
|
|
188
|
+
assert.equal(normalizeScore(0, "contains"), 0);
|
|
189
|
+
});
|
|
190
|
+
it("clamps similarity scores to [0, 1]", () => {
|
|
191
|
+
assert.equal(normalizeScore(1.5, "similar"), 1);
|
|
192
|
+
assert.equal(normalizeScore(-0.1, "similar"), 0);
|
|
193
|
+
});
|
|
194
|
+
});
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
196
|
+
// Ensemble grading
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
describe("computeEnsembleScore", () => {
|
|
199
|
+
it("computes mean ensemble score", () => {
|
|
200
|
+
const { score, agreement } = computeEnsembleScore([0.8, 0.6, 0.7], "mean");
|
|
201
|
+
assert.ok(Math.abs(score - 0.7) < 0.01);
|
|
202
|
+
assert.ok(agreement > 0);
|
|
203
|
+
});
|
|
204
|
+
it("computes median ensemble score", () => {
|
|
205
|
+
const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "median");
|
|
206
|
+
assert.ok(Math.abs(score - 0.7) < 0.01);
|
|
207
|
+
});
|
|
208
|
+
it("computes max ensemble score", () => {
|
|
209
|
+
const { score } = computeEnsembleScore([0.9, 0.5, 0.7], "max");
|
|
210
|
+
assert.ok(Math.abs(score - 0.9) < 0.01);
|
|
211
|
+
});
|
|
212
|
+
it("agreement is 1 for identical scores", () => {
|
|
213
|
+
const { agreement } = computeEnsembleScore([0.8, 0.8, 0.8]);
|
|
214
|
+
assert.ok(Math.abs(agreement - 1.0) < 0.01);
|
|
215
|
+
});
|
|
216
|
+
it("agreement decreases with divergent scores", () => {
|
|
217
|
+
const { agreement } = computeEnsembleScore([0.0, 1.0]);
|
|
218
|
+
assert.ok(agreement < 0.6);
|
|
219
|
+
});
|
|
220
|
+
});
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
// Storage schema
|
|
223
|
+
// ---------------------------------------------------------------------------
|
|
224
|
+
describe("storage schema", () => {
|
|
225
|
+
it("CURRENT_SCHEMA_VERSION is 1", () => {
|
|
226
|
+
assert.equal(CURRENT_SCHEMA_VERSION, 1);
|
|
227
|
+
});
|
|
228
|
+
it("isSchemaVersioned detects versioned docs", () => {
|
|
229
|
+
assert.equal(isSchemaVersioned({ schemaVersion: 1 }), true);
|
|
230
|
+
assert.equal(isSchemaVersioned({}), false);
|
|
231
|
+
assert.equal(isSchemaVersioned(null), false);
|
|
232
|
+
});
|
|
233
|
+
it("migrateDocument is no-op for current version", () => {
|
|
234
|
+
const doc = { schemaVersion: 1, _type: "ailf.run" };
|
|
235
|
+
const migrated = migrateDocument(doc);
|
|
236
|
+
assert.equal(migrated.schemaVersion, 1);
|
|
237
|
+
});
|
|
238
|
+
});
|
|
239
|
+
// ---------------------------------------------------------------------------
|
|
240
|
+
// Plugin registry
|
|
241
|
+
// ---------------------------------------------------------------------------
|
|
242
|
+
describe("InMemoryPluginRegistry", () => {
|
|
243
|
+
it("registers and retrieves modes", () => {
|
|
244
|
+
const registry = new InMemoryPluginRegistry();
|
|
245
|
+
registry.registerMode({
|
|
246
|
+
id: "custom",
|
|
247
|
+
label: "Custom Mode",
|
|
248
|
+
validProviderPatterns: [".*"],
|
|
249
|
+
rubricTemplateIds: [],
|
|
250
|
+
handlerModule: "./custom.js",
|
|
251
|
+
});
|
|
252
|
+
assert.equal(registry.getModes().length, 1);
|
|
253
|
+
assert.equal(registry.getMode("custom")?.label, "Custom Mode");
|
|
254
|
+
});
|
|
255
|
+
it("registers and retrieves assertions", () => {
|
|
256
|
+
const registry = new InMemoryPluginRegistry();
|
|
257
|
+
registry.registerAssertion({
|
|
258
|
+
type: "api-match",
|
|
259
|
+
label: "API Match",
|
|
260
|
+
compatibleModes: ["custom"],
|
|
261
|
+
handlerModule: "./api-match.js",
|
|
262
|
+
});
|
|
263
|
+
assert.equal(registry.getAssertions().length, 1);
|
|
264
|
+
});
|
|
265
|
+
it("registers a complete preset", () => {
|
|
266
|
+
const registry = new InMemoryPluginRegistry();
|
|
267
|
+
registry.registerPreset(sanityLiteracyPreset);
|
|
268
|
+
// Preset should register its modes, assertions, rubric templates
|
|
269
|
+
assert.ok(registry.getMode("literacy"));
|
|
270
|
+
assert.ok(registry.getAssertions().length > 0);
|
|
271
|
+
assert.ok(registry.getRubricTemplates().length > 0);
|
|
272
|
+
assert.ok(registry.getPresets().length === 1);
|
|
273
|
+
});
|
|
274
|
+
});
|
|
275
|
+
// ---------------------------------------------------------------------------
|
|
276
|
+
// sanity-literacy preset
|
|
277
|
+
// ---------------------------------------------------------------------------
|
|
278
|
+
describe("sanityLiteracyPreset", () => {
|
|
279
|
+
it("has correct manifest", () => {
|
|
280
|
+
assert.equal(sanityLiteracyPreset.name, "sanity-literacy");
|
|
281
|
+
assert.equal(sanityLiteracyPreset.manifest.pluginApiVersion, 1);
|
|
282
|
+
});
|
|
283
|
+
it("registers literacy mode", () => {
|
|
284
|
+
assert.equal(sanityLiteracyPreset.modes?.length, 1);
|
|
285
|
+
assert.equal(sanityLiteracyPreset.modes[0].id, "literacy");
|
|
286
|
+
});
|
|
287
|
+
it("includes core assertion types", () => {
|
|
288
|
+
const types = sanityLiteracyPreset.assertions.map((a) => a.type);
|
|
289
|
+
assert.ok(types.includes("contains"));
|
|
290
|
+
assert.ok(types.includes("llm-rubric"));
|
|
291
|
+
assert.ok(types.includes("javascript"));
|
|
292
|
+
});
|
|
293
|
+
it("includes 3 rubric templates", () => {
|
|
294
|
+
assert.equal(sanityLiteracyPreset.rubricTemplates?.length, 3);
|
|
295
|
+
const ids = sanityLiteracyPreset.rubricTemplates.map((t) => t.id);
|
|
296
|
+
assert.ok(ids.includes("task-completion"));
|
|
297
|
+
assert.ok(ids.includes("code-correctness"));
|
|
298
|
+
assert.ok(ids.includes("doc-coverage"));
|
|
299
|
+
});
|
|
300
|
+
it("rubric template scales match config/rubrics.ts authoritative source", () => {
|
|
301
|
+
const templates = sanityLiteracyPreset.rubricTemplates;
|
|
302
|
+
const tc = templates.find((t) => t.id === "task-completion");
|
|
303
|
+
assert.deepEqual(tc.scale, [
|
|
304
|
+
"0: Couldn't attempt — missing critical information",
|
|
305
|
+
"20: Attempted but fundamentally wrong approach",
|
|
306
|
+
"50: Partial implementation — major functional gaps",
|
|
307
|
+
"80: Mostly complete — minor issues or missing edge cases",
|
|
308
|
+
"100: Fully functional code — works as expected",
|
|
309
|
+
]);
|
|
310
|
+
assert.equal(tc.criteriaLabel, "Must demonstrate:");
|
|
311
|
+
const cc = templates.find((t) => t.id === "code-correctness");
|
|
312
|
+
assert.deepEqual(cc.scale, [
|
|
313
|
+
"0: Broken code, syntax errors, or deprecated APIs",
|
|
314
|
+
"30: Works but uses anti-patterns or inefficient approaches",
|
|
315
|
+
"50: Works but not idiomatic",
|
|
316
|
+
"80: Follows most best practices",
|
|
317
|
+
"100: Follows all best practices, idiomatic implementation",
|
|
318
|
+
]);
|
|
319
|
+
assert.equal(cc.criteriaLabel, "Check for:");
|
|
320
|
+
const dc = templates.find((t) => t.id === "doc-coverage");
|
|
321
|
+
assert.deepEqual(dc.scale, [
|
|
322
|
+
"0: Had to hallucinate/guess most implementation details",
|
|
323
|
+
"30: Significant gaps — filled with assumptions",
|
|
324
|
+
"50: Some gaps — inferred from partial information",
|
|
325
|
+
"80: Minor gaps — almost everything was documented",
|
|
326
|
+
"100: Complete coverage — all necessary info was in docs",
|
|
327
|
+
]);
|
|
328
|
+
});
|
|
329
|
+
it("includes sanity:// fixture resolver", () => {
|
|
330
|
+
assert.ok(sanityLiteracyPreset.fixtureResolvers?.some((r) => r.scheme === "sanity://"));
|
|
331
|
+
});
|
|
332
|
+
it("includes 3 prompt templates", () => {
|
|
333
|
+
const templates = sanityLiteracyPreset.promptTemplates;
|
|
334
|
+
assert.ok(templates);
|
|
335
|
+
assert.ok(templates["with-docs"]);
|
|
336
|
+
assert.ok(templates["without-docs"]);
|
|
337
|
+
assert.ok(templates["agentic"]);
|
|
338
|
+
assert.equal(Object.keys(templates).length, 3);
|
|
339
|
+
});
|
|
340
|
+
it("prompt template content matches literacy handler", () => {
|
|
341
|
+
const templates = sanityLiteracyPreset.promptTemplates;
|
|
342
|
+
assert.ok(templates["with-docs"].template.includes("{{docs}}"));
|
|
343
|
+
assert.ok(templates["with-docs"].template.includes("{{task}}"));
|
|
344
|
+
assert.ok(templates["without-docs"].template.includes("{{task}}"));
|
|
345
|
+
assert.ok(templates["agentic"].template.includes("{{task}}"));
|
|
346
|
+
});
|
|
347
|
+
it("includes default and output-only scoring profiles", () => {
|
|
348
|
+
const profiles = sanityLiteracyPreset.scoringProfiles;
|
|
349
|
+
assert.ok(profiles);
|
|
350
|
+
assert.deepEqual(profiles["default"], {
|
|
351
|
+
"task-completion": 0.5,
|
|
352
|
+
"code-correctness": 0.25,
|
|
353
|
+
"doc-coverage": 0.25,
|
|
354
|
+
});
|
|
355
|
+
assert.deepEqual(profiles["output-only"], {
|
|
356
|
+
"task-completion": 0.6,
|
|
357
|
+
"code-correctness": 0.4,
|
|
358
|
+
});
|
|
359
|
+
});
|
|
360
|
+
it("includes 3 source definitions", () => {
|
|
361
|
+
const sources = sanityLiteracyPreset.sourceDefs;
|
|
362
|
+
assert.ok(sources);
|
|
363
|
+
assert.equal(sources.length, 3);
|
|
364
|
+
const names = sources.map((s) => s.name);
|
|
365
|
+
assert.ok(names.includes("production"));
|
|
366
|
+
assert.ok(names.includes("branch"));
|
|
367
|
+
assert.ok(names.includes("local"));
|
|
368
|
+
});
|
|
369
|
+
it("production source has correct baseUrl", () => {
|
|
370
|
+
const prod = sanityLiteracyPreset.sourceDefs.find((s) => s.name === "production");
|
|
371
|
+
assert.ok(prod);
|
|
372
|
+
assert.equal(prod.baseUrl, "https://www.sanity.io/docs");
|
|
373
|
+
});
|
|
374
|
+
it("includes feature registry with all features", () => {
|
|
375
|
+
const features = sanityLiteracyPreset.featureDefs;
|
|
376
|
+
assert.ok(features);
|
|
377
|
+
assert.equal(features.features.length, 14);
|
|
378
|
+
const ids = features.features.map((f) => f.id);
|
|
379
|
+
// Covered features
|
|
380
|
+
assert.ok(ids.includes("groq"));
|
|
381
|
+
assert.ok(ids.includes("visual-editing"));
|
|
382
|
+
assert.ok(ids.includes("nextjs-live"));
|
|
383
|
+
assert.ok(ids.includes("functions"));
|
|
384
|
+
assert.ok(ids.includes("studio-setup"));
|
|
385
|
+
assert.ok(ids.includes("frameworks"));
|
|
386
|
+
// Uncovered features
|
|
387
|
+
assert.ok(ids.includes("portable-text"));
|
|
388
|
+
assert.ok(ids.includes("image-assets"));
|
|
389
|
+
assert.ok(ids.includes("mutations"));
|
|
390
|
+
assert.ok(ids.includes("schemas"));
|
|
391
|
+
assert.ok(ids.includes("authentication"));
|
|
392
|
+
assert.ok(ids.includes("webhooks"));
|
|
393
|
+
assert.ok(ids.includes("realtime"));
|
|
394
|
+
assert.ok(ids.includes("ai-assist"));
|
|
395
|
+
});
|
|
396
|
+
it("includes a docFetcher factory", () => {
|
|
397
|
+
assert.equal(typeof sanityLiteracyPreset.docFetcher, "function");
|
|
398
|
+
// The factory should return a SanityDocFetcher instance
|
|
399
|
+
const fetcher = sanityLiteracyPreset.docFetcher();
|
|
400
|
+
assert.ok(fetcher);
|
|
401
|
+
assert.equal(typeof fetcher.fetch, "function");
|
|
402
|
+
});
|
|
403
|
+
});
|
|
404
|
+
// ---------------------------------------------------------------------------
|
|
405
|
+
// createSanityLiteracyPreset factory
|
|
406
|
+
// ---------------------------------------------------------------------------
|
|
407
|
+
describe("createSanityLiteracyPreset", () => {
|
|
408
|
+
it("returns a preset with all extension points populated", () => {
|
|
409
|
+
const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
|
|
410
|
+
assert.equal(preset.name, "sanity-literacy");
|
|
411
|
+
assert.ok(preset.modes);
|
|
412
|
+
assert.ok(preset.assertions);
|
|
413
|
+
assert.ok(preset.rubricTemplates);
|
|
414
|
+
assert.ok(preset.fixtureResolvers);
|
|
415
|
+
assert.ok(preset.promptTemplates);
|
|
416
|
+
assert.ok(preset.scoringProfiles);
|
|
417
|
+
assert.ok(preset.docFetcher);
|
|
418
|
+
assert.ok(preset.sourceDefs);
|
|
419
|
+
assert.ok(preset.featureDefs);
|
|
420
|
+
});
|
|
421
|
+
it("registers all extension points into the registry", () => {
|
|
422
|
+
const registry = new InMemoryPluginRegistry();
|
|
423
|
+
const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
|
|
424
|
+
registry.registerPreset(preset);
|
|
425
|
+
assert.ok(registry.getMode("literacy"));
|
|
426
|
+
assert.ok(registry.getAssertions().length > 0);
|
|
427
|
+
assert.ok(registry.getRubricTemplates().length === 3);
|
|
428
|
+
assert.ok(Object.keys(registry.getPromptTemplates()).length === 3);
|
|
429
|
+
assert.ok(Object.keys(registry.getScoringProfiles()).length === 2);
|
|
430
|
+
assert.ok(registry.getDocFetcherFactory());
|
|
431
|
+
assert.equal(registry.getSourceDefs().length, 3);
|
|
432
|
+
assert.ok(registry.getFeatureDefs());
|
|
433
|
+
assert.equal(registry.getFeatureDefs().features.length, 14);
|
|
434
|
+
});
|
|
435
|
+
});
|
|
436
|
+
// ---------------------------------------------------------------------------
|
|
437
|
+
// Preset is single source of truth for sources and features
|
|
438
|
+
// ---------------------------------------------------------------------------
|
|
439
|
+
describe("preset is single source of truth for Sanity config", () => {
|
|
440
|
+
it("config/sources.ts exports an empty array", async () => {
|
|
441
|
+
const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
|
|
442
|
+
const ROOT = resolve(__dirname, "..", "..", "..", "..");
|
|
443
|
+
const loaded = tryLoadConfigFile("sources", ROOT);
|
|
444
|
+
assert.ok(loaded, "config/sources.ts should exist");
|
|
445
|
+
const sources = loaded.data;
|
|
446
|
+
assert.ok(Array.isArray(sources), "should export an array");
|
|
447
|
+
assert.equal(sources.length, 0, "config/sources should be empty (preset provides sources)");
|
|
448
|
+
});
|
|
449
|
+
it("config/features.ts exports an empty features array", async () => {
|
|
450
|
+
const { tryLoadConfigFile } = await import("../../compiler/config-loader.js");
|
|
451
|
+
const ROOT = resolve(__dirname, "..", "..", "..", "..");
|
|
452
|
+
const loaded = tryLoadConfigFile("features", ROOT);
|
|
453
|
+
assert.ok(loaded, "config/features.ts should exist");
|
|
454
|
+
assert.ok(Array.isArray(loaded.data.features), "should have a features array");
|
|
455
|
+
assert.equal(loaded.data.features.length, 0, "config/features should be empty (preset provides features)");
|
|
456
|
+
});
|
|
457
|
+
it("preset contains all 3 source entries", () => {
|
|
458
|
+
const sources = sanityLiteracyPreset.sourceDefs;
|
|
459
|
+
assert.equal(sources.length, 3);
|
|
460
|
+
const names = sources.map((s) => s.name).sort();
|
|
461
|
+
assert.deepEqual(names, ["branch", "local", "production"]);
|
|
462
|
+
});
|
|
463
|
+
it("preset contains all 14 feature entries", () => {
|
|
464
|
+
const features = sanityLiteracyPreset.featureDefs.features;
|
|
465
|
+
assert.equal(features.length, 14);
|
|
466
|
+
const covered = features.filter((f) => f.status === "covered");
|
|
467
|
+
const uncovered = features.filter((f) => f.status === "uncovered");
|
|
468
|
+
assert.equal(covered.length, 6, "should have 6 covered features");
|
|
469
|
+
assert.equal(uncovered.length, 8, "should have 8 uncovered features");
|
|
470
|
+
});
|
|
471
|
+
});
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
|
|
3
|
+
*
|
|
4
|
+
* Verifies that `scoreTestGroup` produces the same 0–100 output as the
|
|
5
|
+
* legacy `accumulateDimensions → averageDimensions → weightedComposite`
|
|
6
|
+
* chain when given identical inputs.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
|
|
9
|
+
*/
|
|
10
|
+
export {};
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scoring-bridge.test.ts — Tests for the 4-tier scoring engine bridge.
|
|
3
|
+
*
|
|
4
|
+
* Verifies that `scoreTestGroup` produces the same 0–100 output as the
|
|
5
|
+
* legacy `accumulateDimensions → averageDimensions → weightedComposite`
|
|
6
|
+
* chain when given identical inputs.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/scoring-bridge.test.ts
|
|
9
|
+
*/
|
|
10
|
+
import assert from "node:assert/strict";
|
|
11
|
+
import { describe, it } from "node:test";
|
|
12
|
+
import { scoreTestGroup } from "../scoring-bridge.js";
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Helpers
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
function makeTestResult(overrides) {
|
|
17
|
+
const dims = overrides?.dimensions ?? {};
|
|
18
|
+
const componentResults = [];
|
|
19
|
+
if (dims.taskCompletion !== undefined) {
|
|
20
|
+
componentResults.push({
|
|
21
|
+
assertion: {
|
|
22
|
+
type: "llm-rubric",
|
|
23
|
+
metadata: { dimension: "task-completion" },
|
|
24
|
+
},
|
|
25
|
+
pass: true,
|
|
26
|
+
reason: JSON.stringify({ score: dims.taskCompletion }),
|
|
27
|
+
score: dims.taskCompletion / 100,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
if (dims.codeCorrectness !== undefined) {
|
|
31
|
+
componentResults.push({
|
|
32
|
+
assertion: {
|
|
33
|
+
type: "llm-rubric",
|
|
34
|
+
metadata: { dimension: "code-correctness" },
|
|
35
|
+
},
|
|
36
|
+
pass: true,
|
|
37
|
+
reason: JSON.stringify({ score: dims.codeCorrectness }),
|
|
38
|
+
score: dims.codeCorrectness / 100,
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
if (dims.docCoverage !== undefined) {
|
|
42
|
+
componentResults.push({
|
|
43
|
+
assertion: {
|
|
44
|
+
type: "llm-rubric",
|
|
45
|
+
metadata: { dimension: "doc-coverage" },
|
|
46
|
+
},
|
|
47
|
+
pass: true,
|
|
48
|
+
reason: JSON.stringify({ score: dims.docCoverage }),
|
|
49
|
+
score: dims.docCoverage / 100,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
return {
|
|
53
|
+
cost: overrides?.cost ?? 0.01,
|
|
54
|
+
description: overrides?.description ?? "test",
|
|
55
|
+
gradingResult: {
|
|
56
|
+
componentResults,
|
|
57
|
+
pass: true,
|
|
58
|
+
},
|
|
59
|
+
response: { output: "mock output" },
|
|
60
|
+
vars: overrides?.vars ?? { task: "test", docs: "" },
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
const DEFAULT_PROFILE = {
|
|
64
|
+
"code-correctness": 0.35,
|
|
65
|
+
"doc-coverage": 0.25,
|
|
66
|
+
"task-completion": 0.4,
|
|
67
|
+
};
|
|
68
|
+
const OUTPUT_ONLY_PROFILE = {
|
|
69
|
+
"code-correctness": 0.55,
|
|
70
|
+
"task-completion": 0.45,
|
|
71
|
+
};
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// Tests
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
describe("scoreTestGroup — basic scoring", () => {
|
|
76
|
+
it("returns zeroes for empty test array", () => {
|
|
77
|
+
const result = scoreTestGroup([], DEFAULT_PROFILE);
|
|
78
|
+
assert.equal(result.composite, 0);
|
|
79
|
+
assert.equal(result.totalCost, 0);
|
|
80
|
+
assert.deepEqual(result.dimensions, {});
|
|
81
|
+
});
|
|
82
|
+
it("scores a single test with all dimensions", () => {
|
|
83
|
+
const tests = [
|
|
84
|
+
makeTestResult({
|
|
85
|
+
dimensions: {
|
|
86
|
+
taskCompletion: 80,
|
|
87
|
+
codeCorrectness: 70,
|
|
88
|
+
docCoverage: 60,
|
|
89
|
+
},
|
|
90
|
+
}),
|
|
91
|
+
];
|
|
92
|
+
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
93
|
+
// Expected: 80*0.4 + 70*0.35 + 60*0.25 = 32 + 24.5 + 15 = 71.5 → 72
|
|
94
|
+
assert.equal(result.dimensions.taskCompletion, 80);
|
|
95
|
+
assert.equal(result.dimensions.codeCorrectness, 70);
|
|
96
|
+
assert.equal(result.dimensions.docCoverage, 60);
|
|
97
|
+
assert.equal(result.composite, 72);
|
|
98
|
+
});
|
|
99
|
+
it("averages across multiple tests", () => {
|
|
100
|
+
const tests = [
|
|
101
|
+
makeTestResult({
|
|
102
|
+
dimensions: { taskCompletion: 80, codeCorrectness: 60 },
|
|
103
|
+
}),
|
|
104
|
+
makeTestResult({
|
|
105
|
+
dimensions: { taskCompletion: 60, codeCorrectness: 80 },
|
|
106
|
+
}),
|
|
107
|
+
];
|
|
108
|
+
const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
|
|
109
|
+
// taskCompletion avg = 70, codeCorrectness avg = 70
|
|
110
|
+
// Expected: 70*0.45 + 70*0.55 = 31.5 + 38.5 = 70
|
|
111
|
+
assert.equal(result.dimensions.taskCompletion, 70);
|
|
112
|
+
assert.equal(result.dimensions.codeCorrectness, 70);
|
|
113
|
+
assert.equal(result.composite, 70);
|
|
114
|
+
});
|
|
115
|
+
it("accumulates cost across tests", () => {
|
|
116
|
+
const tests = [
|
|
117
|
+
makeTestResult({ cost: 0.05, dimensions: { taskCompletion: 80 } }),
|
|
118
|
+
makeTestResult({ cost: 0.03, dimensions: { taskCompletion: 70 } }),
|
|
119
|
+
];
|
|
120
|
+
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
121
|
+
assert.ok(Math.abs(result.totalCost - 0.08) < 0.001);
|
|
122
|
+
});
|
|
123
|
+
});
|
|
124
|
+
describe("scoreTestGroup — profile handling", () => {
|
|
125
|
+
it("uses output-only profile (excludes doc-coverage)", () => {
|
|
126
|
+
const tests = [
|
|
127
|
+
makeTestResult({
|
|
128
|
+
dimensions: {
|
|
129
|
+
taskCompletion: 80,
|
|
130
|
+
codeCorrectness: 60,
|
|
131
|
+
docCoverage: 100,
|
|
132
|
+
},
|
|
133
|
+
}),
|
|
134
|
+
];
|
|
135
|
+
const result = scoreTestGroup(tests, OUTPUT_ONLY_PROFILE);
|
|
136
|
+
// doc-coverage should be present in dimensions but NOT affect composite
|
|
137
|
+
// Expected: 80*0.45 + 60*0.55 = 36 + 33 = 69
|
|
138
|
+
assert.equal(result.dimensions.docCoverage, 100);
|
|
139
|
+
assert.equal(result.composite, 69);
|
|
140
|
+
});
|
|
141
|
+
it("handles profile with only one dimension", () => {
|
|
142
|
+
const tests = [
|
|
143
|
+
makeTestResult({
|
|
144
|
+
dimensions: { taskCompletion: 90, codeCorrectness: 50 },
|
|
145
|
+
}),
|
|
146
|
+
];
|
|
147
|
+
const result = scoreTestGroup(tests, { "task-completion": 1.0 });
|
|
148
|
+
// Only taskCompletion should count
|
|
149
|
+
assert.equal(result.composite, 90);
|
|
150
|
+
});
|
|
151
|
+
});
|
|
152
|
+
describe("scoreTestGroup — edge cases", () => {
|
|
153
|
+
it("handles tests with no rubric components", () => {
|
|
154
|
+
const test = {
|
|
155
|
+
cost: 0.01,
|
|
156
|
+
description: "no rubrics",
|
|
157
|
+
gradingResult: {
|
|
158
|
+
componentResults: [
|
|
159
|
+
{ assertion: { type: "javascript" }, pass: true, score: 1 },
|
|
160
|
+
],
|
|
161
|
+
pass: true,
|
|
162
|
+
},
|
|
163
|
+
response: { output: "mock" },
|
|
164
|
+
vars: { task: "test", docs: "" },
|
|
165
|
+
};
|
|
166
|
+
const result = scoreTestGroup([test], DEFAULT_PROFILE);
|
|
167
|
+
// No llm-rubric components → 0 composite
|
|
168
|
+
assert.equal(result.composite, 0);
|
|
169
|
+
assert.equal(result.totalCost, 0.01);
|
|
170
|
+
});
|
|
171
|
+
it("provides raw DimensionScore objects for advanced consumers", () => {
|
|
172
|
+
const tests = [
|
|
173
|
+
makeTestResult({
|
|
174
|
+
dimensions: { taskCompletion: 80, codeCorrectness: 60 },
|
|
175
|
+
}),
|
|
176
|
+
];
|
|
177
|
+
const result = scoreTestGroup(tests, DEFAULT_PROFILE);
|
|
178
|
+
assert.ok(result.rawDimensions.length >= 2);
|
|
179
|
+
const tcDim = result.rawDimensions.find((d) => d.dimensionId === "task-completion");
|
|
180
|
+
assert.ok(tcDim);
|
|
181
|
+
assert.ok(tcDim.score >= 0 && tcDim.score <= 1); // 0–1 scale
|
|
182
|
+
assert.equal(tcDim.assertionCount, 1);
|
|
183
|
+
});
|
|
184
|
+
});
|