@sanity/ailf 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -1,6 +1,21 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/generate-configs.ts
|
|
3
3
|
*
|
|
4
|
+
* @deprecated This is the LEGACY compilation path. New code should use the
|
|
5
|
+
* config compiler pipeline instead:
|
|
6
|
+
*
|
|
7
|
+
* import { compileLiteracyTasks } from "./compiler/literacy-bridge.js"
|
|
8
|
+
* import { buildTaskGraph, compileToPromptfoo } from "./compiler/index.js"
|
|
9
|
+
*
|
|
10
|
+
* This file is retained behind the `--legacy-compiler` CLI flag as an
|
|
11
|
+
* emergency fallback during the migration period. It will be removed once
|
|
12
|
+
* the new compiler has been validated in production.
|
|
13
|
+
*
|
|
14
|
+
* @see packages/eval/src/pipeline/compiler/ — the new compiler pipeline
|
|
15
|
+
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
16
|
+
*
|
|
17
|
+
* ---
|
|
18
|
+
*
|
|
4
19
|
* Reads config/models.yaml (the central model registry) and generates all
|
|
5
20
|
* promptfoo config files with the correct provider entries.
|
|
6
21
|
*
|
|
@@ -19,12 +34,15 @@
|
|
|
19
34
|
* @see docs/exec-plans/eliminate-lib-layer.md
|
|
20
35
|
*/
|
|
21
36
|
import { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "../_vendor/ailf-core/index.js";
|
|
22
|
-
import { existsSync,
|
|
37
|
+
import { existsSync, readdirSync, writeFileSync } from "fs";
|
|
23
38
|
import { resolve } from "path";
|
|
24
|
-
import { dump
|
|
39
|
+
import { dump } from "js-yaml";
|
|
25
40
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
41
|
+
import { loadConfigFile } from "./compiler/config-loader.js";
|
|
42
|
+
import { LITERACY_PROMPT_TEMPLATES } from "./compiler/mode-handlers/literacy-handler.js";
|
|
26
43
|
import { expandTaskDefinitions, loadAndExpandTasks } from "./expand-tasks.js";
|
|
27
44
|
import { validateModelsYaml } from "./validate.js";
|
|
45
|
+
import { LiteracyVariant } from "./normalize-mode.js";
|
|
28
46
|
import { loadSource } from "../sources.js";
|
|
29
47
|
// Re-export pure functions from core for backward compatibility.
|
|
30
48
|
// Tests and other modules that previously imported from lib/generate-configs
|
|
@@ -44,32 +62,43 @@ export function discoverTaskFiles(rootDir) {
|
|
|
44
62
|
.sort()
|
|
45
63
|
.map((f) => `file://tasks/${f}`);
|
|
46
64
|
}
|
|
47
|
-
/**
|
|
65
|
+
/**
|
|
66
|
+
* Load prompt templates. Uses handler-owned literacy templates as defaults,
|
|
67
|
+
* with config/prompts.ts as an override layer for user customization.
|
|
68
|
+
*/
|
|
48
69
|
export function loadPrompts(rootDir) {
|
|
49
|
-
const promptsPath = resolve(rootDir, "config", "prompts.yaml");
|
|
50
|
-
if (!existsSync(promptsPath)) {
|
|
51
|
-
throw new Error(`config/prompts.yaml not found at ${promptsPath}. This file is required — it defines the prompt templates for all evaluation modes.`);
|
|
52
|
-
}
|
|
53
|
-
const raw = readFileSync(promptsPath, "utf-8");
|
|
54
|
-
const data = load(raw);
|
|
55
70
|
const toPrompt = (entry) => ({
|
|
56
71
|
id: entry.id,
|
|
57
72
|
label: entry.label,
|
|
58
73
|
raw: entry.template,
|
|
59
74
|
});
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
75
|
+
// Load user overrides from config/prompts (may be empty after Wave 4)
|
|
76
|
+
let overrides = {};
|
|
77
|
+
try {
|
|
78
|
+
const loaded = loadConfigFile("prompts", rootDir).data;
|
|
79
|
+
// config/prompts.ts may export a Record (legacy) or an empty array (post-Wave 4)
|
|
80
|
+
if (loaded && !Array.isArray(loaded)) {
|
|
81
|
+
overrides = loaded;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
catch {
|
|
85
|
+
// No config/prompts file — use handler defaults only
|
|
63
86
|
}
|
|
87
|
+
// Handler-owned templates are the canonical source; overrides take precedence
|
|
64
88
|
return {
|
|
65
|
-
agentic:
|
|
66
|
-
|
|
67
|
-
|
|
89
|
+
agentic: overrides[LiteracyVariant.AGENTIC]
|
|
90
|
+
? toPrompt(overrides[LiteracyVariant.AGENTIC])
|
|
91
|
+
: toPrompt(LITERACY_PROMPT_TEMPLATES[LiteracyVariant.AGENTIC]),
|
|
92
|
+
withDocs: overrides["with-docs"]
|
|
93
|
+
? toPrompt(overrides["with-docs"])
|
|
94
|
+
: toPrompt(LITERACY_PROMPT_TEMPLATES["with-docs"]),
|
|
95
|
+
withoutDocs: overrides["without-docs"]
|
|
96
|
+
? toPrompt(overrides["without-docs"])
|
|
97
|
+
: toPrompt(LITERACY_PROMPT_TEMPLATES["without-docs"]),
|
|
68
98
|
};
|
|
69
99
|
}
|
|
70
100
|
function loadModels(rootDir) {
|
|
71
|
-
|
|
72
|
-
return load(raw);
|
|
101
|
+
return loadConfigFile("models", rootDir).data;
|
|
73
102
|
}
|
|
74
103
|
// ---------------------------------------------------------------------------
|
|
75
104
|
// Shared components
|
|
@@ -191,7 +220,7 @@ function generateAgenticConfig(models, tests, prompts, source, searchMode, allow
|
|
|
191
220
|
};
|
|
192
221
|
}
|
|
193
222
|
function generateBaselineConfig(models, tests, prompts) {
|
|
194
|
-
const baselineModels = models.models.filter((m) => modelMatchesMode(m,
|
|
223
|
+
const baselineModels = models.models.filter((m) => modelMatchesMode(m, LiteracyVariant.STANDARD));
|
|
195
224
|
const providers = baselineModels.map((model) => ({
|
|
196
225
|
config: mergeConfig(models.defaults, model.config),
|
|
197
226
|
id: model.id,
|
|
@@ -217,7 +246,7 @@ function generateBaselineConfig(models, tests, prompts) {
|
|
|
217
246
|
};
|
|
218
247
|
}
|
|
219
248
|
function generateObservedConfig(models, tests, prompts) {
|
|
220
|
-
const observedModels = models.models.filter((m) => modelMatchesMode(m,
|
|
249
|
+
const observedModels = models.models.filter((m) => modelMatchesMode(m, LiteracyVariant.OBSERVED));
|
|
221
250
|
const providers = observedModels.map((model) => {
|
|
222
251
|
const modelName = extractModelName(model.id);
|
|
223
252
|
return {
|
|
@@ -293,7 +322,7 @@ export function generateConfigs(options) {
|
|
|
293
322
|
const filter = options.filter?.areas || options.filter?.taskIds
|
|
294
323
|
? options.filter
|
|
295
324
|
: undefined;
|
|
296
|
-
// Expand tasks — use
|
|
325
|
+
// Expand tasks — use GeneralizedTaskDefinition[] from TaskSource when provided,
|
|
297
326
|
// otherwise fall back to loading from tasks/*.yaml files.
|
|
298
327
|
let entries;
|
|
299
328
|
let agenticEntries;
|
|
@@ -303,16 +332,16 @@ export function generateConfigs(options) {
|
|
|
303
332
|
taskCount: options.tasks.length,
|
|
304
333
|
taskIds: options.tasks.map((t) => t.id),
|
|
305
334
|
});
|
|
306
|
-
const baselineResult = expandTaskDefinitions(options.tasks, rootDir,
|
|
335
|
+
const baselineResult = expandTaskDefinitions(options.tasks, rootDir, LiteracyVariant.STANDARD);
|
|
307
336
|
entries = baselineResult.entries;
|
|
308
337
|
log.info(` Expanded ${baselineResult.stats.totalTasks} task(s) → ${baselineResult.stats.expandedTotal} test entries (from TaskSource)`);
|
|
309
|
-
const agenticResult = expandTaskDefinitions(options.tasks, rootDir,
|
|
338
|
+
const agenticResult = expandTaskDefinitions(options.tasks, rootDir, LiteracyVariant.AGENTIC);
|
|
310
339
|
agenticEntries = agenticResult.entries;
|
|
311
340
|
log.info(` Agentic: ${agenticResult.stats.expandedTotal} entries (gold only, no baseline)`);
|
|
312
341
|
}
|
|
313
342
|
else {
|
|
314
343
|
// Legacy path — read from tasks/*.yaml files
|
|
315
|
-
const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter,
|
|
344
|
+
const { entries: baselineEntries, stats } = loadAndExpandTasks(rootDir, filter, LiteracyVariant.STANDARD, log);
|
|
316
345
|
entries = baselineEntries;
|
|
317
346
|
log.info(` Expanded ${stats.singleDefinitions} task(s) → ${stats.expandedTotal} test entries`);
|
|
318
347
|
if (stats.legacyEntries > 0) {
|
|
@@ -328,7 +357,7 @@ export function generateConfigs(options) {
|
|
|
328
357
|
}
|
|
329
358
|
log.info(` Scoped to: ${parts.join("; ")}`);
|
|
330
359
|
}
|
|
331
|
-
const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter,
|
|
360
|
+
const { entries: agenticFromYaml, stats: agenticStats } = loadAndExpandTasks(rootDir, filter, LiteracyVariant.AGENTIC, log);
|
|
332
361
|
agenticEntries = agenticFromYaml;
|
|
333
362
|
log.info(` Agentic: ${agenticStats.expandedTotal} entries (gold only, no baseline)`);
|
|
334
363
|
}
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* grader model prefix. Reads the appropriate API key from environment.
|
|
8
8
|
*
|
|
9
9
|
* Also exports `loadGraderModel()` to resolve the grader from
|
|
10
|
-
* `config/models
|
|
10
|
+
* `config/models`.
|
|
11
11
|
*
|
|
12
12
|
* Migrated from lib/grader-api.ts — no module-level side effects, no
|
|
13
13
|
* process.exit(), accepts rootDir as parameter for file-based operations.
|
|
@@ -26,11 +26,11 @@ interface ProviderConfig {
|
|
|
26
26
|
*/
|
|
27
27
|
export declare function gradeOnce(graderModel: string, responseText: string, rubricText: string, logger?: Logger): Promise<null | number>;
|
|
28
28
|
/**
|
|
29
|
-
* Load the grader model from `config/models
|
|
29
|
+
* Load the grader model from `config/models`.
|
|
30
30
|
* Returns both the model ID and human-readable label.
|
|
31
31
|
* Falls back to `openai:gpt-5` if not configured.
|
|
32
32
|
*
|
|
33
|
-
* @throws Error if config/models
|
|
33
|
+
* @throws Error if config/models is not found
|
|
34
34
|
*/
|
|
35
35
|
export declare function loadGraderModel(rootDir: string): {
|
|
36
36
|
id: string;
|
|
@@ -7,15 +7,13 @@
|
|
|
7
7
|
* grader model prefix. Reads the appropriate API key from environment.
|
|
8
8
|
*
|
|
9
9
|
* Also exports `loadGraderModel()` to resolve the grader from
|
|
10
|
-
* `config/models
|
|
10
|
+
* `config/models`.
|
|
11
11
|
*
|
|
12
12
|
* Migrated from lib/grader-api.ts — no module-level side effects, no
|
|
13
13
|
* process.exit(), accepts rootDir as parameter for file-based operations.
|
|
14
14
|
*/
|
|
15
|
-
import { existsSync, readFileSync } from "fs";
|
|
16
|
-
import { join } from "path";
|
|
17
|
-
import { load } from "js-yaml";
|
|
18
15
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
16
|
+
import { loadConfigFile } from "./compiler/config-loader.js";
|
|
19
17
|
// ---------------------------------------------------------------------------
|
|
20
18
|
// Public API
|
|
21
19
|
// ---------------------------------------------------------------------------
|
|
@@ -63,19 +61,14 @@ ${rubricText}
|
|
|
63
61
|
}
|
|
64
62
|
}
|
|
65
63
|
/**
|
|
66
|
-
* Load the grader model from `config/models
|
|
64
|
+
* Load the grader model from `config/models`.
|
|
67
65
|
* Returns both the model ID and human-readable label.
|
|
68
66
|
* Falls back to `openai:gpt-5` if not configured.
|
|
69
67
|
*
|
|
70
|
-
* @throws Error if config/models
|
|
68
|
+
* @throws Error if config/models is not found
|
|
71
69
|
*/
|
|
72
70
|
export function loadGraderModel(rootDir) {
|
|
73
|
-
const
|
|
74
|
-
if (!existsSync(modelsPath)) {
|
|
75
|
-
throw new Error(`config/models.yaml not found at ${modelsPath}`);
|
|
76
|
-
}
|
|
77
|
-
const raw = readFileSync(modelsPath, "utf-8");
|
|
78
|
-
const data = load(raw);
|
|
71
|
+
const data = loadConfigFile("models", rootDir).data;
|
|
79
72
|
return {
|
|
80
73
|
id: data?.grader?.id ?? "openai:gpt-5",
|
|
81
74
|
label: data?.grader?.label ?? "GPT-5 (grader)",
|
|
@@ -14,8 +14,8 @@
|
|
|
14
14
|
*/
|
|
15
15
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
16
16
|
import { join } from "path";
|
|
17
|
-
import { load } from "js-yaml";
|
|
18
17
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
18
|
+
import { loadConfigFile } from "./compiler/config-loader.js";
|
|
19
19
|
import { compareGraders, } from "./grader-comparison.js";
|
|
20
20
|
import { classifyCorrelation } from "./grader-validation.js";
|
|
21
21
|
import { gradeOnce } from "./grader-api.js";
|
|
@@ -23,26 +23,20 @@ import { gradeOnce } from "./grader-api.js";
|
|
|
23
23
|
// Internal helpers
|
|
24
24
|
// ---------------------------------------------------------------------------
|
|
25
25
|
function classifyDimension(component) {
|
|
26
|
+
// Prefer structured metadata — pass through any dimension name directly,
|
|
27
|
+
// enabling non-literacy profiles (MCP, agent, knowledge-probe)
|
|
26
28
|
const metadata = component.assertion?.metadata;
|
|
27
29
|
if (metadata?.dimension) {
|
|
28
|
-
|
|
29
|
-
case "code-correctness":
|
|
30
|
-
return "codeCorrectness";
|
|
31
|
-
case "doc-coverage":
|
|
32
|
-
return "docCoverage";
|
|
33
|
-
case "task-completion":
|
|
34
|
-
return "taskCompletion";
|
|
35
|
-
default:
|
|
36
|
-
return null;
|
|
37
|
-
}
|
|
30
|
+
return metadata.dimension;
|
|
38
31
|
}
|
|
32
|
+
// Fallback: heuristic name matching (returns kebab-case)
|
|
39
33
|
const value = (component.assertion?.value ?? "").toLowerCase();
|
|
40
34
|
if (value.includes("task completion"))
|
|
41
|
-
return "
|
|
35
|
+
return "task-completion";
|
|
42
36
|
if (value.includes("code correctness"))
|
|
43
|
-
return "
|
|
37
|
+
return "code-correctness";
|
|
44
38
|
if (value.includes("documentation coverage") || value.includes("hallucinate"))
|
|
45
|
-
return "
|
|
39
|
+
return "doc-coverage";
|
|
46
40
|
return null;
|
|
47
41
|
}
|
|
48
42
|
function detectFeatureArea(description) {
|
|
@@ -101,15 +95,10 @@ function extractJudgments(file) {
|
|
|
101
95
|
}
|
|
102
96
|
/**
|
|
103
97
|
* Load config: resolve baseline grader and candidate graders.
|
|
104
|
-
* Candidate overrides take precedence over config/models.
|
|
98
|
+
* Candidate overrides take precedence over config/models.
|
|
105
99
|
*/
|
|
106
100
|
function loadConfig(rootDir, candidateOverrides) {
|
|
107
|
-
const
|
|
108
|
-
if (!existsSync(modelsPath)) {
|
|
109
|
-
throw new Error(`config/models.yaml not found at ${modelsPath}`);
|
|
110
|
-
}
|
|
111
|
-
const raw = readFileSync(modelsPath, "utf-8");
|
|
112
|
-
const data = load(raw);
|
|
101
|
+
const data = loadConfigFile("models", rootDir).data;
|
|
113
102
|
const baseline = {
|
|
114
103
|
id: data?.grader?.id ?? "openai:gpt-5",
|
|
115
104
|
label: data?.grader?.label ?? "GPT-5 (grader)",
|
|
@@ -158,11 +147,15 @@ export function formatComparisonReport(result) {
|
|
|
158
147
|
const sep = "|------------------|-------------|--------|---------|-------|";
|
|
159
148
|
lines.push(h);
|
|
160
149
|
lines.push(sep);
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
150
|
+
// Derive display rows dynamically from whatever dimensions are present
|
|
151
|
+
const dims = Object.entries(pair.perDimension).map(([key, data]) => ({
|
|
152
|
+
data,
|
|
153
|
+
// kebab-case → Title Case (e.g. 'task-completion' → 'Task Completion')
|
|
154
|
+
name: key
|
|
155
|
+
.split("-")
|
|
156
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
157
|
+
.join(" "),
|
|
158
|
+
}));
|
|
166
159
|
for (const { data, name } of dims) {
|
|
167
160
|
const biasStr = data.bias > 0 ? `+${data.bias}` : `${data.bias}`;
|
|
168
161
|
lines.push(`| ${name.padEnd(16)} | r=${String(data.correlation).padStart(9)} | ${biasStr.padStart(6)} | ${String(data.meanAbsDiff).padStart(7)} | ${String(data.count).padStart(5)} |`);
|
|
@@ -208,7 +201,7 @@ export async function runGraderCompare(options) {
|
|
|
208
201
|
const { baseline, candidates } = loadConfig(rootDir, options.candidates);
|
|
209
202
|
if (candidates.length === 0) {
|
|
210
203
|
throw new Error("No candidate graders configured. " +
|
|
211
|
-
"Add grader-candidates to config/models
|
|
204
|
+
"Add grader-candidates to config/models or pass --candidate.");
|
|
212
205
|
}
|
|
213
206
|
// Load eval results
|
|
214
207
|
if (!existsSync(resultsPath)) {
|
|
@@ -51,12 +51,8 @@ export interface GraderPairComparison {
|
|
|
51
51
|
graderB: string;
|
|
52
52
|
/** Mean absolute difference between scores */
|
|
53
53
|
meanAbsDiff: number;
|
|
54
|
-
/** Per-dimension comparisons */
|
|
55
|
-
perDimension:
|
|
56
|
-
taskCompletion: DimensionPairComparison;
|
|
57
|
-
codeCorrectness: DimensionPairComparison;
|
|
58
|
-
docCoverage: DimensionPairComparison;
|
|
59
|
-
};
|
|
54
|
+
/** Per-dimension comparisons (keyed by kebab-case dimension name) */
|
|
55
|
+
perDimension: Record<string, DimensionPairComparison>;
|
|
60
56
|
}
|
|
61
57
|
/** Recommendation for a candidate grader */
|
|
62
58
|
export interface GraderRecommendation {
|
|
@@ -71,8 +67,8 @@ export interface GraderRecommendation {
|
|
|
71
67
|
export interface GraderScore {
|
|
72
68
|
/** Feature area (e.g., "groq") */
|
|
73
69
|
area: string;
|
|
74
|
-
/** Which scoring dimension */
|
|
75
|
-
dimension:
|
|
70
|
+
/** Which scoring dimension (kebab-case, e.g. 'task-completion') */
|
|
71
|
+
dimension: string;
|
|
76
72
|
/** Score assigned by this grader (0–100) */
|
|
77
73
|
score: number;
|
|
78
74
|
/** Task ID (e.g., "groq-blog-queries") */
|
|
@@ -68,16 +68,9 @@ function comparePair(a, b) {
|
|
|
68
68
|
// Find paired observations (present in both graders)
|
|
69
69
|
const pairedA = [];
|
|
70
70
|
const pairedB = [];
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
taskCompletion: [],
|
|
75
|
-
};
|
|
76
|
-
const dimPairsB = {
|
|
77
|
-
codeCorrectness: [],
|
|
78
|
-
docCoverage: [],
|
|
79
|
-
taskCompletion: [],
|
|
80
|
-
};
|
|
71
|
+
// Group by dimension dynamically — works with any dimension names
|
|
72
|
+
const dimPairsA = {};
|
|
73
|
+
const dimPairsB = {};
|
|
81
74
|
for (const sA of a.scores) {
|
|
82
75
|
const key = `${sA.taskId}::${sA.dimension}`;
|
|
83
76
|
const scoreB = bScoreMap.get(key);
|
|
@@ -85,8 +78,13 @@ function comparePair(a, b) {
|
|
|
85
78
|
continue;
|
|
86
79
|
pairedA.push(sA.score);
|
|
87
80
|
pairedB.push(scoreB);
|
|
88
|
-
dimPairsA[sA.dimension].push(sA.score);
|
|
89
|
-
dimPairsB[sA.dimension].push(scoreB);
|
|
81
|
+
(dimPairsA[sA.dimension] ??= []).push(sA.score);
|
|
82
|
+
(dimPairsB[sA.dimension] ??= []).push(scoreB);
|
|
83
|
+
}
|
|
84
|
+
// Build perDimension from all dimensions observed in paired data
|
|
85
|
+
const perDimension = {};
|
|
86
|
+
for (const dim of Object.keys(dimPairsA)) {
|
|
87
|
+
perDimension[dim] = computeDimensionPair(dimPairsA[dim], dimPairsB[dim]);
|
|
90
88
|
}
|
|
91
89
|
return {
|
|
92
90
|
bias: computeBias(pairedA, pairedB),
|
|
@@ -94,11 +92,7 @@ function comparePair(a, b) {
|
|
|
94
92
|
graderA: a.modelId,
|
|
95
93
|
graderB: b.modelId,
|
|
96
94
|
meanAbsDiff: computeMeanAbsDiff(pairedA, pairedB),
|
|
97
|
-
perDimension
|
|
98
|
-
codeCorrectness: computeDimensionPair(dimPairsA.codeCorrectness, dimPairsB.codeCorrectness),
|
|
99
|
-
docCoverage: computeDimensionPair(dimPairsA.docCoverage, dimPairsB.docCoverage),
|
|
100
|
-
taskCompletion: computeDimensionPair(dimPairsA.taskCompletion, dimPairsB.taskCompletion),
|
|
101
|
-
},
|
|
95
|
+
perDimension,
|
|
102
96
|
};
|
|
103
97
|
}
|
|
104
98
|
/** Mean signed difference (B - A). Positive = B scores higher. */
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
import { type Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
18
18
|
import type { RawPromptfooFile } from "./calculate-scores.js";
|
|
19
19
|
import { type GraderConsistency } from "./grader-consistency.js";
|
|
20
|
-
import type { DimensionName } from "./types.js";
|
|
21
20
|
/** Options for the grader consistency runner. */
|
|
22
21
|
export interface GraderConsistencyRunnerOptions {
|
|
23
22
|
/** Logger for structured output. Falls back to ConsoleLogger if omitted. */
|
|
@@ -34,8 +33,8 @@ interface GradingJudgment {
|
|
|
34
33
|
area: string;
|
|
35
34
|
/** Task description */
|
|
36
35
|
description: string;
|
|
37
|
-
/** Scoring dimension */
|
|
38
|
-
dimension:
|
|
36
|
+
/** Scoring dimension (kebab-case, e.g. 'task-completion') */
|
|
37
|
+
dimension: string;
|
|
39
38
|
/** The original score from the eval run */
|
|
40
39
|
originalScore: number;
|
|
41
40
|
/** Provider (model under test) */
|
|
@@ -23,28 +23,20 @@ import { analyzeConsistency, } from "./grader-consistency.js";
|
|
|
23
23
|
// Rubric dimension classification (similar to calculate-scores)
|
|
24
24
|
// ---------------------------------------------------------------------------
|
|
25
25
|
function classifyDimension(component) {
|
|
26
|
-
// Prefer structured metadata
|
|
26
|
+
// Prefer structured metadata — pass through any dimension name directly,
|
|
27
|
+
// enabling non-literacy profiles (MCP, agent, knowledge-probe)
|
|
27
28
|
const metadata = component.assertion?.metadata;
|
|
28
29
|
if (metadata?.dimension) {
|
|
29
|
-
|
|
30
|
-
case "code-correctness":
|
|
31
|
-
return "codeCorrectness";
|
|
32
|
-
case "doc-coverage":
|
|
33
|
-
return "docCoverage";
|
|
34
|
-
case "task-completion":
|
|
35
|
-
return "taskCompletion";
|
|
36
|
-
default:
|
|
37
|
-
return null;
|
|
38
|
-
}
|
|
30
|
+
return metadata.dimension;
|
|
39
31
|
}
|
|
40
|
-
// Fallback: heuristic name matching
|
|
32
|
+
// Fallback: heuristic name matching (returns kebab-case)
|
|
41
33
|
const value = (component.assertion?.value ?? "").toLowerCase();
|
|
42
34
|
if (value.includes("task completion"))
|
|
43
|
-
return "
|
|
35
|
+
return "task-completion";
|
|
44
36
|
if (value.includes("code correctness"))
|
|
45
|
-
return "
|
|
37
|
+
return "code-correctness";
|
|
46
38
|
if (value.includes("documentation coverage") || value.includes("hallucinate"))
|
|
47
|
-
return "
|
|
39
|
+
return "doc-coverage";
|
|
48
40
|
return null;
|
|
49
41
|
}
|
|
50
42
|
// ---------------------------------------------------------------------------
|
|
@@ -140,11 +132,15 @@ export function formatConsistencyReport(result, graderModel) {
|
|
|
140
132
|
const sep = "|------------------|-------|-------|-----------|-----------| ";
|
|
141
133
|
lines.push(h);
|
|
142
134
|
lines.push(sep);
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
135
|
+
// Derive display rows dynamically from whatever dimensions are present
|
|
136
|
+
const dims = Object.entries(result.perDimension).map(([key, data]) => ({
|
|
137
|
+
data,
|
|
138
|
+
// kebab-case → Title Case (e.g. 'task-completion' → 'Task Completion')
|
|
139
|
+
name: key
|
|
140
|
+
.split("-")
|
|
141
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
142
|
+
.join(" "),
|
|
143
|
+
}));
|
|
148
144
|
for (const { data, name } of dims) {
|
|
149
145
|
lines.push(`| ${name.padEnd(16)} | ${String(data.avgStdDev).padStart(5)} | ${String(data.maxStdDev).padStart(5)} | ${String(data.avgRange).padStart(9)} | ${String(data.judgmentCount).padStart(9)} |`);
|
|
150
146
|
}
|
|
@@ -35,12 +35,8 @@ export interface GraderConsistency {
|
|
|
35
35
|
judgments: JudgmentConsistency[];
|
|
36
36
|
/** Maximum standard deviation observed (worst-case noise) */
|
|
37
37
|
maxStdDev: number;
|
|
38
|
-
/** Per-dimension consistency */
|
|
39
|
-
perDimension:
|
|
40
|
-
taskCompletion: DimensionConsistency;
|
|
41
|
-
codeCorrectness: DimensionConsistency;
|
|
42
|
-
docCoverage: DimensionConsistency;
|
|
43
|
-
};
|
|
38
|
+
/** Per-dimension consistency (keyed by kebab-case dimension name) */
|
|
39
|
+
perDimension: Record<string, DimensionConsistency>;
|
|
44
40
|
/** Recommended noise threshold for comparisons (2× max dimension avgStdDev) */
|
|
45
41
|
recommendedThreshold: number;
|
|
46
42
|
/** Number of replications per judgment */
|
|
@@ -52,8 +48,8 @@ export interface GraderConsistency {
|
|
|
52
48
|
export interface JudgmentConsistency {
|
|
53
49
|
/** Feature area */
|
|
54
50
|
area: string;
|
|
55
|
-
/** Scoring dimension */
|
|
56
|
-
dimension:
|
|
51
|
+
/** Scoring dimension (kebab-case, e.g. 'task-completion') */
|
|
52
|
+
dimension: string;
|
|
57
53
|
/** Max score observed */
|
|
58
54
|
max: number;
|
|
59
55
|
/** Mean score across replications */
|
|
@@ -75,8 +71,8 @@ export interface JudgmentConsistency {
|
|
|
75
71
|
export interface ReplicatedGrading {
|
|
76
72
|
/** Feature area (derived from task description) */
|
|
77
73
|
area: string;
|
|
78
|
-
/** Which scoring dimension this rubric measures */
|
|
79
|
-
dimension:
|
|
74
|
+
/** Which scoring dimension this rubric measures (kebab-case, e.g. 'task-completion') */
|
|
75
|
+
dimension: string;
|
|
80
76
|
/** Provider (model under test) that produced the original response */
|
|
81
77
|
providerId?: string;
|
|
82
78
|
/** The scores from each replication (length = N replications) */
|
|
@@ -31,26 +31,7 @@ export function analyzeConsistency(gradings) {
|
|
|
31
31
|
generatedAt: new Date().toISOString(),
|
|
32
32
|
judgments: [],
|
|
33
33
|
maxStdDev: 0,
|
|
34
|
-
perDimension: {
|
|
35
|
-
codeCorrectness: {
|
|
36
|
-
avgRange: 0,
|
|
37
|
-
avgStdDev: 0,
|
|
38
|
-
judgmentCount: 0,
|
|
39
|
-
maxStdDev: 0,
|
|
40
|
-
},
|
|
41
|
-
docCoverage: {
|
|
42
|
-
avgRange: 0,
|
|
43
|
-
avgStdDev: 0,
|
|
44
|
-
judgmentCount: 0,
|
|
45
|
-
maxStdDev: 0,
|
|
46
|
-
},
|
|
47
|
-
taskCompletion: {
|
|
48
|
-
avgRange: 0,
|
|
49
|
-
avgStdDev: 0,
|
|
50
|
-
judgmentCount: 0,
|
|
51
|
-
maxStdDev: 0,
|
|
52
|
-
},
|
|
53
|
-
},
|
|
34
|
+
perDimension: {},
|
|
54
35
|
recommendedThreshold: 0,
|
|
55
36
|
replications: 0,
|
|
56
37
|
totalJudgments: 0,
|
|
@@ -58,17 +39,16 @@ export function analyzeConsistency(gradings) {
|
|
|
58
39
|
}
|
|
59
40
|
// Analyze each judgment
|
|
60
41
|
const judgments = gradings.map(analyzeJudgment);
|
|
61
|
-
// Group by dimension
|
|
62
|
-
const byDimension = {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
}
|
|
67
|
-
const perDimension = {
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
};
|
|
42
|
+
// Group by dimension dynamically — works with any dimension names
|
|
43
|
+
const byDimension = {};
|
|
44
|
+
for (const j of judgments) {
|
|
45
|
+
;
|
|
46
|
+
(byDimension[j.dimension] ??= []).push(j);
|
|
47
|
+
}
|
|
48
|
+
const perDimension = {};
|
|
49
|
+
for (const [dim, dimJudgments] of Object.entries(byDimension)) {
|
|
50
|
+
perDimension[dim] = aggregateDimension(dimJudgments);
|
|
51
|
+
}
|
|
72
52
|
// Overall stats
|
|
73
53
|
const allStdDevs = judgments.map((j) => j.stdDev);
|
|
74
54
|
const allRanges = judgments.map((j) => j.range);
|
|
@@ -76,7 +56,8 @@ export function analyzeConsistency(gradings) {
|
|
|
76
56
|
// Recommended threshold: 2× the worst (highest) per-dimension avgStdDev.
|
|
77
57
|
// This means a comparison delta must exceed 2σ of the noisiest dimension
|
|
78
58
|
// to be classified as a real change rather than grader variance.
|
|
79
|
-
const
|
|
59
|
+
const dimAvgStdDevs = Object.values(perDimension).map((d) => d.avgStdDev);
|
|
60
|
+
const maxDimensionAvgStdDev = dimAvgStdDevs.length > 0 ? Math.max(...dimAvgStdDevs) : 0;
|
|
80
61
|
const recommendedThreshold = Math.ceil(maxDimensionAvgStdDev * 2);
|
|
81
62
|
// Sort judgments by stdDev descending (noisiest first)
|
|
82
63
|
const sortedJudgments = [...judgments].sort((a, b) => b.stdDev - a.stdDev);
|
|
@@ -119,11 +119,13 @@ export function formatSensitivityReport(result) {
|
|
|
119
119
|
const sep = "|------------------|-------------|---------|-------|-------|";
|
|
120
120
|
lines.push(h);
|
|
121
121
|
lines.push(sep);
|
|
122
|
-
const dims = [
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
122
|
+
const dims = Object.entries(result.perDimension).map(([key, data]) => ({
|
|
123
|
+
data,
|
|
124
|
+
name: key
|
|
125
|
+
.split(/[-_]/)
|
|
126
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
127
|
+
.join(" "),
|
|
128
|
+
}));
|
|
127
129
|
for (const { data, name } of dims) {
|
|
128
130
|
lines.push(`| ${name.padEnd(16)} | ${String(data.concordanceRate + "%").padStart(11)} | ${String(data.avgSeparation).padStart(7)} | ${String(data.tiedRate + "%").padStart(5)} | ${String(data.pairCount).padStart(5)} |`);
|
|
129
131
|
}
|
|
@@ -58,12 +58,8 @@ export interface GraderSensitivityResult {
|
|
|
58
58
|
generatedAt: string;
|
|
59
59
|
/** Grader model used */
|
|
60
60
|
graderModel: string;
|
|
61
|
-
/** Per-dimension sensitivity metrics */
|
|
62
|
-
perDimension:
|
|
63
|
-
taskCompletion: DimensionSensitivity;
|
|
64
|
-
codeCorrectness: DimensionSensitivity;
|
|
65
|
-
docCoverage: DimensionSensitivity;
|
|
66
|
-
};
|
|
61
|
+
/** Per-dimension sensitivity metrics (keyed by dimension name) */
|
|
62
|
+
perDimension: Record<string, DimensionSensitivity>;
|
|
67
63
|
/** Total paired comparisons analyzed */
|
|
68
64
|
totalPairs: number;
|
|
69
65
|
}
|
|
@@ -30,11 +30,15 @@ export function analyzeSensitivity(pairs, graderModel) {
|
|
|
30
30
|
// Overall concordance and separation
|
|
31
31
|
const { avgSeparation, concordanceRate, tiedRate: _tiedRate, } = computeMetrics(pairs);
|
|
32
32
|
// Per-dimension (based on the grading dimension, not the target dimension)
|
|
33
|
-
const
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
}
|
|
33
|
+
const dimGroups = {};
|
|
34
|
+
for (const p of pairs) {
|
|
35
|
+
;
|
|
36
|
+
(dimGroups[p.dimension] ??= []).push(p);
|
|
37
|
+
}
|
|
38
|
+
const perDimension = {};
|
|
39
|
+
for (const [dim, dimPairs] of Object.entries(dimGroups)) {
|
|
40
|
+
perDimension[dim] = computeMetrics(dimPairs);
|
|
41
|
+
}
|
|
38
42
|
// Cross-dimension: on-target (dimension matches targetDimension) vs off-target
|
|
39
43
|
const onTargetPairs = pairs.filter((p) => p.dimension === p.targetDimension);
|
|
40
44
|
const offTargetPairs = pairs.filter((p) => p.dimension !== p.targetDimension);
|
|
@@ -130,11 +134,7 @@ function emptyResult(graderModel) {
|
|
|
130
134
|
failedPairs: [],
|
|
131
135
|
generatedAt: new Date().toISOString(),
|
|
132
136
|
graderModel,
|
|
133
|
-
perDimension: {
|
|
134
|
-
codeCorrectness: emptyDim,
|
|
135
|
-
docCoverage: emptyDim,
|
|
136
|
-
taskCompletion: emptyDim,
|
|
137
|
-
},
|
|
137
|
+
perDimension: {},
|
|
138
138
|
totalPairs: 0,
|
|
139
139
|
};
|
|
140
140
|
}
|