@sanity/ailf 0.4.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
- package/dist/_vendor/ailf-core/examples/index.js +10 -10
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +5 -7
- package/dist/pipeline/calculate-scores.js +74 -153
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +23 -14
- package/dist/pipeline/expand-tasks.js +37 -31
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +18 -21
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +47 -0
- package/dist/pipeline/profile-resolution.js +91 -0
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -62
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
package/dist/pipeline/cache.d.ts
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
*
|
|
11
11
|
* Cache invalidation triggers:
|
|
12
12
|
* - Content change: any input file's content changes → hash changes → miss
|
|
13
|
-
* - Config change: config/models
|
|
13
|
+
* - Config change: config/models, config/sources, tasks/*.yaml changes → miss
|
|
14
14
|
* - Manual bypass: --no-cache flag skips all cache lookups
|
|
15
15
|
* - Cache clear: delete results/cache/ to start fresh
|
|
16
16
|
*/
|
package/dist/pipeline/cache.js
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
*
|
|
11
11
|
* Cache invalidation triggers:
|
|
12
12
|
* - Content change: any input file's content changes → hash changes → miss
|
|
13
|
-
* - Config change: config/models
|
|
13
|
+
* - Config change: config/models, config/sources, tasks/*.yaml changes → miss
|
|
14
14
|
* - Manual bypass: --no-cache flag skips all cache lookups
|
|
15
15
|
* - Cache clear: delete results/cache/ to start fresh
|
|
16
16
|
*/
|
|
@@ -18,6 +18,19 @@ import { createHash } from "crypto";
|
|
|
18
18
|
import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "fs";
|
|
19
19
|
import { join, resolve } from "path";
|
|
20
20
|
// ---------------------------------------------------------------------------
|
|
21
|
+
// Helpers
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
/** Resolve first existing config file (matches loadConfigFile priority chain) */
|
|
24
|
+
function resolveConfig(rootDir, name) {
|
|
25
|
+
const r = (f) => resolve(rootDir, f);
|
|
26
|
+
for (const ext of [".ts", ".js", ".yaml", ".yml", ".json"]) {
|
|
27
|
+
const p = r(`config/${name}${ext}`);
|
|
28
|
+
if (existsSync(p))
|
|
29
|
+
return p;
|
|
30
|
+
}
|
|
31
|
+
return undefined;
|
|
32
|
+
}
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
21
34
|
// Constants
|
|
22
35
|
// ---------------------------------------------------------------------------
|
|
23
36
|
const CACHE_DIR_NAME = "cache";
|
|
@@ -79,7 +92,10 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
79
92
|
const isBaseline = step === "eval-baseline" || step === "eval";
|
|
80
93
|
const isAgentic = step === "eval-agentic" || step === "eval";
|
|
81
94
|
const isObserved = step === "eval-observed" || step === "eval";
|
|
82
|
-
const paths = [
|
|
95
|
+
const paths = [];
|
|
96
|
+
const modelsPath = resolveConfig(rootDir, "models");
|
|
97
|
+
if (modelsPath)
|
|
98
|
+
paths.push(modelsPath);
|
|
83
99
|
// Config files — only the relevant ones for this mode
|
|
84
100
|
if (isBaseline) {
|
|
85
101
|
paths.push(r("promptfooconfig.yaml"));
|
|
@@ -130,25 +146,37 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
130
146
|
return paths;
|
|
131
147
|
}
|
|
132
148
|
case "fetch-docs": {
|
|
133
|
-
// Inputs: config
|
|
134
|
-
const paths = [
|
|
149
|
+
// Inputs: config sources + models, task files
|
|
150
|
+
const paths = [];
|
|
151
|
+
const sourcesPath = resolveConfig(rootDir, "sources");
|
|
152
|
+
const modelsPath2 = resolveConfig(rootDir, "models");
|
|
153
|
+
if (sourcesPath)
|
|
154
|
+
paths.push(sourcesPath);
|
|
155
|
+
if (modelsPath2)
|
|
156
|
+
paths.push(modelsPath2);
|
|
135
157
|
// Include all task files (they define feature areas)
|
|
136
158
|
const tasksDir = r("tasks");
|
|
137
159
|
if (existsSync(tasksDir)) {
|
|
138
160
|
const taskFiles = readdirSync(tasksDir)
|
|
139
|
-
.filter((f) =>
|
|
161
|
+
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
|
|
140
162
|
.map((f) => join(tasksDir, f));
|
|
141
163
|
paths.push(...taskFiles);
|
|
142
164
|
}
|
|
143
165
|
return paths;
|
|
144
166
|
}
|
|
145
167
|
case "generate-configs": {
|
|
146
|
-
// Inputs: config
|
|
147
|
-
const paths = [
|
|
168
|
+
// Inputs: config models + sources, all task files
|
|
169
|
+
const paths = [];
|
|
170
|
+
const modelsPath3 = resolveConfig(rootDir, "models");
|
|
171
|
+
const sourcesPath2 = resolveConfig(rootDir, "sources");
|
|
172
|
+
if (modelsPath3)
|
|
173
|
+
paths.push(modelsPath3);
|
|
174
|
+
if (sourcesPath2)
|
|
175
|
+
paths.push(sourcesPath2);
|
|
148
176
|
const tasksDir = r("tasks");
|
|
149
177
|
if (existsSync(tasksDir)) {
|
|
150
178
|
const taskFiles = readdirSync(tasksDir)
|
|
151
|
-
.filter((f) =>
|
|
179
|
+
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
|
|
152
180
|
.map((f) => join(tasksDir, f));
|
|
153
181
|
paths.push(...taskFiles);
|
|
154
182
|
}
|
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
|
-
import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
|
|
4
3
|
import type { GraderJudgment, PerModelEntry } from "./types.js";
|
|
5
|
-
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.d.ts";
|
|
6
|
-
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
4
|
+
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
7
5
|
export interface PromptfooResultsWrapper {
|
|
8
6
|
results: RawTestResult[];
|
|
9
7
|
stats: {
|
|
@@ -64,7 +62,7 @@ export interface RawTestResult {
|
|
|
64
62
|
* @returns Record keyed by model ID, or null if only one model was used
|
|
65
63
|
* (per-model breakdown is redundant when there's only one model).
|
|
66
64
|
*/
|
|
67
|
-
export declare function calculateScoresPerModel(resultsPath: string,
|
|
65
|
+
export declare function calculateScoresPerModel(resultsPath: string, goldProfile: Record<string, number>, baselineProfile: Record<string, number>): null | PerModelEntry[];
|
|
68
66
|
/**
|
|
69
67
|
* Extract grader judgments (reason text + scores) from evaluation results.
|
|
70
68
|
*
|
|
@@ -82,7 +80,7 @@ export declare function extractGraderJudgments(resultsPath: string): GraderJudgm
|
|
|
82
80
|
*
|
|
83
81
|
* Returns a record keyed by feature area with the composite actual score.
|
|
84
82
|
*/
|
|
85
|
-
export declare function scoreAgenticResults(resultsPath: string,
|
|
83
|
+
export declare function scoreAgenticResults(resultsPath: string, profile: Record<string, number>): Record<string, ActualScoreEntry>;
|
|
86
84
|
/**
|
|
87
85
|
* Score agentic results broken down by model.
|
|
88
86
|
*
|
|
@@ -90,7 +88,7 @@ export declare function scoreAgenticResults(resultsPath: string, weights: Record
|
|
|
90
88
|
* producing a map of model → feature → ActualScoreEntry.
|
|
91
89
|
* Used to enrich the per-model breakdown with actual scores in full mode.
|
|
92
90
|
*/
|
|
93
|
-
export declare function scoreAgenticResultsPerModel(resultsPath: string,
|
|
91
|
+
export declare function scoreAgenticResultsPerModel(resultsPath: string, profile: Record<string, number>): Record<string, Record<string, ActualScoreEntry>>;
|
|
94
92
|
/** Options for the calculate-scores main() function. */
|
|
95
93
|
export interface CalculateScoresOptions {
|
|
96
94
|
/** Allowed origins for source isolation reporting */
|
|
@@ -8,8 +8,11 @@
|
|
|
8
8
|
* Code Correctness (0–100) — Is the code idiomatic and correct?
|
|
9
9
|
* Doc Coverage (0–100) — Did docs provide the needed info?
|
|
10
10
|
*
|
|
11
|
-
* Dimensions are combined into a weighted composite (0–100) using
|
|
12
|
-
* from config/rubrics.
|
|
11
|
+
* Dimensions are combined into a weighted composite (0–100) using named
|
|
12
|
+
* scoring profiles from config/rubrics. Gold (with-docs) entries use
|
|
13
|
+
* the "default" profile; baseline (without-docs) entries use "output-only"
|
|
14
|
+
* which excludes doc-coverage (undefined without docs).
|
|
15
|
+
* See docs/design-docs/named-scoring-profiles.md.
|
|
13
16
|
*
|
|
14
17
|
* Additionally compares with-docs vs without-docs scores to calculate
|
|
15
18
|
* the "Doc Lift" — how much documentation helps vs parametric knowledge.
|
|
@@ -26,14 +29,17 @@
|
|
|
26
29
|
*/
|
|
27
30
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
28
31
|
import { join } from "path";
|
|
32
|
+
import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
29
33
|
import { calculateCost } from "../agent-observer/pricing.js";
|
|
30
34
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
35
|
+
import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
|
|
31
36
|
import { checkResultsExist } from "./checks.js";
|
|
32
|
-
import { loadRubricTemplates } from "./
|
|
37
|
+
import { loadRubricTemplates } from "./rubric-loader.js";
|
|
38
|
+
import { resolveProfile } from "./profile-resolution.js";
|
|
33
39
|
import { loadSource } from "../sources.js";
|
|
34
|
-
import {
|
|
35
|
-
import {
|
|
36
|
-
// Re-export
|
|
40
|
+
import { LiteracyVariant } from "./normalize-mode.js";
|
|
41
|
+
import { scoreTestGroup } from "./compiler/scoring-bridge.js";
|
|
42
|
+
// Re-export from core for backward compatibility.
|
|
37
43
|
// Existing imports from this file continue to work unchanged.
|
|
38
44
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
39
45
|
/**
|
|
@@ -46,7 +52,7 @@ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, par
|
|
|
46
52
|
* @returns Record keyed by model ID, or null if only one model was used
|
|
47
53
|
* (per-model breakdown is redundant when there's only one model).
|
|
48
54
|
*/
|
|
49
|
-
export function calculateScoresPerModel(resultsPath,
|
|
55
|
+
export function calculateScoresPerModel(resultsPath, goldProfile, baselineProfile) {
|
|
50
56
|
const results = readAndNormalizeResults(resultsPath);
|
|
51
57
|
// Group results by provider
|
|
52
58
|
const byModel = {};
|
|
@@ -66,7 +72,7 @@ export function calculateScoresPerModel(resultsPath, weights) {
|
|
|
66
72
|
}
|
|
67
73
|
const perModel = [];
|
|
68
74
|
for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
|
|
69
|
-
const scores = scoreResults(modelResults,
|
|
75
|
+
const scores = scoreResults(modelResults, goldProfile, baselineProfile, modelId);
|
|
70
76
|
const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
|
|
71
77
|
const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
|
|
72
78
|
const avgScore = scores.length > 0
|
|
@@ -133,14 +139,8 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
133
139
|
// Not JSON — use raw reason string
|
|
134
140
|
}
|
|
135
141
|
}
|
|
136
|
-
// Map internal dimension names to hyphenated form
|
|
137
|
-
const dimensionMap = {
|
|
138
|
-
codeCorrectness: "code-correctness",
|
|
139
|
-
docCoverage: "doc-coverage",
|
|
140
|
-
taskCompletion: "task-completion",
|
|
141
|
-
};
|
|
142
142
|
judgments.push({
|
|
143
|
-
dimension:
|
|
143
|
+
dimension: kind,
|
|
144
144
|
modelId,
|
|
145
145
|
reason,
|
|
146
146
|
score,
|
|
@@ -277,7 +277,7 @@ function aggregateUrlReferences(resultsPath) {
|
|
|
277
277
|
* verification report.
|
|
278
278
|
*/
|
|
279
279
|
function buildSourceVerification(root, source, verificationCtx) {
|
|
280
|
-
const mode = verificationCtx?.mode ??
|
|
280
|
+
const mode = verificationCtx?.mode ?? LiteracyVariant.STANDARD;
|
|
281
281
|
const sourceUrl = source?.baseUrl ?? "default";
|
|
282
282
|
const searchMode = verificationCtx?.searchMode;
|
|
283
283
|
const allowedOrigins = verificationCtx?.allowedOrigins;
|
|
@@ -318,9 +318,9 @@ function buildSourceVerification(root, source, verificationCtx) {
|
|
|
318
318
|
* Calculate overall scores (all models combined).
|
|
319
319
|
* This is the original scoring path — backward compatible.
|
|
320
320
|
*/
|
|
321
|
-
function calculateScores(resultsPath,
|
|
321
|
+
function calculateScores(resultsPath, goldProfile, baselineProfile) {
|
|
322
322
|
const results = readAndNormalizeResults(resultsPath);
|
|
323
|
-
return scoreResults(results,
|
|
323
|
+
return scoreResults(results, goldProfile, baselineProfile);
|
|
324
324
|
}
|
|
325
325
|
/**
|
|
326
326
|
* Extracts agent behavior summary from a test result's metadata.
|
|
@@ -495,13 +495,11 @@ function readAndNormalizeResults(resultsPath, log) {
|
|
|
495
495
|
* used by both the overall scoring and per-model scoring paths.
|
|
496
496
|
*
|
|
497
497
|
* @param results Pre-filtered (valid) test results
|
|
498
|
-
* @param
|
|
499
|
-
* @param
|
|
498
|
+
* @param goldProfile Weight profile for gold (with-docs) entries
|
|
499
|
+
* @param baselineProfile Weight profile for baseline (without-docs) entries
|
|
500
|
+
* @param modelId Optional model identifier to tag each FeatureScore
|
|
500
501
|
*/
|
|
501
|
-
function scoreResults(results,
|
|
502
|
-
const wTask = weights["task-completion"] ?? 0.5;
|
|
503
|
-
const wCode = weights["code-correctness"] ?? 0.25;
|
|
504
|
-
const wDoc = weights["doc-coverage"] ?? 0.25;
|
|
502
|
+
function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
505
503
|
// Group by feature + docs/no-docs
|
|
506
504
|
const byFeature = {};
|
|
507
505
|
for (const result of results) {
|
|
@@ -519,65 +517,28 @@ function scoreResults(results, weights, modelId) {
|
|
|
519
517
|
}
|
|
520
518
|
const scores = [];
|
|
521
519
|
for (const [feature, data] of Object.entries(byFeature)) {
|
|
522
|
-
// --- With docs ---
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
continue;
|
|
533
|
-
}
|
|
534
|
-
const score = parseRubricScore(comp);
|
|
535
|
-
const kind = classifyRubric(comp);
|
|
536
|
-
if (kind === "taskCompletion") {
|
|
537
|
-
totalTask += score;
|
|
538
|
-
}
|
|
539
|
-
else if (kind === "codeCorrectness") {
|
|
540
|
-
totalCode += score;
|
|
541
|
-
}
|
|
542
|
-
else if (kind === "docCoverage") {
|
|
543
|
-
totalDoc += score;
|
|
544
|
-
}
|
|
545
|
-
}
|
|
546
|
-
}
|
|
547
|
-
// Per-dimension averages (each 0–100)
|
|
548
|
-
const avgTask = totalTask / countWithDocs;
|
|
549
|
-
const avgCode = totalCode / countWithDocs;
|
|
550
|
-
const avgDoc = totalDoc / countWithDocs;
|
|
551
|
-
// Weighted composite (0–100)
|
|
552
|
-
const withDocsTotal = avgTask * wTask + avgCode * wCode + avgDoc * wDoc;
|
|
553
|
-
// --- Without docs (baseline) ---
|
|
554
|
-
let baselineTotal = 0;
|
|
555
|
-
let baselineCount = 0;
|
|
556
|
-
for (const test of data.withoutDocs) {
|
|
557
|
-
featureCost += test.cost;
|
|
558
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
559
|
-
if (comp.assertion?.type !== "llm-rubric") {
|
|
560
|
-
continue;
|
|
561
|
-
}
|
|
562
|
-
baselineTotal += parseRubricScore(comp);
|
|
563
|
-
baselineCount++;
|
|
564
|
-
}
|
|
565
|
-
}
|
|
566
|
-
const withoutDocsScore = baselineCount > 0 ? baselineTotal / baselineCount : 0;
|
|
567
|
-
const ceilingScore = Math.round(withDocsTotal);
|
|
568
|
-
const floorScore = Math.round(withoutDocsScore);
|
|
520
|
+
// --- With docs (gold / ceiling) — scored via 4-tier engine ---
|
|
521
|
+
const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
|
|
522
|
+
// --- Without docs (baseline / floor) ---
|
|
523
|
+
// Uses the baseline profile (e.g. "output-only") which may exclude
|
|
524
|
+
// dimensions like doc-coverage that are undefined without docs.
|
|
525
|
+
// See docs/design-docs/named-scoring-profiles.md.
|
|
526
|
+
const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
|
|
527
|
+
const featureCost = gold.totalCost + baseline.totalCost;
|
|
528
|
+
const ceilingScore = gold.composite;
|
|
529
|
+
const floorScore = baseline.composite;
|
|
569
530
|
const docLift = ceilingScore - floorScore;
|
|
570
531
|
const featureScore = {
|
|
571
532
|
ceilingScore,
|
|
572
|
-
codeCorrectness:
|
|
573
|
-
docCoverage:
|
|
533
|
+
codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
|
|
534
|
+
docCoverage: gold.dimensions.docCoverage ?? 0,
|
|
574
535
|
docLift,
|
|
575
536
|
docQualityGap: 100 - ceilingScore,
|
|
576
537
|
feature,
|
|
577
538
|
floorScore,
|
|
578
539
|
...(modelId && { modelId }),
|
|
579
540
|
negativeDocLift: docLift < 0,
|
|
580
|
-
taskCompletion:
|
|
541
|
+
taskCompletion: gold.dimensions.taskCompletion ?? 0,
|
|
581
542
|
testCount: data.withDocs.length,
|
|
582
543
|
totalCost: featureCost,
|
|
583
544
|
totalScore: ceilingScore,
|
|
@@ -597,11 +558,8 @@ function scoreResults(results, weights, modelId) {
|
|
|
597
558
|
* Returns a record keyed by feature area with the composite actual score.
|
|
598
559
|
*/
|
|
599
560
|
// ActualScoreEntry — imported from @sanity/ailf-core via pipeline/types.js
|
|
600
|
-
export function scoreAgenticResults(resultsPath,
|
|
561
|
+
export function scoreAgenticResults(resultsPath, profile) {
|
|
601
562
|
const results = readAndNormalizeResults(resultsPath);
|
|
602
|
-
const wTask = weights["task-completion"] ?? 0.5;
|
|
603
|
-
const wCode = weights["code-correctness"] ?? 0.25;
|
|
604
|
-
const wDoc = weights["doc-coverage"] ?? 0.25;
|
|
605
563
|
// Group by feature area
|
|
606
564
|
const byFeature = {};
|
|
607
565
|
for (const result of results) {
|
|
@@ -613,37 +571,14 @@ export function scoreAgenticResults(resultsPath, weights) {
|
|
|
613
571
|
}
|
|
614
572
|
const entries = {};
|
|
615
573
|
for (const [feature, featureResults] of Object.entries(byFeature)) {
|
|
616
|
-
|
|
617
|
-
let totalCode = 0;
|
|
618
|
-
let totalDoc = 0;
|
|
619
|
-
let featureCost = 0;
|
|
620
|
-
const count = featureResults.length || 1;
|
|
621
|
-
for (const test of featureResults) {
|
|
622
|
-
featureCost += test.cost;
|
|
623
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
624
|
-
if (comp.assertion?.type !== "llm-rubric")
|
|
625
|
-
continue;
|
|
626
|
-
const score = parseRubricScore(comp);
|
|
627
|
-
const kind = classifyRubric(comp);
|
|
628
|
-
if (kind === "taskCompletion")
|
|
629
|
-
totalTask += score;
|
|
630
|
-
else if (kind === "codeCorrectness")
|
|
631
|
-
totalCode += score;
|
|
632
|
-
else if (kind === "docCoverage")
|
|
633
|
-
totalDoc += score;
|
|
634
|
-
}
|
|
635
|
-
}
|
|
636
|
-
const avgTask = totalTask / count;
|
|
637
|
-
const avgCode = totalCode / count;
|
|
638
|
-
const avgDoc = totalDoc / count;
|
|
639
|
-
const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
|
|
574
|
+
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
640
575
|
entries[feature] = {
|
|
641
|
-
actualScore,
|
|
642
|
-
codeCorrectness:
|
|
643
|
-
docCoverage:
|
|
644
|
-
taskCompletion:
|
|
576
|
+
actualScore: scored.composite,
|
|
577
|
+
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
578
|
+
docCoverage: scored.dimensions.docCoverage ?? 0,
|
|
579
|
+
taskCompletion: scored.dimensions.taskCompletion ?? 0,
|
|
645
580
|
testCount: featureResults.length,
|
|
646
|
-
totalCost:
|
|
581
|
+
totalCost: scored.totalCost,
|
|
647
582
|
};
|
|
648
583
|
}
|
|
649
584
|
return entries;
|
|
@@ -655,11 +590,8 @@ export function scoreAgenticResults(resultsPath, weights) {
|
|
|
655
590
|
* producing a map of model → feature → ActualScoreEntry.
|
|
656
591
|
* Used to enrich the per-model breakdown with actual scores in full mode.
|
|
657
592
|
*/
|
|
658
|
-
export function scoreAgenticResultsPerModel(resultsPath,
|
|
593
|
+
export function scoreAgenticResultsPerModel(resultsPath, profile) {
|
|
659
594
|
const results = readAndNormalizeResults(resultsPath);
|
|
660
|
-
const wTask = weights["task-completion"] ?? 0.5;
|
|
661
|
-
const wCode = weights["code-correctness"] ?? 0.25;
|
|
662
|
-
const wDoc = weights["doc-coverage"] ?? 0.25;
|
|
663
595
|
// Group by model, then feature
|
|
664
596
|
const byModel = {};
|
|
665
597
|
for (const result of results) {
|
|
@@ -675,37 +607,14 @@ export function scoreAgenticResultsPerModel(resultsPath, weights) {
|
|
|
675
607
|
for (const [modelId, features] of Object.entries(byModel)) {
|
|
676
608
|
perModel[modelId] = {};
|
|
677
609
|
for (const [feature, featureResults] of Object.entries(features)) {
|
|
678
|
-
|
|
679
|
-
let totalCode = 0;
|
|
680
|
-
let totalDoc = 0;
|
|
681
|
-
let featureCost = 0;
|
|
682
|
-
const count = featureResults.length || 1;
|
|
683
|
-
for (const test of featureResults) {
|
|
684
|
-
featureCost += test.cost;
|
|
685
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
686
|
-
if (comp.assertion?.type !== "llm-rubric")
|
|
687
|
-
continue;
|
|
688
|
-
const score = parseRubricScore(comp);
|
|
689
|
-
const kind = classifyRubric(comp);
|
|
690
|
-
if (kind === "taskCompletion")
|
|
691
|
-
totalTask += score;
|
|
692
|
-
else if (kind === "codeCorrectness")
|
|
693
|
-
totalCode += score;
|
|
694
|
-
else if (kind === "docCoverage")
|
|
695
|
-
totalDoc += score;
|
|
696
|
-
}
|
|
697
|
-
}
|
|
698
|
-
const avgTask = totalTask / count;
|
|
699
|
-
const avgCode = totalCode / count;
|
|
700
|
-
const avgDoc = totalDoc / count;
|
|
701
|
-
const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
|
|
610
|
+
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
702
611
|
perModel[modelId][feature] = {
|
|
703
|
-
actualScore,
|
|
704
|
-
codeCorrectness:
|
|
705
|
-
docCoverage:
|
|
706
|
-
taskCompletion:
|
|
612
|
+
actualScore: scored.composite,
|
|
613
|
+
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
614
|
+
docCoverage: scored.dimensions.docCoverage ?? 0,
|
|
615
|
+
taskCompletion: scored.dimensions.taskCompletion ?? 0,
|
|
707
616
|
testCount: featureResults.length,
|
|
708
|
-
totalCost:
|
|
617
|
+
totalCost: scored.totalCost,
|
|
709
618
|
};
|
|
710
619
|
}
|
|
711
620
|
}
|
|
@@ -743,7 +652,7 @@ export function calculateAndWriteScores(options) {
|
|
|
743
652
|
}
|
|
744
653
|
}
|
|
745
654
|
// Determine mode — controls which result files are read
|
|
746
|
-
const mode = options.mode ??
|
|
655
|
+
const mode = options.mode ?? LiteracyVariant.STANDARD;
|
|
747
656
|
const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
|
|
748
657
|
// Agentic results path (only used in full mode)
|
|
749
658
|
const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
|
|
@@ -760,10 +669,18 @@ export function calculateAndWriteScores(options) {
|
|
|
760
669
|
if (source) {
|
|
761
670
|
log.info(`Source: ${sourceName} (${source.baseUrl})`);
|
|
762
671
|
}
|
|
763
|
-
// Load
|
|
672
|
+
// Load rubric config and resolve scoring profiles per variant.
|
|
673
|
+
// Gold (with-docs) entries use the "default" profile (3 dimensions).
|
|
674
|
+
// Baseline (without-docs) entries use "output-only" (2 dimensions,
|
|
675
|
+
// doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
|
|
764
676
|
const rubricConfig = loadRubricTemplates(ROOT);
|
|
765
|
-
|
|
766
|
-
const
|
|
677
|
+
const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
|
|
678
|
+
const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
|
|
679
|
+
log.debug("Loaded scoring profiles", {
|
|
680
|
+
gold: goldProfile,
|
|
681
|
+
baseline: baselineProfileWeights,
|
|
682
|
+
});
|
|
683
|
+
const baselineScores = calculateScores(baselineResultsPath, goldProfile, baselineProfileWeights);
|
|
767
684
|
log.debug("Baseline scores calculated", {
|
|
768
685
|
featureCount: baselineScores.length,
|
|
769
686
|
features: baselineScores.map((s) => ({
|
|
@@ -773,7 +690,7 @@ export function calculateAndWriteScores(options) {
|
|
|
773
690
|
docLift: s.docLift,
|
|
774
691
|
})),
|
|
775
692
|
});
|
|
776
|
-
const perModel = calculateScoresPerModel(baselineResultsPath,
|
|
693
|
+
const perModel = calculateScoresPerModel(baselineResultsPath, goldProfile, baselineProfileWeights);
|
|
777
694
|
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
778
695
|
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
779
696
|
allowedOrigins: options.allowedOrigins,
|
|
@@ -786,9 +703,10 @@ export function calculateAndWriteScores(options) {
|
|
|
786
703
|
let agentBehavior = null;
|
|
787
704
|
let sourceIsolation = null;
|
|
788
705
|
let evaluationMode;
|
|
789
|
-
if (mode ===
|
|
706
|
+
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
790
707
|
log.info(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
791
|
-
const
|
|
708
|
+
const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
|
|
709
|
+
const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
|
|
792
710
|
log.debug("Agentic scores calculated", {
|
|
793
711
|
featureCount: Object.keys(agenticScores).length,
|
|
794
712
|
features: Object.entries(agenticScores).map(([f, s]) => ({
|
|
@@ -798,10 +716,10 @@ export function calculateAndWriteScores(options) {
|
|
|
798
716
|
})),
|
|
799
717
|
});
|
|
800
718
|
scores = mergeScores(baselineScores, agenticScores);
|
|
801
|
-
evaluationMode =
|
|
719
|
+
evaluationMode = LiteracyVariant.FULL;
|
|
802
720
|
// Merge agentic actual scores into the per-model breakdown
|
|
803
721
|
if (perModel) {
|
|
804
|
-
const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath,
|
|
722
|
+
const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
|
|
805
723
|
for (const entry of perModel) {
|
|
806
724
|
const modelAgentic = agenticPerModel[entry.modelId];
|
|
807
725
|
if (modelAgentic) {
|
|
@@ -821,17 +739,20 @@ export function calculateAndWriteScores(options) {
|
|
|
821
739
|
graderCost.completionTokens += agenticGraderCost.completionTokens;
|
|
822
740
|
}
|
|
823
741
|
}
|
|
824
|
-
else if (mode ===
|
|
742
|
+
else if (mode === LiteracyVariant.AGENTIC) {
|
|
825
743
|
scores = baselineScores;
|
|
826
744
|
agentBehavior = aggregateAgentBehavior(baselineResultsPath);
|
|
827
745
|
sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
|
|
828
|
-
evaluationMode =
|
|
746
|
+
evaluationMode = LiteracyVariant.AGENTIC;
|
|
829
747
|
}
|
|
830
748
|
else {
|
|
831
749
|
scores = baselineScores;
|
|
832
750
|
agentBehavior = aggregateAgentBehavior(baselineResultsPath);
|
|
833
751
|
sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
|
|
834
|
-
evaluationMode =
|
|
752
|
+
evaluationMode =
|
|
753
|
+
mode === LiteracyVariant.OBSERVED
|
|
754
|
+
? LiteracyVariant.OBSERVED
|
|
755
|
+
: LiteracyVariant.STANDARD;
|
|
835
756
|
}
|
|
836
757
|
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
|
|
837
758
|
// Persist
|
|
@@ -842,7 +763,7 @@ export function calculateAndWriteScores(options) {
|
|
|
842
763
|
// Extract and persist grader judgments (Phase 3a: failure mode extraction)
|
|
843
764
|
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
844
765
|
// In full mode, also extract judgments from agentic results
|
|
845
|
-
if (mode ===
|
|
766
|
+
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
846
767
|
const agenticJudgments = extractGraderJudgments(agenticResultsPath);
|
|
847
768
|
judgments.push(...agenticJudgments);
|
|
848
769
|
}
|
package/dist/pipeline/checks.js
CHANGED
|
@@ -117,7 +117,7 @@ export function checkGeneratedConfigsExist(rootDir) {
|
|
|
117
117
|
const baselinePath = resolve(rootDir, "promptfooconfig.yaml");
|
|
118
118
|
if (!existsSync(baselinePath)) {
|
|
119
119
|
issues.push({
|
|
120
|
-
message: "Baseline config 'promptfooconfig.yaml' not found. Run
|
|
120
|
+
message: "Baseline config 'promptfooconfig.yaml' not found. Run the pipeline to generate it.",
|
|
121
121
|
path: baselinePath,
|
|
122
122
|
severity: "error",
|
|
123
123
|
source: "checkGeneratedConfigsExist",
|
|
@@ -131,7 +131,7 @@ export function checkGeneratedConfigsExist(rootDir) {
|
|
|
131
131
|
const configPath = resolve(rootDir, name);
|
|
132
132
|
if (!existsSync(configPath)) {
|
|
133
133
|
issues.push({
|
|
134
|
-
message: `Optional config \`${name}\` not found. Run
|
|
134
|
+
message: `Optional config \`${name}\` not found. Run the pipeline to generate it.`,
|
|
135
135
|
path: configPath,
|
|
136
136
|
severity: "warning",
|
|
137
137
|
source: "checkGeneratedConfigsExist",
|
package/dist/pipeline/compare.js
CHANGED
|
@@ -79,14 +79,14 @@ export function compare(baseline, experiment, options) {
|
|
|
79
79
|
// Per-dimension average deltas (only for areas present in both summaries)
|
|
80
80
|
const commonAreas = areas.filter((a) => baselineAreas.has(a.area) && experimentAreas.has(a.area));
|
|
81
81
|
const commonCount = commonAreas.length || 1;
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
}
|
|
82
|
+
// Collect all dimension keys from area deltas and average each
|
|
83
|
+
const allDimKeys = new Set(commonAreas.flatMap((a) => Object.keys(a.dimensions)));
|
|
84
|
+
const perDimension = {};
|
|
85
|
+
for (const dim of allDimKeys) {
|
|
86
|
+
perDimension[dim] =
|
|
87
|
+
commonAreas.reduce((s, a) => s + (a.dimensions[dim]?.delta ?? 0), 0) /
|
|
88
|
+
commonCount;
|
|
89
|
+
}
|
|
90
90
|
// Doc Lift average delta (common areas only)
|
|
91
91
|
const docLift = commonAreas.reduce((s, a) => s + a.docLiftDelta, 0) / commonCount;
|
|
92
92
|
// Cost delta (if both summaries have cost data)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-harness-handler.test.ts — Tests for agent harness mode compilation.
|
|
3
|
+
*
|
|
4
|
+
* Tests validation, provider assembly, tool permission resolution,
|
|
5
|
+
* assertion mapping, sandbox config, lifecycle extensions, and
|
|
6
|
+
* end-to-end compilation of example tasks.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/agent-harness-handler.test.ts
|
|
9
|
+
*/
|
|
10
|
+
export {};
|