@sanity/ailf 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
package/dist/pipeline/cache.d.ts
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
*
|
|
11
11
|
* Cache invalidation triggers:
|
|
12
12
|
* - Content change: any input file's content changes → hash changes → miss
|
|
13
|
-
* - Config change: config/models
|
|
13
|
+
* - Config change: config/models, config/sources, tasks/*.yaml changes → miss
|
|
14
14
|
* - Manual bypass: --no-cache flag skips all cache lookups
|
|
15
15
|
* - Cache clear: delete results/cache/ to start fresh
|
|
16
16
|
*/
|
package/dist/pipeline/cache.js
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
*
|
|
11
11
|
* Cache invalidation triggers:
|
|
12
12
|
* - Content change: any input file's content changes → hash changes → miss
|
|
13
|
-
* - Config change: config/models
|
|
13
|
+
* - Config change: config/models, config/sources, tasks/*.yaml changes → miss
|
|
14
14
|
* - Manual bypass: --no-cache flag skips all cache lookups
|
|
15
15
|
* - Cache clear: delete results/cache/ to start fresh
|
|
16
16
|
*/
|
|
@@ -18,6 +18,19 @@ import { createHash } from "crypto";
|
|
|
18
18
|
import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "fs";
|
|
19
19
|
import { join, resolve } from "path";
|
|
20
20
|
// ---------------------------------------------------------------------------
|
|
21
|
+
// Helpers
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
/** Resolve first existing config file (matches loadConfigFile priority chain) */
|
|
24
|
+
function resolveConfig(rootDir, name) {
|
|
25
|
+
const r = (f) => resolve(rootDir, f);
|
|
26
|
+
for (const ext of [".ts", ".js", ".yaml", ".yml", ".json"]) {
|
|
27
|
+
const p = r(`config/${name}${ext}`);
|
|
28
|
+
if (existsSync(p))
|
|
29
|
+
return p;
|
|
30
|
+
}
|
|
31
|
+
return undefined;
|
|
32
|
+
}
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
21
34
|
// Constants
|
|
22
35
|
// ---------------------------------------------------------------------------
|
|
23
36
|
const CACHE_DIR_NAME = "cache";
|
|
@@ -79,7 +92,10 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
79
92
|
const isBaseline = step === "eval-baseline" || step === "eval";
|
|
80
93
|
const isAgentic = step === "eval-agentic" || step === "eval";
|
|
81
94
|
const isObserved = step === "eval-observed" || step === "eval";
|
|
82
|
-
const paths = [
|
|
95
|
+
const paths = [];
|
|
96
|
+
const modelsPath = resolveConfig(rootDir, "models");
|
|
97
|
+
if (modelsPath)
|
|
98
|
+
paths.push(modelsPath);
|
|
83
99
|
// Config files — only the relevant ones for this mode
|
|
84
100
|
if (isBaseline) {
|
|
85
101
|
paths.push(r("promptfooconfig.yaml"));
|
|
@@ -130,25 +146,37 @@ export function getStepInputPaths(rootDir, step) {
|
|
|
130
146
|
return paths;
|
|
131
147
|
}
|
|
132
148
|
case "fetch-docs": {
|
|
133
|
-
// Inputs: config
|
|
134
|
-
const paths = [
|
|
149
|
+
// Inputs: config sources + models, task files
|
|
150
|
+
const paths = [];
|
|
151
|
+
const sourcesPath = resolveConfig(rootDir, "sources");
|
|
152
|
+
const modelsPath2 = resolveConfig(rootDir, "models");
|
|
153
|
+
if (sourcesPath)
|
|
154
|
+
paths.push(sourcesPath);
|
|
155
|
+
if (modelsPath2)
|
|
156
|
+
paths.push(modelsPath2);
|
|
135
157
|
// Include all task files (they define feature areas)
|
|
136
158
|
const tasksDir = r("tasks");
|
|
137
159
|
if (existsSync(tasksDir)) {
|
|
138
160
|
const taskFiles = readdirSync(tasksDir)
|
|
139
|
-
.filter((f) =>
|
|
161
|
+
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
|
|
140
162
|
.map((f) => join(tasksDir, f));
|
|
141
163
|
paths.push(...taskFiles);
|
|
142
164
|
}
|
|
143
165
|
return paths;
|
|
144
166
|
}
|
|
145
167
|
case "generate-configs": {
|
|
146
|
-
// Inputs: config
|
|
147
|
-
const paths = [
|
|
168
|
+
// Inputs: config models + sources, all task files
|
|
169
|
+
const paths = [];
|
|
170
|
+
const modelsPath3 = resolveConfig(rootDir, "models");
|
|
171
|
+
const sourcesPath2 = resolveConfig(rootDir, "sources");
|
|
172
|
+
if (modelsPath3)
|
|
173
|
+
paths.push(modelsPath3);
|
|
174
|
+
if (sourcesPath2)
|
|
175
|
+
paths.push(sourcesPath2);
|
|
148
176
|
const tasksDir = r("tasks");
|
|
149
177
|
if (existsSync(tasksDir)) {
|
|
150
178
|
const taskFiles = readdirSync(tasksDir)
|
|
151
|
-
.filter((f) =>
|
|
179
|
+
.filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
|
|
152
180
|
.map((f) => join(tasksDir, f));
|
|
153
181
|
paths.push(...taskFiles);
|
|
154
182
|
}
|
|
@@ -1,9 +1,7 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
|
-
import { type ActualScoreEntry, type ComponentResult } from "../_vendor/ailf-core/index.d.ts";
|
|
4
3
|
import type { GraderJudgment, PerModelEntry } from "./types.js";
|
|
5
|
-
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.d.ts";
|
|
6
|
-
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
4
|
+
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
7
5
|
export interface PromptfooResultsWrapper {
|
|
8
6
|
results: RawTestResult[];
|
|
9
7
|
stats: {
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
* Doc Coverage (0–100) — Did docs provide the needed info?
|
|
10
10
|
*
|
|
11
11
|
* Dimensions are combined into a weighted composite (0–100) using named
|
|
12
|
-
* scoring profiles from config/rubrics.
|
|
12
|
+
* scoring profiles from config/rubrics. Gold (with-docs) entries use
|
|
13
13
|
* the "default" profile; baseline (without-docs) entries use "output-only"
|
|
14
14
|
* which excludes doc-coverage (undefined without docs).
|
|
15
15
|
* See docs/design-docs/named-scoring-profiles.md.
|
|
@@ -29,15 +29,17 @@
|
|
|
29
29
|
*/
|
|
30
30
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
31
31
|
import { join } from "path";
|
|
32
|
+
import { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
32
33
|
import { calculateCost } from "../agent-observer/pricing.js";
|
|
33
34
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
35
|
+
import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
|
|
34
36
|
import { checkResultsExist } from "./checks.js";
|
|
35
|
-
import { loadRubricTemplates } from "./
|
|
37
|
+
import { loadRubricTemplates } from "./rubric-loader.js";
|
|
36
38
|
import { resolveProfile } from "./profile-resolution.js";
|
|
37
39
|
import { loadSource } from "../sources.js";
|
|
38
|
-
import {
|
|
39
|
-
import {
|
|
40
|
-
// Re-export
|
|
40
|
+
import { LiteracyVariant } from "./normalize-mode.js";
|
|
41
|
+
import { scoreTestGroup } from "./compiler/scoring-bridge.js";
|
|
42
|
+
// Re-export from core for backward compatibility.
|
|
41
43
|
// Existing imports from this file continue to work unchanged.
|
|
42
44
|
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
43
45
|
/**
|
|
@@ -137,14 +139,8 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
137
139
|
// Not JSON — use raw reason string
|
|
138
140
|
}
|
|
139
141
|
}
|
|
140
|
-
// Map internal dimension names to hyphenated form
|
|
141
|
-
const dimensionMap = {
|
|
142
|
-
codeCorrectness: "code-correctness",
|
|
143
|
-
docCoverage: "doc-coverage",
|
|
144
|
-
taskCompletion: "task-completion",
|
|
145
|
-
};
|
|
146
142
|
judgments.push({
|
|
147
|
-
dimension:
|
|
143
|
+
dimension: kind,
|
|
148
144
|
modelId,
|
|
149
145
|
reason,
|
|
150
146
|
score,
|
|
@@ -281,7 +277,7 @@ function aggregateUrlReferences(resultsPath) {
|
|
|
281
277
|
* verification report.
|
|
282
278
|
*/
|
|
283
279
|
function buildSourceVerification(root, source, verificationCtx) {
|
|
284
|
-
const mode = verificationCtx?.mode ??
|
|
280
|
+
const mode = verificationCtx?.mode ?? LiteracyVariant.STANDARD;
|
|
285
281
|
const sourceUrl = source?.baseUrl ?? "default";
|
|
286
282
|
const searchMode = verificationCtx?.searchMode;
|
|
287
283
|
const allowedOrigins = verificationCtx?.allowedOrigins;
|
|
@@ -493,62 +489,6 @@ function readAndNormalizeResults(resultsPath, log) {
|
|
|
493
489
|
}
|
|
494
490
|
return valid;
|
|
495
491
|
}
|
|
496
|
-
/**
|
|
497
|
-
* Accumulate raw dimension scores across an array of test results.
|
|
498
|
-
* Dimension-agnostic: any dimension returned by classifyRubric() is tracked.
|
|
499
|
-
*/
|
|
500
|
-
function accumulateDimensions(tests) {
|
|
501
|
-
const dimensions = {};
|
|
502
|
-
let totalCost = 0;
|
|
503
|
-
for (const test of tests) {
|
|
504
|
-
totalCost += test.cost;
|
|
505
|
-
for (const comp of test.gradingResult.componentResults) {
|
|
506
|
-
if (comp.assertion?.type !== "llm-rubric")
|
|
507
|
-
continue;
|
|
508
|
-
const score = parseRubricScore(comp);
|
|
509
|
-
const kind = classifyRubric(comp);
|
|
510
|
-
if (kind) {
|
|
511
|
-
dimensions[kind] = (dimensions[kind] ?? 0) + score;
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
return { dimensions, totalCost };
|
|
516
|
-
}
|
|
517
|
-
/**
|
|
518
|
-
* Average accumulated dimension scores by a count.
|
|
519
|
-
* Returns a dimension → average score map.
|
|
520
|
-
*/
|
|
521
|
-
function averageDimensions(accumulated, count) {
|
|
522
|
-
const avg = {};
|
|
523
|
-
for (const [dim, total] of Object.entries(accumulated.dimensions)) {
|
|
524
|
-
avg[dim] = total / count;
|
|
525
|
-
}
|
|
526
|
-
return avg;
|
|
527
|
-
}
|
|
528
|
-
/**
|
|
529
|
-
* Compute a weighted composite score from dimension averages and a profile.
|
|
530
|
-
* Only dimensions present in the profile contribute to the composite.
|
|
531
|
-
* Dimensions not in the profile are ignored (e.g., doc-coverage on baseline).
|
|
532
|
-
*
|
|
533
|
-
* The profile maps camelCase dimension names (as returned by classifyRubric)
|
|
534
|
-
* to kebab-case keys (as used in rubrics.yaml). This function handles the
|
|
535
|
-
* mapping internally.
|
|
536
|
-
*/
|
|
537
|
-
function weightedComposite(dimensionAverages, profile) {
|
|
538
|
-
// Map profile keys (kebab-case: "task-completion") to classifyRubric
|
|
539
|
-
// output (camelCase: "taskCompletion")
|
|
540
|
-
const kebabToCamel = {
|
|
541
|
-
"code-correctness": "codeCorrectness",
|
|
542
|
-
"doc-coverage": "docCoverage",
|
|
543
|
-
"task-completion": "taskCompletion",
|
|
544
|
-
};
|
|
545
|
-
let total = 0;
|
|
546
|
-
for (const [profileKey, weight] of Object.entries(profile)) {
|
|
547
|
-
const dimKey = kebabToCamel[profileKey] ?? profileKey;
|
|
548
|
-
total += (dimensionAverages[dimKey] ?? 0) * weight;
|
|
549
|
-
}
|
|
550
|
-
return total;
|
|
551
|
-
}
|
|
552
492
|
/**
|
|
553
493
|
* Core scoring logic: takes a pre-filtered array of TestResult and produces
|
|
554
494
|
* FeatureScore[] grouped by feature area. This is the shared implementation
|
|
@@ -577,35 +517,28 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
|
577
517
|
}
|
|
578
518
|
const scores = [];
|
|
579
519
|
for (const [feature, data] of Object.entries(byFeature)) {
|
|
580
|
-
// --- With docs (gold / ceiling) ---
|
|
581
|
-
const
|
|
582
|
-
let featureCost = goldDims.totalCost;
|
|
583
|
-
const countWithDocs = data.withDocs.length || 1;
|
|
584
|
-
const avgGold = averageDimensions(goldDims, countWithDocs);
|
|
585
|
-
const withDocsTotal = weightedComposite(avgGold, goldProfile);
|
|
520
|
+
// --- With docs (gold / ceiling) — scored via 4-tier engine ---
|
|
521
|
+
const gold = scoreTestGroup(data.withDocs, goldProfile, feature);
|
|
586
522
|
// --- Without docs (baseline / floor) ---
|
|
587
523
|
// Uses the baseline profile (e.g. "output-only") which may exclude
|
|
588
524
|
// dimensions like doc-coverage that are undefined without docs.
|
|
589
525
|
// See docs/design-docs/named-scoring-profiles.md.
|
|
590
|
-
const
|
|
591
|
-
featureCost
|
|
592
|
-
const
|
|
593
|
-
const
|
|
594
|
-
const withoutDocsScore = weightedComposite(avgBaseline, baselineProfile);
|
|
595
|
-
const ceilingScore = Math.round(withDocsTotal);
|
|
596
|
-
const floorScore = Math.round(withoutDocsScore);
|
|
526
|
+
const baseline = scoreTestGroup(data.withoutDocs, baselineProfile, feature);
|
|
527
|
+
const featureCost = gold.totalCost + baseline.totalCost;
|
|
528
|
+
const ceilingScore = gold.composite;
|
|
529
|
+
const floorScore = baseline.composite;
|
|
597
530
|
const docLift = ceilingScore - floorScore;
|
|
598
531
|
const featureScore = {
|
|
599
532
|
ceilingScore,
|
|
600
|
-
codeCorrectness:
|
|
601
|
-
docCoverage:
|
|
533
|
+
codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
|
|
534
|
+
docCoverage: gold.dimensions.docCoverage ?? 0,
|
|
602
535
|
docLift,
|
|
603
536
|
docQualityGap: 100 - ceilingScore,
|
|
604
537
|
feature,
|
|
605
538
|
floorScore,
|
|
606
539
|
...(modelId && { modelId }),
|
|
607
540
|
negativeDocLift: docLift < 0,
|
|
608
|
-
taskCompletion:
|
|
541
|
+
taskCompletion: gold.dimensions.taskCompletion ?? 0,
|
|
609
542
|
testCount: data.withDocs.length,
|
|
610
543
|
totalCost: featureCost,
|
|
611
544
|
totalScore: ceilingScore,
|
|
@@ -638,17 +571,14 @@ export function scoreAgenticResults(resultsPath, profile) {
|
|
|
638
571
|
}
|
|
639
572
|
const entries = {};
|
|
640
573
|
for (const [feature, featureResults] of Object.entries(byFeature)) {
|
|
641
|
-
const
|
|
642
|
-
const accumulated = accumulateDimensions(featureResults);
|
|
643
|
-
const avg = averageDimensions(accumulated, count);
|
|
644
|
-
const actualScore = Math.round(weightedComposite(avg, profile));
|
|
574
|
+
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
645
575
|
entries[feature] = {
|
|
646
|
-
actualScore,
|
|
647
|
-
codeCorrectness:
|
|
648
|
-
docCoverage:
|
|
649
|
-
taskCompletion:
|
|
576
|
+
actualScore: scored.composite,
|
|
577
|
+
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
578
|
+
docCoverage: scored.dimensions.docCoverage ?? 0,
|
|
579
|
+
taskCompletion: scored.dimensions.taskCompletion ?? 0,
|
|
650
580
|
testCount: featureResults.length,
|
|
651
|
-
totalCost:
|
|
581
|
+
totalCost: scored.totalCost,
|
|
652
582
|
};
|
|
653
583
|
}
|
|
654
584
|
return entries;
|
|
@@ -677,17 +607,14 @@ export function scoreAgenticResultsPerModel(resultsPath, profile) {
|
|
|
677
607
|
for (const [modelId, features] of Object.entries(byModel)) {
|
|
678
608
|
perModel[modelId] = {};
|
|
679
609
|
for (const [feature, featureResults] of Object.entries(features)) {
|
|
680
|
-
const
|
|
681
|
-
const accumulated = accumulateDimensions(featureResults);
|
|
682
|
-
const avg = averageDimensions(accumulated, count);
|
|
683
|
-
const actualScore = Math.round(weightedComposite(avg, profile));
|
|
610
|
+
const scored = scoreTestGroup(featureResults, profile, feature);
|
|
684
611
|
perModel[modelId][feature] = {
|
|
685
|
-
actualScore,
|
|
686
|
-
codeCorrectness:
|
|
687
|
-
docCoverage:
|
|
688
|
-
taskCompletion:
|
|
612
|
+
actualScore: scored.composite,
|
|
613
|
+
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
614
|
+
docCoverage: scored.dimensions.docCoverage ?? 0,
|
|
615
|
+
taskCompletion: scored.dimensions.taskCompletion ?? 0,
|
|
689
616
|
testCount: featureResults.length,
|
|
690
|
-
totalCost:
|
|
617
|
+
totalCost: scored.totalCost,
|
|
691
618
|
};
|
|
692
619
|
}
|
|
693
620
|
}
|
|
@@ -725,7 +652,7 @@ export function calculateAndWriteScores(options) {
|
|
|
725
652
|
}
|
|
726
653
|
}
|
|
727
654
|
// Determine mode — controls which result files are read
|
|
728
|
-
const mode = options.mode ??
|
|
655
|
+
const mode = options.mode ?? LiteracyVariant.STANDARD;
|
|
729
656
|
const baselineResultsPath = options.resultsPath ?? join(ROOT, "results", "latest", "eval-results.json");
|
|
730
657
|
// Agentic results path (only used in full mode)
|
|
731
658
|
const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
|
|
@@ -747,8 +674,8 @@ export function calculateAndWriteScores(options) {
|
|
|
747
674
|
// Baseline (without-docs) entries use "output-only" (2 dimensions,
|
|
748
675
|
// doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
|
|
749
676
|
const rubricConfig = loadRubricTemplates(ROOT);
|
|
750
|
-
const goldProfile = resolveProfile("
|
|
751
|
-
const baselineProfileWeights = resolveProfile("
|
|
677
|
+
const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
|
|
678
|
+
const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
|
|
752
679
|
log.debug("Loaded scoring profiles", {
|
|
753
680
|
gold: goldProfile,
|
|
754
681
|
baseline: baselineProfileWeights,
|
|
@@ -776,9 +703,9 @@ export function calculateAndWriteScores(options) {
|
|
|
776
703
|
let agentBehavior = null;
|
|
777
704
|
let sourceIsolation = null;
|
|
778
705
|
let evaluationMode;
|
|
779
|
-
if (mode ===
|
|
706
|
+
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
780
707
|
log.info(`\nReading agentic results from: ${agenticResultsPath}`);
|
|
781
|
-
const agenticProfile = resolveProfile("
|
|
708
|
+
const agenticProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.AGENTIC);
|
|
782
709
|
const agenticScores = scoreAgenticResults(agenticResultsPath, agenticProfile);
|
|
783
710
|
log.debug("Agentic scores calculated", {
|
|
784
711
|
featureCount: Object.keys(agenticScores).length,
|
|
@@ -789,7 +716,7 @@ export function calculateAndWriteScores(options) {
|
|
|
789
716
|
})),
|
|
790
717
|
});
|
|
791
718
|
scores = mergeScores(baselineScores, agenticScores);
|
|
792
|
-
evaluationMode =
|
|
719
|
+
evaluationMode = LiteracyVariant.FULL;
|
|
793
720
|
// Merge agentic actual scores into the per-model breakdown
|
|
794
721
|
if (perModel) {
|
|
795
722
|
const agenticPerModel = scoreAgenticResultsPerModel(agenticResultsPath, agenticProfile);
|
|
@@ -812,17 +739,20 @@ export function calculateAndWriteScores(options) {
|
|
|
812
739
|
graderCost.completionTokens += agenticGraderCost.completionTokens;
|
|
813
740
|
}
|
|
814
741
|
}
|
|
815
|
-
else if (mode ===
|
|
742
|
+
else if (mode === LiteracyVariant.AGENTIC) {
|
|
816
743
|
scores = baselineScores;
|
|
817
744
|
agentBehavior = aggregateAgentBehavior(baselineResultsPath);
|
|
818
745
|
sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
|
|
819
|
-
evaluationMode =
|
|
746
|
+
evaluationMode = LiteracyVariant.AGENTIC;
|
|
820
747
|
}
|
|
821
748
|
else {
|
|
822
749
|
scores = baselineScores;
|
|
823
750
|
agentBehavior = aggregateAgentBehavior(baselineResultsPath);
|
|
824
751
|
sourceIsolation = aggregateSourceIsolation(baselineResultsPath, options?.allowedOrigins);
|
|
825
|
-
evaluationMode =
|
|
752
|
+
evaluationMode =
|
|
753
|
+
mode === LiteracyVariant.OBSERVED
|
|
754
|
+
? LiteracyVariant.OBSERVED
|
|
755
|
+
: LiteracyVariant.STANDARD;
|
|
826
756
|
}
|
|
827
757
|
const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode, log);
|
|
828
758
|
// Persist
|
|
@@ -833,7 +763,7 @@ export function calculateAndWriteScores(options) {
|
|
|
833
763
|
// Extract and persist grader judgments (Phase 3a: failure mode extraction)
|
|
834
764
|
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
835
765
|
// In full mode, also extract judgments from agentic results
|
|
836
|
-
if (mode ===
|
|
766
|
+
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
837
767
|
const agenticJudgments = extractGraderJudgments(agenticResultsPath);
|
|
838
768
|
judgments.push(...agenticJudgments);
|
|
839
769
|
}
|
package/dist/pipeline/checks.js
CHANGED
|
@@ -117,7 +117,7 @@ export function checkGeneratedConfigsExist(rootDir) {
|
|
|
117
117
|
const baselinePath = resolve(rootDir, "promptfooconfig.yaml");
|
|
118
118
|
if (!existsSync(baselinePath)) {
|
|
119
119
|
issues.push({
|
|
120
|
-
message: "Baseline config 'promptfooconfig.yaml' not found. Run
|
|
120
|
+
message: "Baseline config 'promptfooconfig.yaml' not found. Run the pipeline to generate it.",
|
|
121
121
|
path: baselinePath,
|
|
122
122
|
severity: "error",
|
|
123
123
|
source: "checkGeneratedConfigsExist",
|
|
@@ -131,7 +131,7 @@ export function checkGeneratedConfigsExist(rootDir) {
|
|
|
131
131
|
const configPath = resolve(rootDir, name);
|
|
132
132
|
if (!existsSync(configPath)) {
|
|
133
133
|
issues.push({
|
|
134
|
-
message: `Optional config \`${name}\` not found. Run
|
|
134
|
+
message: `Optional config \`${name}\` not found. Run the pipeline to generate it.`,
|
|
135
135
|
path: configPath,
|
|
136
136
|
severity: "warning",
|
|
137
137
|
source: "checkGeneratedConfigsExist",
|
package/dist/pipeline/compare.js
CHANGED
|
@@ -79,14 +79,14 @@ export function compare(baseline, experiment, options) {
|
|
|
79
79
|
// Per-dimension average deltas (only for areas present in both summaries)
|
|
80
80
|
const commonAreas = areas.filter((a) => baselineAreas.has(a.area) && experimentAreas.has(a.area));
|
|
81
81
|
const commonCount = commonAreas.length || 1;
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
}
|
|
82
|
+
// Collect all dimension keys from area deltas and average each
|
|
83
|
+
const allDimKeys = new Set(commonAreas.flatMap((a) => Object.keys(a.dimensions)));
|
|
84
|
+
const perDimension = {};
|
|
85
|
+
for (const dim of allDimKeys) {
|
|
86
|
+
perDimension[dim] =
|
|
87
|
+
commonAreas.reduce((s, a) => s + (a.dimensions[dim]?.delta ?? 0), 0) /
|
|
88
|
+
commonCount;
|
|
89
|
+
}
|
|
90
90
|
// Doc Lift average delta (common areas only)
|
|
91
91
|
const docLift = commonAreas.reduce((s, a) => s + a.docLiftDelta, 0) / commonCount;
|
|
92
92
|
// Cost delta (if both summaries have cost data)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-harness-handler.test.ts — Tests for agent harness mode compilation.
|
|
3
|
+
*
|
|
4
|
+
* Tests validation, provider assembly, tool permission resolution,
|
|
5
|
+
* assertion mapping, sandbox config, lifecycle extensions, and
|
|
6
|
+
* end-to-end compilation of example tasks.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/agent-harness-handler.test.ts
|
|
9
|
+
*/
|
|
10
|
+
export {};
|