@sanity/ailf 0.4.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
- package/dist/_vendor/ailf-core/examples/index.js +10 -10
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +5 -7
- package/dist/pipeline/calculate-scores.js +74 -153
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +23 -14
- package/dist/pipeline/expand-tasks.js +37 -31
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +18 -21
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +47 -0
- package/dist/pipeline/profile-resolution.js +91 -0
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -62
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
* (studio-eval-config) so Content Lake documents validate identically.
|
|
11
11
|
*/
|
|
12
12
|
import { z } from "zod";
|
|
13
|
+
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
13
14
|
export const EvalConfigSchema = z
|
|
14
15
|
.object({
|
|
15
16
|
/** Allowed origins for agentic mode */
|
|
@@ -46,8 +47,12 @@ export const EvalConfigSchema = z
|
|
|
46
47
|
graderReplications: z.number().int().positive().optional(),
|
|
47
48
|
/** Custom headers for doc fetching */
|
|
48
49
|
headers: z.record(z.string(), z.string()).optional(),
|
|
49
|
-
/**
|
|
50
|
-
|
|
50
|
+
/**
|
|
51
|
+
* Evaluation mode — accepts both canonical and legacy names.
|
|
52
|
+
* Legacy names ("baseline", "agentic", "observed", "full") must pass
|
|
53
|
+
* through normalizeMode() before entering typed pipeline code.
|
|
54
|
+
*/
|
|
55
|
+
mode: z.enum(RAW_EVAL_MODES).optional(),
|
|
51
56
|
/** Disable release-aware auto-scoping */
|
|
52
57
|
noAutoScope: z.boolean().optional(),
|
|
53
58
|
/** Disable local cache */
|
|
@@ -49,10 +49,15 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
49
49
|
inlineTasks: z.ZodOptional<z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
50
50
|
jobId: z.ZodOptional<z.ZodString>;
|
|
51
51
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
52
|
-
|
|
52
|
+
custom: "custom";
|
|
53
|
+
literacy: "literacy";
|
|
54
|
+
"mcp-server": "mcp-server";
|
|
55
|
+
"agent-harness": "agent-harness";
|
|
56
|
+
"knowledge-probe": "knowledge-probe";
|
|
53
57
|
baseline: "baseline";
|
|
54
|
-
|
|
58
|
+
agentic: "agentic";
|
|
55
59
|
observed: "observed";
|
|
60
|
+
full: "full";
|
|
56
61
|
}>>;
|
|
57
62
|
noAutoScope: z.ZodOptional<z.ZodBoolean>;
|
|
58
63
|
noCache: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -70,9 +75,9 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
70
75
|
source: z.ZodOptional<z.ZodString>;
|
|
71
76
|
sourceReportId: z.ZodOptional<z.ZodString>;
|
|
72
77
|
taskMode: z.ZodOptional<z.ZodEnum<{
|
|
78
|
+
inline: "inline";
|
|
73
79
|
"content-lake": "content-lake";
|
|
74
80
|
yaml: "yaml";
|
|
75
|
-
inline: "inline";
|
|
76
81
|
}>>;
|
|
77
82
|
tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
78
83
|
urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
* @see packages/eval/src/pipeline/map-request-to-config.ts — maps to ResolvedConfig
|
|
14
14
|
*/
|
|
15
15
|
import { z } from "zod";
|
|
16
|
+
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
16
17
|
// ---------------------------------------------------------------------------
|
|
17
18
|
// Debug options — boolean shorthand or structured object
|
|
18
19
|
// ---------------------------------------------------------------------------
|
|
@@ -69,7 +70,11 @@ export const PipelineRequestSchema = z.object({
|
|
|
69
70
|
headers: z.record(z.string(), z.string()).optional(),
|
|
70
71
|
inlineTasks: z.array(z.record(z.string(), z.unknown())).optional(),
|
|
71
72
|
jobId: z.string().optional(),
|
|
72
|
-
|
|
73
|
+
/**
|
|
74
|
+
* Evaluation mode — accepts both canonical and legacy names.
|
|
75
|
+
* Legacy names must pass through normalizeMode() before entering typed pipeline code.
|
|
76
|
+
*/
|
|
77
|
+
mode: z.enum(RAW_EVAL_MODES).optional(),
|
|
73
78
|
noAutoScope: z.boolean().optional(),
|
|
74
79
|
noCache: z.boolean().optional(),
|
|
75
80
|
noRemoteCache: z.boolean().optional(),
|
|
@@ -25,21 +25,37 @@ export declare const RubricTemplateSchema: z.ZodObject<{
|
|
|
25
25
|
}, z.core.$strip>;
|
|
26
26
|
/** Inferred TypeScript type for a rubric template. */
|
|
27
27
|
export type RubricTemplate = z.infer<typeof RubricTemplateSchema>;
|
|
28
|
+
/**
|
|
29
|
+
* A named weight profile — maps dimension names to weights (must sum to 1.0).
|
|
30
|
+
* Each profile is a self-contained scoring formula used for a specific
|
|
31
|
+
* (mode, variant) pair.
|
|
32
|
+
*/
|
|
33
|
+
declare const WeightProfileSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
34
|
+
/** Inferred type for a single weight profile. */
|
|
35
|
+
export type WeightProfile = z.infer<typeof WeightProfileSchema>;
|
|
28
36
|
/**
|
|
29
37
|
* Schema for the full config/rubrics.yaml config file.
|
|
30
38
|
*
|
|
31
|
-
* Each dimension is scored on a uniform 0–100 scale.
|
|
32
|
-
*
|
|
39
|
+
* Each dimension is scored on a uniform 0–100 scale. Named scoring profiles
|
|
40
|
+
* define how dimensions are combined into composite scores. Mode-profile
|
|
41
|
+
* bindings declare which profile to use for each (mode, variant) pair.
|
|
42
|
+
*
|
|
43
|
+
* Supports both the new `profiles` format and the legacy flat `weights`
|
|
44
|
+
* format for backward compatibility.
|
|
45
|
+
*
|
|
46
|
+
* @see docs/design-docs/named-scoring-profiles.md
|
|
33
47
|
*/
|
|
34
48
|
export declare const RubricConfigSchema: z.ZodObject<{
|
|
35
49
|
footer: z.ZodString;
|
|
50
|
+
"mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>]>>>>;
|
|
51
|
+
profiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodNumber>>>;
|
|
36
52
|
templates: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
37
53
|
criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
38
54
|
dimension: z.ZodOptional<z.ZodString>;
|
|
39
55
|
header: z.ZodString;
|
|
40
56
|
scale: z.ZodArray<z.ZodString>;
|
|
41
57
|
}, z.core.$strip>>;
|
|
42
|
-
weights: z.ZodRecord<z.ZodString, z.ZodNumber
|
|
58
|
+
weights: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
43
59
|
}, z.core.$strip>;
|
|
44
60
|
/** Inferred TypeScript type for the rubrics config. */
|
|
45
61
|
export type RubricConfig = z.infer<typeof RubricConfigSchema>;
|
|
@@ -51,17 +67,17 @@ export declare const FeatureSchema: z.ZodObject<{
|
|
|
51
67
|
id: z.ZodString;
|
|
52
68
|
name: z.ZodString;
|
|
53
69
|
priority: z.ZodEnum<{
|
|
70
|
+
critical: "critical";
|
|
54
71
|
high: "high";
|
|
55
|
-
low: "low";
|
|
56
72
|
medium: "medium";
|
|
57
|
-
|
|
73
|
+
low: "low";
|
|
58
74
|
}>;
|
|
59
75
|
sections: z.ZodArray<z.ZodString>;
|
|
60
76
|
status: z.ZodEnum<{
|
|
61
77
|
covered: "covered";
|
|
62
|
-
"out-of-scope": "out-of-scope";
|
|
63
|
-
planned: "planned";
|
|
64
78
|
uncovered: "uncovered";
|
|
79
|
+
planned: "planned";
|
|
80
|
+
"out-of-scope": "out-of-scope";
|
|
65
81
|
}>;
|
|
66
82
|
taskCount: z.ZodOptional<z.ZodNumber>;
|
|
67
83
|
}, z.core.$strip>;
|
|
@@ -76,17 +92,17 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
|
|
|
76
92
|
id: z.ZodString;
|
|
77
93
|
name: z.ZodString;
|
|
78
94
|
priority: z.ZodEnum<{
|
|
95
|
+
critical: "critical";
|
|
79
96
|
high: "high";
|
|
80
|
-
low: "low";
|
|
81
97
|
medium: "medium";
|
|
82
|
-
|
|
98
|
+
low: "low";
|
|
83
99
|
}>;
|
|
84
100
|
sections: z.ZodArray<z.ZodString>;
|
|
85
101
|
status: z.ZodEnum<{
|
|
86
102
|
covered: "covered";
|
|
87
|
-
"out-of-scope": "out-of-scope";
|
|
88
|
-
planned: "planned";
|
|
89
103
|
uncovered: "uncovered";
|
|
104
|
+
planned: "planned";
|
|
105
|
+
"out-of-scope": "out-of-scope";
|
|
90
106
|
}>;
|
|
91
107
|
taskCount: z.ZodOptional<z.ZodNumber>;
|
|
92
108
|
}, z.core.$strip>>;
|
|
@@ -424,14 +440,11 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject
|
|
|
424
440
|
export type TaskFile = z.infer<typeof TaskFileSchema>;
|
|
425
441
|
/**
|
|
426
442
|
* Schema for per-dimension threshold values.
|
|
443
|
+
* Uses a dynamic record to support all evaluation modes, not just literacy.
|
|
427
444
|
* Keys use kebab-case to match YAML convention; the threshold engine
|
|
428
445
|
* normalizes to camelCase for comparison against FeatureScore fields.
|
|
429
446
|
*/
|
|
430
|
-
export declare const ThresholdDimensionsSchema: z.
|
|
431
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
432
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
433
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
434
|
-
}, z.core.$strip>;
|
|
447
|
+
export declare const ThresholdDimensionsSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
435
448
|
/** Inferred TypeScript type for threshold dimension overrides. */
|
|
436
449
|
export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
|
|
437
450
|
/**
|
|
@@ -441,11 +454,7 @@ export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
|
|
|
441
454
|
export declare const ThresholdDefaultsSchema: z.ZodObject<{
|
|
442
455
|
ceiling: z.ZodOptional<z.ZodNumber>;
|
|
443
456
|
composite: z.ZodNumber;
|
|
444
|
-
dimensions: z.ZodOptional<z.
|
|
445
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
446
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
447
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
448
|
-
}, z.core.$strip>>;
|
|
457
|
+
dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
449
458
|
"doc-lift": z.ZodOptional<z.ZodNumber>;
|
|
450
459
|
}, z.core.$strip>;
|
|
451
460
|
/** Inferred TypeScript type for threshold defaults. */
|
|
@@ -485,21 +494,13 @@ export declare const ThresholdConfigSchema: z.ZodObject<{
|
|
|
485
494
|
areas: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
486
495
|
ceiling: z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
|
|
487
496
|
composite: z.ZodOptional<z.ZodNumber>;
|
|
488
|
-
dimensions: z.ZodOptional<z.ZodOptional<z.
|
|
489
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
490
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
491
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
492
|
-
}, z.core.$strip>>>;
|
|
497
|
+
dimensions: z.ZodOptional<z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>>;
|
|
493
498
|
"doc-lift": z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
|
|
494
499
|
}, z.core.$strip>>>;
|
|
495
500
|
defaults: z.ZodObject<{
|
|
496
501
|
ceiling: z.ZodOptional<z.ZodNumber>;
|
|
497
502
|
composite: z.ZodNumber;
|
|
498
|
-
dimensions: z.ZodOptional<z.
|
|
499
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
500
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
501
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
502
|
-
}, z.core.$strip>>;
|
|
503
|
+
dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
503
504
|
"doc-lift": z.ZodOptional<z.ZodNumber>;
|
|
504
505
|
}, z.core.$strip>;
|
|
505
506
|
regression: z.ZodOptional<z.ZodObject<{
|
|
@@ -31,23 +31,66 @@ export const RubricTemplateSchema = z.object({
|
|
|
31
31
|
.array(z.string().min(1))
|
|
32
32
|
.min(1, "scale must have at least one entry"),
|
|
33
33
|
});
|
|
34
|
+
/**
|
|
35
|
+
* A named weight profile — maps dimension names to weights (must sum to 1.0).
|
|
36
|
+
* Each profile is a self-contained scoring formula used for a specific
|
|
37
|
+
* (mode, variant) pair.
|
|
38
|
+
*/
|
|
39
|
+
const WeightProfileSchema = z
|
|
40
|
+
.record(z.string(), z.number().min(0).max(1))
|
|
41
|
+
.refine((w) => {
|
|
42
|
+
const sum = Object.values(w).reduce((s, v) => s + v, 0);
|
|
43
|
+
return Math.abs(sum - 1.0) < 0.001;
|
|
44
|
+
}, { message: "profile weights must sum to 1.0" });
|
|
45
|
+
/**
|
|
46
|
+
* Mode-to-profile bindings — maps (mode, perspective) pairs to profile names.
|
|
47
|
+
*
|
|
48
|
+
* Flat form (most modes):
|
|
49
|
+
* { "mcp-server": { gold: "mcp-behavior" } }
|
|
50
|
+
*
|
|
51
|
+
* Nested form (literacy mode with variant sub-keys):
|
|
52
|
+
* { literacy: { baseline: { gold: "default", baseline: "output-only" }, agentic: { gold: "default" } } }
|
|
53
|
+
*
|
|
54
|
+
* The nested form adds a variant level between mode and perspective,
|
|
55
|
+
* allowing a single canonical mode to host multiple scoring variants.
|
|
56
|
+
*/
|
|
57
|
+
const ModeProfileEntrySchema = z.union([
|
|
58
|
+
z.string(),
|
|
59
|
+
z.record(z.string(), z.string()),
|
|
60
|
+
]);
|
|
61
|
+
const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), ModeProfileEntrySchema));
|
|
34
62
|
/**
|
|
35
63
|
* Schema for the full config/rubrics.yaml config file.
|
|
36
64
|
*
|
|
37
|
-
* Each dimension is scored on a uniform 0–100 scale.
|
|
38
|
-
*
|
|
65
|
+
* Each dimension is scored on a uniform 0–100 scale. Named scoring profiles
|
|
66
|
+
* define how dimensions are combined into composite scores. Mode-profile
|
|
67
|
+
* bindings declare which profile to use for each (mode, variant) pair.
|
|
68
|
+
*
|
|
69
|
+
* Supports both the new `profiles` format and the legacy flat `weights`
|
|
70
|
+
* format for backward compatibility.
|
|
71
|
+
*
|
|
72
|
+
* @see docs/design-docs/named-scoring-profiles.md
|
|
39
73
|
*/
|
|
40
|
-
export const RubricConfigSchema = z
|
|
74
|
+
export const RubricConfigSchema = z
|
|
75
|
+
.object({
|
|
41
76
|
footer: z.string().min(1, "footer must be a non-empty string"),
|
|
77
|
+
"mode-profiles": ModeProfilesSchema.optional(),
|
|
78
|
+
profiles: z
|
|
79
|
+
.record(z.string(), WeightProfileSchema)
|
|
80
|
+
.refine((p) => "default" in p, {
|
|
81
|
+
message: "profiles must include a 'default' profile",
|
|
82
|
+
})
|
|
83
|
+
.optional(),
|
|
42
84
|
templates: z
|
|
43
85
|
.record(z.string(), RubricTemplateSchema)
|
|
44
86
|
.refine((t) => Object.keys(t).length > 0, {
|
|
45
87
|
message: "templates must have at least one entry",
|
|
46
88
|
}),
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
89
|
+
// Legacy: flat weight map. Treated as a single profile named "default".
|
|
90
|
+
weights: WeightProfileSchema.optional(),
|
|
91
|
+
})
|
|
92
|
+
.refine((c) => c.profiles !== undefined || c.weights !== undefined, {
|
|
93
|
+
message: "rubrics.yaml must have either 'profiles' or 'weights'",
|
|
51
94
|
});
|
|
52
95
|
// ---------------------------------------------------------------------------
|
|
53
96
|
// Feature registry schema — validates config/features.yaml (Phase 3c)
|
|
@@ -246,14 +289,11 @@ export const TaskFileSchema = z
|
|
|
246
289
|
// ---------------------------------------------------------------------------
|
|
247
290
|
/**
|
|
248
291
|
* Schema for per-dimension threshold values.
|
|
292
|
+
* Uses a dynamic record to support all evaluation modes, not just literacy.
|
|
249
293
|
* Keys use kebab-case to match YAML convention; the threshold engine
|
|
250
294
|
* normalizes to camelCase for comparison against FeatureScore fields.
|
|
251
295
|
*/
|
|
252
|
-
export const ThresholdDimensionsSchema = z.
|
|
253
|
-
"code-correctness": z.number().min(0).max(100).optional(),
|
|
254
|
-
"doc-coverage": z.number().min(0).max(100).optional(),
|
|
255
|
-
"task-completion": z.number().min(0).max(100).optional(),
|
|
256
|
-
});
|
|
296
|
+
export const ThresholdDimensionsSchema = z.record(z.string(), z.number().min(0).max(100));
|
|
257
297
|
/**
|
|
258
298
|
* Schema for threshold defaults (and per-area overrides).
|
|
259
299
|
* All fields are optional in per-area overrides; defaults must have composite.
|
|
@@ -18,10 +18,15 @@ export declare const ScheduleEntrySchema: z.ZodObject<{
|
|
|
18
18
|
cron: z.ZodString;
|
|
19
19
|
enabled: z.ZodDefault<z.ZodBoolean>;
|
|
20
20
|
mode: z.ZodDefault<z.ZodEnum<{
|
|
21
|
-
|
|
21
|
+
custom: "custom";
|
|
22
|
+
literacy: "literacy";
|
|
23
|
+
"mcp-server": "mcp-server";
|
|
24
|
+
"agent-harness": "agent-harness";
|
|
25
|
+
"knowledge-probe": "knowledge-probe";
|
|
22
26
|
baseline: "baseline";
|
|
23
|
-
|
|
27
|
+
agentic: "agentic";
|
|
24
28
|
observed: "observed";
|
|
29
|
+
full: "full";
|
|
25
30
|
}>>;
|
|
26
31
|
name: z.ZodString;
|
|
27
32
|
publish: z.ZodDefault<z.ZodBoolean>;
|
|
@@ -53,10 +58,15 @@ export declare const SchedulesFileSchema: z.ZodObject<{
|
|
|
53
58
|
cron: z.ZodString;
|
|
54
59
|
enabled: z.ZodDefault<z.ZodBoolean>;
|
|
55
60
|
mode: z.ZodDefault<z.ZodEnum<{
|
|
56
|
-
|
|
61
|
+
custom: "custom";
|
|
62
|
+
literacy: "literacy";
|
|
63
|
+
"mcp-server": "mcp-server";
|
|
64
|
+
"agent-harness": "agent-harness";
|
|
65
|
+
"knowledge-probe": "knowledge-probe";
|
|
57
66
|
baseline: "baseline";
|
|
58
|
-
|
|
67
|
+
agentic: "agentic";
|
|
59
68
|
observed: "observed";
|
|
69
|
+
full: "full";
|
|
60
70
|
}>>;
|
|
61
71
|
name: z.ZodString;
|
|
62
72
|
publish: z.ZodDefault<z.ZodBoolean>;
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
* @see docs/design-docs/report-store/implementation.md — Phase 5
|
|
12
12
|
*/
|
|
13
13
|
import { z } from "zod";
|
|
14
|
+
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
14
15
|
// ---------------------------------------------------------------------------
|
|
15
16
|
// Cron expression validation
|
|
16
17
|
// ---------------------------------------------------------------------------
|
|
@@ -34,8 +35,11 @@ export const ScheduleEntrySchema = z.object({
|
|
|
34
35
|
cron: CronSchema,
|
|
35
36
|
/** Whether this schedule is active */
|
|
36
37
|
enabled: z.boolean().default(true),
|
|
37
|
-
/**
|
|
38
|
-
|
|
38
|
+
/**
|
|
39
|
+
* Evaluation mode — accepts both canonical and legacy names.
|
|
40
|
+
* Legacy names must pass through normalizeMode() before entering typed pipeline code.
|
|
41
|
+
*/
|
|
42
|
+
mode: z.enum(RAW_EVAL_MODES).default("baseline"),
|
|
39
43
|
/** Human-readable schedule name (used as report tag) */
|
|
40
44
|
name: z
|
|
41
45
|
.string()
|
|
@@ -17,10 +17,10 @@
|
|
|
17
17
|
import { z } from "zod";
|
|
18
18
|
/** All supported sink types as a Zod union. */
|
|
19
19
|
export declare const SinkTypeSchema: z.ZodEnum<{
|
|
20
|
-
webhook: "webhook";
|
|
21
20
|
bigquery: "bigquery";
|
|
22
21
|
"github-comment": "github-comment";
|
|
23
22
|
slack: "slack";
|
|
23
|
+
webhook: "webhook";
|
|
24
24
|
}>;
|
|
25
25
|
/** Supported sink type string literal union. */
|
|
26
26
|
export type SinkType = z.infer<typeof SinkTypeSchema>;
|
|
@@ -25,12 +25,21 @@ export function formatComparisonMarkdown(report) {
|
|
|
25
25
|
lines.push("");
|
|
26
26
|
lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
|
|
27
27
|
lines.push("");
|
|
28
|
-
//
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
// Derive dimension columns from the first area's keys (all areas share the
|
|
29
|
+
// same scoring profile, so the key set is uniform).
|
|
30
|
+
const dimKeys = report.areas.length > 0
|
|
31
|
+
? Object.keys(report.areas[0].dimensions)
|
|
32
|
+
: Object.keys(report.deltas.perDimension);
|
|
33
|
+
// Per-area table — columns are dynamic
|
|
34
|
+
const dimHeaders = dimKeys.map(kebabToTitleCase);
|
|
35
|
+
const headerRow = ["Feature", "Baseline", "Current", "Delta", ...dimHeaders];
|
|
36
|
+
const separatorRow = headerRow.map(() => "------");
|
|
37
|
+
lines.push(`| ${headerRow.join(" | ")} |`);
|
|
38
|
+
lines.push(`|${separatorRow.join("|")}|`);
|
|
31
39
|
for (const a of report.areas) {
|
|
32
40
|
const icon = changeIcon(a.change);
|
|
33
|
-
|
|
41
|
+
const dimCells = dimKeys.map((k) => deltaStr(a.dimensions[k]?.delta ?? 0));
|
|
42
|
+
lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${dimCells.join(" | ")} |`);
|
|
34
43
|
}
|
|
35
44
|
lines.push("");
|
|
36
45
|
// Summary
|
|
@@ -55,9 +64,9 @@ export function formatComparisonMarkdown(report) {
|
|
|
55
64
|
const dim = report.deltas.perDimension;
|
|
56
65
|
lines.push("| Dimension | Delta |");
|
|
57
66
|
lines.push("|-----------|-------|");
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
67
|
+
for (const k of Object.keys(dim)) {
|
|
68
|
+
lines.push(`| ${kebabToTitleCase(k)} | ${deltaStr(dim[k])} |`);
|
|
69
|
+
}
|
|
61
70
|
lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
|
|
62
71
|
if (report.deltas.cost !== undefined) {
|
|
63
72
|
const costStr = report.deltas.cost > 0
|
|
@@ -91,29 +100,51 @@ export function formatComparisonTable(report) {
|
|
|
91
100
|
: "unchanged");
|
|
92
101
|
lines.push(` Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
|
|
93
102
|
lines.push("");
|
|
94
|
-
// Per-dimension averages
|
|
103
|
+
// Per-dimension averages — derived dynamically from the report
|
|
95
104
|
const dim = report.deltas.perDimension;
|
|
105
|
+
const dimKeys = report.areas.length > 0
|
|
106
|
+
? Object.keys(report.areas[0].dimensions)
|
|
107
|
+
: Object.keys(dim);
|
|
96
108
|
lines.push(" Dimension averages:");
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
109
|
+
// Pad labels to the longest dimension label for alignment
|
|
110
|
+
const dimLabels = dimKeys.map(kebabToTitleCase);
|
|
111
|
+
// +1 for the colon appended to each label
|
|
112
|
+
const maxLabelLen = Math.max(...dimLabels.map((l) => l.length + 1), "Doc Lift:".length);
|
|
113
|
+
for (let i = 0; i < dimKeys.length; i++) {
|
|
114
|
+
lines.push(` ${(dimLabels[i] + ":").padEnd(maxLabelLen)} ${deltaStr(dim[dimKeys[i]] ?? 0)}`);
|
|
115
|
+
}
|
|
116
|
+
lines.push(` ${"Doc Lift:".padEnd(maxLabelLen)} ${deltaStr(report.deltas.docLift)}`);
|
|
101
117
|
if (report.deltas.cost !== undefined) {
|
|
102
|
-
lines.push(` Cost:
|
|
118
|
+
lines.push(` ${"Cost:".padEnd(maxLabelLen)} ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
|
|
103
119
|
}
|
|
104
120
|
lines.push("");
|
|
105
|
-
// Per-area table
|
|
121
|
+
// Per-area table — columns are dynamic
|
|
106
122
|
lines.push("-".repeat(80));
|
|
107
123
|
lines.push("PER-AREA BREAKDOWN");
|
|
108
124
|
lines.push("-".repeat(80));
|
|
109
125
|
lines.push("");
|
|
110
|
-
const
|
|
111
|
-
const
|
|
112
|
-
|
|
113
|
-
|
|
126
|
+
const dimHeaders = dimKeys.map(kebabToTitleCase);
|
|
127
|
+
const colWidths = dimHeaders.map((h) => Math.max(h.length, 4));
|
|
128
|
+
const hCols = [
|
|
129
|
+
"Feature Area".padEnd(19),
|
|
130
|
+
"Baseline".padStart(8),
|
|
131
|
+
"Experiment".padStart(10),
|
|
132
|
+
"Delta".padStart(5),
|
|
133
|
+
...dimHeaders.map((h, i) => h.padStart(colWidths[i])),
|
|
134
|
+
];
|
|
135
|
+
const sepCols = [
|
|
136
|
+
"-".repeat(21),
|
|
137
|
+
"-".repeat(10),
|
|
138
|
+
"-".repeat(12),
|
|
139
|
+
"-".repeat(7),
|
|
140
|
+
...colWidths.map((w) => "-".repeat(w + 2)),
|
|
141
|
+
];
|
|
142
|
+
lines.push(`| ${hCols.join(" | ")} |`);
|
|
143
|
+
lines.push(`|${sepCols.join("|")}|`);
|
|
114
144
|
for (const a of report.areas) {
|
|
115
145
|
const icon = changeIcon(a.change);
|
|
116
|
-
|
|
146
|
+
const dimCells = dimKeys.map((k, i) => deltaStr(a.dimensions[k]?.delta ?? 0).padStart(colWidths[i]));
|
|
147
|
+
lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${dimCells.join(" | ")} |`);
|
|
117
148
|
}
|
|
118
149
|
lines.push("");
|
|
119
150
|
// Classification summary
|
|
@@ -187,3 +218,10 @@ function deltaStr(d) {
|
|
|
187
218
|
return `${Math.round(d)}`;
|
|
188
219
|
return "0";
|
|
189
220
|
}
|
|
221
|
+
/** Convert kebab-case dimension name to title case (e.g. 'task-completion' → 'Task Completion') */
|
|
222
|
+
function kebabToTitleCase(name) {
|
|
223
|
+
return name
|
|
224
|
+
.split("-")
|
|
225
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
226
|
+
.join(" ");
|
|
227
|
+
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
* Extracted from packages/eval/src/lib/ during the Ports & Adapters
|
|
8
8
|
* migration (Phase 4e).
|
|
9
9
|
*/
|
|
10
|
-
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
10
|
+
export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
11
11
|
export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
|
|
12
|
+
export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
|
|
12
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
* Extracted from packages/eval/src/lib/ during the Ports & Adapters
|
|
8
8
|
* migration (Phase 4e).
|
|
9
9
|
*/
|
|
10
|
-
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
10
|
+
export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadata, mergeScores, parseRubricScore, } from "./scoring.js";
|
|
11
11
|
export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
|
|
12
|
+
export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
|
|
12
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, } from "./config-helpers.js";
|