@sanity/ailf 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -7,150 +7,44 @@
|
|
|
7
7
|
* - RepoTaskSource (tasks-as-content Phase 4) — reads .ailf/tasks/
|
|
8
8
|
*
|
|
9
9
|
* The key invariant: the pipeline orchestrator and all downstream steps
|
|
10
|
-
* work with
|
|
10
|
+
* work with GeneralizedTaskDefinition[] regardless of where they came from.
|
|
11
11
|
*/
|
|
12
|
+
import type { GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, IdDocRef, PathDocRef, PerspectiveDocRef, SlugDocRef } from "../types/generalized-task.js";
|
|
12
13
|
import type { FilterOptions } from "../types/index.js";
|
|
13
|
-
/**
|
|
14
|
-
export interface TemplatedAssertion {
|
|
15
|
-
type: "llm-rubric";
|
|
16
|
-
template: string;
|
|
17
|
-
criteria: string[];
|
|
18
|
-
weight?: number;
|
|
19
|
-
}
|
|
20
|
-
/** A value-based assertion (contains, javascript, etc.) */
|
|
21
|
-
export interface ValueAssertion {
|
|
22
|
-
type: string;
|
|
23
|
-
value?: unknown;
|
|
24
|
-
weight?: number;
|
|
25
|
-
[key: string]: unknown;
|
|
26
|
-
}
|
|
27
|
-
/** Any assertion definition — either templated or value-based */
|
|
28
|
-
export type AssertionDefinition = TemplatedAssertion | ValueAssertion;
|
|
29
|
-
/** Baseline variant configuration */
|
|
30
|
-
export interface BaselineConfig {
|
|
31
|
-
/** Whether to generate a baseline variant. Default: true */
|
|
32
|
-
enabled?: boolean;
|
|
33
|
-
/** Rubric mode for baseline. Default: "full" */
|
|
34
|
-
rubric?: "abbreviated" | "full" | "none";
|
|
35
|
-
}
|
|
36
|
-
/**
|
|
37
|
-
* A canonical documentation reference. Each entry resolves docs through
|
|
38
|
-
* one of four strategies, discriminated by key presence (no explicit
|
|
39
|
-
* `type` field). All strategies carry an optional `reason` for context.
|
|
40
|
-
*
|
|
41
|
-
* Strategies:
|
|
42
|
-
* - `slug` — one article by slug field (legacy, may not be unique)
|
|
43
|
-
* - `path` — one article by URL path (unique across sections)
|
|
44
|
-
* - `id` — one document by Sanity `_id` (drafts, imports)
|
|
45
|
-
* - `perspective` — all articles in a content release (one-to-many)
|
|
46
|
-
*
|
|
47
|
-
* @see docs/design-docs/canonical-doc-resolution.md
|
|
48
|
-
*/
|
|
49
|
-
export type CanonicalDocRef = SlugDocRef | PathDocRef | IdDocRef | PerspectiveDocRef;
|
|
50
|
-
/** Resolve by article slug field. Legacy — prefer `path` for uniqueness. */
|
|
51
|
-
export interface SlugDocRef {
|
|
52
|
-
slug: string;
|
|
53
|
-
reason?: string;
|
|
54
|
-
}
|
|
55
|
-
/** Resolve by URL path (after /docs/). Unique across sections. */
|
|
56
|
-
export interface PathDocRef {
|
|
57
|
-
path: string;
|
|
58
|
-
reason?: string;
|
|
59
|
-
}
|
|
60
|
-
/** Resolve by Sanity document `_id`. The primary resolution strategy.
|
|
61
|
-
*
|
|
62
|
-
* Optional `slug` and `path` provide human-readable context — they are
|
|
63
|
-
* NOT used for resolution (the `_id` is authoritative) but help YAML
|
|
64
|
-
* authors understand which document is being referenced. The Content Lake
|
|
65
|
-
* adapter populates them from the dereferenced article.
|
|
66
|
-
*/
|
|
67
|
-
export interface IdDocRef {
|
|
68
|
-
id: string;
|
|
69
|
-
reason?: string;
|
|
70
|
-
/** Human-readable slug (informational only — not used for resolution) */
|
|
71
|
-
slug?: string;
|
|
72
|
-
/** Human-readable path (informational only — not used for resolution) */
|
|
73
|
-
path?: string;
|
|
74
|
-
}
|
|
75
|
-
/** Resolve all articles in a content release. One-to-many. */
|
|
76
|
-
export interface PerspectiveDocRef {
|
|
77
|
-
perspective: string;
|
|
78
|
-
reason?: string;
|
|
79
|
-
}
|
|
80
|
-
/**
|
|
81
|
-
* A loaded, validated task definition ready for expansion.
|
|
82
|
-
*
|
|
83
|
-
* This is the canonical intermediate representation — adapters produce
|
|
84
|
-
* this from YAML, Content Lake, or .ailf/ files. Downstream consumers
|
|
85
|
-
* (expansion, doc fetching, validation) work exclusively with this type.
|
|
86
|
-
*
|
|
87
|
-
* Design notes:
|
|
88
|
-
* - `taskPrompt` is extracted from `vars.task` in YAML format
|
|
89
|
-
* - `docsPath` is NOT included — it's an infrastructure detail derived
|
|
90
|
-
* from convention (`file://contexts/canonical/${id}.md`)
|
|
91
|
-
* - `featureArea` is derived by the adapter (filename stem, document
|
|
92
|
-
* field, directory structure — depends on the source)
|
|
93
|
-
*/
|
|
94
|
-
export interface TaskDefinition {
|
|
95
|
-
/** Unique task identifier */
|
|
96
|
-
id: string;
|
|
97
|
-
/** Human-readable description */
|
|
98
|
-
description: string;
|
|
99
|
-
/** Feature area this task belongs to */
|
|
100
|
-
featureArea: string;
|
|
101
|
-
/** The implementation task prompt (the user-facing request) */
|
|
102
|
-
taskPrompt: string;
|
|
103
|
-
/** Canonical doc references with reasons */
|
|
104
|
-
canonicalDocs: CanonicalDocRef[];
|
|
105
|
-
/** Path to the reference solution (relative to eval package root) */
|
|
106
|
-
referenceSolution: string;
|
|
107
|
-
/** Whether doc coverage rubric should be auto-generated */
|
|
108
|
-
docCoverage: boolean;
|
|
109
|
-
/** Assertion definitions (rubric templates + value assertions) */
|
|
110
|
-
assertions: AssertionDefinition[];
|
|
111
|
-
/** Baseline variant configuration */
|
|
112
|
-
baseline?: BaselineConfig;
|
|
113
|
-
/** Additional template variables beyond task (e.g., custom vars) */
|
|
114
|
-
extraVars?: Record<string, unknown>;
|
|
115
|
-
/** Lifecycle status — controls pipeline inclusion. Absent = "active". */
|
|
116
|
-
status?: "active" | "archived" | "draft" | "paused";
|
|
117
|
-
/** Freeform labels for filtering and organization */
|
|
118
|
-
tags?: string[];
|
|
119
|
-
}
|
|
120
|
-
/** Check if a canonical doc ref resolves by slug.
|
|
14
|
+
/** Check if a doc ref resolves by slug.
|
|
121
15
|
*
|
|
122
16
|
* Excludes IdDocRef (which may carry an optional `slug` for display).
|
|
123
17
|
* When both `id` and `slug` are present, it's an IdDocRef, not a SlugDocRef.
|
|
124
18
|
*/
|
|
125
|
-
export declare function isSlugRef(ref:
|
|
126
|
-
/** Check if a
|
|
19
|
+
export declare function isSlugRef(ref: GeneralizedDocRef): ref is SlugDocRef;
|
|
20
|
+
/** Check if a doc ref resolves by path.
|
|
127
21
|
*
|
|
128
22
|
* Excludes IdDocRef (which may carry an optional `path` for display).
|
|
129
23
|
* When both `id` and `path` are present, it's an IdDocRef, not a PathDocRef.
|
|
130
24
|
*/
|
|
131
|
-
export declare function isPathRef(ref:
|
|
132
|
-
/** Check if a
|
|
25
|
+
export declare function isPathRef(ref: GeneralizedDocRef): ref is PathDocRef;
|
|
26
|
+
/** Check if a doc ref resolves by document ID.
|
|
133
27
|
*
|
|
134
28
|
* Uses `"id" in ref` as the primary discriminator. IdDocRef may also carry
|
|
135
29
|
* optional `slug` and `path` for display purposes, so we cannot exclude
|
|
136
30
|
* on those keys. When both `id` and `slug` are present, `id` wins.
|
|
137
31
|
*/
|
|
138
|
-
export declare function isIdRef(ref:
|
|
139
|
-
/** Check if a
|
|
140
|
-
export declare function isPerspectiveRef(ref:
|
|
32
|
+
export declare function isIdRef(ref: GeneralizedDocRef): ref is IdDocRef;
|
|
33
|
+
/** Check if a doc ref resolves by content release perspective */
|
|
34
|
+
export declare function isPerspectiveRef(ref: GeneralizedDocRef): ref is PerspectiveDocRef;
|
|
141
35
|
/**
|
|
142
|
-
* Extract a display identifier from any
|
|
36
|
+
* Extract a display identifier from any doc ref.
|
|
143
37
|
* Useful for logging, error messages, and retrieval metrics.
|
|
144
38
|
*/
|
|
145
|
-
export declare function canonicalDocRefLabel(ref:
|
|
39
|
+
export declare function canonicalDocRefLabel(ref: GeneralizedDocRef): string;
|
|
146
40
|
/** Check if an assertion uses the templated format (template + criteria) */
|
|
147
|
-
export declare function isTemplatedAssertion(entry:
|
|
41
|
+
export declare function isTemplatedAssertion(entry: GeneralizedAssertionDefinition): entry is GeneralizedTemplatedAssertion;
|
|
148
42
|
/**
|
|
149
43
|
* Port: Where task definitions come from.
|
|
150
44
|
*
|
|
151
45
|
* The pipeline never knows HOW tasks are loaded — it only sees
|
|
152
|
-
*
|
|
153
|
-
* filesystem scanning, etc.
|
|
46
|
+
* GeneralizedTaskDefinition[]. The adapter handles YAML parsing, GROQ
|
|
47
|
+
* queries, filesystem scanning, etc.
|
|
154
48
|
*/
|
|
155
49
|
export interface TaskSource {
|
|
156
50
|
/**
|
|
@@ -159,5 +53,5 @@ export interface TaskSource {
|
|
|
159
53
|
* @param filter — Area, task ID, or changed-doc filters
|
|
160
54
|
* @returns Validated task definitions ready for expansion
|
|
161
55
|
*/
|
|
162
|
-
loadTasks(filter?: FilterOptions): Promise<
|
|
56
|
+
loadTasks(filter?: FilterOptions): Promise<GeneralizedTaskDefinition[]>;
|
|
163
57
|
}
|
|
@@ -7,12 +7,12 @@
|
|
|
7
7
|
* - RepoTaskSource (tasks-as-content Phase 4) — reads .ailf/tasks/
|
|
8
8
|
*
|
|
9
9
|
* The key invariant: the pipeline orchestrator and all downstream steps
|
|
10
|
-
* work with
|
|
10
|
+
* work with GeneralizedTaskDefinition[] regardless of where they came from.
|
|
11
11
|
*/
|
|
12
12
|
// ---------------------------------------------------------------------------
|
|
13
|
-
// Type guards —
|
|
13
|
+
// Type guards — doc refs
|
|
14
14
|
// ---------------------------------------------------------------------------
|
|
15
|
-
/** Check if a
|
|
15
|
+
/** Check if a doc ref resolves by slug.
|
|
16
16
|
*
|
|
17
17
|
* Excludes IdDocRef (which may carry an optional `slug` for display).
|
|
18
18
|
* When both `id` and `slug` are present, it's an IdDocRef, not a SlugDocRef.
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
export function isSlugRef(ref) {
|
|
21
21
|
return "slug" in ref && !("id" in ref);
|
|
22
22
|
}
|
|
23
|
-
/** Check if a
|
|
23
|
+
/** Check if a doc ref resolves by path.
|
|
24
24
|
*
|
|
25
25
|
* Excludes IdDocRef (which may carry an optional `path` for display).
|
|
26
26
|
* When both `id` and `path` are present, it's an IdDocRef, not a PathDocRef.
|
|
@@ -28,7 +28,7 @@ export function isSlugRef(ref) {
|
|
|
28
28
|
export function isPathRef(ref) {
|
|
29
29
|
return "path" in ref && !("id" in ref);
|
|
30
30
|
}
|
|
31
|
-
/** Check if a
|
|
31
|
+
/** Check if a doc ref resolves by document ID.
|
|
32
32
|
*
|
|
33
33
|
* Uses `"id" in ref` as the primary discriminator. IdDocRef may also carry
|
|
34
34
|
* optional `slug` and `path` for display purposes, so we cannot exclude
|
|
@@ -37,12 +37,12 @@ export function isPathRef(ref) {
|
|
|
37
37
|
export function isIdRef(ref) {
|
|
38
38
|
return "id" in ref;
|
|
39
39
|
}
|
|
40
|
-
/** Check if a
|
|
40
|
+
/** Check if a doc ref resolves by content release perspective */
|
|
41
41
|
export function isPerspectiveRef(ref) {
|
|
42
42
|
return "perspective" in ref;
|
|
43
43
|
}
|
|
44
44
|
/**
|
|
45
|
-
* Extract a display identifier from any
|
|
45
|
+
* Extract a display identifier from any doc ref.
|
|
46
46
|
* Useful for logging, error messages, and retrieval metrics.
|
|
47
47
|
*/
|
|
48
48
|
export function canonicalDocRefLabel(ref) {
|
|
@@ -29,10 +29,15 @@ export declare const EvalConfigSchema: z.ZodObject<{
|
|
|
29
29
|
graderReplications: z.ZodOptional<z.ZodNumber>;
|
|
30
30
|
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
31
31
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
32
|
-
|
|
32
|
+
custom: "custom";
|
|
33
|
+
literacy: "literacy";
|
|
34
|
+
"mcp-server": "mcp-server";
|
|
35
|
+
"agent-harness": "agent-harness";
|
|
36
|
+
"knowledge-probe": "knowledge-probe";
|
|
33
37
|
baseline: "baseline";
|
|
34
|
-
|
|
38
|
+
agentic: "agentic";
|
|
35
39
|
observed: "observed";
|
|
40
|
+
full: "full";
|
|
36
41
|
}>>;
|
|
37
42
|
noAutoScope: z.ZodOptional<z.ZodBoolean>;
|
|
38
43
|
noCache: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
* (studio-eval-config) so Content Lake documents validate identically.
|
|
11
11
|
*/
|
|
12
12
|
import { z } from "zod";
|
|
13
|
+
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
13
14
|
export const EvalConfigSchema = z
|
|
14
15
|
.object({
|
|
15
16
|
/** Allowed origins for agentic mode */
|
|
@@ -46,8 +47,12 @@ export const EvalConfigSchema = z
|
|
|
46
47
|
graderReplications: z.number().int().positive().optional(),
|
|
47
48
|
/** Custom headers for doc fetching */
|
|
48
49
|
headers: z.record(z.string(), z.string()).optional(),
|
|
49
|
-
/**
|
|
50
|
-
|
|
50
|
+
/**
|
|
51
|
+
* Evaluation mode — accepts both canonical and legacy names.
|
|
52
|
+
* Legacy names ("baseline", "agentic", "observed", "full") must pass
|
|
53
|
+
* through normalizeMode() before entering typed pipeline code.
|
|
54
|
+
*/
|
|
55
|
+
mode: z.enum(RAW_EVAL_MODES).optional(),
|
|
51
56
|
/** Disable release-aware auto-scoping */
|
|
52
57
|
noAutoScope: z.boolean().optional(),
|
|
53
58
|
/** Disable local cache */
|
|
@@ -49,10 +49,15 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
49
49
|
inlineTasks: z.ZodOptional<z.ZodArray<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
50
50
|
jobId: z.ZodOptional<z.ZodString>;
|
|
51
51
|
mode: z.ZodOptional<z.ZodEnum<{
|
|
52
|
-
|
|
52
|
+
custom: "custom";
|
|
53
|
+
literacy: "literacy";
|
|
54
|
+
"mcp-server": "mcp-server";
|
|
55
|
+
"agent-harness": "agent-harness";
|
|
56
|
+
"knowledge-probe": "knowledge-probe";
|
|
53
57
|
baseline: "baseline";
|
|
54
|
-
|
|
58
|
+
agentic: "agentic";
|
|
55
59
|
observed: "observed";
|
|
60
|
+
full: "full";
|
|
56
61
|
}>>;
|
|
57
62
|
noAutoScope: z.ZodOptional<z.ZodBoolean>;
|
|
58
63
|
noCache: z.ZodOptional<z.ZodBoolean>;
|
|
@@ -70,9 +75,9 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
70
75
|
source: z.ZodOptional<z.ZodString>;
|
|
71
76
|
sourceReportId: z.ZodOptional<z.ZodString>;
|
|
72
77
|
taskMode: z.ZodOptional<z.ZodEnum<{
|
|
78
|
+
inline: "inline";
|
|
73
79
|
"content-lake": "content-lake";
|
|
74
80
|
yaml: "yaml";
|
|
75
|
-
inline: "inline";
|
|
76
81
|
}>>;
|
|
77
82
|
tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
78
83
|
urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
* @see packages/eval/src/pipeline/map-request-to-config.ts — maps to ResolvedConfig
|
|
14
14
|
*/
|
|
15
15
|
import { z } from "zod";
|
|
16
|
+
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
16
17
|
// ---------------------------------------------------------------------------
|
|
17
18
|
// Debug options — boolean shorthand or structured object
|
|
18
19
|
// ---------------------------------------------------------------------------
|
|
@@ -69,7 +70,11 @@ export const PipelineRequestSchema = z.object({
|
|
|
69
70
|
headers: z.record(z.string(), z.string()).optional(),
|
|
70
71
|
inlineTasks: z.array(z.record(z.string(), z.unknown())).optional(),
|
|
71
72
|
jobId: z.string().optional(),
|
|
72
|
-
|
|
73
|
+
/**
|
|
74
|
+
* Evaluation mode — accepts both canonical and legacy names.
|
|
75
|
+
* Legacy names must pass through normalizeMode() before entering typed pipeline code.
|
|
76
|
+
*/
|
|
77
|
+
mode: z.enum(RAW_EVAL_MODES).optional(),
|
|
73
78
|
noAutoScope: z.boolean().optional(),
|
|
74
79
|
noCache: z.boolean().optional(),
|
|
75
80
|
noRemoteCache: z.boolean().optional(),
|
|
@@ -47,7 +47,7 @@ export type WeightProfile = z.infer<typeof WeightProfileSchema>;
|
|
|
47
47
|
*/
|
|
48
48
|
export declare const RubricConfigSchema: z.ZodObject<{
|
|
49
49
|
footer: z.ZodString;
|
|
50
|
-
"mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodString
|
|
50
|
+
"mode-profiles": z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodString, z.ZodRecord<z.ZodString, z.ZodString>]>>>>;
|
|
51
51
|
profiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodRecord<z.ZodString, z.ZodNumber>>>;
|
|
52
52
|
templates: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
53
53
|
criteria_label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
@@ -67,17 +67,17 @@ export declare const FeatureSchema: z.ZodObject<{
|
|
|
67
67
|
id: z.ZodString;
|
|
68
68
|
name: z.ZodString;
|
|
69
69
|
priority: z.ZodEnum<{
|
|
70
|
+
critical: "critical";
|
|
70
71
|
high: "high";
|
|
71
|
-
low: "low";
|
|
72
72
|
medium: "medium";
|
|
73
|
-
|
|
73
|
+
low: "low";
|
|
74
74
|
}>;
|
|
75
75
|
sections: z.ZodArray<z.ZodString>;
|
|
76
76
|
status: z.ZodEnum<{
|
|
77
77
|
covered: "covered";
|
|
78
|
-
"out-of-scope": "out-of-scope";
|
|
79
|
-
planned: "planned";
|
|
80
78
|
uncovered: "uncovered";
|
|
79
|
+
planned: "planned";
|
|
80
|
+
"out-of-scope": "out-of-scope";
|
|
81
81
|
}>;
|
|
82
82
|
taskCount: z.ZodOptional<z.ZodNumber>;
|
|
83
83
|
}, z.core.$strip>;
|
|
@@ -92,17 +92,17 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
|
|
|
92
92
|
id: z.ZodString;
|
|
93
93
|
name: z.ZodString;
|
|
94
94
|
priority: z.ZodEnum<{
|
|
95
|
+
critical: "critical";
|
|
95
96
|
high: "high";
|
|
96
|
-
low: "low";
|
|
97
97
|
medium: "medium";
|
|
98
|
-
|
|
98
|
+
low: "low";
|
|
99
99
|
}>;
|
|
100
100
|
sections: z.ZodArray<z.ZodString>;
|
|
101
101
|
status: z.ZodEnum<{
|
|
102
102
|
covered: "covered";
|
|
103
|
-
"out-of-scope": "out-of-scope";
|
|
104
|
-
planned: "planned";
|
|
105
103
|
uncovered: "uncovered";
|
|
104
|
+
planned: "planned";
|
|
105
|
+
"out-of-scope": "out-of-scope";
|
|
106
106
|
}>;
|
|
107
107
|
taskCount: z.ZodOptional<z.ZodNumber>;
|
|
108
108
|
}, z.core.$strip>>;
|
|
@@ -440,14 +440,11 @@ export declare const TaskFileSchema: z.ZodArray<z.ZodUnion<readonly [z.ZodObject
|
|
|
440
440
|
export type TaskFile = z.infer<typeof TaskFileSchema>;
|
|
441
441
|
/**
|
|
442
442
|
* Schema for per-dimension threshold values.
|
|
443
|
+
* Uses a dynamic record to support all evaluation modes, not just literacy.
|
|
443
444
|
* Keys use kebab-case to match YAML convention; the threshold engine
|
|
444
445
|
* normalizes to camelCase for comparison against FeatureScore fields.
|
|
445
446
|
*/
|
|
446
|
-
export declare const ThresholdDimensionsSchema: z.
|
|
447
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
448
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
449
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
450
|
-
}, z.core.$strip>;
|
|
447
|
+
export declare const ThresholdDimensionsSchema: z.ZodRecord<z.ZodString, z.ZodNumber>;
|
|
451
448
|
/** Inferred TypeScript type for threshold dimension overrides. */
|
|
452
449
|
export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
|
|
453
450
|
/**
|
|
@@ -457,11 +454,7 @@ export type ThresholdDimensions = z.infer<typeof ThresholdDimensionsSchema>;
|
|
|
457
454
|
export declare const ThresholdDefaultsSchema: z.ZodObject<{
|
|
458
455
|
ceiling: z.ZodOptional<z.ZodNumber>;
|
|
459
456
|
composite: z.ZodNumber;
|
|
460
|
-
dimensions: z.ZodOptional<z.
|
|
461
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
462
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
463
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
464
|
-
}, z.core.$strip>>;
|
|
457
|
+
dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
465
458
|
"doc-lift": z.ZodOptional<z.ZodNumber>;
|
|
466
459
|
}, z.core.$strip>;
|
|
467
460
|
/** Inferred TypeScript type for threshold defaults. */
|
|
@@ -501,21 +494,13 @@ export declare const ThresholdConfigSchema: z.ZodObject<{
|
|
|
501
494
|
areas: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
502
495
|
ceiling: z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
|
|
503
496
|
composite: z.ZodOptional<z.ZodNumber>;
|
|
504
|
-
dimensions: z.ZodOptional<z.ZodOptional<z.
|
|
505
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
506
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
507
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
508
|
-
}, z.core.$strip>>>;
|
|
497
|
+
dimensions: z.ZodOptional<z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>>;
|
|
509
498
|
"doc-lift": z.ZodOptional<z.ZodOptional<z.ZodNumber>>;
|
|
510
499
|
}, z.core.$strip>>>;
|
|
511
500
|
defaults: z.ZodObject<{
|
|
512
501
|
ceiling: z.ZodOptional<z.ZodNumber>;
|
|
513
502
|
composite: z.ZodNumber;
|
|
514
|
-
dimensions: z.ZodOptional<z.
|
|
515
|
-
"code-correctness": z.ZodOptional<z.ZodNumber>;
|
|
516
|
-
"doc-coverage": z.ZodOptional<z.ZodNumber>;
|
|
517
|
-
"task-completion": z.ZodOptional<z.ZodNumber>;
|
|
518
|
-
}, z.core.$strip>>;
|
|
503
|
+
dimensions: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodNumber>>;
|
|
519
504
|
"doc-lift": z.ZodOptional<z.ZodNumber>;
|
|
520
505
|
}, z.core.$strip>;
|
|
521
506
|
regression: z.ZodOptional<z.ZodObject<{
|
|
@@ -43,10 +43,22 @@ const WeightProfileSchema = z
|
|
|
43
43
|
return Math.abs(sum - 1.0) < 0.001;
|
|
44
44
|
}, { message: "profile weights must sum to 1.0" });
|
|
45
45
|
/**
|
|
46
|
-
* Mode-to-profile bindings — maps (mode,
|
|
47
|
-
*
|
|
46
|
+
* Mode-to-profile bindings — maps (mode, perspective) pairs to profile names.
|
|
47
|
+
*
|
|
48
|
+
* Flat form (most modes):
|
|
49
|
+
* { "mcp-server": { gold: "mcp-behavior" } }
|
|
50
|
+
*
|
|
51
|
+
* Nested form (literacy mode with variant sub-keys):
|
|
52
|
+
* { literacy: { baseline: { gold: "default", baseline: "output-only" }, agentic: { gold: "default" } } }
|
|
53
|
+
*
|
|
54
|
+
* The nested form adds a variant level between mode and perspective,
|
|
55
|
+
* allowing a single canonical mode to host multiple scoring variants.
|
|
48
56
|
*/
|
|
49
|
-
const
|
|
57
|
+
const ModeProfileEntrySchema = z.union([
|
|
58
|
+
z.string(),
|
|
59
|
+
z.record(z.string(), z.string()),
|
|
60
|
+
]);
|
|
61
|
+
const ModeProfilesSchema = z.record(z.string(), z.record(z.string(), ModeProfileEntrySchema));
|
|
50
62
|
/**
|
|
51
63
|
* Schema for the full config/rubrics.yaml config file.
|
|
52
64
|
*
|
|
@@ -277,14 +289,11 @@ export const TaskFileSchema = z
|
|
|
277
289
|
// ---------------------------------------------------------------------------
|
|
278
290
|
/**
|
|
279
291
|
* Schema for per-dimension threshold values.
|
|
292
|
+
* Uses a dynamic record to support all evaluation modes, not just literacy.
|
|
280
293
|
* Keys use kebab-case to match YAML convention; the threshold engine
|
|
281
294
|
* normalizes to camelCase for comparison against FeatureScore fields.
|
|
282
295
|
*/
|
|
283
|
-
export const ThresholdDimensionsSchema = z.
|
|
284
|
-
"code-correctness": z.number().min(0).max(100).optional(),
|
|
285
|
-
"doc-coverage": z.number().min(0).max(100).optional(),
|
|
286
|
-
"task-completion": z.number().min(0).max(100).optional(),
|
|
287
|
-
});
|
|
296
|
+
export const ThresholdDimensionsSchema = z.record(z.string(), z.number().min(0).max(100));
|
|
288
297
|
/**
|
|
289
298
|
* Schema for threshold defaults (and per-area overrides).
|
|
290
299
|
* All fields are optional in per-area overrides; defaults must have composite.
|
|
@@ -18,10 +18,15 @@ export declare const ScheduleEntrySchema: z.ZodObject<{
|
|
|
18
18
|
cron: z.ZodString;
|
|
19
19
|
enabled: z.ZodDefault<z.ZodBoolean>;
|
|
20
20
|
mode: z.ZodDefault<z.ZodEnum<{
|
|
21
|
-
|
|
21
|
+
custom: "custom";
|
|
22
|
+
literacy: "literacy";
|
|
23
|
+
"mcp-server": "mcp-server";
|
|
24
|
+
"agent-harness": "agent-harness";
|
|
25
|
+
"knowledge-probe": "knowledge-probe";
|
|
22
26
|
baseline: "baseline";
|
|
23
|
-
|
|
27
|
+
agentic: "agentic";
|
|
24
28
|
observed: "observed";
|
|
29
|
+
full: "full";
|
|
25
30
|
}>>;
|
|
26
31
|
name: z.ZodString;
|
|
27
32
|
publish: z.ZodDefault<z.ZodBoolean>;
|
|
@@ -53,10 +58,15 @@ export declare const SchedulesFileSchema: z.ZodObject<{
|
|
|
53
58
|
cron: z.ZodString;
|
|
54
59
|
enabled: z.ZodDefault<z.ZodBoolean>;
|
|
55
60
|
mode: z.ZodDefault<z.ZodEnum<{
|
|
56
|
-
|
|
61
|
+
custom: "custom";
|
|
62
|
+
literacy: "literacy";
|
|
63
|
+
"mcp-server": "mcp-server";
|
|
64
|
+
"agent-harness": "agent-harness";
|
|
65
|
+
"knowledge-probe": "knowledge-probe";
|
|
57
66
|
baseline: "baseline";
|
|
58
|
-
|
|
67
|
+
agentic: "agentic";
|
|
59
68
|
observed: "observed";
|
|
69
|
+
full: "full";
|
|
60
70
|
}>>;
|
|
61
71
|
name: z.ZodString;
|
|
62
72
|
publish: z.ZodDefault<z.ZodBoolean>;
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
* @see docs/design-docs/report-store/implementation.md — Phase 5
|
|
12
12
|
*/
|
|
13
13
|
import { z } from "zod";
|
|
14
|
+
import { RAW_EVAL_MODES } from "../../ailf-shared/index.js";
|
|
14
15
|
// ---------------------------------------------------------------------------
|
|
15
16
|
// Cron expression validation
|
|
16
17
|
// ---------------------------------------------------------------------------
|
|
@@ -34,8 +35,11 @@ export const ScheduleEntrySchema = z.object({
|
|
|
34
35
|
cron: CronSchema,
|
|
35
36
|
/** Whether this schedule is active */
|
|
36
37
|
enabled: z.boolean().default(true),
|
|
37
|
-
/**
|
|
38
|
-
|
|
38
|
+
/**
|
|
39
|
+
* Evaluation mode — accepts both canonical and legacy names.
|
|
40
|
+
* Legacy names must pass through normalizeMode() before entering typed pipeline code.
|
|
41
|
+
*/
|
|
42
|
+
mode: z.enum(RAW_EVAL_MODES).default("baseline"),
|
|
39
43
|
/** Human-readable schedule name (used as report tag) */
|
|
40
44
|
name: z
|
|
41
45
|
.string()
|
|
@@ -17,10 +17,10 @@
|
|
|
17
17
|
import { z } from "zod";
|
|
18
18
|
/** All supported sink types as a Zod union. */
|
|
19
19
|
export declare const SinkTypeSchema: z.ZodEnum<{
|
|
20
|
-
webhook: "webhook";
|
|
21
20
|
bigquery: "bigquery";
|
|
22
21
|
"github-comment": "github-comment";
|
|
23
22
|
slack: "slack";
|
|
23
|
+
webhook: "webhook";
|
|
24
24
|
}>;
|
|
25
25
|
/** Supported sink type string literal union. */
|
|
26
26
|
export type SinkType = z.infer<typeof SinkTypeSchema>;
|
|
@@ -25,12 +25,21 @@ export function formatComparisonMarkdown(report) {
|
|
|
25
25
|
lines.push("");
|
|
26
26
|
lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallIcon} ${deltaStr(overall)})`);
|
|
27
27
|
lines.push("");
|
|
28
|
-
//
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
// Derive dimension columns from the first area's keys (all areas share the
|
|
29
|
+
// same scoring profile, so the key set is uniform).
|
|
30
|
+
const dimKeys = report.areas.length > 0
|
|
31
|
+
? Object.keys(report.areas[0].dimensions)
|
|
32
|
+
: Object.keys(report.deltas.perDimension);
|
|
33
|
+
// Per-area table — columns are dynamic
|
|
34
|
+
const dimHeaders = dimKeys.map(kebabToTitleCase);
|
|
35
|
+
const headerRow = ["Feature", "Baseline", "Current", "Delta", ...dimHeaders];
|
|
36
|
+
const separatorRow = headerRow.map(() => "------");
|
|
37
|
+
lines.push(`| ${headerRow.join(" | ")} |`);
|
|
38
|
+
lines.push(`|${separatorRow.join("|")}|`);
|
|
31
39
|
for (const a of report.areas) {
|
|
32
40
|
const icon = changeIcon(a.change);
|
|
33
|
-
|
|
41
|
+
const dimCells = dimKeys.map((k) => deltaStr(a.dimensions[k]?.delta ?? 0));
|
|
42
|
+
lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${deltaStr(a.delta)} | ${dimCells.join(" | ")} |`);
|
|
34
43
|
}
|
|
35
44
|
lines.push("");
|
|
36
45
|
// Summary
|
|
@@ -55,9 +64,9 @@ export function formatComparisonMarkdown(report) {
|
|
|
55
64
|
const dim = report.deltas.perDimension;
|
|
56
65
|
lines.push("| Dimension | Delta |");
|
|
57
66
|
lines.push("|-----------|-------|");
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
67
|
+
for (const k of Object.keys(dim)) {
|
|
68
|
+
lines.push(`| ${kebabToTitleCase(k)} | ${deltaStr(dim[k])} |`);
|
|
69
|
+
}
|
|
61
70
|
lines.push(`| Doc Lift | ${deltaStr(report.deltas.docLift)} |`);
|
|
62
71
|
if (report.deltas.cost !== undefined) {
|
|
63
72
|
const costStr = report.deltas.cost > 0
|
|
@@ -91,29 +100,51 @@ export function formatComparisonTable(report) {
|
|
|
91
100
|
: "unchanged");
|
|
92
101
|
lines.push(` Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)} (${overallIcon} ${deltaStr(overall)})`);
|
|
93
102
|
lines.push("");
|
|
94
|
-
// Per-dimension averages
|
|
103
|
+
// Per-dimension averages — derived dynamically from the report
|
|
95
104
|
const dim = report.deltas.perDimension;
|
|
105
|
+
const dimKeys = report.areas.length > 0
|
|
106
|
+
? Object.keys(report.areas[0].dimensions)
|
|
107
|
+
: Object.keys(dim);
|
|
96
108
|
lines.push(" Dimension averages:");
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
109
|
+
// Pad labels to the longest dimension label for alignment
|
|
110
|
+
const dimLabels = dimKeys.map(kebabToTitleCase);
|
|
111
|
+
// +1 for the colon appended to each label
|
|
112
|
+
const maxLabelLen = Math.max(...dimLabels.map((l) => l.length + 1), "Doc Lift:".length);
|
|
113
|
+
for (let i = 0; i < dimKeys.length; i++) {
|
|
114
|
+
lines.push(` ${(dimLabels[i] + ":").padEnd(maxLabelLen)} ${deltaStr(dim[dimKeys[i]] ?? 0)}`);
|
|
115
|
+
}
|
|
116
|
+
lines.push(` ${"Doc Lift:".padEnd(maxLabelLen)} ${deltaStr(report.deltas.docLift)}`);
|
|
101
117
|
if (report.deltas.cost !== undefined) {
|
|
102
|
-
lines.push(` Cost:
|
|
118
|
+
lines.push(` ${"Cost:".padEnd(maxLabelLen)} ${report.deltas.cost > 0 ? "+" : ""}$${report.deltas.cost.toFixed(4)}`);
|
|
103
119
|
}
|
|
104
120
|
lines.push("");
|
|
105
|
-
// Per-area table
|
|
121
|
+
// Per-area table — columns are dynamic
|
|
106
122
|
lines.push("-".repeat(80));
|
|
107
123
|
lines.push("PER-AREA BREAKDOWN");
|
|
108
124
|
lines.push("-".repeat(80));
|
|
109
125
|
lines.push("");
|
|
110
|
-
const
|
|
111
|
-
const
|
|
112
|
-
|
|
113
|
-
|
|
126
|
+
const dimHeaders = dimKeys.map(kebabToTitleCase);
|
|
127
|
+
const colWidths = dimHeaders.map((h) => Math.max(h.length, 4));
|
|
128
|
+
const hCols = [
|
|
129
|
+
"Feature Area".padEnd(19),
|
|
130
|
+
"Baseline".padStart(8),
|
|
131
|
+
"Experiment".padStart(10),
|
|
132
|
+
"Delta".padStart(5),
|
|
133
|
+
...dimHeaders.map((h, i) => h.padStart(colWidths[i])),
|
|
134
|
+
];
|
|
135
|
+
const sepCols = [
|
|
136
|
+
"-".repeat(21),
|
|
137
|
+
"-".repeat(10),
|
|
138
|
+
"-".repeat(12),
|
|
139
|
+
"-".repeat(7),
|
|
140
|
+
...colWidths.map((w) => "-".repeat(w + 2)),
|
|
141
|
+
];
|
|
142
|
+
lines.push(`| ${hCols.join(" | ")} |`);
|
|
143
|
+
lines.push(`|${sepCols.join("|")}|`);
|
|
114
144
|
for (const a of report.areas) {
|
|
115
145
|
const icon = changeIcon(a.change);
|
|
116
|
-
|
|
146
|
+
const dimCells = dimKeys.map((k, i) => deltaStr(a.dimensions[k]?.delta ?? 0).padStart(colWidths[i]));
|
|
147
|
+
lines.push(`| ${icon} ${a.area.padEnd(17)} | ${String(a.baseline).padStart(8)} | ${String(a.experiment).padStart(10)} | ${deltaStr(a.delta).padStart(5)} | ${dimCells.join(" | ")} |`);
|
|
117
148
|
}
|
|
118
149
|
lines.push("");
|
|
119
150
|
// Classification summary
|
|
@@ -187,3 +218,10 @@ function deltaStr(d) {
|
|
|
187
218
|
return `${Math.round(d)}`;
|
|
188
219
|
return "0";
|
|
189
220
|
}
|
|
221
|
+
/** Convert kebab-case dimension name to title case (e.g. 'task-completion' → 'Task Completion') */
|
|
222
|
+
function kebabToTitleCase(name) {
|
|
223
|
+
return name
|
|
224
|
+
.split("-")
|
|
225
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
226
|
+
.join(" ");
|
|
227
|
+
}
|