@sanity/ailf 0.4.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
- package/dist/_vendor/ailf-core/examples/index.js +10 -10
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +5 -7
- package/dist/pipeline/calculate-scores.js +74 -153
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +23 -14
- package/dist/pipeline/expand-tasks.js +37 -31
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +18 -21
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +47 -0
- package/dist/pipeline/profile-resolution.js +91 -0
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -62
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Branded ID types — nominal typing for entity identifiers.
|
|
3
|
+
*
|
|
4
|
+
* All entity IDs use branded types to prevent accidental misuse.
|
|
5
|
+
* A `TaskId` cannot be passed where a `RunId` is expected, even
|
|
6
|
+
* though both are strings at runtime.
|
|
7
|
+
*
|
|
8
|
+
* Constructor functions validate format and return `Result<T, E>` —
|
|
9
|
+
* parse-don't-validate at the boundary, then pass branded values
|
|
10
|
+
* through the pipeline.
|
|
11
|
+
*
|
|
12
|
+
* The `Brand` utility and `Result` type are defined here as the
|
|
13
|
+
* foundation. Existing branded types in the codebase (`ReportId`,
|
|
14
|
+
* `ISOTimestamp`) use inline branding — those will be migrated to
|
|
15
|
+
* use this utility in Phase 7.
|
|
16
|
+
*
|
|
17
|
+
* @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
|
|
18
|
+
* @see docs/design-docs/parse-dont-validate.md (design principle)
|
|
19
|
+
*/
|
|
20
|
+
/** Construct a success result */
|
|
21
|
+
export function ok(value) {
|
|
22
|
+
return { ok: true, value };
|
|
23
|
+
}
|
|
24
|
+
/** Construct a failure result */
|
|
25
|
+
export function err(error) {
|
|
26
|
+
return { ok: false, error };
|
|
27
|
+
}
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
// Constructor functions — parse-don't-validate
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
/**
|
|
32
|
+
* Parse a raw string into a `TaskId`.
|
|
33
|
+
*
|
|
34
|
+
* Valid format: alphanumeric + hyphens, 1–128 characters.
|
|
35
|
+
* Examples: `"groq-projection-basics"`, `"mcp-server-tools-list"`
|
|
36
|
+
*/
|
|
37
|
+
export function taskId(raw) {
|
|
38
|
+
if (!raw.match(/^[a-z0-9][a-z0-9-]{0,127}$/)) {
|
|
39
|
+
return err({
|
|
40
|
+
code: "INVALID_TASK_ID",
|
|
41
|
+
raw,
|
|
42
|
+
message: `Invalid TaskId "${raw}": must be 1–128 lowercase alphanumeric characters or hyphens, starting with a letter or digit`,
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
return ok(raw);
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Parse a raw string into a `RunId`.
|
|
49
|
+
*
|
|
50
|
+
* Valid format: `run_` prefix followed by alphanumeric characters.
|
|
51
|
+
*/
|
|
52
|
+
export function runId(raw) {
|
|
53
|
+
if (!raw.match(/^run_[a-zA-Z0-9]{8,}$/)) {
|
|
54
|
+
return err({
|
|
55
|
+
code: "INVALID_RUN_ID",
|
|
56
|
+
raw,
|
|
57
|
+
message: `Invalid RunId "${raw}": must match run_[a-zA-Z0-9]{8,}`,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
return ok(raw);
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Parse a raw string into a `SuiteId`.
|
|
64
|
+
*
|
|
65
|
+
* Valid format: `suite_` prefix followed by alphanumeric characters.
|
|
66
|
+
*/
|
|
67
|
+
export function suiteId(raw) {
|
|
68
|
+
if (!raw.match(/^suite_[a-zA-Z0-9]{4,}$/)) {
|
|
69
|
+
return err({
|
|
70
|
+
code: "INVALID_SUITE_ID",
|
|
71
|
+
raw,
|
|
72
|
+
message: `Invalid SuiteId "${raw}": must match suite_[a-zA-Z0-9]{4,}`,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
return ok(raw);
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Parse a raw string into a `ResultId`.
|
|
79
|
+
*
|
|
80
|
+
* Valid format: `res_` prefix followed by alphanumeric characters.
|
|
81
|
+
*/
|
|
82
|
+
export function resultId(raw) {
|
|
83
|
+
if (!raw.match(/^res_[a-zA-Z0-9]{8,}$/)) {
|
|
84
|
+
return err({
|
|
85
|
+
code: "INVALID_RESULT_ID",
|
|
86
|
+
raw,
|
|
87
|
+
message: `Invalid ResultId "${raw}": must match res_[a-zA-Z0-9]{8,}`,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
return ok(raw);
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Parse a raw string into a `TraceId`.
|
|
94
|
+
*
|
|
95
|
+
* Valid format: `trace_` prefix followed by alphanumeric characters.
|
|
96
|
+
*/
|
|
97
|
+
export function traceId(raw) {
|
|
98
|
+
if (!raw.match(/^trace_[a-zA-Z0-9]{8,}$/)) {
|
|
99
|
+
return err({
|
|
100
|
+
code: "INVALID_TRACE_ID",
|
|
101
|
+
raw,
|
|
102
|
+
message: `Invalid TraceId "${raw}": must match trace_[a-zA-Z0-9]{8,}`,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
return ok(raw);
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Parse a raw string into a `ProviderId`.
|
|
109
|
+
*
|
|
110
|
+
* Valid format: colon-separated segments (e.g., `"openai:chat:gpt-4o"`).
|
|
111
|
+
*/
|
|
112
|
+
export function providerId(raw) {
|
|
113
|
+
if (!raw.match(/^[a-zA-Z0-9][a-zA-Z0-9:._-]{0,127}$/)) {
|
|
114
|
+
return err({
|
|
115
|
+
code: "INVALID_PROVIDER_ID",
|
|
116
|
+
raw,
|
|
117
|
+
message: `Invalid ProviderId "${raw}": must be 1–128 alphanumeric characters, colons, dots, underscores, or hyphens`,
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
return ok(raw);
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Parse a raw string into a `FixtureId`.
|
|
124
|
+
*
|
|
125
|
+
* Valid format: alphanumeric + hyphens, 1–128 characters.
|
|
126
|
+
*/
|
|
127
|
+
export function fixtureId(raw) {
|
|
128
|
+
if (!raw.match(/^[a-z0-9][a-z0-9-]{0,127}$/)) {
|
|
129
|
+
return err({
|
|
130
|
+
code: "INVALID_FIXTURE_ID",
|
|
131
|
+
raw,
|
|
132
|
+
message: `Invalid FixtureId "${raw}": must be 1–128 lowercase alphanumeric characters or hyphens`,
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
return ok(raw);
|
|
136
|
+
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvalModeConfig — Rich discriminated union for evaluation mode configuration.
|
|
3
|
+
*
|
|
4
|
+
* This is the NEW type system for evaluation modes, introduced in the
|
|
5
|
+
* architecture overhaul (Phase 0). It carries per-mode configuration
|
|
6
|
+
* payloads that the compiler and new evaluation modes use.
|
|
7
|
+
*
|
|
8
|
+
* The existing `EvalMode` string union in @sanity/ailf-shared remains
|
|
9
|
+
* unchanged — it is the lightweight identifier used throughout the
|
|
10
|
+
* existing pipeline. `EvalModeConfig` is the rich type that new code
|
|
11
|
+
* authors against.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
|
|
14
|
+
*/
|
|
15
|
+
import type { EvalMode } from "../../ailf-shared/index.d.ts";
|
|
16
|
+
/** Configuration for documentation source resolution in literacy mode */
|
|
17
|
+
export interface DocSourceConfig {
|
|
18
|
+
/** Source name from config/sources.yaml */
|
|
19
|
+
name: string;
|
|
20
|
+
/** Override the base URL for doc fetching */
|
|
21
|
+
baseUrl?: string;
|
|
22
|
+
/** Sanity dataset to fetch from */
|
|
23
|
+
dataset?: string;
|
|
24
|
+
/** Sanity project ID */
|
|
25
|
+
projectId?: string;
|
|
26
|
+
/** Content release perspective */
|
|
27
|
+
perspective?: string;
|
|
28
|
+
}
|
|
29
|
+
/** Configuration for an MCP server under test */
|
|
30
|
+
export interface MCPServerConfig {
|
|
31
|
+
/** Transport protocol */
|
|
32
|
+
transport: "stdio" | "sse" | "streamable-http";
|
|
33
|
+
/** Command to start the server (for stdio transport) */
|
|
34
|
+
command?: string;
|
|
35
|
+
/** Server URL (for sse/streamable-http transport) */
|
|
36
|
+
url?: string;
|
|
37
|
+
/** Environment variables to pass to the server process */
|
|
38
|
+
env?: Record<string, string>;
|
|
39
|
+
/** Startup timeout in milliseconds */
|
|
40
|
+
startupTimeoutMs?: number;
|
|
41
|
+
}
|
|
42
|
+
/** Configuration for an agent harness under test */
|
|
43
|
+
export interface AgentHarnessConfig {
|
|
44
|
+
/** Runtime environment identifier */
|
|
45
|
+
runtime: string;
|
|
46
|
+
/** Entry point for the agent (file path or module specifier) */
|
|
47
|
+
entrypoint: string;
|
|
48
|
+
/** Maximum execution time per test case */
|
|
49
|
+
timeoutMs?: number;
|
|
50
|
+
/** Sandbox configuration */
|
|
51
|
+
sandbox?: SandboxConfig;
|
|
52
|
+
}
|
|
53
|
+
/** Sandbox configuration for isolated agent execution */
|
|
54
|
+
export interface SandboxConfig {
|
|
55
|
+
/** Sandbox type */
|
|
56
|
+
type: "docker" | "git-worktree" | "none" | "nsjail" | "tempdir";
|
|
57
|
+
/** Docker image (when type is "docker") */
|
|
58
|
+
image?: string;
|
|
59
|
+
/** Resource limits */
|
|
60
|
+
limits?: {
|
|
61
|
+
/** CPU cores */
|
|
62
|
+
cpus?: number;
|
|
63
|
+
/** Memory in bytes */
|
|
64
|
+
memoryBytes?: number;
|
|
65
|
+
/** Disk space in bytes */
|
|
66
|
+
diskBytes?: number;
|
|
67
|
+
/** Network access allowed */
|
|
68
|
+
networkAccess?: boolean;
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
/** Tool definition for agent harness mode */
|
|
72
|
+
export interface ToolDef {
|
|
73
|
+
/** Tool name */
|
|
74
|
+
name: string;
|
|
75
|
+
/** Tool description (for the agent's tool manifest) */
|
|
76
|
+
description: string;
|
|
77
|
+
/** JSON Schema for the tool's input parameters */
|
|
78
|
+
inputSchema?: Record<string, unknown>;
|
|
79
|
+
}
|
|
80
|
+
/** Reference to a knowledge base for knowledge-probe mode */
|
|
81
|
+
export interface KnowledgeBaseRef {
|
|
82
|
+
/** Knowledge base type */
|
|
83
|
+
type: "sanity-dataset" | "embeddings-index" | "file-corpus";
|
|
84
|
+
/** Dataset or index name */
|
|
85
|
+
name: string;
|
|
86
|
+
/** Additional configuration */
|
|
87
|
+
config?: Record<string, unknown>;
|
|
88
|
+
}
|
|
89
|
+
/** Probing strategy for knowledge-probe mode */
|
|
90
|
+
export type ProbeStrategy = "breadth-first" | "depth-first" | "random-sample" | "coverage-guided";
|
|
91
|
+
/** Literacy mode — documentation quality evaluation (the original AILF mode) */
|
|
92
|
+
export interface LiteracyModeConfig {
|
|
93
|
+
type: "literacy";
|
|
94
|
+
/** Documentation sources to evaluate against */
|
|
95
|
+
docSources: DocSourceConfig[];
|
|
96
|
+
/** Whether to compute retrieval metrics (precision/recall/F1) */
|
|
97
|
+
retrievalMetrics: boolean;
|
|
98
|
+
}
|
|
99
|
+
/** MCP server mode — evaluates an MCP server's tool/resource capabilities */
|
|
100
|
+
export interface MCPServerModeConfig {
|
|
101
|
+
type: "mcp-server";
|
|
102
|
+
/** The MCP server under test */
|
|
103
|
+
serverConfig: MCPServerConfig;
|
|
104
|
+
/** Expected capabilities to verify (e.g., "tools/list", "resources/read") */
|
|
105
|
+
capabilities: string[];
|
|
106
|
+
}
|
|
107
|
+
/** Agent harness mode — evaluates an autonomous agent in a sandboxed environment */
|
|
108
|
+
export interface AgentHarnessModeConfig {
|
|
109
|
+
type: "agent-harness";
|
|
110
|
+
/** Agent configuration */
|
|
111
|
+
agentConfig: AgentHarnessConfig;
|
|
112
|
+
/** Tools available to the agent */
|
|
113
|
+
toolManifest: ToolDef[];
|
|
114
|
+
}
|
|
115
|
+
/** Knowledge probe mode — measures knowledge coverage across a corpus */
|
|
116
|
+
export interface KnowledgeProbeModeConfig {
|
|
117
|
+
type: "knowledge-probe";
|
|
118
|
+
/** The knowledge base to probe */
|
|
119
|
+
knowledgeBase: KnowledgeBaseRef;
|
|
120
|
+
/** How to traverse the knowledge space */
|
|
121
|
+
probeStrategy: ProbeStrategy;
|
|
122
|
+
}
|
|
123
|
+
/** Custom mode — user-provided evaluation handler */
|
|
124
|
+
export interface CustomModeConfig {
|
|
125
|
+
type: "custom";
|
|
126
|
+
/** Module path to the custom handler implementing EvalModeHandler */
|
|
127
|
+
handler: string;
|
|
128
|
+
/** Freeform schema for mode-specific configuration */
|
|
129
|
+
schema: Record<string, unknown>;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Rich discriminated union for evaluation mode configuration.
|
|
133
|
+
*
|
|
134
|
+
* Discriminant: `type` field.
|
|
135
|
+
*
|
|
136
|
+
* New code should use `EvalModeConfig` for structured mode configuration.
|
|
137
|
+
* The string-level `EvalMode` type remains for pipeline options, report
|
|
138
|
+
* provenance, GROQ filters, and Studio UI.
|
|
139
|
+
*/
|
|
140
|
+
export type EvalModeConfig = LiteracyModeConfig | MCPServerModeConfig | AgentHarnessModeConfig | KnowledgeProbeModeConfig | CustomModeConfig;
|
|
141
|
+
/** All valid mode type strings from EvalModeConfig */
|
|
142
|
+
export type EvalModeType = EvalModeConfig["type"];
|
|
143
|
+
/**
|
|
144
|
+
* Extract the string-level EvalMode from a rich EvalModeConfig.
|
|
145
|
+
*
|
|
146
|
+
* This bridges the new discriminated union with the existing string
|
|
147
|
+
* union used throughout the pipeline. The returned value is always
|
|
148
|
+
* a valid `EvalMode` string.
|
|
149
|
+
*/
|
|
150
|
+
export declare function evalModeType(config: EvalModeConfig): EvalMode;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvalModeConfig — Rich discriminated union for evaluation mode configuration.
|
|
3
|
+
*
|
|
4
|
+
* This is the NEW type system for evaluation modes, introduced in the
|
|
5
|
+
* architecture overhaul (Phase 0). It carries per-mode configuration
|
|
6
|
+
* payloads that the compiler and new evaluation modes use.
|
|
7
|
+
*
|
|
8
|
+
* The existing `EvalMode` string union in @sanity/ailf-shared remains
|
|
9
|
+
* unchanged — it is the lightweight identifier used throughout the
|
|
10
|
+
* existing pipeline. `EvalModeConfig` is the rich type that new code
|
|
11
|
+
* authors against.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
|
|
14
|
+
*/
|
|
15
|
+
/**
|
|
16
|
+
* Extract the string-level EvalMode from a rich EvalModeConfig.
|
|
17
|
+
*
|
|
18
|
+
* This bridges the new discriminated union with the existing string
|
|
19
|
+
* union used throughout the pipeline. The returned value is always
|
|
20
|
+
* a valid `EvalMode` string.
|
|
21
|
+
*/
|
|
22
|
+
export function evalModeType(config) {
|
|
23
|
+
return config.type;
|
|
24
|
+
}
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GeneralizedTaskDefinition — Mode-discriminated task definitions.
|
|
3
|
+
*
|
|
4
|
+
* This is the canonical task type for the architecture overhaul. It supports
|
|
5
|
+
* all five evaluation modes via a discriminated union on the `mode`
|
|
6
|
+
* field. Common fields (id, title, description, area, etc.) are shared;
|
|
7
|
+
* mode-specific fields live only on the relevant variant.
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
|
|
10
|
+
* @see docs/design-docs/architecture-overhaul/test-definition.md (authoring surfaces)
|
|
11
|
+
* @see docs/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
|
|
12
|
+
*/
|
|
13
|
+
/** Difficulty level for a task */
|
|
14
|
+
export type TaskDifficulty = "basic" | "intermediate" | "advanced";
|
|
15
|
+
/** Lifecycle status for a task */
|
|
16
|
+
export type TaskStatus = "active" | "draft" | "paused" | "archived";
|
|
17
|
+
/**
|
|
18
|
+
* A canonical documentation reference — polymorphic resolution strategy.
|
|
19
|
+
*
|
|
20
|
+
* Each entry resolves docs through one of four strategies, discriminated
|
|
21
|
+
* by key presence (no explicit `type` field). All strategies carry an
|
|
22
|
+
* optional `reason` for context.
|
|
23
|
+
*
|
|
24
|
+
* @see docs/design-docs/canonical-doc-resolution.md
|
|
25
|
+
*/
|
|
26
|
+
export type GeneralizedDocRef = SlugDocRef | PathDocRef | IdDocRef | PerspectiveDocRef;
|
|
27
|
+
/** Resolve by article slug field. Legacy — prefer `path` for uniqueness. */
|
|
28
|
+
export interface SlugDocRef {
|
|
29
|
+
slug: string;
|
|
30
|
+
reason?: string;
|
|
31
|
+
}
|
|
32
|
+
/** Resolve by URL path (after /docs/). Unique across sections. */
|
|
33
|
+
export interface PathDocRef {
|
|
34
|
+
path: string;
|
|
35
|
+
reason?: string;
|
|
36
|
+
}
|
|
37
|
+
/** Resolve by Sanity document `_id`. The primary resolution strategy.
|
|
38
|
+
*
|
|
39
|
+
* Optional `slug` and `path` provide human-readable context — they are
|
|
40
|
+
* NOT used for resolution (the `_id` is authoritative) but help YAML
|
|
41
|
+
* authors understand which document is being referenced.
|
|
42
|
+
*/
|
|
43
|
+
export interface IdDocRef {
|
|
44
|
+
id: string;
|
|
45
|
+
reason?: string;
|
|
46
|
+
/** Human-readable slug (informational only — not used for resolution) */
|
|
47
|
+
slug?: string;
|
|
48
|
+
/** Human-readable path (informational only — not used for resolution) */
|
|
49
|
+
path?: string;
|
|
50
|
+
}
|
|
51
|
+
/** Resolve all articles in a content release. One-to-many. */
|
|
52
|
+
export interface PerspectiveDocRef {
|
|
53
|
+
perspective: string;
|
|
54
|
+
reason?: string;
|
|
55
|
+
}
|
|
56
|
+
/** A templated assertion referencing a rubric template */
|
|
57
|
+
export interface GeneralizedTemplatedAssertion {
|
|
58
|
+
type: "llm-rubric";
|
|
59
|
+
template: string;
|
|
60
|
+
criteria: string[];
|
|
61
|
+
weight?: number;
|
|
62
|
+
}
|
|
63
|
+
/** A value-based assertion (contains, javascript, cost, latency, etc.) */
|
|
64
|
+
export interface GeneralizedValueAssertion {
|
|
65
|
+
type: string;
|
|
66
|
+
value?: unknown;
|
|
67
|
+
weight?: number;
|
|
68
|
+
[key: string]: unknown;
|
|
69
|
+
}
|
|
70
|
+
/** Any assertion definition */
|
|
71
|
+
export type GeneralizedAssertionDefinition = GeneralizedTemplatedAssertion | GeneralizedValueAssertion;
|
|
72
|
+
/** Rubric reference — either a named template or an inline rubric */
|
|
73
|
+
export type RubricRef = {
|
|
74
|
+
ref: string;
|
|
75
|
+
} | {
|
|
76
|
+
inline: string;
|
|
77
|
+
dimensions?: {
|
|
78
|
+
key: string;
|
|
79
|
+
weight: number;
|
|
80
|
+
}[];
|
|
81
|
+
};
|
|
82
|
+
/** Provider override for a specific task */
|
|
83
|
+
export interface TaskProviderConfig {
|
|
84
|
+
/** Provider ID (e.g., "openai:chat:gpt-4o", "anthropic:messages:claude-sonnet-4-6") */
|
|
85
|
+
id: string;
|
|
86
|
+
/** Provider-specific configuration overrides */
|
|
87
|
+
config?: Record<string, unknown>;
|
|
88
|
+
}
|
|
89
|
+
/** Task-level options */
|
|
90
|
+
export interface TaskOptions {
|
|
91
|
+
/** Execution timeout in milliseconds */
|
|
92
|
+
timeout?: number;
|
|
93
|
+
/** Whether to cache results for this task */
|
|
94
|
+
cache?: boolean;
|
|
95
|
+
/** Output transform expression */
|
|
96
|
+
transformOutput?: string;
|
|
97
|
+
/** Arbitrary Promptfoo overrides (escape hatch) */
|
|
98
|
+
promptfooOverrides?: Record<string, unknown>;
|
|
99
|
+
}
|
|
100
|
+
/** Fields shared by all task modes */
|
|
101
|
+
export interface TaskCommonFields {
|
|
102
|
+
/** Unique task identifier */
|
|
103
|
+
id: string;
|
|
104
|
+
/** Human-readable title */
|
|
105
|
+
title: string;
|
|
106
|
+
/** Detailed description of what this task evaluates */
|
|
107
|
+
description?: string;
|
|
108
|
+
/** Feature area this task belongs to (e.g., "groq", "studio", "mutations") */
|
|
109
|
+
area?: string;
|
|
110
|
+
/** Difficulty level */
|
|
111
|
+
difficulty?: TaskDifficulty;
|
|
112
|
+
/** Freeform labels for filtering and organization */
|
|
113
|
+
tags?: string[];
|
|
114
|
+
/** Lifecycle status — controls pipeline inclusion */
|
|
115
|
+
status?: TaskStatus;
|
|
116
|
+
/** Assertion definitions (rubric templates + value assertions) */
|
|
117
|
+
assertions?: GeneralizedAssertionDefinition[];
|
|
118
|
+
/** Rubric template or inline rubric for grading */
|
|
119
|
+
rubric?: RubricRef;
|
|
120
|
+
/** Provider overrides for this task */
|
|
121
|
+
providers?: TaskProviderConfig[];
|
|
122
|
+
/** Task-level execution options */
|
|
123
|
+
options?: TaskOptions;
|
|
124
|
+
/** Prompt configuration */
|
|
125
|
+
prompt?: {
|
|
126
|
+
/** Named prompt template */
|
|
127
|
+
template?: string;
|
|
128
|
+
/** Inline prompt text (mutually exclusive with template) */
|
|
129
|
+
text?: string;
|
|
130
|
+
/** System message override */
|
|
131
|
+
systemMessage?: string;
|
|
132
|
+
/** Variables for template interpolation */
|
|
133
|
+
vars?: Record<string, unknown>;
|
|
134
|
+
};
|
|
135
|
+
/** Arbitrary metadata */
|
|
136
|
+
metadata?: Record<string, unknown>;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Literacy mode — documentation quality evaluation.
|
|
140
|
+
*
|
|
141
|
+
* The original AILF evaluation mode. Tests whether AI coding tools
|
|
142
|
+
* can find the right docs and produce correct code.
|
|
143
|
+
*/
|
|
144
|
+
export interface LiteracyTaskDefinition extends TaskCommonFields {
|
|
145
|
+
mode: "literacy";
|
|
146
|
+
/** Documentation context configuration */
|
|
147
|
+
context?: {
|
|
148
|
+
/** Canonical doc references for this task */
|
|
149
|
+
docs?: GeneralizedDocRef[];
|
|
150
|
+
/** Fixture references */
|
|
151
|
+
fixtures?: string[];
|
|
152
|
+
};
|
|
153
|
+
/** Path to the reference solution (relative to eval package root) */
|
|
154
|
+
referenceSolution?: string;
|
|
155
|
+
/** Whether doc coverage rubric should be auto-generated */
|
|
156
|
+
docCoverage?: boolean;
|
|
157
|
+
/** Baseline variant configuration */
|
|
158
|
+
baseline?: {
|
|
159
|
+
enabled?: boolean;
|
|
160
|
+
rubric?: "abbreviated" | "full" | "none";
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* MCP server mode — evaluates an MCP server's capabilities.
|
|
165
|
+
*
|
|
166
|
+
* Tests whether an MCP server correctly implements tools, resources,
|
|
167
|
+
* and prompts according to the MCP specification.
|
|
168
|
+
*/
|
|
169
|
+
export interface MCPServerTaskDefinition extends TaskCommonFields {
|
|
170
|
+
mode: "mcp-server";
|
|
171
|
+
/** Server under test */
|
|
172
|
+
serverConfig?: {
|
|
173
|
+
/** Transport protocol */
|
|
174
|
+
transport: "stdio" | "sse" | "streamable-http";
|
|
175
|
+
/** Command to start the server (stdio) */
|
|
176
|
+
command?: string;
|
|
177
|
+
/** Server URL (sse/streamable-http) */
|
|
178
|
+
url?: string;
|
|
179
|
+
/** Environment variables for the server process */
|
|
180
|
+
env?: Record<string, string>;
|
|
181
|
+
/** Startup timeout in milliseconds */
|
|
182
|
+
startupTimeoutMs?: number;
|
|
183
|
+
/**
|
|
184
|
+
* Authentication config for the MCP server.
|
|
185
|
+
* Maps directly to Promptfoo's auth config.
|
|
186
|
+
* @see https://www.promptfoo.dev/docs/providers/mcp/#authentication
|
|
187
|
+
*/
|
|
188
|
+
auth?: {
|
|
189
|
+
type: "bearer" | "basic" | "api_key" | "oauth";
|
|
190
|
+
/** Bearer token (for type: "bearer") */
|
|
191
|
+
token?: string;
|
|
192
|
+
/** Username (for type: "basic" or "oauth" password grant) */
|
|
193
|
+
username?: string;
|
|
194
|
+
/** Password (for type: "basic" or "oauth" password grant) */
|
|
195
|
+
password?: string;
|
|
196
|
+
/** API key value (for type: "api_key") */
|
|
197
|
+
value?: string;
|
|
198
|
+
/** Header/query param name for API key (default: "X-API-Key") */
|
|
199
|
+
keyName?: string;
|
|
200
|
+
/** "header" or "query" (for type: "api_key", default: "header") */
|
|
201
|
+
placement?: "header" | "query";
|
|
202
|
+
/** OAuth grant type */
|
|
203
|
+
grantType?: "client_credentials" | "password";
|
|
204
|
+
/** OAuth token endpoint URL */
|
|
205
|
+
tokenUrl?: string;
|
|
206
|
+
/** OAuth client ID */
|
|
207
|
+
clientId?: string;
|
|
208
|
+
/** OAuth client secret */
|
|
209
|
+
clientSecret?: string;
|
|
210
|
+
/** OAuth scopes */
|
|
211
|
+
scopes?: string[];
|
|
212
|
+
};
|
|
213
|
+
};
|
|
214
|
+
/** Expected capabilities to verify */
|
|
215
|
+
capabilities?: string[];
|
|
216
|
+
/** Documentation context (MCP servers can also be evaluated against docs) */
|
|
217
|
+
context?: {
|
|
218
|
+
docs?: GeneralizedDocRef[];
|
|
219
|
+
tools?: string[];
|
|
220
|
+
fixtures?: string[];
|
|
221
|
+
};
|
|
222
|
+
/** Multi-turn conversation definition */
|
|
223
|
+
multiTurn?: {
|
|
224
|
+
turns: {
|
|
225
|
+
role: "user" | "assistant";
|
|
226
|
+
content: string;
|
|
227
|
+
}[];
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Agent harness mode — evaluates autonomous agents in a sandbox.
|
|
232
|
+
*
|
|
233
|
+
* Tests whether an AI agent can complete tasks correctly using a
|
|
234
|
+
* defined set of tools in an isolated environment.
|
|
235
|
+
*/
|
|
236
|
+
export interface AgentHarnessTaskDefinition extends TaskCommonFields {
|
|
237
|
+
mode: "agent-harness";
|
|
238
|
+
/** Sandbox configuration for isolated execution */
|
|
239
|
+
sandbox?: {
|
|
240
|
+
type: "docker" | "git-worktree" | "none" | "nsjail" | "tempdir";
|
|
241
|
+
image?: string;
|
|
242
|
+
limits?: {
|
|
243
|
+
cpus?: number;
|
|
244
|
+
memoryBytes?: number;
|
|
245
|
+
diskBytes?: number;
|
|
246
|
+
networkAccess?: boolean;
|
|
247
|
+
};
|
|
248
|
+
};
|
|
249
|
+
/** Tools available to the agent */
|
|
250
|
+
tools?: string[];
|
|
251
|
+
/** Fixture references for test data */
|
|
252
|
+
fixtures?: string[];
|
|
253
|
+
/** Documentation context */
|
|
254
|
+
context?: {
|
|
255
|
+
docs?: GeneralizedDocRef[];
|
|
256
|
+
fixtures?: string[];
|
|
257
|
+
};
|
|
258
|
+
/** Multi-turn conversation definition */
|
|
259
|
+
multiTurn?: {
|
|
260
|
+
turns: {
|
|
261
|
+
role: "user" | "assistant";
|
|
262
|
+
content: string;
|
|
263
|
+
}[];
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
/**
|
|
267
|
+
* Knowledge probe mode — measures knowledge coverage across a corpus.
|
|
268
|
+
*
|
|
269
|
+
* Tests breadth and depth of an LLM's knowledge about a specific
|
|
270
|
+
* domain by systematically probing different aspects.
|
|
271
|
+
*/
|
|
272
|
+
export interface KnowledgeProbeTaskDefinition extends TaskCommonFields {
|
|
273
|
+
mode: "knowledge-probe";
|
|
274
|
+
/** How to traverse the knowledge space */
|
|
275
|
+
probeStrategy?: "breadth-first" | "depth-first" | "random-sample" | "coverage-guided";
|
|
276
|
+
/** Knowledge base reference */
|
|
277
|
+
knowledgeBase?: {
|
|
278
|
+
type: "sanity-dataset" | "embeddings-index" | "file-corpus";
|
|
279
|
+
name: string;
|
|
280
|
+
config?: Record<string, unknown>;
|
|
281
|
+
};
|
|
282
|
+
/** Documentation context */
|
|
283
|
+
context?: {
|
|
284
|
+
docs?: GeneralizedDocRef[];
|
|
285
|
+
fixtures?: string[];
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* Custom mode — user-provided evaluation handler.
|
|
290
|
+
*
|
|
291
|
+
* The escape hatch for evaluation types not covered by the built-in
|
|
292
|
+
* modes. Users provide a handler module implementing `EvalModeHandler`.
|
|
293
|
+
*/
|
|
294
|
+
export interface CustomTaskDefinition extends TaskCommonFields {
|
|
295
|
+
mode: "custom";
|
|
296
|
+
/** Module path to the custom handler */
|
|
297
|
+
handler: string;
|
|
298
|
+
/** Freeform schema for mode-specific configuration */
|
|
299
|
+
schema?: Record<string, unknown>;
|
|
300
|
+
/** Documentation context */
|
|
301
|
+
context?: {
|
|
302
|
+
docs?: GeneralizedDocRef[];
|
|
303
|
+
fixtures?: string[];
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Generalized task definition — a mode-discriminated union.
|
|
308
|
+
*
|
|
309
|
+
* The `mode` field determines which fields are available:
|
|
310
|
+
* - `"literacy"` → context.docs, referenceSolution, docCoverage, baseline
|
|
311
|
+
* - `"mcp-server"` → serverConfig, capabilities, multiTurn
|
|
312
|
+
* - `"agent-harness"` → sandbox, tools, fixtures, multiTurn
|
|
313
|
+
* - `"knowledge-probe"` → probeStrategy, knowledgeBase
|
|
314
|
+
* - `"custom"` → handler, schema
|
|
315
|
+
*
|
|
316
|
+
* Use `defineTask()` from `@sanity/ailf` for full IDE autocomplete
|
|
317
|
+
* when authoring tasks.
|
|
318
|
+
*/
|
|
319
|
+
export type GeneralizedTaskDefinition = LiteracyTaskDefinition | MCPServerTaskDefinition | AgentHarnessTaskDefinition | KnowledgeProbeTaskDefinition | CustomTaskDefinition;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GeneralizedTaskDefinition — Mode-discriminated task definitions.
|
|
3
|
+
*
|
|
4
|
+
* This is the canonical task type for the architecture overhaul. It supports
|
|
5
|
+
* all five evaluation modes via a discriminated union on the `mode`
|
|
6
|
+
* field. Common fields (id, title, description, area, etc.) are shared;
|
|
7
|
+
* mode-specific fields live only on the relevant variant.
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/architecture-overhaul/domain-model.md (canonical)
|
|
10
|
+
* @see docs/design-docs/architecture-overhaul/test-definition.md (authoring surfaces)
|
|
11
|
+
* @see docs/exec-plans/architecture-overhaul/phase-0-foundation-types.md (task 0h)
|
|
12
|
+
*/
|
|
13
|
+
export {};
|