@sanity/ailf 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/features.ts +23 -0
- package/config/models.ts +83 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
- package/dist/_vendor/ailf-core/config-helpers.js +150 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +38 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +133 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
- package/dist/adapters/task-sources/index.d.ts +1 -0
- package/dist/adapters/task-sources/index.js +1 -0
- package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
- package/dist/adapters/task-sources/repo-task-source.js +69 -16
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +7 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/composition-root.d.ts +1 -1
- package/dist/composition-root.js +67 -4
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +24 -6
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +6 -4
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +245 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +6 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
- package/dist/pipeline/mirror-repo-tasks.js +16 -15
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +6 -3
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -23,6 +23,7 @@ import { TASK_FILE_NAMES } from "../_vendor/ailf-core/index.js";
|
|
|
23
23
|
import { buildPipelinePlan, buildSimpleCommandPlan, } from "../pipeline/plan.js";
|
|
24
24
|
import { formatPlanConsole, formatPlanJson } from "../pipeline/plan-format.js";
|
|
25
25
|
import { computeResolvedOptions } from "./pipeline-action.js";
|
|
26
|
+
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
26
27
|
// ---------------------------------------------------------------------------
|
|
27
28
|
// Registry
|
|
28
29
|
// ---------------------------------------------------------------------------
|
|
@@ -84,8 +85,8 @@ const EXPLAIN_REGISTRY = {
|
|
|
84
85
|
filesCreated: ["results/latest/score-summary.json"],
|
|
85
86
|
filesRead: [
|
|
86
87
|
"results/latest/eval-results.json",
|
|
87
|
-
"config/rubrics.
|
|
88
|
-
"config/models.
|
|
88
|
+
"config/rubrics.ts",
|
|
89
|
+
"config/models.ts",
|
|
89
90
|
],
|
|
90
91
|
steps: [
|
|
91
92
|
{
|
|
@@ -138,12 +139,12 @@ const EXPLAIN_REGISTRY = {
|
|
|
138
139
|
},
|
|
139
140
|
"coverage-audit": {
|
|
140
141
|
description: "Cross-reference feature registry against evaluation tasks for coverage gaps",
|
|
141
|
-
filesRead: ["config/features.
|
|
142
|
+
filesRead: ["config/features.ts", "tasks/*.{yaml,task.ts,task.js}"],
|
|
142
143
|
steps: [
|
|
143
144
|
{
|
|
144
145
|
cacheStatus: "miss",
|
|
145
146
|
name: "Load feature registry",
|
|
146
|
-
reason: "Parse config/features.
|
|
147
|
+
reason: "Parse config/features.ts for product feature list",
|
|
147
148
|
willRun: true,
|
|
148
149
|
},
|
|
149
150
|
{
|
|
@@ -201,7 +202,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
201
202
|
"fetch-docs": {
|
|
202
203
|
description: "Fetch documentation from Sanity CMS and generate canonical context files",
|
|
203
204
|
filesCreated: ["contexts/canonical/*.md"],
|
|
204
|
-
filesRead: ["config/sources.
|
|
205
|
+
filesRead: ["config/sources.ts", "config/models.ts"],
|
|
205
206
|
steps: [
|
|
206
207
|
{
|
|
207
208
|
cacheStatus: "miss",
|
|
@@ -224,7 +225,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
224
225
|
],
|
|
225
226
|
},
|
|
226
227
|
"generate-configs": {
|
|
227
|
-
description: "Generate Promptfoo config files from models.
|
|
228
|
+
description: "Generate Promptfoo config files from models.ts and task definitions",
|
|
228
229
|
filesCreated: [
|
|
229
230
|
"promptfooconfig.yaml",
|
|
230
231
|
"promptfooconfig.observed.yaml",
|
|
@@ -232,16 +233,16 @@ const EXPLAIN_REGISTRY = {
|
|
|
232
233
|
"tasks/.expanded.yaml",
|
|
233
234
|
],
|
|
234
235
|
filesRead: [
|
|
235
|
-
"config/models.
|
|
236
|
-
"config/prompts.
|
|
237
|
-
"config/rubrics.
|
|
238
|
-
"config/sources.
|
|
236
|
+
"config/models.ts",
|
|
237
|
+
"config/prompts.ts",
|
|
238
|
+
"config/rubrics.ts",
|
|
239
|
+
"config/sources.ts",
|
|
239
240
|
],
|
|
240
241
|
steps: [
|
|
241
242
|
{
|
|
242
243
|
cacheStatus: "miss",
|
|
243
244
|
name: "Load models",
|
|
244
|
-
reason: "Parse config/models.
|
|
245
|
+
reason: "Parse config/models.ts for active model list",
|
|
245
246
|
willRun: true,
|
|
246
247
|
},
|
|
247
248
|
{
|
|
@@ -262,7 +263,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
262
263
|
description: "Grader reliability tools (consistency, compare, sensitivity, validate)",
|
|
263
264
|
filesRead: [
|
|
264
265
|
"results/latest/eval-results.json",
|
|
265
|
-
"config/rubrics.
|
|
266
|
+
"config/rubrics.ts",
|
|
266
267
|
"canonical/reference-solutions/",
|
|
267
268
|
],
|
|
268
269
|
steps: [
|
|
@@ -369,7 +370,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
369
370
|
filesRead: [
|
|
370
371
|
"results/latest/score-summary.json",
|
|
371
372
|
"results/latest/gap-analysis.json",
|
|
372
|
-
"config/thresholds.
|
|
373
|
+
"config/thresholds.ts",
|
|
373
374
|
"results/baselines/",
|
|
374
375
|
],
|
|
375
376
|
filesCreated: ["results/latest/readiness-report.md"],
|
|
@@ -377,7 +378,7 @@ const EXPLAIN_REGISTRY = {
|
|
|
377
378
|
{
|
|
378
379
|
cacheStatus: "miss",
|
|
379
380
|
name: "Load scores + thresholds",
|
|
380
|
-
reason: "Read score-summary.json and thresholds.
|
|
381
|
+
reason: "Read score-summary.json and thresholds.ts for gate evaluation",
|
|
381
382
|
willRun: true,
|
|
382
383
|
},
|
|
383
384
|
{
|
|
@@ -395,18 +396,18 @@ const EXPLAIN_REGISTRY = {
|
|
|
395
396
|
],
|
|
396
397
|
},
|
|
397
398
|
validate: {
|
|
398
|
-
description: "Validate all
|
|
399
|
+
description: "Validate all config files, task definitions, reference solutions, and environment",
|
|
399
400
|
filesRead: [
|
|
400
|
-
"config/models.
|
|
401
|
-
"config/rubrics.
|
|
402
|
-
"config/features.
|
|
403
|
-
"config/thresholds.
|
|
401
|
+
"config/models.ts",
|
|
402
|
+
"config/rubrics.ts",
|
|
403
|
+
"config/features.ts",
|
|
404
|
+
"config/thresholds.ts",
|
|
404
405
|
],
|
|
405
406
|
steps: [
|
|
406
407
|
{
|
|
407
408
|
cacheStatus: "miss",
|
|
408
409
|
name: "Validate configuration",
|
|
409
|
-
reason: "Parse all
|
|
410
|
+
reason: "Parse all config files through Zod schemas, cross-reference mappings",
|
|
410
411
|
willRun: true,
|
|
411
412
|
},
|
|
412
413
|
{
|
|
@@ -454,12 +455,12 @@ const EXPLAIN_REGISTRY = {
|
|
|
454
455
|
},
|
|
455
456
|
"weekly-digest": {
|
|
456
457
|
description: "Generate and deliver a weekly evaluation trend digest via Slack",
|
|
457
|
-
filesRead: ["config/schedules.
|
|
458
|
+
filesRead: ["config/schedules.ts", "config/sinks.ts"],
|
|
458
459
|
steps: [
|
|
459
460
|
{
|
|
460
461
|
cacheStatus: "miss",
|
|
461
462
|
name: "Load digest config",
|
|
462
|
-
reason: "Read schedules.
|
|
463
|
+
reason: "Read schedules.ts for lookback window and delivery targets",
|
|
463
464
|
willRun: true,
|
|
464
465
|
},
|
|
465
466
|
{
|
|
@@ -670,7 +671,7 @@ async function buildPipelineExplainPlan(actionCommand, rootDir) {
|
|
|
670
671
|
graderReplications: raw.graderReplications,
|
|
671
672
|
header: raw.header ?? [],
|
|
672
673
|
headers: raw.headers ?? [],
|
|
673
|
-
mode: raw.mode ??
|
|
674
|
+
mode: raw.mode ?? LiteracyVariant.FULL,
|
|
674
675
|
output: raw.output,
|
|
675
676
|
promptfooUrl: raw.promptfooUrl,
|
|
676
677
|
publish: raw.publish,
|
|
@@ -714,6 +715,7 @@ async function buildPipelineExplainPlan(actionCommand, rootDir) {
|
|
|
714
715
|
gapAnalysisEnabled: resolved.gapAnalysisEnabled,
|
|
715
716
|
graderReplications: resolved.graderReplications,
|
|
716
717
|
mode: resolved.mode,
|
|
718
|
+
variant: resolved.variant,
|
|
717
719
|
noCache: resolved.noCache,
|
|
718
720
|
publishEnabled: resolved.publishEnabled,
|
|
719
721
|
readinessEnabled: resolved.readinessEnabled,
|
|
@@ -41,7 +41,7 @@ async function executeFetchDocs(opts) {
|
|
|
41
41
|
// Build a minimal ResolvedConfig for the composition root
|
|
42
42
|
const ctx = createAppContext({
|
|
43
43
|
rootDir: ROOT,
|
|
44
|
-
mode: "
|
|
44
|
+
mode: "literacy",
|
|
45
45
|
noAutoScope: false,
|
|
46
46
|
skipFetch: false,
|
|
47
47
|
skipEval: true,
|
|
@@ -83,7 +83,8 @@ async function executeFetchDocs(opts) {
|
|
|
83
83
|
}
|
|
84
84
|
// Canonical contexts — same code path as the pipeline
|
|
85
85
|
const tasks = await ctx.taskSource.loadTasks();
|
|
86
|
-
|
|
86
|
+
// Bridge: narrow to literacy tasks with docs (only literacy tasks have context.docs)
|
|
87
|
+
const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
|
|
87
88
|
if (tasksWithDocs.length > 0) {
|
|
88
89
|
console.log("\nGenerating canonical (gold-retrieval) contexts...\n");
|
|
89
90
|
const result = await fetcher.fetch(tasksWithDocs, resolvedSource);
|
|
@@ -9,6 +9,10 @@
|
|
|
9
9
|
* Uses @inquirer/prompts for a clean, modern terminal UI.
|
|
10
10
|
*/
|
|
11
11
|
import { Command } from "commander";
|
|
12
|
+
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
13
|
+
// CLI command name for the baseline snapshot management subcommand.
|
|
14
|
+
// Defined as a constant to avoid scattering the literal string across routing code.
|
|
15
|
+
const BASELINE_CMD = "baseline";
|
|
12
16
|
export function createInteractiveCommand() {
|
|
13
17
|
return new Command("interactive")
|
|
14
18
|
.description("Guided wizard for common evaluation workflows")
|
|
@@ -65,7 +69,7 @@ async function runInteractiveWizard() {
|
|
|
65
69
|
{
|
|
66
70
|
description: "Save, compare, or list historical score snapshots",
|
|
67
71
|
name: "Manage baselines",
|
|
68
|
-
value:
|
|
72
|
+
value: BASELINE_CMD,
|
|
69
73
|
},
|
|
70
74
|
{
|
|
71
75
|
description: "Weekly evaluation trends and area summaries",
|
|
@@ -93,7 +97,7 @@ async function runInteractiveWizard() {
|
|
|
93
97
|
});
|
|
94
98
|
return { args: dryRun ? ["--dry-run"] : [], command: "weekly-digest" };
|
|
95
99
|
}
|
|
96
|
-
if (workflow ===
|
|
100
|
+
if (workflow === BASELINE_CMD) {
|
|
97
101
|
const subcommand = await select({
|
|
98
102
|
choices: [
|
|
99
103
|
{ name: "Save current scores", value: "save" },
|
|
@@ -102,7 +106,7 @@ async function runInteractiveWizard() {
|
|
|
102
106
|
],
|
|
103
107
|
message: "Baseline operation:",
|
|
104
108
|
});
|
|
105
|
-
return { args: [subcommand], command:
|
|
109
|
+
return { args: [subcommand], command: BASELINE_CMD };
|
|
106
110
|
}
|
|
107
111
|
if (workflow === "grader") {
|
|
108
112
|
const subcommand = await select({
|
|
@@ -140,22 +144,22 @@ async function runInteractiveWizard() {
|
|
|
140
144
|
{
|
|
141
145
|
description: "Evaluate with pre-fetched documentation context",
|
|
142
146
|
name: "Baseline (with docs vs without docs)",
|
|
143
|
-
value:
|
|
147
|
+
value: LiteracyVariant.STANDARD,
|
|
144
148
|
},
|
|
145
149
|
{
|
|
146
150
|
description: "Baseline + record HTTP request patterns",
|
|
147
151
|
name: "Observed (instrumented)",
|
|
148
|
-
value:
|
|
152
|
+
value: LiteracyVariant.OBSERVED,
|
|
149
153
|
},
|
|
150
154
|
{
|
|
151
155
|
description: "Agent searches for docs itself via web tools",
|
|
152
156
|
name: "Agentic (agent-driven retrieval)",
|
|
153
|
-
value:
|
|
157
|
+
value: LiteracyVariant.AGENTIC,
|
|
154
158
|
},
|
|
155
159
|
],
|
|
156
160
|
message: "Evaluation mode:",
|
|
157
161
|
});
|
|
158
|
-
if (mode !==
|
|
162
|
+
if (mode !== LiteracyVariant.STANDARD) {
|
|
159
163
|
args.push("--mode", mode);
|
|
160
164
|
}
|
|
161
165
|
// Step 3: Area scoping
|
|
@@ -31,6 +31,8 @@ export interface ResolvedOptions {
|
|
|
31
31
|
headerArgs: string[];
|
|
32
32
|
impactSummary?: ImpactSummary;
|
|
33
33
|
mode: EvalMode;
|
|
34
|
+
/** Literacy variant — set when the user passes a legacy mode name */
|
|
35
|
+
variant?: string;
|
|
34
36
|
noAutoScope: boolean;
|
|
35
37
|
noCache: boolean;
|
|
36
38
|
noRemoteCache: boolean;
|
|
@@ -14,6 +14,7 @@ import { existsSync, readFileSync, writeFileSync } from "fs";
|
|
|
14
14
|
import { dirname, resolve } from "path";
|
|
15
15
|
import { fileURLToPath } from "url";
|
|
16
16
|
import { classifyUrls } from "../pipeline/classify-url.js";
|
|
17
|
+
import { normalizeMode } from "../pipeline/normalize-mode.js";
|
|
17
18
|
import { assessImpact, buildReverseMapping, } from "../pipeline/reverse-mapping.js";
|
|
18
19
|
import { buildAppContext } from "../orchestration/build-app-context.js";
|
|
19
20
|
import { buildStepSequence } from "../orchestration/build-step-sequence.js";
|
|
@@ -23,9 +24,8 @@ import { parseRepoConfig, } from "../adapters/task-sources/repo-schemas.js";
|
|
|
23
24
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
24
25
|
const ROOT = resolve(__dirname, "..", "..");
|
|
25
26
|
// ---------------------------------------------------------------------------
|
|
26
|
-
// Valid
|
|
27
|
+
// Valid search modes
|
|
27
28
|
// ---------------------------------------------------------------------------
|
|
28
|
-
const VALID_MODES = ["baseline", "observed", "agentic", "full"];
|
|
29
29
|
const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
|
|
30
30
|
/**
|
|
31
31
|
* Pure option resolution — computes ResolvedOptions from CLI flags without
|
|
@@ -36,10 +36,19 @@ const VALID_SEARCH_MODES = ["open", "origin-only", "off"];
|
|
|
36
36
|
export function computeResolvedOptions(opts) {
|
|
37
37
|
// Resolve paths relative to the caller's cwd, not the eval package root
|
|
38
38
|
const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
39
|
-
// Validate mode
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
39
|
+
// Validate + normalize mode via the single boundary function.
|
|
40
|
+
// normalizeMode() maps legacy variant names (baseline, agentic, etc.)
|
|
41
|
+
// to canonical mode "literacy" + variant, and throws on invalid input.
|
|
42
|
+
let mode;
|
|
43
|
+
let variant;
|
|
44
|
+
try {
|
|
45
|
+
const normalized = normalizeMode(opts.mode);
|
|
46
|
+
mode = normalized.mode;
|
|
47
|
+
// Explicit --variant flag takes precedence over what normalizeMode inferred
|
|
48
|
+
variant = opts.variant ?? normalized.variant;
|
|
49
|
+
}
|
|
50
|
+
catch (err) {
|
|
51
|
+
console.error(`❌ ${err instanceof Error ? err.message : String(err)}`);
|
|
43
52
|
process.exit(1);
|
|
44
53
|
}
|
|
45
54
|
// Debug options — any sub-flag (--debug-n, --debug-pattern, --debug-sample)
|
|
@@ -220,6 +229,7 @@ export function computeResolvedOptions(opts) {
|
|
|
220
229
|
headerArgs,
|
|
221
230
|
impactSummary,
|
|
222
231
|
mode,
|
|
232
|
+
variant,
|
|
223
233
|
noAutoScope: opts.autoScope === false,
|
|
224
234
|
noCache: !opts.cache,
|
|
225
235
|
noRemoteCache: opts.remoteCache === false,
|
|
@@ -8,11 +8,13 @@
|
|
|
8
8
|
* @see docs/CLI.md for the full flag reference.
|
|
9
9
|
*/
|
|
10
10
|
import { Command } from "commander";
|
|
11
|
+
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
11
12
|
import { addAgenticOptions, addDebugOptions, addSanitySourceOptions, } from "./shared/options.js";
|
|
12
13
|
export function createPipelineCommand() {
|
|
13
14
|
const cmd = new Command("pipeline")
|
|
14
15
|
.description("Run the full evaluation pipeline")
|
|
15
|
-
.option("-m, --mode <mode>", "Evaluation mode:
|
|
16
|
+
.option("-m, --mode <mode>", "Evaluation mode: literacy (default), mcp-server, agent-harness, knowledge-probe, custom. Legacy aliases (baseline, agentic, observed, full) are accepted and normalized to literacy + variant.", LiteracyVariant.FULL)
|
|
17
|
+
.option("--variant <variant>", "Literacy variant: full (default — standard + agentic), baseline (standard only), agentic (agentic only), observed. Only applies to --mode literacy.")
|
|
16
18
|
.option("-s, --source <name>", "Documentation source name (from sources.yaml)")
|
|
17
19
|
.option("-n, --dry-run", "Validate configuration only, no execution", false)
|
|
18
20
|
.option("--skip-fetch", "Reuse cached documentation contexts", false)
|
|
@@ -44,7 +46,7 @@ export function createPipelineCommand() {
|
|
|
44
46
|
.option("--publish-tag <tag>", "Label for published report")
|
|
45
47
|
.option("--report-dataset <name>", "Sanity dataset for report store")
|
|
46
48
|
.option("--report-project <id>", "Sanity project ID for report store")
|
|
47
|
-
.option("--config <path>", "Load pipeline config from a
|
|
49
|
+
.option("--config <path>", "Load pipeline config from a TS/JS/YAML/JSON file (overrides most CLI flags)")
|
|
48
50
|
.option("-o, --output <path>", "Write PR comment markdown to file")
|
|
49
51
|
.option("--promptfoo-url <url>", "Promptfoo share URL for report")
|
|
50
52
|
.option("--task-source <type>", "Task definition source: content-lake (default — Sanity Content Lake), repo (repo tasks only, no Content Lake merge), yaml (tasks/*.yaml files, legacy)", "content-lake")
|
package/dist/commands/publish.js
CHANGED
|
@@ -52,7 +52,7 @@ export function createPublishCommand() {
|
|
|
52
52
|
*/
|
|
53
53
|
function buildProvenanceFromSummary(summary) {
|
|
54
54
|
const areas = summary.scores.map((s) => s.feature);
|
|
55
|
-
const mode = (process.env.EVAL_MODE ?? "
|
|
55
|
+
const mode = (process.env.EVAL_MODE ?? "literacy");
|
|
56
56
|
const source = {
|
|
57
57
|
baseUrl: summary.source?.baseUrl ?? "https://www.sanity.io/docs",
|
|
58
58
|
dataset: summary.source?.dataset ?? process.env.SANITY_DATASET ?? "next",
|
|
@@ -83,7 +83,7 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
83
83
|
compareEnabled: false,
|
|
84
84
|
discoveryReportEnabled: false,
|
|
85
85
|
gapAnalysisEnabled: false,
|
|
86
|
-
mode: "
|
|
86
|
+
mode: "literacy",
|
|
87
87
|
noAutoScope: false,
|
|
88
88
|
noCache: true,
|
|
89
89
|
noRemoteCache: true,
|
|
@@ -10,14 +10,14 @@ import { Command } from "commander";
|
|
|
10
10
|
import { existsSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
11
11
|
import { dirname, join, resolve } from "path";
|
|
12
12
|
import { fileURLToPath } from "url";
|
|
13
|
-
import {
|
|
13
|
+
import { ConfigNotFoundError, loadConfigFile, } from "../pipeline/compiler/config-loader.js";
|
|
14
14
|
import { formatReadinessMarkdown, generateReadinessReport, } from "../pipeline/readiness-report.js";
|
|
15
15
|
import { ThresholdConfigSchema, } from "../pipeline/schemas.js";
|
|
16
16
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
17
|
const ROOT = resolve(__dirname, "..", "..");
|
|
18
18
|
const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
|
|
19
19
|
const GAP_ANALYSIS_PATH = join(ROOT, "results", "latest", "gap-analysis.json");
|
|
20
|
-
|
|
20
|
+
// thresholds loaded via loadConfigFile below
|
|
21
21
|
const BASELINES_DIR = join(ROOT, "results", "baselines");
|
|
22
22
|
export function createReadinessReportCommand() {
|
|
23
23
|
return new Command("readiness-report")
|
|
@@ -33,12 +33,19 @@ export function createReadinessReportCommand() {
|
|
|
33
33
|
}
|
|
34
34
|
const scoreSummary = JSON.parse(readFileSync(SCORE_SUMMARY_PATH, "utf-8"));
|
|
35
35
|
// Load threshold config
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
let parsedThresholds;
|
|
37
|
+
try {
|
|
38
|
+
parsedThresholds = loadConfigFile("thresholds", ROOT).data;
|
|
39
|
+
}
|
|
40
|
+
catch (err) {
|
|
41
|
+
if (err instanceof ConfigNotFoundError) {
|
|
42
|
+
console.error("❌ Threshold config not found in config/.");
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
console.error(`❌ Failed to load threshold config: ${err instanceof Error ? err.message : err}`);
|
|
46
|
+
}
|
|
38
47
|
process.exit(1);
|
|
39
48
|
}
|
|
40
|
-
const rawThresholds = readFileSync(THRESHOLDS_PATH, "utf-8");
|
|
41
|
-
const parsedThresholds = load(rawThresholds);
|
|
42
49
|
const thresholdResult = ThresholdConfigSchema.safeParse(parsedThresholds);
|
|
43
50
|
if (!thresholdResult.success) {
|
|
44
51
|
const messages = thresholdResult.error.issues
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* @see packages/core/src/ports/context.ts — AppContext interface
|
|
16
16
|
* @see docs/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
|
-
import type
|
|
18
|
+
import { type AppContext, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
|
|
19
19
|
/**
|
|
20
20
|
* Create a fully wired AppContext from resolved configuration.
|
|
21
21
|
*
|
package/dist/composition-root.js
CHANGED
|
@@ -15,12 +15,13 @@
|
|
|
15
15
|
* @see packages/core/src/ports/context.ts — AppContext interface
|
|
16
16
|
* @see docs/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
|
+
import { InMemoryPluginRegistry, } from "./_vendor/ailf-core/index.js";
|
|
18
19
|
import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
|
|
19
20
|
import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
|
|
20
|
-
import { SanityDocFetcher } from "./adapters/doc-fetchers/index.js";
|
|
21
21
|
import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
|
|
22
22
|
import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
|
|
23
23
|
import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, YamlTaskSource, } from "./adapters/task-sources/index.js";
|
|
24
|
+
import { createSanityLiteracyPreset } from "./pipeline/compiler/presets/index.js";
|
|
24
25
|
import { getSanityClient } from "./sanity/client.js";
|
|
25
26
|
import { ReportStore } from "./report-store.js";
|
|
26
27
|
import { loadSinks } from "./sinks/index.js";
|
|
@@ -38,13 +39,18 @@ export function createAppContext(config) {
|
|
|
38
39
|
const cache = config.noCache ? undefined : createCache(config);
|
|
39
40
|
// Task source — selected by config.taskSourceType
|
|
40
41
|
const taskSource = createTaskSource(config);
|
|
41
|
-
//
|
|
42
|
-
|
|
42
|
+
// Plugin registry — mode handlers, assertions, rubric templates, doc fetcher.
|
|
43
|
+
// The Sanity preset is registered here with config.rootDir so its doc fetcher
|
|
44
|
+
// factory resolves paths relative to the eval package root (not cwd).
|
|
45
|
+
const registry = createRegistry(config.rootDir);
|
|
46
|
+
// Doc fetcher — provided by the registered preset's factory
|
|
47
|
+
const docFetcherFactory = registry.getDocFetcherFactory();
|
|
48
|
+
const docFetcher = docFetcherFactory ? docFetcherFactory() : undefined;
|
|
43
49
|
// Eval runner — Promptfoo subprocess
|
|
44
50
|
const evalRunner = new PromptfooEvalAdapter(config.rootDir);
|
|
45
51
|
// Report store — Sanity Content Lake (for publish + auto-compare)
|
|
46
52
|
const reportStore = createReportStore(config);
|
|
47
|
-
// Sinks — loaded from config/sinks
|
|
53
|
+
// Sinks — loaded from config/sinks
|
|
48
54
|
const sinks = loadSinks();
|
|
49
55
|
return {
|
|
50
56
|
cache,
|
|
@@ -52,6 +58,7 @@ export function createAppContext(config) {
|
|
|
52
58
|
docFetcher,
|
|
53
59
|
evalRunner,
|
|
54
60
|
logger,
|
|
61
|
+
registry,
|
|
55
62
|
reportStore,
|
|
56
63
|
sinks,
|
|
57
64
|
taskSource,
|
|
@@ -113,6 +120,62 @@ function createTaskSource(config) {
|
|
|
113
120
|
}
|
|
114
121
|
return primary;
|
|
115
122
|
}
|
|
123
|
+
// ---------------------------------------------------------------------------
|
|
124
|
+
// Built-in mode registrations for non-literacy modes
|
|
125
|
+
// ---------------------------------------------------------------------------
|
|
126
|
+
const BUILT_IN_MODES = [
|
|
127
|
+
{
|
|
128
|
+
id: "knowledge-probe",
|
|
129
|
+
label: "Knowledge Probe",
|
|
130
|
+
validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
|
|
131
|
+
rubricTemplateIds: [],
|
|
132
|
+
handlerModule: "./mode-handlers/knowledge-probe-handler.js",
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
id: "mcp-server",
|
|
136
|
+
label: "MCP Server Testing",
|
|
137
|
+
validProviderPatterns: ["^mcp:", "^file://"],
|
|
138
|
+
rubricTemplateIds: [
|
|
139
|
+
"mcp-input-validation",
|
|
140
|
+
"mcp-output-correctness",
|
|
141
|
+
"mcp-error-handling",
|
|
142
|
+
],
|
|
143
|
+
handlerModule: "./mode-handlers/mcp-server-handler.js",
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
id: "agent-harness",
|
|
147
|
+
label: "Agent Harness",
|
|
148
|
+
validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
|
|
149
|
+
rubricTemplateIds: [],
|
|
150
|
+
handlerModule: "./mode-handlers/agent-harness-handler.js",
|
|
151
|
+
},
|
|
152
|
+
];
|
|
153
|
+
/**
|
|
154
|
+
* Build and populate the plugin registry.
|
|
155
|
+
*
|
|
156
|
+
* Preset registration flow:
|
|
157
|
+
* 1. A preset is a PresetDefinition — a bundle of modes, assertions, rubric
|
|
158
|
+
* templates, prompt templates, scoring profiles, a doc fetcher factory,
|
|
159
|
+
* source definitions, and feature definitions.
|
|
160
|
+
* 2. registerPreset() iterates the preset's fields and delegates each one to
|
|
161
|
+
* the appropriate register method (registerMode, registerRubricTemplate, …).
|
|
162
|
+
* 3. After registration the rest of createAppContext() can pull capabilities
|
|
163
|
+
* from the registry (e.g. getDocFetcherFactory()) without knowing which
|
|
164
|
+
* preset provided them.
|
|
165
|
+
*
|
|
166
|
+
* To add a new preset: create a PresetDefinition, then call
|
|
167
|
+
* registry.registerPreset() here before the built-in mode registrations.
|
|
168
|
+
*/
|
|
169
|
+
function createRegistry(rootDir) {
|
|
170
|
+
const registry = new InMemoryPluginRegistry();
|
|
171
|
+
// Register the sanity-literacy preset — the Sanity-specific evaluation bundle.
|
|
172
|
+
registry.registerPreset(createSanityLiteracyPreset({ rootDir }));
|
|
173
|
+
// Register other built-in modes (not part of any preset yet)
|
|
174
|
+
for (const mode of BUILT_IN_MODES) {
|
|
175
|
+
registry.registerMode(mode);
|
|
176
|
+
}
|
|
177
|
+
return registry;
|
|
178
|
+
}
|
|
116
179
|
function createReportStore(config) {
|
|
117
180
|
return new ReportStore({
|
|
118
181
|
dataset: process.env.AILF_REPORT_DATASET ??
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* PipelineStep objects determined by config flags like skipFetch,
|
|
6
6
|
* skipEval, compareEnabled, etc.
|
|
7
7
|
*/
|
|
8
|
-
import {
|
|
8
|
+
import { LiteracyVariant } from "../pipeline/normalize-mode.js";
|
|
9
9
|
import { CallbackStep } from "./steps/callback-step.js";
|
|
10
10
|
import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
|
|
11
11
|
import { CompareStep } from "./steps/compare-step.js";
|
|
@@ -40,11 +40,29 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
|
|
|
40
40
|
// Step 2: Generate Promptfoo configs
|
|
41
41
|
steps.push(new GenerateConfigsStep());
|
|
42
42
|
// Step 3: Run evaluation (steps handle --skip-eval internally)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
43
|
+
//
|
|
44
|
+
// For literacy mode, the variant determines how many eval steps run:
|
|
45
|
+
// "full" → baseline + agentic (two steps)
|
|
46
|
+
// "baseline" / "agentic" / "observed" → one step
|
|
47
|
+
// undefined → defaults to baseline
|
|
48
|
+
//
|
|
49
|
+
// For all other modes, one eval step per mode.
|
|
50
|
+
if (config.mode === "literacy") {
|
|
51
|
+
const variant = config.variant ?? LiteracyVariant.STANDARD;
|
|
52
|
+
if (variant === LiteracyVariant.FULL) {
|
|
53
|
+
for (const submode of [
|
|
54
|
+
LiteracyVariant.STANDARD,
|
|
55
|
+
LiteracyVariant.AGENTIC,
|
|
56
|
+
]) {
|
|
57
|
+
steps.push(new RunEvalStep(submode));
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
else {
|
|
61
|
+
steps.push(new RunEvalStep(variant));
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
steps.push(new RunEvalStep(config.mode));
|
|
48
66
|
}
|
|
49
67
|
// Step 3c: Grader consistency (optional, conditional)
|
|
50
68
|
if (config.graderReplications) {
|