@sanity/ailf 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +51 -31
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +24 -24
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,565 @@
|
|
|
1
|
+
# .expanded.yaml
|
|
2
|
+
#
|
|
3
|
+
# AUTO-GENERATED by compiler pipeline — do not edit directly.
|
|
4
|
+
# Run: npx @sanity/ailf generate-configs
|
|
5
|
+
|
|
6
|
+
- description: GROQ - Blog queries with filtering and pagination (gold)
|
|
7
|
+
vars:
|
|
8
|
+
task: |-
|
|
9
|
+
Write GROQ queries for a Sanity blog application:
|
|
10
|
+
|
|
11
|
+
1. Fetch all published blog posts ordered by publishedAt descending,
|
|
12
|
+
with a projection that includes: _id, title, slug (from slug.current),
|
|
13
|
+
publishedAt, excerpt, and the author's name (resolved from a reference)
|
|
14
|
+
2. Add pagination to return only the first 10 results
|
|
15
|
+
3. Fetch a single post by its slug parameter, including the full body
|
|
16
|
+
content and resolved author and category references
|
|
17
|
+
4. Fetch posts published after a specific date
|
|
18
|
+
5. Fetch posts that belong to a specific category (where categories
|
|
19
|
+
is an array of references)
|
|
20
|
+
|
|
21
|
+
Use @sanity/client with client.fetch() for all queries. Include
|
|
22
|
+
TypeScript types for the query results.
|
|
23
|
+
docs: file://contexts/canonical/groq-blog-queries.md
|
|
24
|
+
__featureArea: groq
|
|
25
|
+
assert:
|
|
26
|
+
- type: llm-rubric
|
|
27
|
+
value: |-
|
|
28
|
+
Score task completion from 0 to 100:
|
|
29
|
+
- 0: Couldn't attempt — missing critical information
|
|
30
|
+
- 20: Attempted but fundamentally wrong approach
|
|
31
|
+
- 50: Partial implementation — major functional gaps
|
|
32
|
+
- 80: Mostly complete — minor issues or missing edge cases
|
|
33
|
+
- 100: Fully functional code — works as expected
|
|
34
|
+
|
|
35
|
+
Must demonstrate:
|
|
36
|
+
- GROQ filter with _type == "post"
|
|
37
|
+
- Projection with aliased slug field ("slug": slug.current)
|
|
38
|
+
- Reference resolution with -> for author
|
|
39
|
+
- Ordering with | order(publishedAt desc)
|
|
40
|
+
- Slice/pagination syntax [0...10] or [0..9]
|
|
41
|
+
- Parameterized query with $slug for single post fetch
|
|
42
|
+
- Date filtering with dateTime() or string comparison
|
|
43
|
+
- Category filtering using references or array contains
|
|
44
|
+
|
|
45
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
46
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
47
|
+
metadata:
|
|
48
|
+
dimension: task-completion
|
|
49
|
+
maxScore: 100
|
|
50
|
+
- type: llm-rubric
|
|
51
|
+
value: |-
|
|
52
|
+
Score code correctness from 0 to 100:
|
|
53
|
+
- 0: Broken code, syntax errors, or deprecated APIs
|
|
54
|
+
- 30: Works but uses anti-patterns or inefficient approaches
|
|
55
|
+
- 50: Works but not idiomatic
|
|
56
|
+
- 80: Follows most best practices
|
|
57
|
+
- 100: Follows all best practices, idiomatic implementation
|
|
58
|
+
|
|
59
|
+
Check for:
|
|
60
|
+
- Valid GROQ syntax (proper filter brackets, projection braces)
|
|
61
|
+
- Uses @sanity/client createClient + client.fetch()
|
|
62
|
+
- Correct parameter passing syntax ($param)
|
|
63
|
+
- Proper reference dereference with ->
|
|
64
|
+
- No deprecated patterns
|
|
65
|
+
|
|
66
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
67
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
68
|
+
metadata:
|
|
69
|
+
dimension: code-correctness
|
|
70
|
+
maxScore: 100
|
|
71
|
+
- type: contains-any
|
|
72
|
+
value:
|
|
73
|
+
- client.fetch
|
|
74
|
+
- createClient
|
|
75
|
+
weight: 1
|
|
76
|
+
- type: contains-any
|
|
77
|
+
value:
|
|
78
|
+
- order(publishedAt
|
|
79
|
+
- order(_createdAt
|
|
80
|
+
- '| order('
|
|
81
|
+
weight: 1
|
|
82
|
+
- type: contains-any
|
|
83
|
+
value:
|
|
84
|
+
- '[0...10]'
|
|
85
|
+
- '[0..9]'
|
|
86
|
+
- '[0...'
|
|
87
|
+
weight: 1
|
|
88
|
+
- type: llm-rubric
|
|
89
|
+
value: |-
|
|
90
|
+
Score documentation coverage from 0 to 100:
|
|
91
|
+
- 0: Had to hallucinate/guess most implementation details
|
|
92
|
+
- 30: Significant gaps — filled with assumptions
|
|
93
|
+
- 50: Some gaps — inferred from partial information
|
|
94
|
+
- 80: Minor gaps — almost everything was documented
|
|
95
|
+
- 100: Complete coverage — all necessary info was in docs
|
|
96
|
+
|
|
97
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
98
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
99
|
+
metadata:
|
|
100
|
+
dimension: doc-coverage
|
|
101
|
+
maxScore: 100
|
|
102
|
+
prompts:
|
|
103
|
+
- with-docs
|
|
104
|
+
- description: GROQ - Blog queries with filtering and pagination (baseline)
|
|
105
|
+
vars:
|
|
106
|
+
task: |-
|
|
107
|
+
Write GROQ queries for a Sanity blog application:
|
|
108
|
+
|
|
109
|
+
1. Fetch all published blog posts ordered by publishedAt descending,
|
|
110
|
+
with a projection that includes: _id, title, slug (from slug.current),
|
|
111
|
+
publishedAt, excerpt, and the author's name (resolved from a reference)
|
|
112
|
+
2. Add pagination to return only the first 10 results
|
|
113
|
+
3. Fetch a single post by its slug parameter, including the full body
|
|
114
|
+
content and resolved author and category references
|
|
115
|
+
4. Fetch posts published after a specific date
|
|
116
|
+
5. Fetch posts that belong to a specific category (where categories
|
|
117
|
+
is an array of references)
|
|
118
|
+
|
|
119
|
+
Use @sanity/client with client.fetch() for all queries. Include
|
|
120
|
+
TypeScript types for the query results.
|
|
121
|
+
docs: file://contexts/canonical/groq-blog-queries.md
|
|
122
|
+
__featureArea: groq
|
|
123
|
+
assert:
|
|
124
|
+
- type: llm-rubric
|
|
125
|
+
value: |-
|
|
126
|
+
Score task completion from 0 to 100:
|
|
127
|
+
- 0: Couldn't attempt — missing critical information
|
|
128
|
+
- 20: Attempted but fundamentally wrong approach
|
|
129
|
+
- 50: Partial implementation — major functional gaps
|
|
130
|
+
- 80: Mostly complete — minor issues or missing edge cases
|
|
131
|
+
- 100: Fully functional code — works as expected
|
|
132
|
+
|
|
133
|
+
Must demonstrate:
|
|
134
|
+
- GROQ filter with _type == "post"
|
|
135
|
+
- Projection with aliased slug field ("slug": slug.current)
|
|
136
|
+
- Reference resolution with -> for author
|
|
137
|
+
- Ordering with | order(publishedAt desc)
|
|
138
|
+
- Slice/pagination syntax [0...10] or [0..9]
|
|
139
|
+
- Parameterized query with $slug for single post fetch
|
|
140
|
+
- Date filtering with dateTime() or string comparison
|
|
141
|
+
- Category filtering using references or array contains
|
|
142
|
+
|
|
143
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
144
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
145
|
+
metadata:
|
|
146
|
+
dimension: task-completion
|
|
147
|
+
maxScore: 100
|
|
148
|
+
- type: llm-rubric
|
|
149
|
+
value: |-
|
|
150
|
+
Score code correctness from 0 to 100:
|
|
151
|
+
- 0: Broken code, syntax errors, or deprecated APIs
|
|
152
|
+
- 30: Works but uses anti-patterns or inefficient approaches
|
|
153
|
+
- 50: Works but not idiomatic
|
|
154
|
+
- 80: Follows most best practices
|
|
155
|
+
- 100: Follows all best practices, idiomatic implementation
|
|
156
|
+
|
|
157
|
+
Check for:
|
|
158
|
+
- Valid GROQ syntax (proper filter brackets, projection braces)
|
|
159
|
+
- Uses @sanity/client createClient + client.fetch()
|
|
160
|
+
- Correct parameter passing syntax ($param)
|
|
161
|
+
- Proper reference dereference with ->
|
|
162
|
+
- No deprecated patterns
|
|
163
|
+
|
|
164
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
165
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
166
|
+
metadata:
|
|
167
|
+
dimension: code-correctness
|
|
168
|
+
maxScore: 100
|
|
169
|
+
- type: contains-any
|
|
170
|
+
value:
|
|
171
|
+
- client.fetch
|
|
172
|
+
- createClient
|
|
173
|
+
weight: 1
|
|
174
|
+
- type: contains-any
|
|
175
|
+
value:
|
|
176
|
+
- order(publishedAt
|
|
177
|
+
- order(_createdAt
|
|
178
|
+
- '| order('
|
|
179
|
+
weight: 1
|
|
180
|
+
- type: contains-any
|
|
181
|
+
value:
|
|
182
|
+
- '[0...10]'
|
|
183
|
+
- '[0..9]'
|
|
184
|
+
- '[0...'
|
|
185
|
+
weight: 1
|
|
186
|
+
- type: llm-rubric
|
|
187
|
+
value: |-
|
|
188
|
+
Score documentation coverage from 0 to 100:
|
|
189
|
+
- 0: Had to hallucinate/guess most implementation details
|
|
190
|
+
- 30: Significant gaps — filled with assumptions
|
|
191
|
+
- 50: Some gaps — inferred from partial information
|
|
192
|
+
- 80: Minor gaps — almost everything was documented
|
|
193
|
+
- 100: Complete coverage — all necessary info was in docs
|
|
194
|
+
|
|
195
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
196
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
197
|
+
metadata:
|
|
198
|
+
dimension: doc-coverage
|
|
199
|
+
maxScore: 100
|
|
200
|
+
prompts:
|
|
201
|
+
- without-docs
|
|
202
|
+
- description: GROQ - Joins and reference resolution (gold)
|
|
203
|
+
vars:
|
|
204
|
+
task: |-
|
|
205
|
+
Write GROQ queries that demonstrate join patterns in Sanity:
|
|
206
|
+
|
|
207
|
+
1. Follow a single reference to resolve an author's full profile
|
|
208
|
+
from a post (post.author -> author document with name, bio, image)
|
|
209
|
+
2. Resolve an array of category references from a post
|
|
210
|
+
(post.categories[]-> with title and slug)
|
|
211
|
+
3. Write a reverse reference query: given an author's ID, find all
|
|
212
|
+
posts by that author using a subquery and the parent scope operator (^)
|
|
213
|
+
4. Create a nested join: for each author, include their 5 most recent
|
|
214
|
+
posts as a nested array
|
|
215
|
+
5. Use the references() function to find all documents that reference
|
|
216
|
+
a specific document ID
|
|
217
|
+
|
|
218
|
+
Use @sanity/client with client.fetch(). Include TypeScript types.
|
|
219
|
+
docs: file://contexts/canonical/groq-joins-references.md
|
|
220
|
+
__featureArea: groq
|
|
221
|
+
assert:
|
|
222
|
+
- type: llm-rubric
|
|
223
|
+
value: |-
|
|
224
|
+
Score task completion from 0 to 100:
|
|
225
|
+
- 0: Couldn't attempt — missing critical information
|
|
226
|
+
- 20: Attempted but fundamentally wrong approach
|
|
227
|
+
- 50: Partial implementation — major functional gaps
|
|
228
|
+
- 80: Mostly complete — minor issues or missing edge cases
|
|
229
|
+
- 100: Fully functional code — works as expected
|
|
230
|
+
|
|
231
|
+
Must demonstrate:
|
|
232
|
+
- Single reference follow with -> operator
|
|
233
|
+
- Array reference resolution with []->
|
|
234
|
+
- Reverse reference / subquery using *[references(^._id)]
|
|
235
|
+
- Nested join pattern with parent scope (^)
|
|
236
|
+
- The references() function
|
|
237
|
+
|
|
238
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
239
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
240
|
+
metadata:
|
|
241
|
+
dimension: task-completion
|
|
242
|
+
maxScore: 100
|
|
243
|
+
- type: llm-rubric
|
|
244
|
+
value: |-
|
|
245
|
+
Score code correctness from 0 to 100:
|
|
246
|
+
- 0: Broken code, syntax errors, or deprecated APIs
|
|
247
|
+
- 30: Works but uses anti-patterns or inefficient approaches
|
|
248
|
+
- 50: Works but not idiomatic
|
|
249
|
+
- 80: Follows most best practices
|
|
250
|
+
- 100: Follows all best practices, idiomatic implementation
|
|
251
|
+
|
|
252
|
+
Check for:
|
|
253
|
+
- Correct -> dereference syntax
|
|
254
|
+
- Valid []-> array dereference
|
|
255
|
+
- Proper use of ^ parent scope operator
|
|
256
|
+
- Valid references() function usage
|
|
257
|
+
- No made-up syntax
|
|
258
|
+
|
|
259
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
260
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
261
|
+
metadata:
|
|
262
|
+
dimension: code-correctness
|
|
263
|
+
maxScore: 100
|
|
264
|
+
- type: contains
|
|
265
|
+
value: '->'
|
|
266
|
+
weight: 1
|
|
267
|
+
- type: contains-any
|
|
268
|
+
value:
|
|
269
|
+
- references(
|
|
270
|
+
- references(^
|
|
271
|
+
weight: 1
|
|
272
|
+
- type: llm-rubric
|
|
273
|
+
value: |-
|
|
274
|
+
Score documentation coverage from 0 to 100:
|
|
275
|
+
- 0: Had to hallucinate/guess most implementation details
|
|
276
|
+
- 30: Significant gaps — filled with assumptions
|
|
277
|
+
- 50: Some gaps — inferred from partial information
|
|
278
|
+
- 80: Minor gaps — almost everything was documented
|
|
279
|
+
- 100: Complete coverage — all necessary info was in docs
|
|
280
|
+
|
|
281
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
282
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
283
|
+
metadata:
|
|
284
|
+
dimension: doc-coverage
|
|
285
|
+
maxScore: 100
|
|
286
|
+
prompts:
|
|
287
|
+
- with-docs
|
|
288
|
+
- description: GROQ - Joins and reference resolution (baseline)
|
|
289
|
+
vars:
|
|
290
|
+
task: |-
|
|
291
|
+
Write GROQ queries that demonstrate join patterns in Sanity:
|
|
292
|
+
|
|
293
|
+
1. Follow a single reference to resolve an author's full profile
|
|
294
|
+
from a post (post.author -> author document with name, bio, image)
|
|
295
|
+
2. Resolve an array of category references from a post
|
|
296
|
+
(post.categories[]-> with title and slug)
|
|
297
|
+
3. Write a reverse reference query: given an author's ID, find all
|
|
298
|
+
posts by that author using a subquery and the parent scope operator (^)
|
|
299
|
+
4. Create a nested join: for each author, include their 5 most recent
|
|
300
|
+
posts as a nested array
|
|
301
|
+
5. Use the references() function to find all documents that reference
|
|
302
|
+
a specific document ID
|
|
303
|
+
|
|
304
|
+
Use @sanity/client with client.fetch(). Include TypeScript types.
|
|
305
|
+
docs: file://contexts/canonical/groq-joins-references.md
|
|
306
|
+
__featureArea: groq
|
|
307
|
+
assert:
|
|
308
|
+
- type: llm-rubric
|
|
309
|
+
value: |-
|
|
310
|
+
Score task completion from 0 to 100:
|
|
311
|
+
- 0: Couldn't attempt — missing critical information
|
|
312
|
+
- 20: Attempted but fundamentally wrong approach
|
|
313
|
+
- 50: Partial implementation — major functional gaps
|
|
314
|
+
- 80: Mostly complete — minor issues or missing edge cases
|
|
315
|
+
- 100: Fully functional code — works as expected
|
|
316
|
+
|
|
317
|
+
Must demonstrate:
|
|
318
|
+
- Single reference follow with -> operator
|
|
319
|
+
- Array reference resolution with []->
|
|
320
|
+
- Reverse reference / subquery using *[references(^._id)]
|
|
321
|
+
- Nested join pattern with parent scope (^)
|
|
322
|
+
- The references() function
|
|
323
|
+
|
|
324
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
325
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
326
|
+
metadata:
|
|
327
|
+
dimension: task-completion
|
|
328
|
+
maxScore: 100
|
|
329
|
+
- type: llm-rubric
|
|
330
|
+
value: |-
|
|
331
|
+
Score code correctness from 0 to 100:
|
|
332
|
+
- 0: Broken code, syntax errors, or deprecated APIs
|
|
333
|
+
- 30: Works but uses anti-patterns or inefficient approaches
|
|
334
|
+
- 50: Works but not idiomatic
|
|
335
|
+
- 80: Follows most best practices
|
|
336
|
+
- 100: Follows all best practices, idiomatic implementation
|
|
337
|
+
|
|
338
|
+
Check for:
|
|
339
|
+
- Correct -> dereference syntax
|
|
340
|
+
- Valid []-> array dereference
|
|
341
|
+
- Proper use of ^ parent scope operator
|
|
342
|
+
- Valid references() function usage
|
|
343
|
+
- No made-up syntax
|
|
344
|
+
|
|
345
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
346
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
347
|
+
metadata:
|
|
348
|
+
dimension: code-correctness
|
|
349
|
+
maxScore: 100
|
|
350
|
+
- type: contains
|
|
351
|
+
value: '->'
|
|
352
|
+
weight: 1
|
|
353
|
+
- type: contains-any
|
|
354
|
+
value:
|
|
355
|
+
- references(
|
|
356
|
+
- references(^
|
|
357
|
+
weight: 1
|
|
358
|
+
- type: llm-rubric
|
|
359
|
+
value: |-
|
|
360
|
+
Score documentation coverage from 0 to 100:
|
|
361
|
+
- 0: Had to hallucinate/guess most implementation details
|
|
362
|
+
- 30: Significant gaps — filled with assumptions
|
|
363
|
+
- 50: Some gaps — inferred from partial information
|
|
364
|
+
- 80: Minor gaps — almost everything was documented
|
|
365
|
+
- 100: Complete coverage — all necessary info was in docs
|
|
366
|
+
|
|
367
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
368
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
369
|
+
metadata:
|
|
370
|
+
dimension: doc-coverage
|
|
371
|
+
maxScore: 100
|
|
372
|
+
prompts:
|
|
373
|
+
- without-docs
|
|
374
|
+
- description: GROQ - Advanced filtering and projections (gold)
|
|
375
|
+
vars:
|
|
376
|
+
task: |-
|
|
377
|
+
Write GROQ queries demonstrating advanced filtering and projection patterns:
|
|
378
|
+
|
|
379
|
+
1. Use select() for conditional projections — return different fields
|
|
380
|
+
based on the document's _type (e.g., posts get excerpt, events get
|
|
381
|
+
date and venue)
|
|
382
|
+
2. Use coalesce() for fallback values — e.g., use seoTitle if it
|
|
383
|
+
exists, otherwise fall back to title
|
|
384
|
+
3. Use the match operator for full-text search in titles
|
|
385
|
+
4. Use count() to count documents matching a filter and to count
|
|
386
|
+
items within an array field
|
|
387
|
+
5. Use defined() to filter for documents that have a specific field set
|
|
388
|
+
6. Filter items within an array using [condition] syntax
|
|
389
|
+
7. Order results by multiple fields (e.g., featured status first,
|
|
390
|
+
then by publishedAt)
|
|
391
|
+
|
|
392
|
+
Use @sanity/client with client.fetch(). Include TypeScript types.
|
|
393
|
+
docs: file://contexts/canonical/groq-advanced-filtering.md
|
|
394
|
+
__featureArea: groq
|
|
395
|
+
assert:
|
|
396
|
+
- type: llm-rubric
|
|
397
|
+
value: |-
|
|
398
|
+
Score task completion from 0 to 100:
|
|
399
|
+
- 0: Couldn't attempt — missing critical information
|
|
400
|
+
- 20: Attempted but fundamentally wrong approach
|
|
401
|
+
- 50: Partial implementation — major functional gaps
|
|
402
|
+
- 80: Mostly complete — minor issues or missing edge cases
|
|
403
|
+
- 100: Fully functional code — works as expected
|
|
404
|
+
|
|
405
|
+
Must demonstrate:
|
|
406
|
+
- select() for conditional projections
|
|
407
|
+
- coalesce() for fallback values
|
|
408
|
+
- match operator for text search
|
|
409
|
+
- count() function usage
|
|
410
|
+
- defined() function for existence checks
|
|
411
|
+
- Array filtering with [condition]
|
|
412
|
+
- Multi-field ordering
|
|
413
|
+
|
|
414
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
415
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
416
|
+
metadata:
|
|
417
|
+
dimension: task-completion
|
|
418
|
+
maxScore: 100
|
|
419
|
+
- type: llm-rubric
|
|
420
|
+
value: |-
|
|
421
|
+
Score code correctness from 0 to 100:
|
|
422
|
+
- 0: Broken code, syntax errors, or deprecated APIs
|
|
423
|
+
- 30: Works but uses anti-patterns or inefficient approaches
|
|
424
|
+
- 50: Works but not idiomatic
|
|
425
|
+
- 80: Follows most best practices
|
|
426
|
+
- 100: Follows all best practices, idiomatic implementation
|
|
427
|
+
|
|
428
|
+
Check for:
|
|
429
|
+
- Valid select() syntax with => arrow notation
|
|
430
|
+
- Correct coalesce() usage
|
|
431
|
+
- Proper match operator usage (on text fields)
|
|
432
|
+
- Valid count() and defined() function calls
|
|
433
|
+
- Correct array filter syntax
|
|
434
|
+
|
|
435
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
436
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
437
|
+
metadata:
|
|
438
|
+
dimension: code-correctness
|
|
439
|
+
maxScore: 100
|
|
440
|
+
- type: contains-any
|
|
441
|
+
value:
|
|
442
|
+
- select(
|
|
443
|
+
- coalesce(
|
|
444
|
+
weight: 1
|
|
445
|
+
- type: contains-any
|
|
446
|
+
value:
|
|
447
|
+
- count(
|
|
448
|
+
- defined(
|
|
449
|
+
weight: 1
|
|
450
|
+
- type: contains-any
|
|
451
|
+
value:
|
|
452
|
+
- match
|
|
453
|
+
weight: 1
|
|
454
|
+
- type: llm-rubric
|
|
455
|
+
value: |-
|
|
456
|
+
Score documentation coverage from 0 to 100:
|
|
457
|
+
- 0: Had to hallucinate/guess most implementation details
|
|
458
|
+
- 30: Significant gaps — filled with assumptions
|
|
459
|
+
- 50: Some gaps — inferred from partial information
|
|
460
|
+
- 80: Minor gaps — almost everything was documented
|
|
461
|
+
- 100: Complete coverage — all necessary info was in docs
|
|
462
|
+
|
|
463
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
464
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
465
|
+
metadata:
|
|
466
|
+
dimension: doc-coverage
|
|
467
|
+
maxScore: 100
|
|
468
|
+
prompts:
|
|
469
|
+
- with-docs
|
|
470
|
+
- description: GROQ - Advanced filtering and projections (baseline)
|
|
471
|
+
vars:
|
|
472
|
+
task: |-
|
|
473
|
+
Write GROQ queries demonstrating advanced filtering and projection patterns:
|
|
474
|
+
|
|
475
|
+
1. Use select() for conditional projections — return different fields
|
|
476
|
+
based on the document's _type (e.g., posts get excerpt, events get
|
|
477
|
+
date and venue)
|
|
478
|
+
2. Use coalesce() for fallback values — e.g., use seoTitle if it
|
|
479
|
+
exists, otherwise fall back to title
|
|
480
|
+
3. Use the match operator for full-text search in titles
|
|
481
|
+
4. Use count() to count documents matching a filter and to count
|
|
482
|
+
items within an array field
|
|
483
|
+
5. Use defined() to filter for documents that have a specific field set
|
|
484
|
+
6. Filter items within an array using [condition] syntax
|
|
485
|
+
7. Order results by multiple fields (e.g., featured status first,
|
|
486
|
+
then by publishedAt)
|
|
487
|
+
|
|
488
|
+
Use @sanity/client with client.fetch(). Include TypeScript types.
|
|
489
|
+
docs: file://contexts/canonical/groq-advanced-filtering.md
|
|
490
|
+
__featureArea: groq
|
|
491
|
+
assert:
|
|
492
|
+
- type: llm-rubric
|
|
493
|
+
value: |-
|
|
494
|
+
Score task completion from 0 to 100:
|
|
495
|
+
- 0: Couldn't attempt — missing critical information
|
|
496
|
+
- 20: Attempted but fundamentally wrong approach
|
|
497
|
+
- 50: Partial implementation — major functional gaps
|
|
498
|
+
- 80: Mostly complete — minor issues or missing edge cases
|
|
499
|
+
- 100: Fully functional code — works as expected
|
|
500
|
+
|
|
501
|
+
Must demonstrate:
|
|
502
|
+
- select() for conditional projections
|
|
503
|
+
- coalesce() for fallback values
|
|
504
|
+
- match operator for text search
|
|
505
|
+
- count() function usage
|
|
506
|
+
- defined() function for existence checks
|
|
507
|
+
- Array filtering with [condition]
|
|
508
|
+
- Multi-field ordering
|
|
509
|
+
|
|
510
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
511
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
512
|
+
metadata:
|
|
513
|
+
dimension: task-completion
|
|
514
|
+
maxScore: 100
|
|
515
|
+
- type: llm-rubric
|
|
516
|
+
value: |-
|
|
517
|
+
Score code correctness from 0 to 100:
|
|
518
|
+
- 0: Broken code, syntax errors, or deprecated APIs
|
|
519
|
+
- 30: Works but uses anti-patterns or inefficient approaches
|
|
520
|
+
- 50: Works but not idiomatic
|
|
521
|
+
- 80: Follows most best practices
|
|
522
|
+
- 100: Follows all best practices, idiomatic implementation
|
|
523
|
+
|
|
524
|
+
Check for:
|
|
525
|
+
- Valid select() syntax with => arrow notation
|
|
526
|
+
- Correct coalesce() usage
|
|
527
|
+
- Proper match operator usage (on text fields)
|
|
528
|
+
- Valid count() and defined() function calls
|
|
529
|
+
- Correct array filter syntax
|
|
530
|
+
|
|
531
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
532
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
533
|
+
metadata:
|
|
534
|
+
dimension: code-correctness
|
|
535
|
+
maxScore: 100
|
|
536
|
+
- type: contains-any
|
|
537
|
+
value:
|
|
538
|
+
- select(
|
|
539
|
+
- coalesce(
|
|
540
|
+
weight: 1
|
|
541
|
+
- type: contains-any
|
|
542
|
+
value:
|
|
543
|
+
- count(
|
|
544
|
+
- defined(
|
|
545
|
+
weight: 1
|
|
546
|
+
- type: contains-any
|
|
547
|
+
value:
|
|
548
|
+
- match
|
|
549
|
+
weight: 1
|
|
550
|
+
- type: llm-rubric
|
|
551
|
+
value: |-
|
|
552
|
+
Score documentation coverage from 0 to 100:
|
|
553
|
+
- 0: Had to hallucinate/guess most implementation details
|
|
554
|
+
- 30: Significant gaps — filled with assumptions
|
|
555
|
+
- 50: Some gaps — inferred from partial information
|
|
556
|
+
- 80: Minor gaps — almost everything was documented
|
|
557
|
+
- 100: Complete coverage — all necessary info was in docs
|
|
558
|
+
|
|
559
|
+
Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
|
|
560
|
+
provider: anthropic:messages:claude-opus-4-5-20251101
|
|
561
|
+
metadata:
|
|
562
|
+
dimension: doc-coverage
|
|
563
|
+
maxScore: 100
|
|
564
|
+
prompts:
|
|
565
|
+
- without-docs
|
|
@@ -3,21 +3,30 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Tests whether models know the current typed schema API
|
|
5
5
|
* vs the legacy untyped format.
|
|
6
|
+
*
|
|
7
|
+
* Knowledge probes test the model's built-in knowledge WITHOUT providing documentation
|
|
8
|
+
* context (no `context.docs`). Contrast with "literacy" tasks which inject docs.
|
|
6
9
|
*/
|
|
7
10
|
import { defineTask } from "@sanity/ailf-core"
|
|
8
11
|
|
|
9
12
|
export default defineTask({
|
|
13
|
+
// "knowledge-probe" mode: no docs injected — measures what the model already knows
|
|
10
14
|
mode: "knowledge-probe",
|
|
11
15
|
id: "kp-define-type-api",
|
|
12
16
|
title: "What is Sanity's defineType API?",
|
|
13
17
|
description:
|
|
14
18
|
"Explain how to define document schemas in Sanity using the defineType, " +
|
|
15
19
|
"defineField, and defineArrayMember helper functions.",
|
|
20
|
+
// Used for score aggregation in reports and --area CLI filtering
|
|
16
21
|
area: "studio",
|
|
22
|
+
// Metadata for reporting; does not affect evaluation behavior
|
|
17
23
|
difficulty: "basic",
|
|
24
|
+
// Freeform labels for --tag CLI filtering
|
|
18
25
|
tags: ["knowledge-probe", "studio", "schema"],
|
|
26
|
+
// Controls how the probe explores knowledge: "breadth-first" covers many topics, "depth-first" drills deep
|
|
19
27
|
probeStrategy: "breadth-first",
|
|
20
28
|
prompt: {
|
|
29
|
+
// Direct prompt text sent to the model (knowledge probes use text, literacy tasks use vars.task with a template)
|
|
21
30
|
text:
|
|
22
31
|
"Explain Sanity's schema definition API:\n\n" +
|
|
23
32
|
"1. What is `defineType` and how do you use it?\n" +
|
|
@@ -34,6 +43,7 @@ export default defineTask({
|
|
|
34
43
|
assertions: [
|
|
35
44
|
{ type: "contains", value: "defineType" },
|
|
36
45
|
{ type: "contains", value: "defineField" },
|
|
46
|
+
// Inline llm-rubric (value is the rubric text itself, unlike template+criteria in literacy tasks)
|
|
37
47
|
{
|
|
38
48
|
type: "llm-rubric",
|
|
39
49
|
value:
|
|
@@ -41,6 +51,7 @@ export default defineTask({
|
|
|
41
51
|
"Check that the code examples use the current API, not the legacy " +
|
|
42
52
|
"untyped format. Penalize if the response uses the old pattern " +
|
|
43
53
|
"without mentioning defineType.",
|
|
54
|
+
// weight: relative weight in the overall score (these two rubrics split evenly at 0.5 each)
|
|
44
55
|
weight: 0.5,
|
|
45
56
|
},
|
|
46
57
|
{
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Tests deep knowledge of Sanity's query language without
|
|
5
5
|
* providing any documentation context.
|
|
6
|
+
*
|
|
7
|
+
* See define-type-api.task.ts for detailed explanations of knowledge-probe properties.
|
|
6
8
|
*/
|
|
7
9
|
import { defineTask } from "@sanity/ailf-core"
|
|
8
10
|
|
|
@@ -18,6 +20,7 @@ export default defineTask({
|
|
|
18
20
|
area: "groq",
|
|
19
21
|
difficulty: "intermediate",
|
|
20
22
|
tags: ["knowledge-probe", "groq", "syntax"],
|
|
23
|
+
// "depth-first" drills deep into one topic (vs "breadth-first" which covers many shallowly)
|
|
21
24
|
probeStrategy: "depth-first",
|
|
22
25
|
prompt: {
|
|
23
26
|
text:
|