@sanity/ailf 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/README.md +2 -2
- package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
- package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
- package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
- package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
- package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
- package/config/features.ts +1 -1
- package/config/models.ts +28 -23
- package/config/sources.ts +1 -1
- package/config/thresholds.ts +1 -1
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
- package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
- package/dist/_vendor/ailf-core/config-helpers.js +29 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
- package/dist/_vendor/ailf-core/examples/index.js +208 -114
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
- package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
- package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
- package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
- package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
- package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/services/scoring.js +9 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
- package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
- package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
- package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
- package/dist/_vendor/ailf-tasks/cli.js +61 -0
- package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
- package/dist/_vendor/ailf-tasks/index.js +16 -0
- package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
- package/dist/_vendor/ailf-tasks/parser.js +73 -0
- package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
- package/dist/_vendor/ailf-tasks/schemas.js +180 -0
- package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
- package/dist/_vendor/ailf-tasks/validation.js +162 -0
- package/dist/adapters/api-client/remediation.js +2 -2
- package/dist/adapters/config-sources/file-config-adapter.js +6 -1
- package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
- package/dist/adapters/index.d.ts +0 -1
- package/dist/adapters/index.js +0 -1
- package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
- package/dist/adapters/task-sources/index.d.ts +1 -2
- package/dist/adapters/task-sources/index.js +1 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
- package/dist/adapters/task-sources/repo-schemas.js +2 -2
- package/dist/adapters/task-sources/repo-task-source.js +1 -1
- package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
- package/dist/adapters/task-sources/repo-trigger.js +1 -1
- package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
- package/dist/adapters/task-sources/task-file-loader.js +20 -6
- package/dist/agent-observer/test-imports.d.ts +7 -0
- package/dist/agent-observer/test-imports.js +185 -0
- package/dist/artifact-capture/comparator.d.ts +22 -0
- package/dist/artifact-capture/comparator.js +493 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
- package/dist/artifact-capture/filesystem-collector.js +237 -0
- package/dist/artifact-capture/redact-artifact.d.ts +20 -0
- package/dist/artifact-capture/redact-artifact.js +115 -0
- package/dist/assertions/source-isolation.d.ts +1 -1
- package/dist/assertions/source-isolation.js +1 -1
- package/dist/cli.js +4 -0
- package/dist/commands/calculate-scores.js +1 -0
- package/dist/commands/capture-compare.d.ts +15 -0
- package/dist/commands/capture-compare.js +253 -0
- package/dist/commands/capture-list.d.ts +12 -0
- package/dist/commands/capture-list.js +147 -0
- package/dist/commands/capture.d.ts +9 -0
- package/dist/commands/capture.js +16 -0
- package/dist/commands/chronic-failures.d.ts +8 -0
- package/dist/commands/chronic-failures.js +33 -0
- package/dist/commands/explain-handler.d.ts +1 -1
- package/dist/commands/explain-handler.js +37 -8
- package/dist/commands/fetch-docs.js +1 -0
- package/dist/commands/generate-configs.d.ts +3 -3
- package/dist/commands/generate-configs.js +20 -8
- package/dist/commands/init.d.ts +2 -3
- package/dist/commands/init.js +56 -170
- package/dist/commands/pipeline-action.d.ts +7 -1
- package/dist/commands/pipeline-action.js +43 -19
- package/dist/commands/pipeline.d.ts +6 -1
- package/dist/commands/pipeline.js +7 -2
- package/dist/commands/pr-comment.js +1 -0
- package/dist/commands/publish.js +1 -0
- package/dist/commands/shared/help.js +2 -2
- package/dist/commands/update-quality-scores.d.ts +5 -0
- package/dist/commands/update-quality-scores.js +20 -0
- package/dist/composition-root.d.ts +2 -3
- package/dist/composition-root.js +27 -14
- package/dist/config/features.ts +23 -0
- package/dist/config/models.ts +100 -0
- package/dist/config/prompts.ts +16 -0
- package/dist/config/rubrics.ts +225 -0
- package/dist/config/schedules.ts +47 -0
- package/dist/config/sinks.ts +37 -0
- package/dist/config/sources.ts +21 -0
- package/dist/config/thresholds.ts +61 -0
- package/dist/lib/agent-behavior-report.d.ts +8 -0
- package/dist/lib/agent-behavior-report.js +185 -0
- package/dist/lib/baseline.d.ts +19 -0
- package/dist/lib/baseline.js +153 -0
- package/dist/lib/calculate-scores.d.ts +23 -0
- package/dist/lib/calculate-scores.js +42 -0
- package/dist/lib/compare.d.ts +18 -0
- package/dist/lib/compare.js +170 -0
- package/dist/lib/coverage-audit.d.ts +4 -0
- package/dist/lib/coverage-audit.js +42 -0
- package/dist/lib/discovery-report.d.ts +13 -0
- package/dist/lib/discovery-report.js +57 -0
- package/dist/lib/fetch-docs.d.ts +30 -0
- package/dist/lib/fetch-docs.js +171 -0
- package/dist/lib/generate-configs.d.ts +25 -0
- package/dist/lib/generate-configs.js +42 -0
- package/dist/lib/grader-api.d.ts +21 -0
- package/dist/lib/grader-api.js +34 -0
- package/dist/lib/grader-compare.d.ts +19 -0
- package/dist/lib/grader-compare.js +91 -0
- package/dist/lib/grader-consistency.d.ts +27 -0
- package/dist/lib/grader-consistency.js +79 -0
- package/dist/lib/grader-sensitivity.d.ts +19 -0
- package/dist/lib/grader-sensitivity.js +75 -0
- package/dist/lib/grader-validate.d.ts +19 -0
- package/dist/lib/grader-validate.js +78 -0
- package/dist/lib/measure-retrieval.d.ts +14 -0
- package/dist/lib/measure-retrieval.js +71 -0
- package/dist/lib/pr-comment.d.ts +16 -0
- package/dist/lib/pr-comment.js +28 -0
- package/dist/lib/readiness-report.d.ts +13 -0
- package/dist/lib/readiness-report.js +108 -0
- package/dist/lib/webhook-server.d.ts +11 -0
- package/dist/lib/webhook-server.js +24 -0
- package/dist/lib/weekly-digest.d.ts +24 -0
- package/dist/lib/weekly-digest.js +148 -0
- package/dist/orchestration/build-app-context.js +13 -0
- package/dist/orchestration/cache-context.d.ts +23 -0
- package/dist/orchestration/cache-context.js +43 -0
- package/dist/orchestration/env-bridge.d.ts +21 -0
- package/dist/orchestration/env-bridge.js +66 -0
- package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
- package/dist/orchestration/load-pipeline-tasks.js +52 -0
- package/dist/orchestration/pipeline-orchestrator.js +75 -5
- package/dist/orchestration/step-runner.js +5 -1
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
- package/dist/orchestration/steps/calculate-scores-step.js +13 -0
- package/dist/orchestration/steps/callback-step.js +10 -1
- package/dist/orchestration/steps/compare-step.js +6 -3
- package/dist/orchestration/steps/discovery-report-step.js +6 -2
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
- package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +30 -16
- package/dist/orchestration/steps/gap-analysis-step.js +13 -2
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
- package/dist/orchestration/steps/generate-configs-step.js +50 -15
- package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/publish-report-step.js +19 -0
- package/dist/orchestration/steps/readiness-step.js +8 -3
- package/dist/orchestration/steps/report-step.js +17 -4
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
- package/dist/orchestration/steps/run-eval-step.js +52 -32
- package/dist/pipeline/agent-behavior-report.js +6 -0
- package/dist/pipeline/attribution.d.ts +1 -1
- package/dist/pipeline/attribution.js +1 -1
- package/dist/pipeline/cache.js +29 -15
- package/dist/pipeline/calculate-scores.d.ts +2 -0
- package/dist/pipeline/calculate-scores.js +70 -33
- package/dist/pipeline/checks.d.ts +8 -3
- package/dist/pipeline/checks.js +23 -3
- package/dist/pipeline/chronic-failures.d.ts +55 -0
- package/dist/pipeline/chronic-failures.js +110 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
- package/dist/pipeline/compiler/assertion-mapper.js +1 -1
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
- package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
- package/dist/pipeline/compiler/config-loader.d.ts +14 -0
- package/dist/pipeline/compiler/config-loader.js +42 -2
- package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/fixture-resolver.js +1 -1
- package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
- package/dist/pipeline/compiler/ignore-fields.js +1 -1
- package/dist/pipeline/compiler/index.d.ts +2 -5
- package/dist/pipeline/compiler/index.js +2 -5
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
- package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
- package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
- package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
- package/dist/pipeline/compiler/provider-assembler.js +13 -7
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
- package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/index.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
- package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/scoring-bridge.js +1 -1
- package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
- package/dist/pipeline/compiler/task-bridge.js +92 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
- package/dist/pipeline/compiler/task-graph-builder.js +1 -4
- package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
- package/dist/pipeline/compiler/telemetry/index.js +1 -1
- package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
- package/dist/pipeline/compiler/variable-resolver.js +1 -1
- package/dist/pipeline/coverage-audit.d.ts +1 -1
- package/dist/pipeline/coverage-audit.js +1 -1
- package/dist/pipeline/degradations.d.ts +1 -1
- package/dist/pipeline/degradations.js +1 -1
- package/dist/pipeline/failure-modes.d.ts +1 -1
- package/dist/pipeline/failure-modes.js +13 -1
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +3 -1
- package/dist/pipeline/generate-configs.d.ts +2 -2
- package/dist/pipeline/generate-configs.js +15 -8
- package/dist/pipeline/grader-compare-runner.d.ts +1 -1
- package/dist/pipeline/grader-compare-runner.js +7 -1
- package/dist/pipeline/grader-comparison.d.ts +1 -1
- package/dist/pipeline/grader-comparison.js +1 -1
- package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
- package/dist/pipeline/grader-consistency-runner.js +7 -1
- package/dist/pipeline/grader-consistency.d.ts +1 -1
- package/dist/pipeline/grader-consistency.js +1 -1
- package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity-runner.js +1 -1
- package/dist/pipeline/grader-sensitivity.d.ts +1 -1
- package/dist/pipeline/grader-sensitivity.js +1 -1
- package/dist/pipeline/grader-validate-runner.d.ts +1 -1
- package/dist/pipeline/grader-validate-runner.js +2 -2
- package/dist/pipeline/grader-validation.d.ts +1 -1
- package/dist/pipeline/grader-validation.js +1 -1
- package/dist/pipeline/map-request-to-config.js +15 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
- package/dist/pipeline/mirror-repo-tasks.js +1 -1
- package/dist/pipeline/plan-format.d.ts +1 -1
- package/dist/pipeline/plan-format.js +1 -1
- package/dist/pipeline/plan.d.ts +1 -1
- package/dist/pipeline/plan.js +67 -29
- package/dist/pipeline/probe.d.ts +1 -1
- package/dist/pipeline/probe.js +1 -1
- package/dist/pipeline/readiness-report.d.ts +2 -2
- package/dist/pipeline/readiness-report.js +2 -2
- package/dist/pipeline/release-classification.d.ts +1 -1
- package/dist/pipeline/release-classification.js +1 -1
- package/dist/pipeline/release-report.d.ts +1 -1
- package/dist/pipeline/release-report.js +1 -1
- package/dist/pipeline/repo-eval-comment.d.ts +1 -1
- package/dist/pipeline/repo-eval-comment.js +1 -1
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/resolve-mappings.d.ts +6 -6
- package/dist/pipeline/resolve-mappings.js +44 -44
- package/dist/pipeline/retrieval-metrics.d.ts +3 -3
- package/dist/pipeline/retrieval-metrics.js +28 -20
- package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/calculate-scores-step.js +89 -0
- package/dist/pipeline/steps/compare-step.d.ts +18 -0
- package/dist/pipeline/steps/compare-step.js +90 -0
- package/dist/pipeline/steps/eval-step.d.ts +53 -0
- package/dist/pipeline/steps/eval-step.js +347 -0
- package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
- package/dist/pipeline/steps/fetch-docs-step.js +84 -0
- package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
- package/dist/pipeline/steps/generate-configs-step.js +98 -0
- package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
- package/dist/pipeline/steps/grader-consistency-step.js +74 -0
- package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
- package/dist/pipeline/steps/publish-report-step.js +243 -0
- package/dist/pipeline/steps/report-step.d.ts +13 -0
- package/dist/pipeline/steps/report-step.js +56 -0
- package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
- package/dist/pipeline/steps/update-scores-step.js +42 -0
- package/dist/pipeline/targeted-loo.d.ts +1 -1
- package/dist/pipeline/targeted-loo.js +1 -1
- package/dist/pipeline/thresholds.d.ts +1 -1
- package/dist/pipeline/thresholds.js +1 -1
- package/dist/pipeline/validate.js +13 -0
- package/dist/report-store.d.ts +17 -0
- package/dist/report-store.js +24 -0
- package/dist/scripts/agent-behavior-report.d.ts +19 -0
- package/dist/scripts/agent-behavior-report.js +315 -0
- package/dist/scripts/baseline.d.ts +43 -0
- package/dist/scripts/baseline.js +267 -0
- package/dist/scripts/calculate-scores.d.ts +166 -0
- package/dist/scripts/calculate-scores.js +1296 -0
- package/dist/scripts/compare.d.ts +22 -0
- package/dist/scripts/compare.js +334 -0
- package/dist/scripts/coverage-audit.d.ts +44 -0
- package/dist/scripts/coverage-audit.js +209 -0
- package/dist/scripts/debug-eval.d.ts +19 -0
- package/dist/scripts/debug-eval.js +73 -0
- package/dist/scripts/discovery-report.d.ts +58 -0
- package/dist/scripts/discovery-report.js +250 -0
- package/dist/scripts/fetch-docs.d.ts +35 -0
- package/dist/scripts/fetch-docs.js +472 -0
- package/dist/scripts/generate-configs.d.ts +66 -0
- package/dist/scripts/generate-configs.js +459 -0
- package/dist/scripts/grader-api.d.ts +27 -0
- package/dist/scripts/grader-api.js +206 -0
- package/dist/scripts/grader-compare.d.ts +22 -0
- package/dist/scripts/grader-compare.js +368 -0
- package/dist/scripts/grader-consistency.d.ts +20 -0
- package/dist/scripts/grader-consistency.js +313 -0
- package/dist/scripts/grader-sensitivity.d.ts +22 -0
- package/dist/scripts/grader-sensitivity.js +354 -0
- package/dist/scripts/grader-validate.d.ts +19 -0
- package/dist/scripts/grader-validate.js +267 -0
- package/dist/scripts/measure-retrieval.d.ts +10 -0
- package/dist/scripts/measure-retrieval.js +145 -0
- package/dist/scripts/migrate-task-mode.d.ts +1 -1
- package/dist/scripts/migrate-task-mode.js +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
- package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
- package/dist/scripts/pipeline.d.ts +76 -0
- package/dist/scripts/pipeline.js +1031 -0
- package/dist/scripts/pr-comment.d.ts +10 -0
- package/dist/scripts/pr-comment.js +510 -0
- package/dist/scripts/readiness-report.d.ts +88 -0
- package/dist/scripts/readiness-report.js +342 -0
- package/dist/scripts/update-quality-scores.d.ts +15 -0
- package/dist/scripts/update-quality-scores.js +184 -0
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +1 -1
- package/dist/scripts/validate.d.ts +13 -0
- package/dist/scripts/validate.js +79 -0
- package/dist/scripts/webhook-server.d.ts +26 -0
- package/dist/scripts/webhook-server.js +147 -0
- package/dist/scripts/weekly-digest.d.ts +24 -0
- package/dist/scripts/weekly-digest.js +144 -0
- package/dist/sinks/format-slack.d.ts +64 -0
- package/dist/sinks/format-slack.js +306 -0
- package/dist/sinks/slack-sink.d.ts +27 -0
- package/dist/sinks/slack-sink.js +78 -0
- package/dist/sinks/types.d.ts +1 -1
- package/dist/sinks/types.js +1 -1
- package/dist/sinks/webhook-sink.d.ts +19 -0
- package/dist/sinks/webhook-sink.js +50 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
- package/dist/tasks/literacy/content-lake.task.ts +181 -0
- package/dist/tasks/literacy/frameworks.task.ts +129 -0
- package/dist/tasks/literacy/functions.task.ts +70 -0
- package/dist/tasks/literacy/groq.task.ts +259 -0
- package/dist/tasks/literacy/image-handling.task.ts +95 -0
- package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
- package/dist/tasks/literacy/portable-text.task.ts +169 -0
- package/dist/tasks/literacy/studio-setup.task.ts +134 -0
- package/dist/tasks/literacy/visual-editing.task.ts +147 -0
- package/package.json +25 -25
- package/tasks/.expanded.agentic.yaml +280 -0
- package/tasks/.expanded.yaml +565 -0
- package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
- package/tasks/literacy/content-lake.task.ts +181 -0
- package/tasks/literacy/frameworks.task.ts +1 -0
- package/tasks/literacy/functions.task.ts +1 -0
- package/tasks/literacy/groq.task.ts +1 -0
- package/tasks/literacy/image-handling.task.ts +95 -0
- package/tasks/literacy/nextjs-live.task.ts +2 -1
- package/tasks/literacy/portable-text.task.ts +169 -0
- package/tasks/literacy/studio-setup.task.ts +5 -2
- package/tasks/literacy/visual-editing.task.ts +1 -0
- package/LICENSE +0 -21
- package/tasks/frameworks.yaml +0 -98
- package/tasks/functions.yaml +0 -51
- package/tasks/groq.yaml +0 -216
- package/tasks/nextjs-live.yaml +0 -62
- package/tasks/studio-setup.yaml +0 -111
- package/tasks/visual-editing.yaml +0 -120
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-behavior-report.ts
|
|
3
|
+
*
|
|
4
|
+
* Standalone script that reads Promptfoo evaluation results containing
|
|
5
|
+
* agent behavior observation data and generates a detailed report.
|
|
6
|
+
*
|
|
7
|
+
* This provides deeper analysis than the summary included in the main
|
|
8
|
+
* calculate-scores report, including:
|
|
9
|
+
*
|
|
10
|
+
* - Per-task behavior breakdown (which specific pages each task visited)
|
|
11
|
+
* - Canonical doc coverage (did the agent find the "right" docs?)
|
|
12
|
+
* - Request timeline and latency analysis
|
|
13
|
+
* - Search strategy analysis
|
|
14
|
+
* - Cross-task navigation pattern detection
|
|
15
|
+
*
|
|
16
|
+
* Usage:
|
|
17
|
+
* tsx src/scripts/agent-behavior-report.ts [results-path]
|
|
18
|
+
*/
|
|
19
|
+
// oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
|
|
20
|
+
import "dotenv/config";
|
|
21
|
+
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "fs";
|
|
22
|
+
import { join, dirname } from "path";
|
|
23
|
+
// Canonical doc mapping: task description patterns -> expected doc slugs
|
|
24
|
+
// This maps what docs a well-informed agent *should* visit for each task
|
|
25
|
+
const CANONICAL_DOC_MAP = {
|
|
26
|
+
frameworks: [
|
|
27
|
+
"remix",
|
|
28
|
+
"nuxt",
|
|
29
|
+
"svelte",
|
|
30
|
+
"astro",
|
|
31
|
+
"gatsby",
|
|
32
|
+
"client-libraries",
|
|
33
|
+
],
|
|
34
|
+
functions: [
|
|
35
|
+
"functions",
|
|
36
|
+
"webhooks",
|
|
37
|
+
"groq-powered-webhooks",
|
|
38
|
+
"event-driven",
|
|
39
|
+
"automations",
|
|
40
|
+
],
|
|
41
|
+
"nextjs-live": [
|
|
42
|
+
"next-js",
|
|
43
|
+
"live-content-api",
|
|
44
|
+
"content-source-maps",
|
|
45
|
+
"app-router",
|
|
46
|
+
"groq",
|
|
47
|
+
"client-libraries",
|
|
48
|
+
],
|
|
49
|
+
"studio-setup": [
|
|
50
|
+
"studio",
|
|
51
|
+
"schema-types",
|
|
52
|
+
"structure-builder",
|
|
53
|
+
"configuration",
|
|
54
|
+
"plugins",
|
|
55
|
+
],
|
|
56
|
+
"visual-editing": [
|
|
57
|
+
"visual-editing",
|
|
58
|
+
"presentation",
|
|
59
|
+
"preview",
|
|
60
|
+
"overlays",
|
|
61
|
+
"loaders",
|
|
62
|
+
],
|
|
63
|
+
};
|
|
64
|
+
function analyzeResults(resultsPath) {
|
|
65
|
+
const json = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
66
|
+
// Support both the flat shape ({ results: TestResult[] }) and the full
|
|
67
|
+
// Promptfoo envelope shape ({ results: { results: TestResult[] } }).
|
|
68
|
+
const results = Array.isArray(json.results)
|
|
69
|
+
? json.results
|
|
70
|
+
: json.results.results;
|
|
71
|
+
const tasks = [];
|
|
72
|
+
for (const result of results) {
|
|
73
|
+
const metadata = result.metadata;
|
|
74
|
+
if (!metadata?.agentBehaviorSummary)
|
|
75
|
+
continue;
|
|
76
|
+
const behavior = metadata.agentBehaviorSummary;
|
|
77
|
+
tasks.push({
|
|
78
|
+
behavior,
|
|
79
|
+
description: result.description,
|
|
80
|
+
feature: detectFeatureArea(result.description),
|
|
81
|
+
hasDocs: !!(result.vars.docs && result.vars.docs.trim().length > 0),
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
if (tasks.length === 0) {
|
|
85
|
+
return { features: [], hasData: false, tasks: [] };
|
|
86
|
+
}
|
|
87
|
+
// Group by feature
|
|
88
|
+
const byFeature = {};
|
|
89
|
+
for (const t of tasks) {
|
|
90
|
+
if (!byFeature[t.feature])
|
|
91
|
+
byFeature[t.feature] = [];
|
|
92
|
+
byFeature[t.feature].push(t);
|
|
93
|
+
}
|
|
94
|
+
const features = Object.entries(byFeature)
|
|
95
|
+
.map(([feature, featureTasks]) => {
|
|
96
|
+
const allDocSlugs = [
|
|
97
|
+
...new Set(featureTasks.flatMap((t) => t.behavior.docSlugsVisited)),
|
|
98
|
+
];
|
|
99
|
+
const allSearchQueries = [
|
|
100
|
+
...new Set(featureTasks.flatMap((t) => t.behavior.uniqueSearchQueries)),
|
|
101
|
+
];
|
|
102
|
+
const allExternalDomains = [
|
|
103
|
+
...new Set(featureTasks.flatMap((t) => t.behavior.externalDomains)),
|
|
104
|
+
];
|
|
105
|
+
const canonicalSlugs = CANONICAL_DOC_MAP[feature] || [];
|
|
106
|
+
const matchedCanonical = canonicalSlugs.filter((slug) => allDocSlugs.some((visited) => visited.includes(slug)));
|
|
107
|
+
const canonicalCoverage = canonicalSlugs.length > 0
|
|
108
|
+
? matchedCanonical.length / canonicalSlugs.length
|
|
109
|
+
: 0;
|
|
110
|
+
const count = featureTasks.length || 1;
|
|
111
|
+
return {
|
|
112
|
+
allDocSlugs,
|
|
113
|
+
allExternalDomains,
|
|
114
|
+
allSearchQueries,
|
|
115
|
+
avgDocPages: featureTasks.reduce((s, t) => s + t.behavior.docPagesVisited, 0) /
|
|
116
|
+
count,
|
|
117
|
+
avgNetworkMs: featureTasks.reduce((s, t) => s + t.behavior.totalNetworkMs, 0) /
|
|
118
|
+
count,
|
|
119
|
+
avgSearches: featureTasks.reduce((s, t) => s + t.behavior.searchesPerformed, 0) /
|
|
120
|
+
count,
|
|
121
|
+
canonicalCoverage,
|
|
122
|
+
canonicalSlugs,
|
|
123
|
+
feature,
|
|
124
|
+
tasks: featureTasks,
|
|
125
|
+
};
|
|
126
|
+
})
|
|
127
|
+
.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
128
|
+
return { features, hasData: true, tasks };
|
|
129
|
+
}
|
|
130
|
+
function detectFeatureArea(description) {
|
|
131
|
+
const desc = description.toLowerCase();
|
|
132
|
+
if (desc.includes("studio"))
|
|
133
|
+
return "studio-setup";
|
|
134
|
+
if (desc.includes("visual") ||
|
|
135
|
+
desc.includes("presentation") ||
|
|
136
|
+
desc.includes("live preview"))
|
|
137
|
+
return "visual-editing";
|
|
138
|
+
if (desc.includes("function") || desc.includes("webhook"))
|
|
139
|
+
return "functions";
|
|
140
|
+
if (desc.includes("next") || desc.includes("app router"))
|
|
141
|
+
return "nextjs-live";
|
|
142
|
+
if (desc.includes("remix") ||
|
|
143
|
+
desc.includes("nuxt") ||
|
|
144
|
+
desc.includes("svelte"))
|
|
145
|
+
return "frameworks";
|
|
146
|
+
return "other";
|
|
147
|
+
}
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
// Report output
|
|
150
|
+
// ---------------------------------------------------------------------------
|
|
151
|
+
function main() {
|
|
152
|
+
const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
|
|
153
|
+
const resultsPath = process.argv[2] || join(ROOT, "results", "latest", "eval-results.json");
|
|
154
|
+
if (!existsSync(resultsPath)) {
|
|
155
|
+
console.error(`Results file not found: ${resultsPath}`);
|
|
156
|
+
console.error("Run an evaluation first: pnpm eval:observed");
|
|
157
|
+
process.exit(1);
|
|
158
|
+
}
|
|
159
|
+
console.log(`Reading results from: ${resultsPath}`);
|
|
160
|
+
console.log();
|
|
161
|
+
const analysis = analyzeResults(resultsPath);
|
|
162
|
+
if (!analysis.hasData) {
|
|
163
|
+
console.log("No agent behavior data found in the results.");
|
|
164
|
+
console.log("Make sure you ran the evaluation with the observed config:");
|
|
165
|
+
console.log(" pnpm eval:observed");
|
|
166
|
+
process.exit(0);
|
|
167
|
+
}
|
|
168
|
+
printReport(analysis);
|
|
169
|
+
// Persist detailed report as JSON
|
|
170
|
+
const outDir = join(ROOT, "results", "latest");
|
|
171
|
+
mkdirSync(outDir, { recursive: true });
|
|
172
|
+
const reportData = {
|
|
173
|
+
features: analysis.features.map((f) => ({
|
|
174
|
+
avgDocPages: f.avgDocPages,
|
|
175
|
+
avgNetworkMs: f.avgNetworkMs,
|
|
176
|
+
avgSearches: f.avgSearches,
|
|
177
|
+
canonicalCoverage: f.canonicalCoverage,
|
|
178
|
+
canonicalSlugs: f.canonicalSlugs,
|
|
179
|
+
docSlugsVisited: f.allDocSlugs,
|
|
180
|
+
externalDomains: f.allExternalDomains,
|
|
181
|
+
feature: f.feature,
|
|
182
|
+
searchQueries: f.allSearchQueries,
|
|
183
|
+
taskCount: f.tasks.length,
|
|
184
|
+
})),
|
|
185
|
+
tasks: analysis.tasks.map((t) => ({
|
|
186
|
+
behavior: t.behavior,
|
|
187
|
+
description: t.description,
|
|
188
|
+
feature: t.feature,
|
|
189
|
+
hasDocs: t.hasDocs,
|
|
190
|
+
})),
|
|
191
|
+
timestamp: new Date().toISOString(),
|
|
192
|
+
totalTasks: analysis.tasks.length,
|
|
193
|
+
};
|
|
194
|
+
writeFileSync(join(outDir, "agent-behavior-report.json"), JSON.stringify(reportData, null, 2));
|
|
195
|
+
console.log("Agent behavior report written to results/latest/agent-behavior-report.json");
|
|
196
|
+
}
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
// Main
|
|
199
|
+
// ---------------------------------------------------------------------------
|
|
200
|
+
function printReport(analysis) {
|
|
201
|
+
console.log("=".repeat(80));
|
|
202
|
+
console.log(" AGENT BEHAVIOR OBSERVATION REPORT");
|
|
203
|
+
console.log("=".repeat(80));
|
|
204
|
+
console.log();
|
|
205
|
+
// ---- Overview table ----
|
|
206
|
+
console.log("OVERVIEW BY FEATURE AREA");
|
|
207
|
+
console.log("-".repeat(80));
|
|
208
|
+
const h = "| Feature Area | Tasks | Avg Docs | Avg Search | Avg Net(ms) | Canon% |";
|
|
209
|
+
const sep = "|---------------------|-------|----------|------------|-------------|--------|";
|
|
210
|
+
console.log(h);
|
|
211
|
+
console.log(sep);
|
|
212
|
+
for (const f of analysis.features) {
|
|
213
|
+
console.log(`| ${f.feature.padEnd(19)} | ` +
|
|
214
|
+
`${f.tasks.length.toString().padStart(5)} | ` +
|
|
215
|
+
`${f.avgDocPages.toFixed(1).padStart(8)} | ` +
|
|
216
|
+
`${f.avgSearches.toFixed(1).padStart(10)} | ` +
|
|
217
|
+
`${Math.round(f.avgNetworkMs).toString().padStart(11)} | ` +
|
|
218
|
+
`${(f.canonicalCoverage * 100).toFixed(0).padStart(5)}% |`);
|
|
219
|
+
}
|
|
220
|
+
console.log();
|
|
221
|
+
// ---- Canonical coverage breakdown ----
|
|
222
|
+
console.log("CANONICAL DOCUMENTATION COVERAGE");
|
|
223
|
+
console.log("-".repeat(80));
|
|
224
|
+
console.log();
|
|
225
|
+
for (const f of analysis.features) {
|
|
226
|
+
console.log(` ${f.feature} (${(f.canonicalCoverage * 100).toFixed(0)}% canonical coverage):`);
|
|
227
|
+
if (f.canonicalSlugs.length === 0) {
|
|
228
|
+
console.log(" (no canonical docs defined)");
|
|
229
|
+
}
|
|
230
|
+
else {
|
|
231
|
+
for (const slug of f.canonicalSlugs) {
|
|
232
|
+
const found = f.allDocSlugs.some((visited) => visited.includes(slug));
|
|
233
|
+
const marker = found ? "[x]" : "[ ]";
|
|
234
|
+
console.log(` ${marker} ${slug}`);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
if (f.allDocSlugs.length > 0) {
|
|
238
|
+
const nonCanonical = f.allDocSlugs.filter((slug) => !f.canonicalSlugs.some((c) => slug.includes(c)));
|
|
239
|
+
if (nonCanonical.length > 0) {
|
|
240
|
+
console.log(" Additional docs visited:");
|
|
241
|
+
for (const slug of nonCanonical) {
|
|
242
|
+
console.log(` + ${slug}`);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
console.log();
|
|
247
|
+
}
|
|
248
|
+
// ---- Search strategy ----
|
|
249
|
+
const allSearches = analysis.features.flatMap((f) => f.allSearchQueries);
|
|
250
|
+
if (allSearches.length > 0) {
|
|
251
|
+
console.log("SEARCH STRATEGY");
|
|
252
|
+
console.log("-".repeat(80));
|
|
253
|
+
console.log();
|
|
254
|
+
for (const f of analysis.features) {
|
|
255
|
+
if (f.allSearchQueries.length === 0)
|
|
256
|
+
continue;
|
|
257
|
+
console.log(` ${f.feature}:`);
|
|
258
|
+
for (const q of f.allSearchQueries) {
|
|
259
|
+
console.log(` -> "${q}"`);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
console.log();
|
|
263
|
+
}
|
|
264
|
+
// ---- Per-task detail ----
|
|
265
|
+
console.log("PER-TASK DETAIL");
|
|
266
|
+
console.log("-".repeat(80));
|
|
267
|
+
console.log();
|
|
268
|
+
for (const f of analysis.features) {
|
|
269
|
+
console.log(` ## ${f.feature}`);
|
|
270
|
+
console.log();
|
|
271
|
+
for (const t of f.tasks) {
|
|
272
|
+
const variant = t.hasDocs ? "[gold]" : "[baseline]";
|
|
273
|
+
console.log(` ${variant} ${t.description}`);
|
|
274
|
+
console.log(` Requests: ${t.behavior.totalRequests} | ` +
|
|
275
|
+
`Doc pages: ${t.behavior.docPagesVisited} | ` +
|
|
276
|
+
`Searches: ${t.behavior.searchesPerformed} | ` +
|
|
277
|
+
`External: ${t.behavior.externalRequestCount}`);
|
|
278
|
+
if (t.behavior.docSlugsVisited.length > 0) {
|
|
279
|
+
console.log(` Docs: ${t.behavior.docSlugsVisited.join(", ")}`);
|
|
280
|
+
}
|
|
281
|
+
if (t.behavior.uniqueSearchQueries.length > 0) {
|
|
282
|
+
console.log(` Queries: ${t.behavior.uniqueSearchQueries.map((q) => `"${q}"`).join(", ")}`);
|
|
283
|
+
}
|
|
284
|
+
console.log();
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
// ---- External domains ----
|
|
288
|
+
const allDomains = [
|
|
289
|
+
...new Set(analysis.features.flatMap((f) => f.allExternalDomains)),
|
|
290
|
+
];
|
|
291
|
+
if (allDomains.length > 0) {
|
|
292
|
+
console.log("EXTERNAL DOMAINS");
|
|
293
|
+
console.log("-".repeat(80));
|
|
294
|
+
console.log();
|
|
295
|
+
for (const d of allDomains) {
|
|
296
|
+
console.log(` - ${d}`);
|
|
297
|
+
}
|
|
298
|
+
console.log();
|
|
299
|
+
}
|
|
300
|
+
// ---- Summary stats ----
|
|
301
|
+
console.log("OVERALL STATISTICS");
|
|
302
|
+
console.log("-".repeat(80));
|
|
303
|
+
console.log();
|
|
304
|
+
const totalTasks = analysis.tasks.length;
|
|
305
|
+
const tasksUsingDocs = analysis.tasks.filter((t) => t.behavior.usedDocs).length;
|
|
306
|
+
const tasksUsingSearch = analysis.tasks.filter((t) => t.behavior.usedSearch).length;
|
|
307
|
+
const avgCanonical = analysis.features.reduce((s, f) => s + f.canonicalCoverage, 0) /
|
|
308
|
+
(analysis.features.length || 1);
|
|
309
|
+
console.log(` Total tasks observed: ${totalTasks}`);
|
|
310
|
+
console.log(` Tasks that used docs: ${tasksUsingDocs}/${totalTasks} (${((tasksUsingDocs / totalTasks) * 100).toFixed(0)}%)`);
|
|
311
|
+
console.log(` Tasks that used search: ${tasksUsingSearch}/${totalTasks} (${((tasksUsingSearch / totalTasks) * 100).toFixed(0)}%)`);
|
|
312
|
+
console.log(` Avg canonical coverage: ${(avgCanonical * 100).toFixed(1)}%`);
|
|
313
|
+
console.log();
|
|
314
|
+
}
|
|
315
|
+
main();
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline.ts
|
|
3
|
+
*
|
|
4
|
+
* Manages historical baseline snapshots of evaluation scores.
|
|
5
|
+
* Allows saving, comparing, and listing score baselines over time.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* pnpm baseline:save # save current scores as baseline
|
|
9
|
+
* pnpm baseline:save --tag "pre-groq" # save with a descriptive tag
|
|
10
|
+
* pnpm baseline:compare # compare current vs latest baseline
|
|
11
|
+
* pnpm baseline:history # list all saved baselines
|
|
12
|
+
*/
|
|
13
|
+
interface BaselineMetadata {
|
|
14
|
+
areaCount: number;
|
|
15
|
+
avgScore: number;
|
|
16
|
+
filename: string;
|
|
17
|
+
graderCost?: number;
|
|
18
|
+
tag?: string;
|
|
19
|
+
timestamp: string;
|
|
20
|
+
totalCost?: number;
|
|
21
|
+
}
|
|
22
|
+
interface CompareResult {
|
|
23
|
+
comparisons?: ScoreComparison[];
|
|
24
|
+
message: string;
|
|
25
|
+
overallDelta?: number;
|
|
26
|
+
success: boolean;
|
|
27
|
+
}
|
|
28
|
+
interface ScoreComparison {
|
|
29
|
+
baseline: number;
|
|
30
|
+
costBaseline?: number;
|
|
31
|
+
costCurrent?: number;
|
|
32
|
+
costDelta?: number;
|
|
33
|
+
current: number;
|
|
34
|
+
delta: number;
|
|
35
|
+
feature: string;
|
|
36
|
+
}
|
|
37
|
+
export declare function compareBaseline(baselineFile?: string): CompareResult;
|
|
38
|
+
export declare function listBaselines(): BaselineMetadata[];
|
|
39
|
+
export declare function saveBaseline(tag?: string): {
|
|
40
|
+
success: boolean;
|
|
41
|
+
message: string;
|
|
42
|
+
};
|
|
43
|
+
export {};
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Baseline.ts
|
|
3
|
+
*
|
|
4
|
+
* Manages historical baseline snapshots of evaluation scores.
|
|
5
|
+
* Allows saving, comparing, and listing score baselines over time.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* pnpm baseline:save # save current scores as baseline
|
|
9
|
+
* pnpm baseline:save --tag "pre-groq" # save with a descriptive tag
|
|
10
|
+
* pnpm baseline:compare # compare current vs latest baseline
|
|
11
|
+
* pnpm baseline:history # list all saved baselines
|
|
12
|
+
*/
|
|
13
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
|
|
14
|
+
import { dirname, join, resolve } from "path";
|
|
15
|
+
import { fileURLToPath } from "url";
|
|
16
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
const ROOT = resolve(__dirname, "..", "..");
|
|
18
|
+
const BASELINES_DIR = join(ROOT, "results", "baselines");
|
|
19
|
+
const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Compare
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
export function compareBaseline(baselineFile) {
|
|
24
|
+
if (!existsSync(SCORE_SUMMARY_PATH)) {
|
|
25
|
+
return {
|
|
26
|
+
message: "No current score-summary.json found.",
|
|
27
|
+
success: false,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
// Find baseline to compare against
|
|
31
|
+
const baselines = listBaselines();
|
|
32
|
+
if (baselines.length === 0) {
|
|
33
|
+
return {
|
|
34
|
+
message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
|
|
35
|
+
success: false,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
const targetFile = baselineFile ?? baselines[0].filename;
|
|
39
|
+
const baselinePath = join(BASELINES_DIR, targetFile);
|
|
40
|
+
if (!existsSync(baselinePath)) {
|
|
41
|
+
return {
|
|
42
|
+
message: `Baseline file not found: ${targetFile}`,
|
|
43
|
+
success: false,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
const current = JSON.parse(readFileSync(SCORE_SUMMARY_PATH, "utf-8"));
|
|
47
|
+
const baseline = JSON.parse(readFileSync(baselinePath, "utf-8"));
|
|
48
|
+
const baselineMap = new Map(baseline.scores.map((s) => [s.feature, s.totalScore]));
|
|
49
|
+
const baselineCostMap = new Map(baseline.scores.map((s) => [s.feature, s.totalCost ?? 0]));
|
|
50
|
+
const comparisons = current.scores.map((s) => {
|
|
51
|
+
const baseScore = baselineMap.get(s.feature) ?? 0;
|
|
52
|
+
const currentCost = s.totalCost ?? 0;
|
|
53
|
+
const baseCost = baselineCostMap.get(s.feature) ?? 0;
|
|
54
|
+
return {
|
|
55
|
+
baseline: baseScore,
|
|
56
|
+
costBaseline: baseCost > 0 ? baseCost : undefined,
|
|
57
|
+
costCurrent: currentCost > 0 ? currentCost : undefined,
|
|
58
|
+
costDelta: currentCost > 0 || baseCost > 0 ? currentCost - baseCost : undefined,
|
|
59
|
+
current: s.totalScore,
|
|
60
|
+
delta: s.totalScore - baseScore,
|
|
61
|
+
feature: s.feature,
|
|
62
|
+
};
|
|
63
|
+
});
|
|
64
|
+
// Check for areas in baseline but not in current
|
|
65
|
+
for (const [feature, score] of baselineMap) {
|
|
66
|
+
if (!comparisons.find((c) => c.feature === feature)) {
|
|
67
|
+
comparisons.push({
|
|
68
|
+
baseline: score,
|
|
69
|
+
current: 0,
|
|
70
|
+
delta: -score,
|
|
71
|
+
feature,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
comparisons.sort((a, b) => b.delta - a.delta);
|
|
76
|
+
const overallDelta = Math.round(current.overall.avgScore) - Math.round(baseline.overall.avgScore);
|
|
77
|
+
return {
|
|
78
|
+
comparisons,
|
|
79
|
+
message: `Compared against ${targetFile}`,
|
|
80
|
+
overallDelta,
|
|
81
|
+
success: true,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
export function listBaselines() {
|
|
85
|
+
if (!existsSync(BASELINES_DIR)) {
|
|
86
|
+
return [];
|
|
87
|
+
}
|
|
88
|
+
const files = readdirSync(BASELINES_DIR)
|
|
89
|
+
.filter((f) => f.endsWith(".json"))
|
|
90
|
+
.sort()
|
|
91
|
+
.reverse(); // Newest first
|
|
92
|
+
return files.map((filename) => {
|
|
93
|
+
const raw = readFileSync(join(BASELINES_DIR, filename), "utf-8");
|
|
94
|
+
const data = JSON.parse(raw);
|
|
95
|
+
return {
|
|
96
|
+
areaCount: data.scores.length,
|
|
97
|
+
avgScore: Math.round(data.overall.avgScore),
|
|
98
|
+
filename,
|
|
99
|
+
graderCost: data.overall.cost?.graderTotal,
|
|
100
|
+
tag: data.baselineMeta?.tag,
|
|
101
|
+
timestamp: data.timestamp,
|
|
102
|
+
totalCost: data.overall.cost?.total,
|
|
103
|
+
};
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
export function saveBaseline(tag) {
|
|
107
|
+
if (!existsSync(SCORE_SUMMARY_PATH)) {
|
|
108
|
+
return {
|
|
109
|
+
message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
|
|
110
|
+
success: false,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
const raw = readFileSync(SCORE_SUMMARY_PATH, "utf-8");
|
|
114
|
+
const summary = JSON.parse(raw);
|
|
115
|
+
mkdirSync(BASELINES_DIR, { recursive: true });
|
|
116
|
+
// Generate filename: YYYY-MM-DD_HHmmss[_tag].json
|
|
117
|
+
const now = new Date();
|
|
118
|
+
const datePart = now
|
|
119
|
+
.toISOString()
|
|
120
|
+
.slice(0, 19)
|
|
121
|
+
.replace(/[T:]/g, "_")
|
|
122
|
+
.replace(/-/g, "");
|
|
123
|
+
const tagPart = tag
|
|
124
|
+
? `_${tag.replace(/[^a-z0-9-]/gi, "-").toLowerCase()}`
|
|
125
|
+
: "";
|
|
126
|
+
const filename = `${datePart}${tagPart}.json`;
|
|
127
|
+
const baseline = {
|
|
128
|
+
...summary,
|
|
129
|
+
baselineMeta: {
|
|
130
|
+
savedAt: now.toISOString(),
|
|
131
|
+
// oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string tag should be treated as no tag
|
|
132
|
+
tag: tag || undefined,
|
|
133
|
+
},
|
|
134
|
+
};
|
|
135
|
+
writeFileSync(join(BASELINES_DIR, filename), JSON.stringify(baseline, null, 2));
|
|
136
|
+
return {
|
|
137
|
+
message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
|
|
138
|
+
success: true,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
// CLI
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
if (process.argv[1]?.endsWith("baseline.ts") ||
|
|
145
|
+
process.argv[1]?.endsWith("baseline.js")) {
|
|
146
|
+
const args = process.argv.slice(2);
|
|
147
|
+
const command = args[0] || "save";
|
|
148
|
+
function getArg(name) {
|
|
149
|
+
const idx = args.indexOf(`--${name}`);
|
|
150
|
+
return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
|
|
151
|
+
}
|
|
152
|
+
switch (command) {
|
|
153
|
+
case "compare": {
|
|
154
|
+
const file = getArg("file");
|
|
155
|
+
console.log("=== Baseline Comparison ===\n");
|
|
156
|
+
const result = compareBaseline(file);
|
|
157
|
+
if (!result.success) {
|
|
158
|
+
console.error(` ❌ ${result.message}`);
|
|
159
|
+
process.exit(1);
|
|
160
|
+
}
|
|
161
|
+
console.log(` ${result.message}\n`);
|
|
162
|
+
console.log(" " +
|
|
163
|
+
"Feature Area".padEnd(18) +
|
|
164
|
+
"Current".padEnd(10) +
|
|
165
|
+
"Baseline".padEnd(10) +
|
|
166
|
+
"Delta");
|
|
167
|
+
console.log(" " + "-".repeat(50));
|
|
168
|
+
for (const c of result.comparisons) {
|
|
169
|
+
const deltaStr = c.delta > 0 ? `+${c.delta}` : c.delta === 0 ? "=" : String(c.delta);
|
|
170
|
+
const icon = c.delta > 0 ? "📈" : c.delta < 0 ? "📉" : "➡️";
|
|
171
|
+
console.log(" " +
|
|
172
|
+
c.feature.padEnd(18) +
|
|
173
|
+
String(c.current).padEnd(10) +
|
|
174
|
+
String(c.baseline).padEnd(10) +
|
|
175
|
+
`${icon} ${deltaStr}`);
|
|
176
|
+
}
|
|
177
|
+
// Cost comparison (only if cost data exists)
|
|
178
|
+
const hasCostData = result.comparisons.some((c) => c.costCurrent !== undefined || c.costBaseline !== undefined);
|
|
179
|
+
if (hasCostData) {
|
|
180
|
+
console.log();
|
|
181
|
+
console.log(" " + "Cost Comparison:");
|
|
182
|
+
console.log(" " +
|
|
183
|
+
"Feature Area".padEnd(18) +
|
|
184
|
+
"Current".padEnd(10) +
|
|
185
|
+
"Baseline".padEnd(10) +
|
|
186
|
+
"Delta");
|
|
187
|
+
console.log(" " + "-".repeat(50));
|
|
188
|
+
for (const c of result.comparisons) {
|
|
189
|
+
if (c.costCurrent === undefined && c.costBaseline === undefined) {
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
const cur = `$${(c.costCurrent ?? 0).toFixed(4)}`;
|
|
193
|
+
const base = `$${(c.costBaseline ?? 0).toFixed(4)}`;
|
|
194
|
+
const delta = c.costDelta ?? 0;
|
|
195
|
+
const deltaStr = delta > 0
|
|
196
|
+
? `+$${delta.toFixed(4)}`
|
|
197
|
+
: delta < 0
|
|
198
|
+
? `-$${Math.abs(delta).toFixed(4)}`
|
|
199
|
+
: "=";
|
|
200
|
+
const icon = delta > 0 ? "📈" : delta < 0 ? "📉" : "➡️";
|
|
201
|
+
console.log(" " +
|
|
202
|
+
c.feature.padEnd(18) +
|
|
203
|
+
cur.padEnd(10) +
|
|
204
|
+
base.padEnd(10) +
|
|
205
|
+
`${icon} ${deltaStr}`);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
console.log();
|
|
209
|
+
const overallIcon = result.overallDelta > 0 ? "📈" : result.overallDelta < 0 ? "📉" : "➡️";
|
|
210
|
+
const overallStr = result.overallDelta > 0
|
|
211
|
+
? `+${result.overallDelta}`
|
|
212
|
+
: result.overallDelta === 0
|
|
213
|
+
? "="
|
|
214
|
+
: String(result.overallDelta);
|
|
215
|
+
console.log(` Overall: ${overallIcon} ${overallStr} points`);
|
|
216
|
+
break;
|
|
217
|
+
}
|
|
218
|
+
case "history": {
|
|
219
|
+
console.log("=== Baseline History ===\n");
|
|
220
|
+
const baselines = listBaselines();
|
|
221
|
+
if (baselines.length === 0) {
|
|
222
|
+
console.log(" No baselines saved yet.");
|
|
223
|
+
}
|
|
224
|
+
else {
|
|
225
|
+
const hasCosts = baselines.some((b) => b.totalCost !== undefined || b.graderCost !== undefined);
|
|
226
|
+
const costHeader = hasCosts ? "Cost".padEnd(10) : "";
|
|
227
|
+
console.log(" " +
|
|
228
|
+
"Date".padEnd(22) +
|
|
229
|
+
"Avg".padEnd(6) +
|
|
230
|
+
"Areas".padEnd(7) +
|
|
231
|
+
costHeader +
|
|
232
|
+
"Tag");
|
|
233
|
+
console.log(" " + "-".repeat(hasCosts ? 60 : 50));
|
|
234
|
+
for (const b of baselines) {
|
|
235
|
+
const date = new Date(b.timestamp).toLocaleString();
|
|
236
|
+
const combinedCost = (b.totalCost ?? 0) + (b.graderCost ?? 0);
|
|
237
|
+
const costStr = hasCosts
|
|
238
|
+
? (combinedCost > 0 ? `$${combinedCost.toFixed(2)}` : "-").padEnd(10)
|
|
239
|
+
: "";
|
|
240
|
+
console.log(" " +
|
|
241
|
+
date.padEnd(22) +
|
|
242
|
+
String(b.avgScore).padEnd(6) +
|
|
243
|
+
String(b.areaCount).padEnd(7) +
|
|
244
|
+
costStr +
|
|
245
|
+
(b.tag ?? ""));
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
break;
|
|
249
|
+
}
|
|
250
|
+
case "save": {
|
|
251
|
+
const tag = getArg("tag");
|
|
252
|
+
console.log("=== Saving baseline snapshot ===\n");
|
|
253
|
+
const result = saveBaseline(tag);
|
|
254
|
+
if (result.success) {
|
|
255
|
+
console.log(` ✅ ${result.message}`);
|
|
256
|
+
}
|
|
257
|
+
else {
|
|
258
|
+
console.error(` ❌ ${result.message}`);
|
|
259
|
+
process.exit(1);
|
|
260
|
+
}
|
|
261
|
+
break;
|
|
262
|
+
}
|
|
263
|
+
default:
|
|
264
|
+
console.error(`Unknown command: "${command}". Use: save, history, compare`);
|
|
265
|
+
process.exit(1);
|
|
266
|
+
}
|
|
267
|
+
}
|