@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -0,0 +1,503 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* telemetry.test.ts — Tests for the observability & telemetry subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Covers tool call classification, trace collection, cost tracking,
|
|
5
|
+
* redaction pipeline, trace storage, and per-turn trace merging.
|
|
6
|
+
*
|
|
7
|
+
* Run: npx tsx --test src/pipeline/compiler/__tests__/telemetry.test.ts
|
|
8
|
+
*/
|
|
9
|
+
import assert from "node:assert/strict";
|
|
10
|
+
import { existsSync, rmSync } from "node:fs";
|
|
11
|
+
import { afterEach, describe, it } from "node:test";
|
|
12
|
+
import { tmpdir } from "os";
|
|
13
|
+
import { resolve } from "path";
|
|
14
|
+
import { classifyToolCall, classifyToolCalls, } from "../telemetry/tool-classifier.js";
|
|
15
|
+
import { collectTrace, mergeTraces } from "../telemetry/trace-collector.js";
|
|
16
|
+
import { checkBudget, computeCost, estimateRunCost, lookupPricing, } from "../telemetry/cost-tracker.js";
|
|
17
|
+
import { redactTrace } from "../telemetry/redactor.js";
|
|
18
|
+
import { extractTraceSummary, LocalTraceStore, } from "../telemetry/trace-store.js";
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Tool call classification
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
describe("classifyToolCall", () => {
|
|
23
|
+
it("classifies known tools by exact name", () => {
|
|
24
|
+
assert.equal(classifyToolCall("WebSearch"), "search");
|
|
25
|
+
assert.equal(classifyToolCall("Read"), "read");
|
|
26
|
+
assert.equal(classifyToolCall("Write"), "write");
|
|
27
|
+
assert.equal(classifyToolCall("Bash"), "execute");
|
|
28
|
+
assert.equal(classifyToolCall("Browser.navigate"), "navigate");
|
|
29
|
+
assert.equal(classifyToolCall("AskUser"), "communicate");
|
|
30
|
+
});
|
|
31
|
+
it("uses heuristic for unknown tools", () => {
|
|
32
|
+
assert.equal(classifyToolCall("custom_search_tool"), "search");
|
|
33
|
+
assert.equal(classifyToolCall("ReadFromDB"), "read");
|
|
34
|
+
assert.equal(classifyToolCall("writeConfig"), "write");
|
|
35
|
+
assert.equal(classifyToolCall("executeScript"), "execute");
|
|
36
|
+
});
|
|
37
|
+
it("uses custom mappings over defaults", () => {
|
|
38
|
+
assert.equal(classifyToolCall("MyTool", { MyTool: "communicate" }), "communicate");
|
|
39
|
+
});
|
|
40
|
+
it("falls back to execute for truly unknown tools", () => {
|
|
41
|
+
assert.equal(classifyToolCall("zzz_unknown_zzz"), "execute");
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
describe("classifyToolCalls", () => {
|
|
45
|
+
it("classifies a batch and reports unrecognized names", () => {
|
|
46
|
+
const { categories, unrecognized } = classifyToolCalls([
|
|
47
|
+
"WebSearch",
|
|
48
|
+
"Read",
|
|
49
|
+
"zzz_mystery_tool",
|
|
50
|
+
]);
|
|
51
|
+
assert.equal(categories.length, 3);
|
|
52
|
+
assert.equal(categories[0], "search");
|
|
53
|
+
assert.equal(categories[1], "read");
|
|
54
|
+
assert.ok(unrecognized.includes("zzz_mystery_tool"));
|
|
55
|
+
});
|
|
56
|
+
});
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
// Trace collection
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
describe("collectTrace", () => {
|
|
61
|
+
const baseOptions = {
|
|
62
|
+
runId: "run-1",
|
|
63
|
+
taskId: "task-1",
|
|
64
|
+
testCaseIndex: 0,
|
|
65
|
+
modelId: "openai:chat:gpt-4o",
|
|
66
|
+
};
|
|
67
|
+
it("creates a trace from an empty response", () => {
|
|
68
|
+
const trace = collectTrace({}, baseOptions);
|
|
69
|
+
assert.equal(trace.runId, "run-1");
|
|
70
|
+
assert.equal(trace.taskId, "task-1");
|
|
71
|
+
assert.equal(trace.modelId, "openai:chat:gpt-4o");
|
|
72
|
+
assert.equal(trace.toolCalls.length, 0);
|
|
73
|
+
assert.equal(trace.tokensUsed.totalTokens, 0);
|
|
74
|
+
});
|
|
75
|
+
it("extracts tool calls from metadata", () => {
|
|
76
|
+
const trace = collectTrace({
|
|
77
|
+
metadata: {
|
|
78
|
+
toolCalls: [
|
|
79
|
+
{ name: "WebSearch", input: { query: "GROQ" }, durationMs: 100 },
|
|
80
|
+
{ name: "Read", input: { path: "/docs/groq.md" }, durationMs: 50 },
|
|
81
|
+
],
|
|
82
|
+
},
|
|
83
|
+
}, baseOptions);
|
|
84
|
+
assert.equal(trace.toolCalls.length, 2);
|
|
85
|
+
assert.equal(trace.toolCalls[0].name, "WebSearch");
|
|
86
|
+
assert.equal(trace.toolCalls[0].category, "search");
|
|
87
|
+
assert.equal(trace.toolCalls[1].name, "Read");
|
|
88
|
+
assert.equal(trace.toolCalls[1].category, "read");
|
|
89
|
+
});
|
|
90
|
+
it("extracts token usage", () => {
|
|
91
|
+
const trace = collectTrace({ tokenUsage: { prompt: 1000, completion: 500, total: 1500 } }, baseOptions);
|
|
92
|
+
assert.equal(trace.tokensUsed.promptTokens, 1000);
|
|
93
|
+
assert.equal(trace.tokensUsed.completionTokens, 500);
|
|
94
|
+
assert.equal(trace.tokensUsed.totalTokens, 1500);
|
|
95
|
+
});
|
|
96
|
+
it("extracts URLs from tool calls", () => {
|
|
97
|
+
const trace = collectTrace({
|
|
98
|
+
metadata: {
|
|
99
|
+
toolCalls: [
|
|
100
|
+
{ name: "WebFetch", input: { url: "https://sanity.io/docs" } },
|
|
101
|
+
],
|
|
102
|
+
},
|
|
103
|
+
}, baseOptions);
|
|
104
|
+
assert.ok(trace.urlsVisited.includes("https://sanity.io/docs"));
|
|
105
|
+
});
|
|
106
|
+
it("extracts search terms", () => {
|
|
107
|
+
const trace = collectTrace({
|
|
108
|
+
metadata: {
|
|
109
|
+
toolCalls: [
|
|
110
|
+
{ name: "WebSearch", input: { query: "GROQ projection" } },
|
|
111
|
+
],
|
|
112
|
+
},
|
|
113
|
+
}, baseOptions);
|
|
114
|
+
assert.ok(trace.searchTerms.includes("GROQ projection"));
|
|
115
|
+
});
|
|
116
|
+
it("extracts files read and written", () => {
|
|
117
|
+
const trace = collectTrace({
|
|
118
|
+
metadata: {
|
|
119
|
+
toolCalls: [
|
|
120
|
+
{ name: "Read", input: { path: "/src/schema.ts" } },
|
|
121
|
+
{ name: "Write", input: { path: "/src/config.ts" } },
|
|
122
|
+
],
|
|
123
|
+
},
|
|
124
|
+
}, baseOptions);
|
|
125
|
+
assert.ok(trace.filesRead.includes("/src/schema.ts"));
|
|
126
|
+
assert.ok(trace.filesWritten.includes("/src/config.ts"));
|
|
127
|
+
});
|
|
128
|
+
it("creates event log from tool calls", () => {
|
|
129
|
+
const trace = collectTrace({
|
|
130
|
+
metadata: {
|
|
131
|
+
toolCalls: [
|
|
132
|
+
{ name: "WebSearch", input: { query: "test" }, durationMs: 100 },
|
|
133
|
+
],
|
|
134
|
+
},
|
|
135
|
+
latencyMs: 500,
|
|
136
|
+
}, baseOptions);
|
|
137
|
+
// Should have: llm_request, tool_call_start, tool_call_end, llm_response
|
|
138
|
+
assert.equal(trace.events.length, 4);
|
|
139
|
+
assert.equal(trace.events[0].type, "llm_request");
|
|
140
|
+
assert.equal(trace.events[1].type, "tool_call_start");
|
|
141
|
+
assert.equal(trace.events[2].type, "tool_call_end");
|
|
142
|
+
assert.equal(trace.events[3].type, "llm_response");
|
|
143
|
+
});
|
|
144
|
+
it("builds a root span", () => {
|
|
145
|
+
const trace = collectTrace({ latencyMs: 1000 }, baseOptions);
|
|
146
|
+
assert.equal(trace.spans.length, 1);
|
|
147
|
+
assert.equal(trace.spans[0].operation, "test-case");
|
|
148
|
+
assert.equal(trace.spans[0].parentSpanId, null);
|
|
149
|
+
});
|
|
150
|
+
});
|
|
151
|
+
// ---------------------------------------------------------------------------
|
|
152
|
+
// mergeTraces (per-turn tracing — task 6f)
|
|
153
|
+
// ---------------------------------------------------------------------------
|
|
154
|
+
describe("mergeTraces", () => {
|
|
155
|
+
const parentOptions = {
|
|
156
|
+
runId: "run-1",
|
|
157
|
+
taskId: "task-1",
|
|
158
|
+
testCaseIndex: 0,
|
|
159
|
+
modelId: "openai:chat:gpt-4o",
|
|
160
|
+
};
|
|
161
|
+
function makeTurn(index) {
|
|
162
|
+
return collectTrace({
|
|
163
|
+
metadata: {
|
|
164
|
+
toolCalls: [
|
|
165
|
+
{
|
|
166
|
+
name: "WebSearch",
|
|
167
|
+
input: { query: `turn ${index}` },
|
|
168
|
+
durationMs: 50,
|
|
169
|
+
},
|
|
170
|
+
],
|
|
171
|
+
},
|
|
172
|
+
tokenUsage: { prompt: 100, completion: 50, total: 150 },
|
|
173
|
+
latencyMs: 200,
|
|
174
|
+
}, { ...parentOptions, testCaseIndex: index });
|
|
175
|
+
}
|
|
176
|
+
it("merges multiple turns into one trace", () => {
|
|
177
|
+
const turns = [makeTurn(0), makeTurn(1), makeTurn(2)];
|
|
178
|
+
const merged = mergeTraces(turns, parentOptions);
|
|
179
|
+
assert.equal(merged.toolCalls.length, 3);
|
|
180
|
+
assert.equal(merged.tokensUsed.promptTokens, 300);
|
|
181
|
+
assert.equal(merged.tokensUsed.completionTokens, 150);
|
|
182
|
+
assert.equal(merged.durationMs, 600);
|
|
183
|
+
});
|
|
184
|
+
it("creates per-turn spans under root", () => {
|
|
185
|
+
const turns = [makeTurn(0), makeTurn(1)];
|
|
186
|
+
const merged = mergeTraces(turns, parentOptions);
|
|
187
|
+
// root + 2 turns
|
|
188
|
+
assert.equal(merged.spans.length, 3);
|
|
189
|
+
assert.equal(merged.spans[0].operation, "test-case");
|
|
190
|
+
assert.equal(merged.spans[0].parentSpanId, null);
|
|
191
|
+
assert.equal(merged.spans[1].operation, "turn-0");
|
|
192
|
+
assert.equal(merged.spans[1].parentSpanId, merged.spans[0].spanId);
|
|
193
|
+
assert.equal(merged.spans[2].operation, "turn-1");
|
|
194
|
+
});
|
|
195
|
+
it("deduplicates URLs and search terms", () => {
|
|
196
|
+
const t1 = collectTrace({
|
|
197
|
+
metadata: {
|
|
198
|
+
toolCalls: [
|
|
199
|
+
{ name: "WebSearch", input: { query: "GROQ" } },
|
|
200
|
+
{ name: "WebFetch", input: { url: "https://sanity.io" } },
|
|
201
|
+
],
|
|
202
|
+
},
|
|
203
|
+
}, { ...parentOptions, testCaseIndex: 0 });
|
|
204
|
+
const t2 = collectTrace({
|
|
205
|
+
metadata: {
|
|
206
|
+
toolCalls: [
|
|
207
|
+
{ name: "WebSearch", input: { query: "GROQ" } },
|
|
208
|
+
{ name: "WebFetch", input: { url: "https://sanity.io" } },
|
|
209
|
+
],
|
|
210
|
+
},
|
|
211
|
+
}, { ...parentOptions, testCaseIndex: 1 });
|
|
212
|
+
const merged = mergeTraces([t1, t2], parentOptions);
|
|
213
|
+
assert.equal(merged.searchTerms.length, 1); // deduplicated
|
|
214
|
+
assert.equal(merged.urlsVisited.length, 1); // deduplicated
|
|
215
|
+
});
|
|
216
|
+
it("handles empty turns", () => {
|
|
217
|
+
const merged = mergeTraces([], parentOptions);
|
|
218
|
+
assert.equal(merged.toolCalls.length, 0);
|
|
219
|
+
assert.equal(merged.spans.length, 1); // root only
|
|
220
|
+
});
|
|
221
|
+
});
|
|
222
|
+
// ---------------------------------------------------------------------------
|
|
223
|
+
// Cost tracking
|
|
224
|
+
// ---------------------------------------------------------------------------
|
|
225
|
+
describe("computeCost", () => {
|
|
226
|
+
it("computes cost from token usage and pricing", () => {
|
|
227
|
+
const cost = computeCost({ promptTokens: 1000, completionTokens: 500, totalTokens: 1500 }, { input: 3.0, output: 15.0 });
|
|
228
|
+
// 1000 * 3.0/1M + 500 * 15.0/1M = 0.003 + 0.0075 = 0.0105
|
|
229
|
+
assert.ok(Math.abs(cost - 0.0105) < 0.0001);
|
|
230
|
+
});
|
|
231
|
+
it("accounts for cached input tokens", () => {
|
|
232
|
+
const cost = computeCost({
|
|
233
|
+
promptTokens: 1000,
|
|
234
|
+
completionTokens: 500,
|
|
235
|
+
totalTokens: 1500,
|
|
236
|
+
toolTokens: 300,
|
|
237
|
+
}, { input: 3.0, output: 15.0, cachedInput: 0.3 });
|
|
238
|
+
// 700 * 3.0/1M + 300 * 0.3/1M + 500 * 15.0/1M = 0.0021 + 0.00009 + 0.0075
|
|
239
|
+
assert.ok(cost > 0);
|
|
240
|
+
assert.ok(cost < 0.02);
|
|
241
|
+
});
|
|
242
|
+
});
|
|
243
|
+
describe("lookupPricing", () => {
|
|
244
|
+
it("finds exact match", () => {
|
|
245
|
+
const pricing = lookupPricing("openai:chat:gpt-4o");
|
|
246
|
+
assert.ok(pricing);
|
|
247
|
+
assert.ok(pricing.input > 0);
|
|
248
|
+
});
|
|
249
|
+
it("falls back to prefix match", () => {
|
|
250
|
+
const pricing = lookupPricing("openai:chat:gpt-4o-2024-11-20");
|
|
251
|
+
assert.ok(pricing);
|
|
252
|
+
});
|
|
253
|
+
it("returns undefined for unknown model", () => {
|
|
254
|
+
const pricing = lookupPricing("unknown:model:xyz");
|
|
255
|
+
assert.equal(pricing, undefined);
|
|
256
|
+
});
|
|
257
|
+
it("uses custom pricing over defaults", () => {
|
|
258
|
+
const pricing = lookupPricing("custom:model", {
|
|
259
|
+
"custom:model": { input: 1.0, output: 2.0 },
|
|
260
|
+
});
|
|
261
|
+
assert.ok(pricing);
|
|
262
|
+
assert.equal(pricing.input, 1.0);
|
|
263
|
+
});
|
|
264
|
+
});
|
|
265
|
+
describe("estimateRunCost", () => {
|
|
266
|
+
it("estimates cost for a run", () => {
|
|
267
|
+
const estimate = estimateRunCost(5, ["openai:chat:gpt-4o"]);
|
|
268
|
+
assert.ok(estimate.totalUSD > 0);
|
|
269
|
+
assert.equal(estimate.perModel.length, 1);
|
|
270
|
+
});
|
|
271
|
+
it("flags budget warning", () => {
|
|
272
|
+
const estimate = estimateRunCost(100, ["openai:chat:gpt-4o", "anthropic:messages:claude-sonnet-4-6"], { perRun: { warn: 0.01, stop: 1.0 } });
|
|
273
|
+
assert.equal(estimate.exceedsWarning, true);
|
|
274
|
+
});
|
|
275
|
+
});
|
|
276
|
+
describe("checkBudget", () => {
|
|
277
|
+
it("allows spend below thresholds", () => {
|
|
278
|
+
const result = checkBudget(1.0, { perRun: { warn: 5.0, stop: 20.0 } }, "perRun");
|
|
279
|
+
assert.equal(result.proceed, true);
|
|
280
|
+
assert.equal(result.warning, undefined);
|
|
281
|
+
});
|
|
282
|
+
it("warns at warn threshold", () => {
|
|
283
|
+
const result = checkBudget(5.5, { perRun: { warn: 5.0, stop: 20.0 } }, "perRun");
|
|
284
|
+
assert.equal(result.proceed, true);
|
|
285
|
+
assert.ok(result.warning?.includes("warning"));
|
|
286
|
+
});
|
|
287
|
+
it("stops at stop threshold", () => {
|
|
288
|
+
const result = checkBudget(25.0, { perRun: { warn: 5.0, stop: 20.0 } }, "perRun");
|
|
289
|
+
assert.equal(result.proceed, false);
|
|
290
|
+
assert.ok(result.warning?.includes("exceeded"));
|
|
291
|
+
});
|
|
292
|
+
});
|
|
293
|
+
// ---------------------------------------------------------------------------
|
|
294
|
+
// Redaction
|
|
295
|
+
// ---------------------------------------------------------------------------
|
|
296
|
+
describe("redactTrace", () => {
|
|
297
|
+
function makeTrace(toolCalls) {
|
|
298
|
+
return {
|
|
299
|
+
traceId: "trace-1",
|
|
300
|
+
runId: "run-1",
|
|
301
|
+
taskId: "task-1",
|
|
302
|
+
testCaseIndex: 0,
|
|
303
|
+
modelId: "openai:chat:gpt-4o",
|
|
304
|
+
spans: [],
|
|
305
|
+
toolCalls,
|
|
306
|
+
urlsVisited: [],
|
|
307
|
+
searchTerms: [],
|
|
308
|
+
filesRead: [],
|
|
309
|
+
filesWritten: [],
|
|
310
|
+
tokensUsed: { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
|
|
311
|
+
costEstimate: 0,
|
|
312
|
+
durationMs: 0,
|
|
313
|
+
events: [],
|
|
314
|
+
startedAt: new Date().toISOString(),
|
|
315
|
+
completedAt: new Date().toISOString(),
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
it("redacts Bearer tokens in tool call inputs", () => {
|
|
319
|
+
const trace = makeTrace([
|
|
320
|
+
{
|
|
321
|
+
name: "WebFetch",
|
|
322
|
+
input: {
|
|
323
|
+
url: "https://api.sanity.io",
|
|
324
|
+
auth: "Bearer sk_live_abc123def456ghi789",
|
|
325
|
+
},
|
|
326
|
+
output: "OK",
|
|
327
|
+
durationMs: 100,
|
|
328
|
+
category: "read",
|
|
329
|
+
},
|
|
330
|
+
]);
|
|
331
|
+
const { trace: redacted, redactionCount } = redactTrace(trace);
|
|
332
|
+
const inputStr = JSON.stringify(redacted.toolCalls[0].input);
|
|
333
|
+
assert.ok(!inputStr.includes("sk_live_abc123def456ghi789"));
|
|
334
|
+
assert.ok(inputStr.includes("[REDACTED]"));
|
|
335
|
+
assert.ok(redactionCount > 0);
|
|
336
|
+
});
|
|
337
|
+
it("redacts Sanity tokens", () => {
|
|
338
|
+
const trace = makeTrace([
|
|
339
|
+
{
|
|
340
|
+
name: "Write",
|
|
341
|
+
input: { token: "skAbcDefGhiJklMnoPqrStUvWxYz0123456789" },
|
|
342
|
+
output: null,
|
|
343
|
+
durationMs: 10,
|
|
344
|
+
category: "write",
|
|
345
|
+
},
|
|
346
|
+
]);
|
|
347
|
+
const { trace: redacted } = redactTrace(trace);
|
|
348
|
+
const inputStr = JSON.stringify(redacted.toolCalls[0].input);
|
|
349
|
+
assert.ok(inputStr.includes("[REDACTED_SANITY_TOKEN]"));
|
|
350
|
+
});
|
|
351
|
+
it("redacts OpenAI keys", () => {
|
|
352
|
+
const trace = makeTrace([
|
|
353
|
+
{
|
|
354
|
+
name: "Bash",
|
|
355
|
+
input: {
|
|
356
|
+
command: "export OPENAI_API_KEY=sk-proj-abcdefghij1234567890abcdefghij",
|
|
357
|
+
},
|
|
358
|
+
output: null,
|
|
359
|
+
durationMs: 10,
|
|
360
|
+
category: "execute",
|
|
361
|
+
},
|
|
362
|
+
]);
|
|
363
|
+
const { trace: redacted } = redactTrace(trace);
|
|
364
|
+
const inputStr = JSON.stringify(redacted.toolCalls[0].input);
|
|
365
|
+
assert.ok(!inputStr.includes("sk-proj-abcdefghij1234567890abcdefghij"), "OpenAI key should be redacted");
|
|
366
|
+
});
|
|
367
|
+
it("does not mutate the original trace", () => {
|
|
368
|
+
const original = makeTrace([
|
|
369
|
+
{
|
|
370
|
+
name: "WebFetch",
|
|
371
|
+
input: { auth: "Bearer secrettoken1234567890" },
|
|
372
|
+
output: null,
|
|
373
|
+
durationMs: 10,
|
|
374
|
+
category: "read",
|
|
375
|
+
},
|
|
376
|
+
]);
|
|
377
|
+
const originalStr = JSON.stringify(original);
|
|
378
|
+
redactTrace(original);
|
|
379
|
+
assert.equal(JSON.stringify(original), originalStr);
|
|
380
|
+
});
|
|
381
|
+
it("reports which rules fired", () => {
|
|
382
|
+
const trace = makeTrace([
|
|
383
|
+
{
|
|
384
|
+
name: "Bash",
|
|
385
|
+
input: {
|
|
386
|
+
cmd: "curl -H 'Authorization: Bearer abc123def456789' https://api.example.com",
|
|
387
|
+
},
|
|
388
|
+
output: null,
|
|
389
|
+
durationMs: 10,
|
|
390
|
+
category: "execute",
|
|
391
|
+
},
|
|
392
|
+
]);
|
|
393
|
+
const { rulesApplied } = redactTrace(trace);
|
|
394
|
+
assert.ok(rulesApplied.includes("bearer_tokens"));
|
|
395
|
+
});
|
|
396
|
+
});
|
|
397
|
+
// ---------------------------------------------------------------------------
|
|
398
|
+
// Trace storage
|
|
399
|
+
// ---------------------------------------------------------------------------
|
|
400
|
+
describe("LocalTraceStore", () => {
|
|
401
|
+
const storeDir = resolve(tmpdir(), `ailf-trace-test-${process.pid}`);
|
|
402
|
+
afterEach(() => {
|
|
403
|
+
if (existsSync(storeDir)) {
|
|
404
|
+
rmSync(storeDir, { recursive: true, force: true });
|
|
405
|
+
}
|
|
406
|
+
});
|
|
407
|
+
it("stores and retrieves a trace", async () => {
|
|
408
|
+
const store = new LocalTraceStore(storeDir);
|
|
409
|
+
const trace = {
|
|
410
|
+
traceId: "trace-store-test",
|
|
411
|
+
runId: "run-1",
|
|
412
|
+
taskId: "task-1",
|
|
413
|
+
testCaseIndex: 0,
|
|
414
|
+
modelId: "openai:chat:gpt-4o",
|
|
415
|
+
spans: [],
|
|
416
|
+
toolCalls: [],
|
|
417
|
+
urlsVisited: [],
|
|
418
|
+
searchTerms: [],
|
|
419
|
+
filesRead: [],
|
|
420
|
+
filesWritten: [],
|
|
421
|
+
tokensUsed: { promptTokens: 100, completionTokens: 50, totalTokens: 150 },
|
|
422
|
+
costEstimate: 0.001,
|
|
423
|
+
durationMs: 500,
|
|
424
|
+
events: [],
|
|
425
|
+
startedAt: new Date().toISOString(),
|
|
426
|
+
completedAt: new Date().toISOString(),
|
|
427
|
+
};
|
|
428
|
+
const result = await store.store(trace);
|
|
429
|
+
assert.ok(result.uri.startsWith("file://"));
|
|
430
|
+
assert.ok(result.sizeBytes > 0);
|
|
431
|
+
const retrieved = await store.retrieve(result.uri);
|
|
432
|
+
assert.ok(retrieved);
|
|
433
|
+
assert.equal(retrieved.traceId, "trace-store-test");
|
|
434
|
+
});
|
|
435
|
+
it("returns null for non-existent trace", async () => {
|
|
436
|
+
const store = new LocalTraceStore(storeDir);
|
|
437
|
+
const result = await store.retrieve("file:///nonexistent/path.json");
|
|
438
|
+
assert.equal(result, null);
|
|
439
|
+
});
|
|
440
|
+
});
|
|
441
|
+
// ---------------------------------------------------------------------------
|
|
442
|
+
// Trace summary extraction
|
|
443
|
+
// ---------------------------------------------------------------------------
|
|
444
|
+
describe("extractTraceSummary", () => {
|
|
445
|
+
it("extracts sanitized summary from full trace", () => {
|
|
446
|
+
const trace = {
|
|
447
|
+
traceId: "trace-summary-test",
|
|
448
|
+
runId: "run-1",
|
|
449
|
+
taskId: "task-1",
|
|
450
|
+
testCaseIndex: 0,
|
|
451
|
+
modelId: "openai:chat:gpt-4o",
|
|
452
|
+
spans: [],
|
|
453
|
+
toolCalls: [
|
|
454
|
+
{
|
|
455
|
+
name: "WebSearch",
|
|
456
|
+
input: {},
|
|
457
|
+
output: null,
|
|
458
|
+
durationMs: 100,
|
|
459
|
+
category: "search",
|
|
460
|
+
},
|
|
461
|
+
{
|
|
462
|
+
name: "Read",
|
|
463
|
+
input: {},
|
|
464
|
+
output: null,
|
|
465
|
+
durationMs: 50,
|
|
466
|
+
category: "read",
|
|
467
|
+
},
|
|
468
|
+
{
|
|
469
|
+
name: "Read",
|
|
470
|
+
input: {},
|
|
471
|
+
output: null,
|
|
472
|
+
durationMs: 30,
|
|
473
|
+
category: "read",
|
|
474
|
+
},
|
|
475
|
+
],
|
|
476
|
+
urlsVisited: ["https://sanity.io/docs"],
|
|
477
|
+
searchTerms: ["GROQ"],
|
|
478
|
+
filesRead: ["/src/schema.ts"],
|
|
479
|
+
filesWritten: [],
|
|
480
|
+
tokensUsed: {
|
|
481
|
+
promptTokens: 1000,
|
|
482
|
+
completionTokens: 500,
|
|
483
|
+
totalTokens: 1500,
|
|
484
|
+
},
|
|
485
|
+
costEstimate: 0.01,
|
|
486
|
+
durationMs: 2000,
|
|
487
|
+
events: [],
|
|
488
|
+
startedAt: new Date().toISOString(),
|
|
489
|
+
completedAt: new Date().toISOString(),
|
|
490
|
+
};
|
|
491
|
+
const summary = extractTraceSummary(trace, "file:///traces/trace-1.json");
|
|
492
|
+
assert.equal(summary.traceId, "trace-summary-test");
|
|
493
|
+
assert.equal(summary.traceDataUri, "file:///traces/trace-1.json");
|
|
494
|
+
assert.equal(summary.toolCallCount, 3);
|
|
495
|
+
assert.equal(summary.toolCallCategories.search, 1);
|
|
496
|
+
assert.equal(summary.toolCallCategories.read, 2);
|
|
497
|
+
assert.equal(summary.totalTokens, 1500);
|
|
498
|
+
assert.equal(summary.costEstimate, 0.01);
|
|
499
|
+
assert.equal(summary.urlsVisitedCount, 1);
|
|
500
|
+
assert.equal(summary.filesReadCount, 1);
|
|
501
|
+
assert.equal(summary.filesWrittenCount, 0);
|
|
502
|
+
});
|
|
503
|
+
});
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Assertion type mapper — maps AILF assertion types to Promptfoo assertion types.
|
|
3
|
+
*
|
|
4
|
+
* AILF assertions have two flavors:
|
|
5
|
+
* 1. Templated assertions (`type: "llm-rubric"` with `template` + `criteria`)
|
|
6
|
+
* → resolved into Promptfoo's `llm-rubric` with a fully assembled rubric prompt
|
|
7
|
+
* 2. Value assertions (any other `type` with a `value`)
|
|
8
|
+
* → passed through to Promptfoo mostly as-is
|
|
9
|
+
*
|
|
10
|
+
* This module handles the mapping for both, validates mode compatibility
|
|
11
|
+
* (e.g., `tool-called` is only valid for agent-harness/mcp-server modes),
|
|
12
|
+
* and normalizes weight fields.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/design-docs/architecture-overhaul/scoring-rubrics-assertions.md
|
|
15
|
+
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
16
|
+
*/
|
|
17
|
+
import type { GeneralizedAssertionDefinition } from "../../_vendor/ailf-core/index.d.ts";
|
|
18
|
+
import type { EvalMode } from "../../_vendor/ailf-shared/index.d.ts";
|
|
19
|
+
/** A Promptfoo-compatible assertion object */
|
|
20
|
+
export interface PromptfooAssertion {
|
|
21
|
+
type: string;
|
|
22
|
+
value?: unknown;
|
|
23
|
+
weight?: number;
|
|
24
|
+
/** Promptfoo-specific: provider for model-graded assertions */
|
|
25
|
+
provider?: string;
|
|
26
|
+
/** Promptfoo-specific: rubric prompt text */
|
|
27
|
+
rubricPrompt?: string;
|
|
28
|
+
/** Promptfoo-specific: threshold for similarity */
|
|
29
|
+
threshold?: number;
|
|
30
|
+
/** Additional properties passed through */
|
|
31
|
+
[key: string]: unknown;
|
|
32
|
+
}
|
|
33
|
+
/** Options for mapping assertions */
|
|
34
|
+
export interface AssertionMapperOptions {
|
|
35
|
+
/** Evaluation mode — used for compatibility checking */
|
|
36
|
+
mode?: EvalMode;
|
|
37
|
+
/** Default grader provider (for LLM-graded assertions) */
|
|
38
|
+
graderProvider?: string;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Map an array of AILF assertions to Promptfoo assertions.
|
|
42
|
+
*
|
|
43
|
+
* @param assertions - AILF assertion definitions
|
|
44
|
+
* @param options - Mapper options
|
|
45
|
+
* @returns Mapped Promptfoo assertions and any warnings
|
|
46
|
+
*/
|
|
47
|
+
export declare function mapAssertions(assertions: GeneralizedAssertionDefinition[], options?: AssertionMapperOptions): {
|
|
48
|
+
mapped: PromptfooAssertion[];
|
|
49
|
+
warnings: string[];
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* Check if an assertion type is valid.
|
|
53
|
+
*/
|
|
54
|
+
export declare function isValidAssertionType(type: string): boolean;
|
|
55
|
+
/**
|
|
56
|
+
* Check if an assertion type is compatible with a given mode.
|
|
57
|
+
*/
|
|
58
|
+
export declare function isAssertionCompatibleWithMode(type: string, mode: EvalMode): boolean;
|