@sanity/ailf 0.5.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/features.ts +23 -0
- package/config/models.ts +95 -0
- package/config/prompts.ts +16 -0
- package/config/rubrics.ts +225 -0
- package/config/schedules.ts +47 -0
- package/config/sinks.ts +37 -0
- package/config/sources.ts +21 -0
- package/config/thresholds.ts +61 -0
- package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
- package/dist/_vendor/ailf-core/config-helpers.js +170 -0
- package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
- package/dist/_vendor/ailf-core/env-helper.js +45 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/index.js +5 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
- package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
- package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
- package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
- package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
- package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
- package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
- package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
- package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/services/index.js +2 -1
- package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
- package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
- package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
- package/dist/_vendor/ailf-core/services/scoring.js +25 -15
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
- package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
- package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
- package/dist/_vendor/ailf-core/types/index.js +8 -1
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
- package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
- package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
- package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
- package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
- package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
- package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
- package/dist/_vendor/ailf-core/types/trace.js +18 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
- package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
- package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
- package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
- package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
- package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
- package/dist/_vendor/ailf-shared/index.d.ts +0 -1
- package/dist/_vendor/ailf-shared/index.js +0 -1
- package/dist/adapters/api-client/build-request.js +14 -13
- package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
- package/dist/adapters/config-sources/file-config-adapter.js +39 -12
- package/dist/adapters/config-sources/index.d.ts +2 -0
- package/dist/adapters/config-sources/index.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
- package/dist/adapters/config-sources/ts-config-loader.js +141 -0
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
- package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
- package/dist/adapters/task-sources/composite-task-source.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
- package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
- package/dist/adapters/task-sources/index.d.ts +3 -2
- package/dist/adapters/task-sources/index.js +3 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
- package/dist/adapters/task-sources/repo-task-source.js +92 -80
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
- package/dist/adapters/task-sources/task-file-loader.js +83 -0
- package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
- package/dist/adapters/task-sources/yaml-task-source.js +19 -16
- package/dist/cli.js +0 -2
- package/dist/commands/baseline.js +4 -1
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/coverage-audit.js +9 -1
- package/dist/commands/explain-handler.js +25 -23
- package/dist/commands/fetch-docs.js +3 -2
- package/dist/commands/generate-configs.js +1 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/interactive.js +11 -7
- package/dist/commands/pipeline-action.d.ts +2 -0
- package/dist/commands/pipeline-action.js +16 -6
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +4 -2
- package/dist/commands/pr-comment.js +1 -1
- package/dist/commands/publish.js +2 -2
- package/dist/commands/readiness-report.js +13 -6
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +99 -4
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +28 -8
- package/dist/orchestration/steps/calculate-scores-step.js +24 -11
- package/dist/orchestration/steps/fetch-docs-step.js +8 -7
- package/dist/orchestration/steps/gap-analysis-step.js +8 -7
- package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
- package/dist/orchestration/steps/generate-configs-step.js +261 -51
- package/dist/orchestration/steps/grader-consistency-step.js +7 -4
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
- package/dist/orchestration/steps/readiness-step.js +5 -6
- package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
- package/dist/orchestration/steps/run-eval-step.js +8 -7
- package/dist/pipeline/cache.d.ts +1 -1
- package/dist/pipeline/cache.js +36 -8
- package/dist/pipeline/calculate-scores.d.ts +2 -4
- package/dist/pipeline/calculate-scores.js +43 -113
- package/dist/pipeline/checks.js +2 -2
- package/dist/pipeline/compare.js +8 -8
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
- package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
- package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
- package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
- package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
- package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
- package/dist/pipeline/compiler/assertion-mapper.js +175 -0
- package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
- package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
- package/dist/pipeline/compiler/config-loader.d.ts +56 -0
- package/dist/pipeline/compiler/config-loader.js +111 -0
- package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
- package/dist/pipeline/compiler/fixture-resolver.js +113 -0
- package/dist/pipeline/compiler/hash.d.ts +11 -0
- package/dist/pipeline/compiler/hash.js +18 -0
- package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
- package/dist/pipeline/compiler/ignore-fields.js +113 -0
- package/dist/pipeline/compiler/index.d.ts +29 -0
- package/dist/pipeline/compiler/index.js +45 -0
- package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
- package/dist/pipeline/compiler/literacy-bridge.js +172 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/index.d.ts +9 -0
- package/dist/pipeline/compiler/presets/index.js +8 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
- package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
- package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
- package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
- package/dist/pipeline/compiler/provider-assembler.js +137 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
- package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
- package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
- package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
- package/dist/pipeline/compiler/sandbox/index.js +11 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
- package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
- package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
- package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
- package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
- package/dist/pipeline/compiler/scoring-bridge.js +114 -0
- package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
- package/dist/pipeline/compiler/task-graph-builder.js +291 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
- package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
- package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
- package/dist/pipeline/compiler/telemetry/index.js +19 -0
- package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
- package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
- package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
- package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
- package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
- package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
- package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
- package/dist/pipeline/compiler/variable-resolver.js +115 -0
- package/dist/pipeline/coverage-audit.d.ts +15 -5
- package/dist/pipeline/coverage-audit.js +41 -22
- package/dist/pipeline/eval-constants.d.ts +16 -6
- package/dist/pipeline/eval-constants.js +25 -4
- package/dist/pipeline/eval-fingerprint.d.ts +2 -2
- package/dist/pipeline/eval-fingerprint.js +8 -9
- package/dist/pipeline/expand-tasks.d.ts +19 -10
- package/dist/pipeline/expand-tasks.js +34 -28
- package/dist/pipeline/gap-analysis.d.ts +1 -1
- package/dist/pipeline/gap-analysis.js +2 -2
- package/dist/pipeline/generate-configs.d.ts +22 -4
- package/dist/pipeline/generate-configs.js +53 -24
- package/dist/pipeline/grader-api.d.ts +3 -3
- package/dist/pipeline/grader-api.js +5 -12
- package/dist/pipeline/grader-compare-runner.js +20 -27
- package/dist/pipeline/grader-comparison.d.ts +4 -8
- package/dist/pipeline/grader-comparison.js +11 -17
- package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
- package/dist/pipeline/grader-consistency-runner.js +16 -20
- package/dist/pipeline/grader-consistency.d.ts +6 -10
- package/dist/pipeline/grader-consistency.js +13 -32
- package/dist/pipeline/grader-sensitivity-runner.js +7 -5
- package/dist/pipeline/grader-sensitivity.d.ts +2 -6
- package/dist/pipeline/grader-sensitivity.js +10 -10
- package/dist/pipeline/grader-validate-runner.js +7 -5
- package/dist/pipeline/grader-validation.d.ts +2 -6
- package/dist/pipeline/grader-validation.js +14 -22
- package/dist/pipeline/map-request-to-config.js +7 -1
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
- package/dist/pipeline/mirror-repo-tasks.js +22 -21
- package/dist/pipeline/normalize-mode.d.ts +49 -0
- package/dist/pipeline/normalize-mode.js +64 -0
- package/dist/pipeline/plan.d.ts +5 -2
- package/dist/pipeline/plan.js +134 -78
- package/dist/pipeline/pr-comment.js +2 -0
- package/dist/pipeline/profile-resolution.d.ts +22 -14
- package/dist/pipeline/profile-resolution.js +41 -19
- package/dist/pipeline/provenance.d.ts +2 -2
- package/dist/pipeline/provenance.js +12 -17
- package/dist/pipeline/release-report.js +4 -4
- package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
- package/dist/pipeline/repo-threshold-evaluator.js +1 -1
- package/dist/pipeline/rubric-loader.d.ts +20 -0
- package/dist/pipeline/rubric-loader.js +37 -0
- package/dist/pipeline/validate.d.ts +4 -4
- package/dist/pipeline/validate.js +64 -53
- package/dist/schedules/loader.js +18 -8
- package/dist/scripts/migrate-task-mode.d.ts +24 -0
- package/dist/scripts/migrate-task-mode.js +85 -0
- package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
- package/dist/scripts/validate-task-sources.d.ts +1 -1
- package/dist/scripts/validate-task-sources.js +15 -15
- package/dist/sinks/loader.js +5 -7
- package/dist/sources.d.ts +7 -7
- package/dist/sources.js +22 -24
- package/dist/webhook/dispatch.js +2 -1
- package/package.json +15 -4
- package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
- package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
- package/tasks/literacy/frameworks.task.ts +128 -0
- package/tasks/literacy/functions.task.ts +69 -0
- package/tasks/literacy/groq.task.ts +258 -0
- package/tasks/literacy/nextjs-live.task.ts +75 -0
- package/tasks/literacy/studio-setup.task.ts +131 -0
- package/tasks/literacy/visual-editing.task.ts +146 -0
- package/config/features.yaml +0 -116
- package/config/models.yaml +0 -116
- package/config/prompts.yaml +0 -75
- package/config/rubrics.yaml +0 -81
- package/config/schedules.yaml +0 -43
- package/config/sinks.yaml +0 -54
- package/config/sources.yaml +0 -51
- package/config/thresholds.yaml +0 -49
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TraceCollector — extracts structured trace data from provider responses.
|
|
3
|
+
*
|
|
4
|
+
* Parses tool calls, token usage, and timing data from Promptfoo result
|
|
5
|
+
* objects and normalizes them into the canonical `EvalTrace` shape.
|
|
6
|
+
*
|
|
7
|
+
* Works via inline extraction — parsing provider response metadata
|
|
8
|
+
* directly, without requiring additional infrastructure.
|
|
9
|
+
*
|
|
10
|
+
* @see docs/design-docs/architecture-overhaul/observability-telemetry.md
|
|
11
|
+
* @see packages/core/src/types/trace.ts — EvalTrace types
|
|
12
|
+
*/
|
|
13
|
+
import { classifyToolCall } from "./tool-classifier.js";
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Public API
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
/**
|
|
18
|
+
* Collect a trace from a single provider response.
|
|
19
|
+
*
|
|
20
|
+
* Extracts tool calls, token usage, timing, and builds the
|
|
21
|
+
* chronological event log.
|
|
22
|
+
*/
|
|
23
|
+
export function collectTrace(response, options) {
|
|
24
|
+
const now = new Date().toISOString();
|
|
25
|
+
const traceId = `trace-${options.runId}-${options.taskId}-${options.testCaseIndex}`;
|
|
26
|
+
// Extract tool calls
|
|
27
|
+
const toolCalls = extractToolCalls(response, options.toolCategories, options.maxOutputBytes ?? 10_240);
|
|
28
|
+
// Extract token usage
|
|
29
|
+
const tokensUsed = extractTokenUsage(response);
|
|
30
|
+
// Compute cost (caller provides pricing separately)
|
|
31
|
+
const durationMs = response.latencyMs ?? 0;
|
|
32
|
+
// Build event log
|
|
33
|
+
const events = buildEventLog(toolCalls, tokensUsed, durationMs, now);
|
|
34
|
+
// Build spans — one root span for the test case
|
|
35
|
+
const spans = buildSpans(options, durationMs);
|
|
36
|
+
// Extract URLs and search terms from tool calls
|
|
37
|
+
const { urlsVisited, searchTerms, filesRead, filesWritten } = extractAccessPatterns(toolCalls);
|
|
38
|
+
return {
|
|
39
|
+
traceId,
|
|
40
|
+
runId: options.runId,
|
|
41
|
+
taskId: options.taskId,
|
|
42
|
+
testCaseIndex: options.testCaseIndex,
|
|
43
|
+
modelId: options.modelId,
|
|
44
|
+
spans,
|
|
45
|
+
toolCalls,
|
|
46
|
+
urlsVisited,
|
|
47
|
+
searchTerms,
|
|
48
|
+
filesRead,
|
|
49
|
+
filesWritten,
|
|
50
|
+
tokensUsed,
|
|
51
|
+
costEstimate: 0, // Set by cost calculator separately
|
|
52
|
+
durationMs,
|
|
53
|
+
events,
|
|
54
|
+
startedAt: now,
|
|
55
|
+
completedAt: now,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Merge multiple per-turn traces into a single test case trace.
|
|
60
|
+
*
|
|
61
|
+
* Each turn produces its own trace. This function combines them into
|
|
62
|
+
* a parent trace with per-turn spans.
|
|
63
|
+
*/
|
|
64
|
+
export function mergeTraces(turns, parentOptions) {
|
|
65
|
+
if (turns.length === 0) {
|
|
66
|
+
return collectTrace({}, parentOptions);
|
|
67
|
+
}
|
|
68
|
+
const first = turns[0];
|
|
69
|
+
const last = turns[turns.length - 1];
|
|
70
|
+
const traceId = `trace-${parentOptions.runId}-${parentOptions.taskId}-${parentOptions.testCaseIndex}`;
|
|
71
|
+
// Merge all tool calls, events, and access patterns
|
|
72
|
+
const toolCalls = turns.flatMap((t) => t.toolCalls);
|
|
73
|
+
const events = turns.flatMap((t) => t.events);
|
|
74
|
+
const urlsVisited = [...new Set(turns.flatMap((t) => t.urlsVisited))];
|
|
75
|
+
const searchTerms = [...new Set(turns.flatMap((t) => t.searchTerms))];
|
|
76
|
+
const filesRead = [...new Set(turns.flatMap((t) => t.filesRead))];
|
|
77
|
+
const filesWritten = [...new Set(turns.flatMap((t) => t.filesWritten))];
|
|
78
|
+
// Aggregate token usage
|
|
79
|
+
const tokensUsed = {
|
|
80
|
+
promptTokens: turns.reduce((sum, t) => sum + t.tokensUsed.promptTokens, 0),
|
|
81
|
+
completionTokens: turns.reduce((sum, t) => sum + t.tokensUsed.completionTokens, 0),
|
|
82
|
+
totalTokens: turns.reduce((sum, t) => sum + t.tokensUsed.totalTokens, 0),
|
|
83
|
+
toolTokens: turns.reduce((sum, t) => sum + (t.tokensUsed.toolTokens ?? 0), 0),
|
|
84
|
+
};
|
|
85
|
+
// Build per-turn spans
|
|
86
|
+
const spans = [
|
|
87
|
+
{
|
|
88
|
+
spanId: `${traceId}-root`,
|
|
89
|
+
parentSpanId: null,
|
|
90
|
+
operation: "test-case",
|
|
91
|
+
startMs: 0,
|
|
92
|
+
endMs: turns.reduce((sum, t) => sum + t.durationMs, 0),
|
|
93
|
+
attributes: { turnCount: turns.length },
|
|
94
|
+
},
|
|
95
|
+
...turns.map((turn, i) => ({
|
|
96
|
+
spanId: `${traceId}-turn-${i}`,
|
|
97
|
+
parentSpanId: `${traceId}-root`,
|
|
98
|
+
operation: `turn-${i}`,
|
|
99
|
+
startMs: turns.slice(0, i).reduce((sum, t) => sum + t.durationMs, 0),
|
|
100
|
+
endMs: turns.slice(0, i + 1).reduce((sum, t) => sum + t.durationMs, 0),
|
|
101
|
+
attributes: {
|
|
102
|
+
modelId: turn.modelId,
|
|
103
|
+
toolCallCount: turn.toolCalls.length,
|
|
104
|
+
},
|
|
105
|
+
})),
|
|
106
|
+
];
|
|
107
|
+
return {
|
|
108
|
+
traceId,
|
|
109
|
+
runId: parentOptions.runId,
|
|
110
|
+
taskId: parentOptions.taskId,
|
|
111
|
+
testCaseIndex: parentOptions.testCaseIndex,
|
|
112
|
+
modelId: parentOptions.modelId,
|
|
113
|
+
spans,
|
|
114
|
+
toolCalls,
|
|
115
|
+
urlsVisited,
|
|
116
|
+
searchTerms,
|
|
117
|
+
filesRead,
|
|
118
|
+
filesWritten,
|
|
119
|
+
tokensUsed,
|
|
120
|
+
costEstimate: turns.reduce((sum, t) => sum + t.costEstimate, 0),
|
|
121
|
+
durationMs: turns.reduce((sum, t) => sum + t.durationMs, 0),
|
|
122
|
+
events,
|
|
123
|
+
startedAt: first.startedAt,
|
|
124
|
+
completedAt: last.completedAt,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
// ---------------------------------------------------------------------------
|
|
128
|
+
// Tool call extraction
|
|
129
|
+
// ---------------------------------------------------------------------------
|
|
130
|
+
function extractToolCalls(response, customCategories, maxOutputBytes) {
|
|
131
|
+
const rawCalls = response.metadata?.toolCalls ?? [];
|
|
132
|
+
return rawCalls.map((raw) => {
|
|
133
|
+
const name = raw.name ?? raw.function?.name ?? "unknown";
|
|
134
|
+
const input = raw.input ?? parseJsonSafe(raw.function?.arguments) ?? {};
|
|
135
|
+
let output = raw.output;
|
|
136
|
+
// Truncate output if too large
|
|
137
|
+
if (maxOutputBytes && output) {
|
|
138
|
+
const serialized = JSON.stringify(output);
|
|
139
|
+
if (serialized.length > maxOutputBytes) {
|
|
140
|
+
output = serialized.slice(0, maxOutputBytes) + "... [truncated]";
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return {
|
|
144
|
+
name,
|
|
145
|
+
input: input,
|
|
146
|
+
output,
|
|
147
|
+
durationMs: raw.durationMs ?? 0,
|
|
148
|
+
error: raw.error,
|
|
149
|
+
category: classifyToolCall(name, customCategories),
|
|
150
|
+
};
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
// ---------------------------------------------------------------------------
|
|
154
|
+
// Token usage extraction
|
|
155
|
+
// ---------------------------------------------------------------------------
|
|
156
|
+
function extractTokenUsage(response) {
|
|
157
|
+
const usage = response.tokenUsage;
|
|
158
|
+
if (!usage) {
|
|
159
|
+
return {
|
|
160
|
+
promptTokens: 0,
|
|
161
|
+
completionTokens: 0,
|
|
162
|
+
totalTokens: 0,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
const prompt = usage.prompt ?? 0;
|
|
166
|
+
const completion = usage.completion ?? 0;
|
|
167
|
+
return {
|
|
168
|
+
promptTokens: prompt,
|
|
169
|
+
completionTokens: completion,
|
|
170
|
+
totalTokens: usage.total ?? prompt + completion,
|
|
171
|
+
...(usage.cached ? { toolTokens: usage.cached } : {}),
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
// ---------------------------------------------------------------------------
|
|
175
|
+
// Access pattern extraction
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
function extractAccessPatterns(toolCalls) {
|
|
178
|
+
const urlsVisited = new Set();
|
|
179
|
+
const searchTerms = new Set();
|
|
180
|
+
const filesRead = new Set();
|
|
181
|
+
const filesWritten = new Set();
|
|
182
|
+
for (const call of toolCalls) {
|
|
183
|
+
switch (call.category) {
|
|
184
|
+
case "navigate":
|
|
185
|
+
case "read": {
|
|
186
|
+
const url = extractString(call.input, "url");
|
|
187
|
+
if (url)
|
|
188
|
+
urlsVisited.add(url);
|
|
189
|
+
const filePath = extractString(call.input, "path", "file", "filename");
|
|
190
|
+
if (filePath) {
|
|
191
|
+
if (call.category === "read")
|
|
192
|
+
filesRead.add(filePath);
|
|
193
|
+
}
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
case "search": {
|
|
197
|
+
const query = extractString(call.input, "query", "term", "search");
|
|
198
|
+
if (query)
|
|
199
|
+
searchTerms.add(query);
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
case "write": {
|
|
203
|
+
const writePath = extractString(call.input, "path", "file", "filename");
|
|
204
|
+
if (writePath)
|
|
205
|
+
filesWritten.add(writePath);
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
return {
|
|
211
|
+
urlsVisited: [...urlsVisited],
|
|
212
|
+
searchTerms: [...searchTerms],
|
|
213
|
+
filesRead: [...filesRead],
|
|
214
|
+
filesWritten: [...filesWritten],
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
// ---------------------------------------------------------------------------
|
|
218
|
+
// Event log
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
function buildEventLog(toolCalls, tokensUsed, durationMs, timestamp) {
|
|
221
|
+
const events = [];
|
|
222
|
+
// LLM request event
|
|
223
|
+
events.push({
|
|
224
|
+
timestamp,
|
|
225
|
+
type: "llm_request",
|
|
226
|
+
data: { promptTokens: tokensUsed.promptTokens },
|
|
227
|
+
});
|
|
228
|
+
// Tool call events
|
|
229
|
+
for (const call of toolCalls) {
|
|
230
|
+
events.push({
|
|
231
|
+
timestamp,
|
|
232
|
+
type: "tool_call_start",
|
|
233
|
+
data: { name: call.name, category: call.category },
|
|
234
|
+
});
|
|
235
|
+
events.push({
|
|
236
|
+
timestamp,
|
|
237
|
+
type: "tool_call_end",
|
|
238
|
+
data: {
|
|
239
|
+
name: call.name,
|
|
240
|
+
durationMs: call.durationMs,
|
|
241
|
+
...(call.error ? { error: call.error } : {}),
|
|
242
|
+
},
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
// LLM response event
|
|
246
|
+
events.push({
|
|
247
|
+
timestamp,
|
|
248
|
+
type: "llm_response",
|
|
249
|
+
data: {
|
|
250
|
+
completionTokens: tokensUsed.completionTokens,
|
|
251
|
+
totalTokens: tokensUsed.totalTokens,
|
|
252
|
+
durationMs,
|
|
253
|
+
},
|
|
254
|
+
});
|
|
255
|
+
return events;
|
|
256
|
+
}
|
|
257
|
+
// ---------------------------------------------------------------------------
|
|
258
|
+
// Span building
|
|
259
|
+
// ---------------------------------------------------------------------------
|
|
260
|
+
function buildSpans(options, durationMs) {
|
|
261
|
+
const traceId = `trace-${options.runId}-${options.taskId}-${options.testCaseIndex}`;
|
|
262
|
+
return [
|
|
263
|
+
{
|
|
264
|
+
spanId: `${traceId}-root`,
|
|
265
|
+
parentSpanId: null,
|
|
266
|
+
operation: "test-case",
|
|
267
|
+
startMs: 0,
|
|
268
|
+
endMs: durationMs,
|
|
269
|
+
attributes: {
|
|
270
|
+
taskId: options.taskId,
|
|
271
|
+
modelId: options.modelId,
|
|
272
|
+
testCaseIndex: options.testCaseIndex,
|
|
273
|
+
},
|
|
274
|
+
},
|
|
275
|
+
];
|
|
276
|
+
}
|
|
277
|
+
// ---------------------------------------------------------------------------
|
|
278
|
+
// Helpers
|
|
279
|
+
// ---------------------------------------------------------------------------
|
|
280
|
+
function parseJsonSafe(str) {
|
|
281
|
+
if (!str)
|
|
282
|
+
return null;
|
|
283
|
+
try {
|
|
284
|
+
return JSON.parse(str);
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
return null;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
function extractString(input, ...keys) {
|
|
291
|
+
for (const key of keys) {
|
|
292
|
+
const val = input[key];
|
|
293
|
+
if (typeof val === "string" && val.length > 0)
|
|
294
|
+
return val;
|
|
295
|
+
}
|
|
296
|
+
return undefined;
|
|
297
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Trace storage — port interface for persisting evaluation traces.
|
|
3
|
+
*
|
|
4
|
+
* Full trace payloads go to blob storage (GCP Cloud Storage or local
|
|
5
|
+
* filesystem). Content Lake documents store only sanitized summaries
|
|
6
|
+
* with a `traceDataUri` pointing to the full payload.
|
|
7
|
+
*
|
|
8
|
+
* CRITICAL SECURITY: The `next` dataset is publicly accessible. Full
|
|
9
|
+
* trace data (which may contain tool call arguments with sensitive data)
|
|
10
|
+
* MUST go to blob storage. Only sanitized summaries in Content Lake.
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/architecture-overhaul/observability-telemetry.md
|
|
13
|
+
* @see docs/design-docs/architecture-overhaul/storage-schema.md
|
|
14
|
+
*/
|
|
15
|
+
import type { EvalTrace } from "../../../_vendor/ailf-core/index.d.ts";
|
|
16
|
+
/** Result of storing a trace */
|
|
17
|
+
export interface TraceStoreResult {
|
|
18
|
+
/** URI pointing to the stored trace (blob:// or file://) */
|
|
19
|
+
uri: string;
|
|
20
|
+
/** Content hash for integrity verification */
|
|
21
|
+
contentHash: string;
|
|
22
|
+
/** Size in bytes */
|
|
23
|
+
sizeBytes: number;
|
|
24
|
+
}
|
|
25
|
+
/** Sanitized trace summary for Content Lake storage */
|
|
26
|
+
export interface TraceSummary {
|
|
27
|
+
/** Trace ID */
|
|
28
|
+
traceId: string;
|
|
29
|
+
/** URI to the full trace payload */
|
|
30
|
+
traceDataUri: string;
|
|
31
|
+
/** Task ID */
|
|
32
|
+
taskId: string;
|
|
33
|
+
/** Model ID */
|
|
34
|
+
modelId: string;
|
|
35
|
+
/** Number of tool calls */
|
|
36
|
+
toolCallCount: number;
|
|
37
|
+
/** Tool call category breakdown */
|
|
38
|
+
toolCallCategories: Record<string, number>;
|
|
39
|
+
/** Total tokens used */
|
|
40
|
+
totalTokens: number;
|
|
41
|
+
/** Estimated cost in USD */
|
|
42
|
+
costEstimate: number;
|
|
43
|
+
/** Duration in milliseconds */
|
|
44
|
+
durationMs: number;
|
|
45
|
+
/** URLs visited count */
|
|
46
|
+
urlsVisitedCount: number;
|
|
47
|
+
/** Files read count */
|
|
48
|
+
filesReadCount: number;
|
|
49
|
+
/** Files written count */
|
|
50
|
+
filesWrittenCount: number;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Port interface for trace blob storage.
|
|
54
|
+
*
|
|
55
|
+
* Implementations:
|
|
56
|
+
* - LocalTraceStore — writes to local filesystem (development/CI)
|
|
57
|
+
* - GCPTraceStore — writes to GCP Cloud Storage (production)
|
|
58
|
+
*/
|
|
59
|
+
export interface TraceStore {
|
|
60
|
+
/** Store a full trace payload */
|
|
61
|
+
store(trace: EvalTrace): Promise<TraceStoreResult>;
|
|
62
|
+
/** Retrieve a trace by URI */
|
|
63
|
+
retrieve(uri: string): Promise<EvalTrace | null>;
|
|
64
|
+
}
|
|
65
|
+
/** Local filesystem trace store — for development and CI */
|
|
66
|
+
export declare class LocalTraceStore implements TraceStore {
|
|
67
|
+
private readonly baseDir;
|
|
68
|
+
constructor(baseDir: string);
|
|
69
|
+
store(trace: EvalTrace): Promise<TraceStoreResult>;
|
|
70
|
+
retrieve(uri: string): Promise<EvalTrace | null>;
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Extract a sanitized summary from a full trace.
|
|
74
|
+
*
|
|
75
|
+
* This summary is safe for Content Lake storage (no tool call arguments,
|
|
76
|
+
* no file contents, no potentially sensitive data).
|
|
77
|
+
*/
|
|
78
|
+
export declare function extractTraceSummary(trace: EvalTrace, traceDataUri: string): TraceSummary;
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Trace storage — port interface for persisting evaluation traces.
|
|
3
|
+
*
|
|
4
|
+
* Full trace payloads go to blob storage (GCP Cloud Storage or local
|
|
5
|
+
* filesystem). Content Lake documents store only sanitized summaries
|
|
6
|
+
* with a `traceDataUri` pointing to the full payload.
|
|
7
|
+
*
|
|
8
|
+
* CRITICAL SECURITY: The `next` dataset is publicly accessible. Full
|
|
9
|
+
* trace data (which may contain tool call arguments with sensitive data)
|
|
10
|
+
* MUST go to blob storage. Only sanitized summaries in Content Lake.
|
|
11
|
+
*
|
|
12
|
+
* @see docs/design-docs/architecture-overhaul/observability-telemetry.md
|
|
13
|
+
* @see docs/design-docs/architecture-overhaul/storage-schema.md
|
|
14
|
+
*/
|
|
15
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
16
|
+
import { resolve } from "path";
|
|
17
|
+
import { simpleHash } from "../hash.js";
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Local filesystem implementation
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
/** Local filesystem trace store — for development and CI */
|
|
22
|
+
export class LocalTraceStore {
|
|
23
|
+
baseDir;
|
|
24
|
+
constructor(baseDir) {
|
|
25
|
+
this.baseDir = baseDir;
|
|
26
|
+
}
|
|
27
|
+
async store(trace) {
|
|
28
|
+
mkdirSync(this.baseDir, { recursive: true });
|
|
29
|
+
const filename = `${trace.traceId}.json`;
|
|
30
|
+
const filepath = resolve(this.baseDir, filename);
|
|
31
|
+
const content = JSON.stringify(trace, null, 2);
|
|
32
|
+
writeFileSync(filepath, content);
|
|
33
|
+
return {
|
|
34
|
+
uri: `file://${filepath}`,
|
|
35
|
+
contentHash: simpleHash(content),
|
|
36
|
+
sizeBytes: Buffer.byteLength(content),
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
async retrieve(uri) {
|
|
40
|
+
const filepath = uri.startsWith("file://") ? uri.slice(7) : uri;
|
|
41
|
+
if (!existsSync(filepath))
|
|
42
|
+
return null;
|
|
43
|
+
try {
|
|
44
|
+
const content = readFileSync(filepath, "utf-8");
|
|
45
|
+
return JSON.parse(content);
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
// ---------------------------------------------------------------------------
|
|
53
|
+
// Summary extraction
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
/**
|
|
56
|
+
* Extract a sanitized summary from a full trace.
|
|
57
|
+
*
|
|
58
|
+
* This summary is safe for Content Lake storage (no tool call arguments,
|
|
59
|
+
* no file contents, no potentially sensitive data).
|
|
60
|
+
*/
|
|
61
|
+
export function extractTraceSummary(trace, traceDataUri) {
|
|
62
|
+
// Count tool calls by category
|
|
63
|
+
const toolCallCategories = {};
|
|
64
|
+
for (const call of trace.toolCalls) {
|
|
65
|
+
toolCallCategories[call.category] =
|
|
66
|
+
(toolCallCategories[call.category] ?? 0) + 1;
|
|
67
|
+
}
|
|
68
|
+
return {
|
|
69
|
+
traceId: trace.traceId,
|
|
70
|
+
traceDataUri,
|
|
71
|
+
taskId: trace.taskId,
|
|
72
|
+
modelId: trace.modelId,
|
|
73
|
+
toolCallCount: trace.toolCalls.length,
|
|
74
|
+
toolCallCategories,
|
|
75
|
+
totalTokens: trace.tokensUsed.totalTokens,
|
|
76
|
+
costEstimate: trace.costEstimate,
|
|
77
|
+
durationMs: trace.durationMs,
|
|
78
|
+
urlsVisitedCount: trace.urlsVisited.length,
|
|
79
|
+
filesReadCount: trace.filesRead.length,
|
|
80
|
+
filesWrittenCount: trace.filesWritten.length,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Helpers
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Variable resolver — handles dynamic variable interpolation.
|
|
3
|
+
*
|
|
4
|
+
* Resolves dynamic variables ($latest, $env, $now, etc.) in task
|
|
5
|
+
* configurations. All resolved values are recorded in the
|
|
6
|
+
* VariableEnvelope with provenance metadata for audit and cache
|
|
7
|
+
* invalidation.
|
|
8
|
+
*
|
|
9
|
+
* Design decision: dynamic variables are opt-in and logged.
|
|
10
|
+
* A task must explicitly use `$latest` or `$env(VAR)` syntax —
|
|
11
|
+
* there's no implicit resolution. Every dynamic resolution is
|
|
12
|
+
* recorded in provenance for reproducibility tracking.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/design-docs/architecture-overhaul/domain-model.md (VariableEnvelope)
|
|
15
|
+
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
16
|
+
*/
|
|
17
|
+
import type { VariableDeclaration, VariableEnvelope } from "../../_vendor/ailf-core/index.d.ts";
|
|
18
|
+
/** Options for variable resolution */
|
|
19
|
+
export interface VariableResolverOptions {
|
|
20
|
+
/** Environment variables available for $env() resolution */
|
|
21
|
+
env?: Record<string, string | undefined>;
|
|
22
|
+
/** Log dynamic resolutions */
|
|
23
|
+
log?: (message: string) => void;
|
|
24
|
+
}
|
|
25
|
+
/** Result of resolving variables */
|
|
26
|
+
export interface VariableResolutionResult {
|
|
27
|
+
/** Updated variable envelope with resolved values */
|
|
28
|
+
envelope: VariableEnvelope;
|
|
29
|
+
/** Resolution log entries */
|
|
30
|
+
resolutions: string[];
|
|
31
|
+
/** Warnings for unresolvable references */
|
|
32
|
+
warnings: string[];
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Resolve dynamic variables in a VariableEnvelope.
|
|
36
|
+
*
|
|
37
|
+
* Scans all string values for dynamic patterns and replaces them
|
|
38
|
+
* with resolved values. Non-string values are passed through unchanged.
|
|
39
|
+
*/
|
|
40
|
+
export declare function resolveVariables(envelope: VariableEnvelope, options?: VariableResolverOptions): VariableResolutionResult;
|
|
41
|
+
/**
|
|
42
|
+
* Create a VariableEnvelope from a plain key-value map.
|
|
43
|
+
*
|
|
44
|
+
* Convenience function for wrapping raw variables with empty provenance.
|
|
45
|
+
*/
|
|
46
|
+
export declare function createEnvelope(values: Record<string, unknown>, declarations?: VariableDeclaration[]): VariableEnvelope;
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Variable resolver — handles dynamic variable interpolation.
|
|
3
|
+
*
|
|
4
|
+
* Resolves dynamic variables ($latest, $env, $now, etc.) in task
|
|
5
|
+
* configurations. All resolved values are recorded in the
|
|
6
|
+
* VariableEnvelope with provenance metadata for audit and cache
|
|
7
|
+
* invalidation.
|
|
8
|
+
*
|
|
9
|
+
* Design decision: dynamic variables are opt-in and logged.
|
|
10
|
+
* A task must explicitly use `$latest` or `$env(VAR)` syntax —
|
|
11
|
+
* there's no implicit resolution. Every dynamic resolution is
|
|
12
|
+
* recorded in provenance for reproducibility tracking.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/design-docs/architecture-overhaul/domain-model.md (VariableEnvelope)
|
|
15
|
+
* @see docs/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
16
|
+
*/
|
|
17
|
+
import { simpleHash } from "./hash.js";
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Dynamic variable patterns
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
/** Pattern for $env(VAR_NAME) references */
|
|
22
|
+
const ENV_PATTERN = /\$env\(([A-Z_][A-Z0-9_]*)\)/g;
|
|
23
|
+
/** Pattern for $latest references (resolved to current timestamp) */
|
|
24
|
+
const LATEST_PATTERN = /\$latest/g;
|
|
25
|
+
/** Pattern for $now references (resolved to ISO timestamp) */
|
|
26
|
+
const NOW_PATTERN = /\$now/g;
|
|
27
|
+
/**
|
|
28
|
+
* Resolve dynamic variables in a VariableEnvelope.
|
|
29
|
+
*
|
|
30
|
+
* Scans all string values for dynamic patterns and replaces them
|
|
31
|
+
* with resolved values. Non-string values are passed through unchanged.
|
|
32
|
+
*/
|
|
33
|
+
export function resolveVariables(envelope, options) {
|
|
34
|
+
const resolutions = [];
|
|
35
|
+
const warnings = [];
|
|
36
|
+
const resolvedValues = {};
|
|
37
|
+
const resolvedProvenance = {
|
|
38
|
+
...envelope.provenance,
|
|
39
|
+
};
|
|
40
|
+
const envVars = options?.env ?? process.env;
|
|
41
|
+
const log = options?.log ?? (() => { });
|
|
42
|
+
const now = new Date().toISOString();
|
|
43
|
+
for (const [key, value] of Object.entries(envelope.values)) {
|
|
44
|
+
if (typeof value !== "string") {
|
|
45
|
+
resolvedValues[key] = value;
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
let resolved = value;
|
|
49
|
+
let wasResolved = false;
|
|
50
|
+
// Resolve $env(VAR_NAME)
|
|
51
|
+
resolved = resolved.replace(ENV_PATTERN, (_match, varName) => {
|
|
52
|
+
const envValue = envVars[varName];
|
|
53
|
+
if (envValue === undefined) {
|
|
54
|
+
warnings.push(`Variable "${key}": $env(${varName}) could not be resolved — ` +
|
|
55
|
+
`environment variable "${varName}" is not set`);
|
|
56
|
+
return _match; // Leave unresolved
|
|
57
|
+
}
|
|
58
|
+
wasResolved = true;
|
|
59
|
+
const msg = `Resolved $env(${varName}) in "${key}"`;
|
|
60
|
+
resolutions.push(msg);
|
|
61
|
+
log(msg);
|
|
62
|
+
return envValue;
|
|
63
|
+
});
|
|
64
|
+
// Resolve $latest — alias for $now (same ISO timestamp).
|
|
65
|
+
// Both exist for readability: $now in "run at $now" vs $latest in
|
|
66
|
+
// "compare against $latest baseline". Semantically identical.
|
|
67
|
+
resolved = resolved.replace(LATEST_PATTERN, () => {
|
|
68
|
+
wasResolved = true;
|
|
69
|
+
const msg = `Resolved $latest in "${key}" to ${now} (alias for $now)`;
|
|
70
|
+
resolutions.push(msg);
|
|
71
|
+
log(msg);
|
|
72
|
+
return now;
|
|
73
|
+
});
|
|
74
|
+
// Resolve $now — current ISO timestamp
|
|
75
|
+
resolved = resolved.replace(NOW_PATTERN, () => {
|
|
76
|
+
wasResolved = true;
|
|
77
|
+
const msg = `Resolved $now in "${key}" to ${now}`;
|
|
78
|
+
resolutions.push(msg);
|
|
79
|
+
log(msg);
|
|
80
|
+
return now;
|
|
81
|
+
});
|
|
82
|
+
resolvedValues[key] = resolved;
|
|
83
|
+
if (wasResolved) {
|
|
84
|
+
resolvedProvenance[key] = {
|
|
85
|
+
hash: simpleHash(resolved),
|
|
86
|
+
resolvedAt: now,
|
|
87
|
+
source: { expression: value, inputs: [key], type: "derived" },
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return {
|
|
92
|
+
envelope: {
|
|
93
|
+
declarations: envelope.declarations,
|
|
94
|
+
provenance: resolvedProvenance,
|
|
95
|
+
values: resolvedValues,
|
|
96
|
+
},
|
|
97
|
+
resolutions,
|
|
98
|
+
warnings,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Create a VariableEnvelope from a plain key-value map.
|
|
103
|
+
*
|
|
104
|
+
* Convenience function for wrapping raw variables with empty provenance.
|
|
105
|
+
*/
|
|
106
|
+
export function createEnvelope(values, declarations) {
|
|
107
|
+
return {
|
|
108
|
+
declarations: declarations ?? [],
|
|
109
|
+
provenance: {},
|
|
110
|
+
values,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
// ---------------------------------------------------------------------------
|
|
114
|
+
// Hash helper (shared with fixture-resolver)
|
|
115
|
+
// ---------------------------------------------------------------------------
|
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
* coverage-audit.ts
|
|
3
3
|
*
|
|
4
4
|
* Pure computation functions for cross-referencing the product feature registry
|
|
5
|
-
* (config/features
|
|
5
|
+
* (config/features) against actual task files (tasks/*.yaml)
|
|
6
6
|
* to produce a documentation coverage audit.
|
|
7
7
|
*
|
|
8
8
|
* Phase 3c of the Scenario Matrix implementation.
|
|
9
9
|
*
|
|
10
10
|
* @see docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
|
|
11
11
|
*/
|
|
12
|
-
import type { Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
12
|
+
import type { Logger, PluginRegistry } from "../_vendor/ailf-core/index.d.ts";
|
|
13
13
|
import type { CoverageAuditReport, ProductFeature } from "./types.js";
|
|
14
14
|
/**
|
|
15
15
|
* Count unique document slugs referenced across all tasks.
|
|
@@ -31,10 +31,20 @@ export declare function formatCoverageConsole(report: CoverageAuditReport): stri
|
|
|
31
31
|
*/
|
|
32
32
|
export declare function formatCoverageMarkdown(report: CoverageAuditReport): string;
|
|
33
33
|
/**
|
|
34
|
-
* Load and validate the feature registry
|
|
34
|
+
* Load and validate the feature registry.
|
|
35
|
+
*
|
|
36
|
+
* Resolution order:
|
|
37
|
+
* 1. config/features file (user overrides) — if non-empty, wins
|
|
38
|
+
* 2. Registry features (preset-provided) — fallback when config is empty
|
|
39
|
+
* 3. null — no features available
|
|
35
40
|
*/
|
|
36
|
-
export declare function loadFeatureRegistry(rootDir: string,
|
|
41
|
+
export declare function loadFeatureRegistry(rootDir: string, options?: {
|
|
42
|
+
logger?: Logger;
|
|
43
|
+
registry?: PluginRegistry;
|
|
44
|
+
}): null | ProductFeature[];
|
|
37
45
|
/**
|
|
38
46
|
* Run the coverage audit and produce a structured report.
|
|
39
47
|
*/
|
|
40
|
-
export declare function runCoverageAudit(rootDir: string
|
|
48
|
+
export declare function runCoverageAudit(rootDir: string, options?: {
|
|
49
|
+
registry?: PluginRegistry;
|
|
50
|
+
}): CoverageAuditReport | null;
|