npm - @sanity/ailf - Versions diffs - 0.5.0 → 1.0.0 - Mend

@sanity/ailf 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (288) hide show

package/config/features.ts +23 -0
package/config/models.ts +83 -0
package/config/prompts.ts +16 -0
package/config/rubrics.ts +225 -0
package/config/schedules.ts +47 -0
package/config/sinks.ts +37 -0
package/config/sources.ts +21 -0
package/config/thresholds.ts +61 -0
package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
package/dist/_vendor/ailf-core/config-helpers.js +150 -0
package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
package/dist/_vendor/ailf-core/env-helper.js +45 -0
package/dist/_vendor/ailf-core/index.d.ts +3 -0
package/dist/_vendor/ailf-core/index.js +5 -0
package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
package/dist/_vendor/ailf-core/services/index.js +2 -1
package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
package/dist/_vendor/ailf-core/services/scoring.js +25 -15
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
package/dist/_vendor/ailf-core/types/index.js +8 -1
package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
package/dist/_vendor/ailf-core/types/trace.js +18 -0
package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
package/dist/_vendor/ailf-shared/index.d.ts +0 -1
package/dist/_vendor/ailf-shared/index.js +0 -1
package/dist/adapters/api-client/build-request.js +14 -13
package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
package/dist/adapters/config-sources/file-config-adapter.js +38 -12
package/dist/adapters/config-sources/index.d.ts +2 -0
package/dist/adapters/config-sources/index.js +1 -0
package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
package/dist/adapters/config-sources/ts-config-loader.js +133 -0
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
package/dist/adapters/task-sources/composite-task-source.js +1 -1
package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
package/dist/adapters/task-sources/index.d.ts +1 -0
package/dist/adapters/task-sources/index.js +1 -0
package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
package/dist/adapters/task-sources/repo-task-source.js +69 -16
package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
package/dist/adapters/task-sources/task-file-loader.js +83 -0
package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
package/dist/adapters/task-sources/yaml-task-source.js +19 -16
package/dist/cli.js +0 -2
package/dist/commands/baseline.js +4 -1
package/dist/commands/calculate-scores.js +1 -1
package/dist/commands/coverage-audit.js +7 -1
package/dist/commands/explain-handler.js +25 -23
package/dist/commands/fetch-docs.js +3 -2
package/dist/commands/generate-configs.js +1 -1
package/dist/commands/interactive.js +11 -7
package/dist/commands/pipeline-action.d.ts +2 -0
package/dist/commands/pipeline-action.js +16 -6
package/dist/commands/pipeline.d.ts +1 -0
package/dist/commands/pipeline.js +4 -2
package/dist/commands/pr-comment.js +1 -1
package/dist/commands/publish.js +2 -2
package/dist/commands/readiness-report.js +13 -6
package/dist/composition-root.d.ts +1 -1
package/dist/composition-root.js +67 -4
package/dist/orchestration/build-app-context.js +1 -0
package/dist/orchestration/build-step-sequence.js +24 -6
package/dist/orchestration/steps/calculate-scores-step.js +24 -11
package/dist/orchestration/steps/fetch-docs-step.js +6 -4
package/dist/orchestration/steps/gap-analysis-step.js +8 -7
package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
package/dist/orchestration/steps/generate-configs-step.js +245 -51
package/dist/orchestration/steps/grader-consistency-step.js +7 -4
package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
package/dist/orchestration/steps/readiness-step.js +5 -6
package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
package/dist/orchestration/steps/run-eval-step.js +8 -7
package/dist/pipeline/cache.d.ts +1 -1
package/dist/pipeline/cache.js +36 -8
package/dist/pipeline/calculate-scores.d.ts +2 -4
package/dist/pipeline/calculate-scores.js +43 -113
package/dist/pipeline/checks.js +2 -2
package/dist/pipeline/compare.js +8 -8
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
package/dist/pipeline/compiler/assertion-mapper.js +175 -0
package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
package/dist/pipeline/compiler/config-loader.d.ts +56 -0
package/dist/pipeline/compiler/config-loader.js +111 -0
package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
package/dist/pipeline/compiler/fixture-resolver.js +113 -0
package/dist/pipeline/compiler/hash.d.ts +11 -0
package/dist/pipeline/compiler/hash.js +18 -0
package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
package/dist/pipeline/compiler/ignore-fields.js +113 -0
package/dist/pipeline/compiler/index.d.ts +29 -0
package/dist/pipeline/compiler/index.js +45 -0
package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
package/dist/pipeline/compiler/literacy-bridge.js +172 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
package/dist/pipeline/compiler/presets/index.d.ts +9 -0
package/dist/pipeline/compiler/presets/index.js +8 -0
package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
package/dist/pipeline/compiler/provider-assembler.js +137 -0
package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
package/dist/pipeline/compiler/sandbox/index.js +11 -0
package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
package/dist/pipeline/compiler/scoring-bridge.js +114 -0
package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
package/dist/pipeline/compiler/task-graph-builder.js +291 -0
package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
package/dist/pipeline/compiler/telemetry/index.js +19 -0
package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
package/dist/pipeline/compiler/variable-resolver.js +115 -0
package/dist/pipeline/coverage-audit.d.ts +15 -5
package/dist/pipeline/coverage-audit.js +41 -22
package/dist/pipeline/eval-constants.d.ts +16 -6
package/dist/pipeline/eval-constants.js +25 -4
package/dist/pipeline/eval-fingerprint.d.ts +2 -2
package/dist/pipeline/eval-fingerprint.js +8 -9
package/dist/pipeline/expand-tasks.d.ts +19 -10
package/dist/pipeline/expand-tasks.js +34 -28
package/dist/pipeline/gap-analysis.d.ts +1 -1
package/dist/pipeline/gap-analysis.js +2 -2
package/dist/pipeline/generate-configs.d.ts +22 -4
package/dist/pipeline/generate-configs.js +53 -24
package/dist/pipeline/grader-api.d.ts +3 -3
package/dist/pipeline/grader-api.js +5 -12
package/dist/pipeline/grader-compare-runner.js +20 -27
package/dist/pipeline/grader-comparison.d.ts +4 -8
package/dist/pipeline/grader-comparison.js +11 -17
package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
package/dist/pipeline/grader-consistency-runner.js +16 -20
package/dist/pipeline/grader-consistency.d.ts +6 -10
package/dist/pipeline/grader-consistency.js +13 -32
package/dist/pipeline/grader-sensitivity-runner.js +7 -5
package/dist/pipeline/grader-sensitivity.d.ts +2 -6
package/dist/pipeline/grader-sensitivity.js +10 -10
package/dist/pipeline/grader-validate-runner.js +7 -5
package/dist/pipeline/grader-validation.d.ts +2 -6
package/dist/pipeline/grader-validation.js +14 -22
package/dist/pipeline/map-request-to-config.js +6 -1
package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
package/dist/pipeline/mirror-repo-tasks.js +16 -15
package/dist/pipeline/normalize-mode.d.ts +49 -0
package/dist/pipeline/normalize-mode.js +64 -0
package/dist/pipeline/plan.d.ts +5 -2
package/dist/pipeline/plan.js +134 -78
package/dist/pipeline/pr-comment.js +2 -0
package/dist/pipeline/profile-resolution.d.ts +22 -14
package/dist/pipeline/profile-resolution.js +41 -19
package/dist/pipeline/provenance.d.ts +2 -2
package/dist/pipeline/provenance.js +12 -17
package/dist/pipeline/release-report.js +4 -4
package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
package/dist/pipeline/repo-threshold-evaluator.js +1 -1
package/dist/pipeline/rubric-loader.d.ts +20 -0
package/dist/pipeline/rubric-loader.js +37 -0
package/dist/pipeline/validate.d.ts +4 -4
package/dist/pipeline/validate.js +64 -53
package/dist/schedules/loader.js +18 -8
package/dist/scripts/migrate-task-mode.d.ts +24 -0
package/dist/scripts/migrate-task-mode.js +85 -0
package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
package/dist/scripts/validate-task-sources.d.ts +1 -1
package/dist/scripts/validate-task-sources.js +15 -15
package/dist/sinks/loader.js +5 -7
package/dist/sources.d.ts +7 -7
package/dist/sources.js +22 -24
package/dist/webhook/dispatch.js +2 -1
package/package.json +6 -3
package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
package/tasks/literacy/frameworks.task.ts +128 -0
package/tasks/literacy/functions.task.ts +69 -0
package/tasks/literacy/groq.task.ts +258 -0
package/tasks/literacy/nextjs-live.task.ts +75 -0
package/tasks/literacy/studio-setup.task.ts +131 -0
package/tasks/literacy/visual-editing.task.ts +146 -0
package/config/features.yaml +0 -116
package/config/models.yaml +0 -116
package/config/prompts.yaml +0 -75
package/config/rubrics.yaml +0 -81
package/config/schedules.yaml +0 -43
package/config/sinks.yaml +0 -54
package/config/sources.yaml +0 -51
package/config/thresholds.yaml +0 -49
package/dist/agent-observer/test-imports.d.ts +0 -7
package/dist/agent-observer/test-imports.js +0 -185

package/config/features.yaml DELETED Viewed

@@ -1,116 +0,0 @@
-# features.yaml
-#
-# Product feature registry for documentation coverage auditing.
-# Each entry represents a product feature that should have evaluation
-# task coverage. Edit this file to track coverage gaps and priorities.
-#
-# The coverage audit (`pnpm coverage-audit`) cross-references this
-# registry against tasks/*.yaml to identify covered and uncovered features.
-#
-# Status values:
-#   covered      — has evaluation tasks in tasks/*.yaml
-#   uncovered    — no evaluation tasks yet
-#   planned      — tasks are planned but not yet written
-#   out-of-scope — intentionally excluded from evaluation
-#
-# Phase 3c of the Scenario Matrix implementation.
-# See docs/exec-plans/scenario-matrix-implementation/phase-3-gap-analysis.md
-features:
-  # === Currently covered (have evaluation tasks) ===
-  - id: groq
-    name: "GROQ Query Language"
-    sections: [content-lake]
-    status: covered
-    area: groq
-    priority: critical
-    taskCount: 3
-  - id: visual-editing
-    name: "Visual Editing"
-    sections: [visual-editing]
-    status: covered
-    area: visual-editing
-    priority: critical
-    taskCount: 1
-  - id: nextjs-live
-    name: "Next.js Live Preview"
-    sections: [visual-editing]
-    status: covered
-    area: nextjs-live
-    priority: high
-    taskCount: 2
-  - id: functions
-    name: "Sanity Functions"
-    sections: [compute-and-ai]
-    status: covered
-    area: functions
-    priority: high
-    taskCount: 2
-  - id: studio-setup
-    name: "Studio Configuration"
-    sections: [studio]
-    status: covered
-    area: studio-setup
-    priority: high
-    taskCount: 1
-  - id: frameworks
-    name: "Framework Integration"
-    sections: [developer-guides]
-    status: covered
-    area: frameworks
-    priority: high
-    taskCount: 2
-  # === Uncovered (no evaluation tasks yet) ===
-  - id: portable-text
-    name: "Portable Text"
-    sections: [content-lake, studio]
-    status: uncovered
-    priority: high
-  - id: image-assets
-    name: "Image & Asset Handling"
-    sections: [content-lake, apis-and-sdks]
-    status: uncovered
-    priority: high
-  - id: mutations
-    name: "Mutations & Transactions"
-    sections: [content-lake, apis-and-sdks]
-    status: uncovered
-    priority: high
-  - id: schemas
-    name: "Schema Types & Validation"
-    sections: [studio, content-lake]
-    status: uncovered
-    priority: medium
-  - id: authentication
-    name: "Authentication & Access Control"
-    sections: [apis-and-sdks]
-    status: uncovered
-    priority: medium
-  - id: webhooks
-    name: "Webhooks"
-    sections: [content-lake]
-    status: uncovered
-    priority: medium
-  - id: realtime
-    name: "Real-time Listeners"
-    sections: [apis-and-sdks]
-    status: uncovered
-    priority: low
-  - id: ai-assist
-    name: "AI Assist"
-    sections: [compute-and-ai, studio]
-    status: uncovered
-    priority: medium

package/config/models.yaml DELETED Viewed

@@ -1,116 +0,0 @@
-# models.yaml
-#
-# Central model registry for ai-literacy-framework evaluations.
-#
-# Define all models you want to test here. Each eval mode (baseline, observed,
-# agentic) reads this file and generates the appropriate provider entries.
-#
-# Usage:
-#   1. Add/remove models below
-#   2. Run: pnpm generate-configs
-#   3. Run your eval: pnpm eval / pnpm eval:observed / pnpm eval:agentic
-#
-# Model entries support:
-#   - id:          Promptfoo provider identifier (e.g., "openai:gpt-4o")
-#   - label:       Human-readable label for results display
-#   - config:      Model-specific config (temperature, max_tokens, etc.)
-#   - modes:       Which eval modes to include this model in (default: all)
-#                  Options: baseline, observed, agentic-naive, agentic-optimized
-#   - env:         Environment variable name for the API key (auto-detected for
-#                  openai:* and anthropic:* providers)
-#
-# The generator script expands each model into the correct provider format
-# for each eval mode. Custom providers (observed, agentic) are wired up
-# automatically — you just specify the model name.
-models:
-  # -- Anthropic
-  - id: anthropic:messages:claude-opus-4-6
-    label: Claude Opus 4.6
-    config:
-      temperature: 0.2
-      max_tokens: 4096
-    modes: [baseline, observed, agentic-naive, agentic-optimized]
-  # -- Google
-  # - id: google:gemini-2.5-pro
-  #   label: Gemini 2.5 Pro
-  #   config:
-  #     temperature: 0.2
-  #     max_tokens: 4096
-  #   modes: [baseline, observed, agentic-naive, agentic-optimized]
-  # -- OpenAI
-  - id: openai:chat:gpt-5.2
-    label: GPT 5.2
-    config:
-      temperature: 0.2
-      max_tokens: 4096
-    modes: [baseline, observed, agentic-naive, agentic-optimized]
-  - id: openai:chat:gpt-5.4
-    label: GPT 5.4
-    config:
-      reasoning_effort: "medium"
-      max_output_tokens: 4096
-      maxRetries: 1
-    modes: [baseline, observed, agentic-naive, agentic-optimized]
-  # ── Anthropic ───────────────────────────────────────────────
-  # - id: anthropic:claude-sonnet-4-20250514
-  #   label: Claude Sonnet 4
-  #   config:
-  #     temperature: 0.2
-  #     max_tokens: 4096
-  #   modes: [baseline]
-  # - id: anthropic:claude-3.5-sonnet-20241022
-  #   label: Claude 3.5 Sonnet
-  #   config:
-  #     temperature: 0.2
-  #     max_tokens: 4096
-  #   modes: [baseline, agentic-naive, agentic-optimized]
-  # ── Google ──────────────────────────────────────────────────
-  # - id: google:gemini-2.0-flash
-  #   label: Gemini 2.0 Flash
-  #   config:
-  #     temperature: 0.2
-  #     max_tokens: 4096
-  #   modes: [baseline]
-  # ── Other ───────────────────────────────────────────────────
-  # - id: openrouter:deepseek/deepseek-r1
-  #   label: DeepSeek R1
-  #   config:
-  #     temperature: 0.2
-  #     max_tokens: 4096
-  #   modes: [baseline]
-# ── Grading Model ───────────────────────────────────────────
-# Which model scores the responses. Separate from the models being tested.
-grader:
-  id: anthropic:messages:claude-opus-4-5-20251101
-  label: Claude Opus 4.5 (grader)
-#grader:
-#  id: openai:gpt-5-2025-08-07
-#  label: GPT-5 (grader)
-# ── Evaluation Options ──────────────────────────────────────
-# Controls how promptfoo runs evaluations.
-maxConcurrency: 32 # max parallel API calls — benchmarked in DOC-1896
-# ── Default Config ──────────────────────────────────────────
-# Applied to all models unless overridden per-model.
-defaults:
-  temperature: 0.2
-  max_tokens: 4096
-  maxToolRounds: 5 # for agentic modes
-  observerOptions:
-    maxPreviewBytes: 2048
-    captureResponsePreview: true
-    includePatterns:
-      - "sanity.io"
-      - "sanity.dev"
-      - "cdn.sanity.io"
-    sensitiveHeaders:
-      - "authorization"
-      - "cookie"
-      - "x-api-key"

package/config/prompts.yaml DELETED Viewed

@@ -1,75 +0,0 @@
-# prompts.yaml
-#
-# Prompt templates used across all evaluation modes.
-# Edit these to change what instructions the LLM receives.
-#
-# Available template variables (injected from task vars):
-#   {{task}}  — the implementation task description
-#   {{docs}}  — documentation context (empty string for baseline tests)
-#
-# Each prompt has:
-#   id:       unique identifier (used in Promptfoo config)
-#   label:    human-readable name (shown in results)
-#   template: the actual prompt text with {{variable}} placeholders
-with-docs:
-  id: with-docs
-  label: With Documentation
-  template: |
-    You are an expert Sanity.io developer. Use the following documentation to help implement the task.
-    ## Sanity Documentation
-    {{docs}}
-    ## Task
-    {{task}}
-    ## Requirements
-    1. Use ONLY the APIs and patterns shown in the documentation
-    2. Provide a complete, working implementation
-    3. Include all necessary imports
-    4. Follow Sanity best practices as documented
-    Provide your implementation:
-without-docs:
-  id: without-docs
-  label: Baseline (No Docs)
-  template: |
-    You are an expert Sanity.io developer.
-    ## Task
-    {{task}}
-    ## Requirements
-    1. Provide a complete, working implementation
-    2. Include all necessary imports
-    3. Follow Sanity best practices
-    Provide your implementation:
-agentic:
-  id: agentic
-  label: Agentic (self-retrieval)
-  template: |
-    You are an expert developer helping implement a Sanity.io feature.
-    You have access to web search and page fetching tools.
-    IMPORTANT: Before writing any code, search for and read the relevant
-    Sanity.io documentation to ensure you are using the latest APIs and
-    best practices. Do not rely on memory alone.
-    ## Task
-    {{task}}
-    ## Requirements
-    1. Search for relevant Sanity documentation before implementing
-    2. Use ONLY the APIs and patterns from the current official docs
-    3. Provide a complete, working implementation
-    4. Include all necessary imports
-    5. Follow Sanity best practices as documented
-    Provide your implementation:

package/config/rubrics.yaml DELETED Viewed

@@ -1,81 +0,0 @@
-# rubrics.yaml
-#
-# Centralized rubric templates for LLM grading assertions.
-# Tasks reference these templates by key and provide only their
-# unique criteria bullet points. The pipeline assembles the full
-# rubric text at expansion time.
-#
-# Each dimension is scored on a uniform 0–100 scale. Dimensions are
-# combined into a composite score using named scoring profiles below.
-#
-# Each template carries a `dimension` field that tags the scoring
-# dimension it belongs to. This metadata propagates through the
-# expansion pipeline into Promptfoo assertion metadata, enabling
-# structured dimension classification at scoring time instead of
-# fragile heuristic string matching.
-# See docs/design-docs/structured-dimensions.md for the design.
-#
-# See docs/exec-plans/rubric-templates.md for the design.
-# See docs/design-docs/uniform-dimension-scoring.md for the scoring model.
-templates:
-  task-completion:
-    dimension: task-completion
-    header: "Score task completion from 0 to 100:"
-    scale:
-      - "0: Couldn't attempt — missing critical information"
-      - "20: Attempted but fundamentally wrong approach"
-      - "50: Partial implementation — major functional gaps"
-      - "80: Mostly complete — minor issues or missing edge cases"
-      - "100: Fully functional code — works as expected"
-    criteria_label: "Must demonstrate:"
-  code-correctness:
-    dimension: code-correctness
-    header: "Score code correctness from 0 to 100:"
-    scale:
-      - "0: Broken code, syntax errors, or deprecated APIs"
-      - "30: Works but uses anti-patterns or inefficient approaches"
-      - "50: Works but not idiomatic"
-      - "80: Follows most best practices"
-      - "100: Follows all best practices, idiomatic implementation"
-    criteria_label: "Check for:"
-  doc-coverage:
-    dimension: doc-coverage
-    header: "Score documentation coverage from 0 to 100:"
-    scale:
-      - "0: Had to hallucinate/guess most implementation details"
-      - "30: Significant gaps — filled with assumptions"
-      - "50: Some gaps — inferred from partial information"
-      - "80: Minor gaps — almost everything was documented"
-      - "100: Complete coverage — all necessary info was in docs"
-# Named scoring profiles — each is a dimension → weight map (must sum to 1.0).
-#
-# 'default': Full three-dimension composite for gold/ceiling entries (with docs).
-# 'output-only': Output quality dimensions only — excludes doc-coverage, which
-#   is semantically undefined on without-docs entries.
-#
-# See docs/design-docs/named-scoring-profiles.md for the rationale.
-profiles:
-  default:
-    task-completion: 0.50
-    code-correctness: 0.25
-    doc-coverage: 0.25
-  output-only:
-    task-completion: 0.60
-    code-correctness: 0.40
-# Mode-to-profile bindings — which profile to use for each (mode, variant) pair.
-# The scoring engine resolves: mode-profiles.<mode>.<variant> → profile name.
-# Falls back to 'default' when no explicit binding exists.
-mode-profiles:
-  baseline:
-    gold: default
-    baseline: output-only
-  agentic:
-    gold: default
-footer:
-  'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}'

package/config/schedules.yaml DELETED Viewed

@@ -1,43 +0,0 @@
-# schedules.yaml
-#
-# Scheduled evaluation configuration for ai-literacy-framework.
-# Each schedule defines a recurring pipeline run with its own source,
-# mode, and delivery preferences.
-#
-# The GitHub Actions cron workflow (.github/workflows/scheduled-eval.yml)
-# reads this file to determine which evaluations to run and when.
-#
-# Schedule names are used as tags on published reports for easy filtering.
-# Cron expressions use UTC timezone (GitHub Actions standard).
-#
-# @see docs/design-docs/report-store/implementation.md — Phase 5
-schedules:
-  # Daily baseline — track score trends against production docs
-  - name: daily-baseline
-    cron: "0 2 * * *" # 2:00 AM UTC, every day
-    mode: baseline
-    source: production
-    publish: true
-    compare: true
-    enabled: true
-  # Weekly full decomposition — complete floor/ceiling/actual report
-  # Runs both baseline + agentic in a single pipeline invocation,
-  # producing the three-layer decomposition with retrieval gap metrics.
-  # Replaces the previous weekly-full (baseline) + weekly-agentic schedule pair.
-  - name: weekly-full
-    cron: "0 3 * * 0" # 3:00 AM UTC, every Sunday
-    mode: full
-    source: production
-    publish: true
-    compare: true
-    enabled: true
-# Digest configuration — aggregates reports into periodic summaries
-digest:
-  # Weekly digest — summarize score trends from the past 7 days
-  enabled: true
-  cron: "0 9 * * 1" # 9:00 AM UTC, every Monday
-  lookbackDays: 7
-  slackWebhookUrl: ${{ SLACK_WEBHOOK_URL }}

package/config/sinks.yaml DELETED Viewed

@@ -1,54 +0,0 @@
-# sinks.yaml
-#
-# Report delivery sink configuration for ai-literacy-framework.
-# Sinks receive published evaluation reports and deliver them to external
-# systems (BigQuery, Slack, GitHub, webhooks, etc.).
-#
-# Sinks are fire-and-forget (P6): a sink failure is logged but never blocks
-# the pipeline. The Sanity Content Lake is the system of record.
-#
-# Environment variables use ${{ VAR }} or ${{ VAR | default }} syntax.
-# A sink with `enabled: false` (or a falsy env var) is skipped entirely.
-#
-# Sinks activate only when their required environment variables are present.
-# A developer running locally with no env vars gets zero sinks.
-#
-# @see docs/design-docs/report-store/sink-architecture.md
-sinks:
-  # BigQuery — disabled; Airbyte ELT is the primary BigQuery delivery mechanism.
-  # The BigQuerySink can be re-enabled as a fallback if Airbyte is unavailable.
-  # See config/airbyte/ for the active connector and config/bigquery/ for views.
-  # - type: bigquery
-  #   enabled: false
-  #   project: ${{ BIGQUERY_PROJECT | data-platform-302218 }}
-  #   dataset: ${{ BIGQUERY_DATASET | ailf }}
-  #   credentials: ${{ GOOGLE_APPLICATION_CREDENTIALS }}
-  # Slack — regression alerts to configured channels
-  # Activates only when SLACK_WEBHOOK_URL is set in the environment.
-  # By default only posts on regressions (avoids notification fatigue).
-  # Routing controls which messages go where by severity level.
-  # - type: slack
-  #   enabled: true
-  #   webhookUrl: ${{ SLACK_WEBHOOK_URL }}
-  #   channel: "#docs-ai-literacy"
-  #   routing:
-  #     critical: "#docs-alerts"    # critical violations → dedicated alert channel
-  #     warning: "#docs-team"       # warnings → team channel
-  #     regression: "#docs-team"    # regressions → team channel
-  #     digest: "#docs-weekly"      # weekly digests → broader channel
-  # GitHub PR comments — score tables on CI-triggered eval runs
-  # - type: github-comment
-  #   enabled: false
-  #   token: ${{ GITHUB_TOKEN }}
-  # Webhook — generic HTTP relay for Airbyte, Zapier, custom services
-  # - type: webhook
-  #   enabled: false
-  #   url: ${{ AILF_WEBHOOK_URL }}
-  #   headers:
-  #     Authorization: "Bearer ${{ AILF_WEBHOOK_TOKEN }}"
-  #   routing:
-  #     critical: true              # webhooks fire on critical only

package/config/sources.yaml DELETED Viewed

@@ -1,51 +0,0 @@
-# sources.yaml
-#
-# Documentation source definitions for ai-literacy-framework evaluations.
-# Each source defines where to find documentation for both evaluation modes:
-#
-#   - Baseline/Observed: reads from Sanity CMS (projectId + dataset)
-#   - Agentic: reads from a live URL (baseUrl, llms.txt derived automatically)
-#
-# Environment variables use ${{ VAR }} or ${{ VAR | default }} syntax.
-# Required vars (no default) will error if not set at load time.
-#
-# Available fields (all optional except baseUrl, projectId, dataset):
-#   projectId:      Sanity project ID
-#   dataset:        Sanity dataset name
-#   baseUrl:        Documentation site URL
-#   perspective:    Sanity release perspective ID (for evaluating content releases)
-#   studioOrigin:   Sanity Studio base URL (default: https://admin.sanity.io)
-#   allowedOrigins: Origin allowlist for agentic sandboxing
-#
-# Fields settable only via CLI flags or env vars (not in this file):
-#   documentIds:   --sanity-document / SANITY_DOCUMENT_IDS
-#   urls:          --url / DOC_BASE_URL
-#   headers:       --header / DOC_HEADERS
-#
-# Usage:
-#   pnpm eval --source production
-#   pnpm eval:agentic --source branch
-#   DOC_BASE_URL=https://my-branch.sanity.dev/docs pnpm eval:agentic
-sources:
-  # Production Sanity docs — the default when no --source is specified
-  production:
-    projectId: ${{ SANITY_PROJECT_ID | 3do82whm }}
-    dataset: ${{ SANITY_DATASET | next }}
-    baseUrl: https://www.sanity.io/docs
-  # Branch deploy — for testing doc changes before merge
-  # Requires DOC_BASE_URL to be set in the environment
-  # Headers can also be passed via --header flag or DOC_HEADERS env var
-  branch:
-    projectId: ${{ SANITY_PROJECT_ID | 3do82whm }}
-    dataset: ${{ SANITY_DATASET | next }}
-    baseUrl: ${{ DOC_BASE_URL }}
-    allowedOrigins:
-      - ${{ DOC_ALLOWED_ORIGIN | }}
-  # Local dev server — serve docs locally for offline testing
-  local:
-    projectId: ${{ SANITY_PROJECT_ID | 3do82whm }}
-    dataset: ${{ SANITY_DATASET | next }}
-    baseUrl: http://localhost:${{ DOC_LOCAL_PORT | 3001 }}/docs

package/config/thresholds.yaml DELETED Viewed

@@ -1,49 +0,0 @@
-# thresholds.yaml
-#
-# Quality thresholds for readiness gates and regression alerts.
-# Each threshold defines a minimum acceptable score. Violations are
-# classified by severity and routed to configured sinks.
-#
-# Used by:
-# - `pnpm pipeline --readiness` (launch readiness checklist)
-# - `pnpm pipeline --publish` (severity-aware sink routing)
-# - `pnpm pipeline --compare` (regression alerting)
-#
-# @see docs/exec-plans/scenario-matrix-implementation/phase-5-readiness-thresholds.md
-# Global defaults (apply to all areas unless overridden)
-defaults:
-  composite: 50 # minimum composite score
-  dimensions:
-    task-completion: 40
-    code-correctness: 30
-    doc-coverage: 30
-  doc-lift: 0 # minimum Doc Lift (0 = docs must not hurt)
-  ceiling: 40 # minimum ceiling score (doc quality floor)
-# Per-area overrides (inherit from defaults, override specific values)
-areas:
-  groq:
-    composite: 60 # GROQ is critical — higher bar
-    dimensions:
-      task-completion: 50
-  # visual-editing:
-  #   composite: 45 # currently at 36, set achievable near-term target
-  # Areas not listed here use defaults
-# Regression thresholds (for comparison reports)
-regression:
-  composite: -3 # alert if composite drops more than 3 points
-  per-area: -5 # alert if any area drops more than 5 points
-  per-dimension: -8 # alert if any dimension drops more than 8 points
-# Severity classification
-severity:
-  critical: # blocks deployment, immediate notification
-    composite-below: 30
-    negative-doc-lift: true
-  warning: # flags for review, non-blocking
-    composite-below: 50
-    regression-exceeds: -3
-  info: # logged but not alerted
-    composite-below: 60

package/dist/agent-observer/test-imports.d.ts DELETED Viewed

@@ -1,7 +0,0 @@
-/**
- * Final validation — ensures all agent-observer modules work together
- * and the full data pipeline (record → classify → summarize) is correct.
- *
- * Run: tsx src/agent-observer/test-imports.ts
- */
-export {};