npm - @sanity/ailf - Versions diffs - 2.0.1 → 2.1.0 - Mend

@sanity/ailf 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (160) hide show

package/LICENSE +21 -0
package/dist/cli.js +0 -0
package/dist/orchestration/steps/run-eval-step.js +1 -1
package/dist/pipeline/checks.d.ts +8 -3
package/dist/pipeline/checks.js +23 -3
package/package.json +25 -25
package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
package/dist/_vendor/ailf-tasks/cli.js +0 -61
package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
package/dist/_vendor/ailf-tasks/index.js +0 -16
package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
package/dist/_vendor/ailf-tasks/parser.js +0 -73
package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
package/dist/_vendor/ailf-tasks/schemas.js +0 -180
package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
package/dist/_vendor/ailf-tasks/validation.js +0 -162
package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
package/dist/adapters/task-sources/yaml-task-source.js +0 -139
package/dist/agent-observer/test-imports.d.ts +0 -7
package/dist/agent-observer/test-imports.js +0 -185
package/dist/commands/update-quality-scores.d.ts +0 -5
package/dist/commands/update-quality-scores.js +0 -20
package/dist/lib/agent-behavior-report.d.ts +0 -8
package/dist/lib/agent-behavior-report.js +0 -185
package/dist/lib/baseline.d.ts +0 -19
package/dist/lib/baseline.js +0 -153
package/dist/lib/calculate-scores.d.ts +0 -23
package/dist/lib/calculate-scores.js +0 -42
package/dist/lib/compare.d.ts +0 -18
package/dist/lib/compare.js +0 -170
package/dist/lib/coverage-audit.d.ts +0 -4
package/dist/lib/coverage-audit.js +0 -42
package/dist/lib/discovery-report.d.ts +0 -13
package/dist/lib/discovery-report.js +0 -57
package/dist/lib/fetch-docs.d.ts +0 -30
package/dist/lib/fetch-docs.js +0 -171
package/dist/lib/generate-configs.d.ts +0 -25
package/dist/lib/generate-configs.js +0 -42
package/dist/lib/grader-api.d.ts +0 -21
package/dist/lib/grader-api.js +0 -34
package/dist/lib/grader-compare.d.ts +0 -19
package/dist/lib/grader-compare.js +0 -91
package/dist/lib/grader-consistency.d.ts +0 -27
package/dist/lib/grader-consistency.js +0 -79
package/dist/lib/grader-sensitivity.d.ts +0 -19
package/dist/lib/grader-sensitivity.js +0 -75
package/dist/lib/grader-validate.d.ts +0 -19
package/dist/lib/grader-validate.js +0 -78
package/dist/lib/measure-retrieval.d.ts +0 -14
package/dist/lib/measure-retrieval.js +0 -71
package/dist/lib/pr-comment.d.ts +0 -16
package/dist/lib/pr-comment.js +0 -28
package/dist/lib/readiness-report.d.ts +0 -13
package/dist/lib/readiness-report.js +0 -108
package/dist/lib/webhook-server.d.ts +0 -11
package/dist/lib/webhook-server.js +0 -24
package/dist/lib/weekly-digest.d.ts +0 -24
package/dist/lib/weekly-digest.js +0 -148
package/dist/orchestration/env-bridge.d.ts +0 -21
package/dist/orchestration/env-bridge.js +0 -66
package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
package/dist/pipeline/compiler/task-bridge.js +0 -92
package/dist/pipeline/expand-tasks.d.ts +0 -232
package/dist/pipeline/expand-tasks.js +0 -467
package/dist/pipeline/generate-configs.d.ts +0 -92
package/dist/pipeline/generate-configs.js +0 -445
package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
package/dist/pipeline/steps/calculate-scores-step.js +0 -89
package/dist/pipeline/steps/compare-step.d.ts +0 -18
package/dist/pipeline/steps/compare-step.js +0 -90
package/dist/pipeline/steps/eval-step.d.ts +0 -53
package/dist/pipeline/steps/eval-step.js +0 -347
package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
package/dist/pipeline/steps/fetch-docs-step.js +0 -84
package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
package/dist/pipeline/steps/generate-configs-step.js +0 -98
package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
package/dist/pipeline/steps/grader-consistency-step.js +0 -74
package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
package/dist/pipeline/steps/publish-report-step.js +0 -243
package/dist/pipeline/steps/report-step.d.ts +0 -13
package/dist/pipeline/steps/report-step.js +0 -56
package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
package/dist/pipeline/steps/update-scores-step.js +0 -42
package/dist/scripts/agent-behavior-report.d.ts +0 -19
package/dist/scripts/agent-behavior-report.js +0 -315
package/dist/scripts/baseline.d.ts +0 -43
package/dist/scripts/baseline.js +0 -267
package/dist/scripts/calculate-scores.d.ts +0 -166
package/dist/scripts/calculate-scores.js +0 -1296
package/dist/scripts/compare.d.ts +0 -22
package/dist/scripts/compare.js +0 -334
package/dist/scripts/coverage-audit.d.ts +0 -44
package/dist/scripts/coverage-audit.js +0 -209
package/dist/scripts/debug-eval.d.ts +0 -19
package/dist/scripts/debug-eval.js +0 -73
package/dist/scripts/discovery-report.d.ts +0 -58
package/dist/scripts/discovery-report.js +0 -250
package/dist/scripts/fetch-docs.d.ts +0 -35
package/dist/scripts/fetch-docs.js +0 -472
package/dist/scripts/generate-configs.d.ts +0 -66
package/dist/scripts/generate-configs.js +0 -459
package/dist/scripts/grader-api.d.ts +0 -27
package/dist/scripts/grader-api.js +0 -206
package/dist/scripts/grader-compare.d.ts +0 -22
package/dist/scripts/grader-compare.js +0 -368
package/dist/scripts/grader-consistency.d.ts +0 -20
package/dist/scripts/grader-consistency.js +0 -313
package/dist/scripts/grader-sensitivity.d.ts +0 -22
package/dist/scripts/grader-sensitivity.js +0 -354
package/dist/scripts/grader-validate.d.ts +0 -19
package/dist/scripts/grader-validate.js +0 -267
package/dist/scripts/measure-retrieval.d.ts +0 -10
package/dist/scripts/measure-retrieval.js +0 -145
package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
package/dist/scripts/pipeline.d.ts +0 -76
package/dist/scripts/pipeline.js +0 -1031
package/dist/scripts/pr-comment.d.ts +0 -10
package/dist/scripts/pr-comment.js +0 -510
package/dist/scripts/readiness-report.d.ts +0 -88
package/dist/scripts/readiness-report.js +0 -342
package/dist/scripts/update-quality-scores.d.ts +0 -15
package/dist/scripts/update-quality-scores.js +0 -184
package/dist/scripts/validate-task-sources.d.ts +0 -21
package/dist/scripts/validate-task-sources.js +0 -210
package/dist/scripts/validate.d.ts +0 -13
package/dist/scripts/validate.js +0 -79
package/dist/scripts/webhook-server.d.ts +0 -26
package/dist/scripts/webhook-server.js +0 -147
package/dist/scripts/weekly-digest.d.ts +0 -24
package/dist/scripts/weekly-digest.js +0 -144
package/dist/sinks/format-slack.d.ts +0 -64
package/dist/sinks/format-slack.js +0 -306
package/dist/sinks/slack-sink.d.ts +0 -27
package/dist/sinks/slack-sink.js +0 -78
package/dist/sinks/webhook-sink.d.ts +0 -19
package/dist/sinks/webhook-sink.js +0 -50
package/tasks/.expanded.agentic.yaml +0 -280
package/tasks/.expanded.yaml +0 -565

package/tasks/.expanded.agentic.yaml DELETED Viewed

@@ -1,280 +0,0 @@
-# .expanded.agentic.yaml
-#
-# AUTO-GENERATED by compiler pipeline — do not edit directly.
-# Gold entries only (no baseline) for agentic evaluation mode.
-# Run: npx @sanity/ailf generate-configs
-- description: GROQ - Blog queries with filtering and pagination (gold)
-  vars:
-    task: |-
-      Write GROQ queries for a Sanity blog application:
-      1. Fetch all published blog posts ordered by publishedAt descending,
-         with a projection that includes: _id, title, slug (from slug.current),
-         publishedAt, excerpt, and the author's name (resolved from a reference)
-      2. Add pagination to return only the first 10 results
-      3. Fetch a single post by its slug parameter, including the full body
-         content and resolved author and category references
-      4. Fetch posts published after a specific date
-      5. Fetch posts that belong to a specific category (where categories
-         is an array of references)
-      Use @sanity/client with client.fetch() for all queries. Include
-      TypeScript types for the query results.
-    docs: file://contexts/canonical/groq-blog-queries.md
-    __featureArea: groq
-  assert:
-    - type: llm-rubric
-      value: |-
-        Score task completion from 0 to 100:
-        - 0: Couldn't attempt — missing critical information
-        - 20: Attempted but fundamentally wrong approach
-        - 50: Partial implementation — major functional gaps
-        - 80: Mostly complete — minor issues or missing edge cases
-        - 100: Fully functional code — works as expected
-        Must demonstrate:
-        - GROQ filter with _type == "post"
-        - Projection with aliased slug field ("slug": slug.current)
-        - Reference resolution with -> for author
-        - Ordering with | order(publishedAt desc)
-        - Slice/pagination syntax [0...10] or [0..9]
-        - Parameterized query with $slug for single post fetch
-        - Date filtering with dateTime() or string comparison
-        - Category filtering using references or array contains
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: task-completion
-        maxScore: 100
-    - type: llm-rubric
-      value: |-
-        Score code correctness from 0 to 100:
-        - 0: Broken code, syntax errors, or deprecated APIs
-        - 30: Works but uses anti-patterns or inefficient approaches
-        - 50: Works but not idiomatic
-        - 80: Follows most best practices
-        - 100: Follows all best practices, idiomatic implementation
-        Check for:
-        - Valid GROQ syntax (proper filter brackets, projection braces)
-        - Uses @sanity/client createClient + client.fetch()
-        - Correct parameter passing syntax ($param)
-        - Proper reference dereference with ->
-        - No deprecated patterns
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: code-correctness
-        maxScore: 100
-    - type: contains-any
-      value:
-        - client.fetch
-        - createClient
-      weight: 1
-    - type: contains-any
-      value:
-        - order(publishedAt
-        - order(_createdAt
-        - '| order('
-      weight: 1
-    - type: contains-any
-      value:
-        - '[0...10]'
-        - '[0..9]'
-        - '[0...'
-      weight: 1
-    - type: llm-rubric
-      value: |-
-        Score documentation coverage from 0 to 100:
-        - 0: Had to hallucinate/guess most implementation details
-        - 30: Significant gaps — filled with assumptions
-        - 50: Some gaps — inferred from partial information
-        - 80: Minor gaps — almost everything was documented
-        - 100: Complete coverage — all necessary info was in docs
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: doc-coverage
-        maxScore: 100
-- description: GROQ - Joins and reference resolution (gold)
-  vars:
-    task: |-
-      Write GROQ queries that demonstrate join patterns in Sanity:
-      1. Follow a single reference to resolve an author's full profile
-         from a post (post.author -> author document with name, bio, image)
-      2. Resolve an array of category references from a post
-         (post.categories[]-> with title and slug)
-      3. Write a reverse reference query: given an author's ID, find all
-         posts by that author using a subquery and the parent scope operator (^)
-      4. Create a nested join: for each author, include their 5 most recent
-         posts as a nested array
-      5. Use the references() function to find all documents that reference
-         a specific document ID
-      Use @sanity/client with client.fetch(). Include TypeScript types.
-    docs: file://contexts/canonical/groq-joins-references.md
-    __featureArea: groq
-  assert:
-    - type: llm-rubric
-      value: |-
-        Score task completion from 0 to 100:
-        - 0: Couldn't attempt — missing critical information
-        - 20: Attempted but fundamentally wrong approach
-        - 50: Partial implementation — major functional gaps
-        - 80: Mostly complete — minor issues or missing edge cases
-        - 100: Fully functional code — works as expected
-        Must demonstrate:
-        - Single reference follow with -> operator
-        - Array reference resolution with []->
-        - Reverse reference / subquery using *[references(^._id)]
-        - Nested join pattern with parent scope (^)
-        - The references() function
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: task-completion
-        maxScore: 100
-    - type: llm-rubric
-      value: |-
-        Score code correctness from 0 to 100:
-        - 0: Broken code, syntax errors, or deprecated APIs
-        - 30: Works but uses anti-patterns or inefficient approaches
-        - 50: Works but not idiomatic
-        - 80: Follows most best practices
-        - 100: Follows all best practices, idiomatic implementation
-        Check for:
-        - Correct -> dereference syntax
-        - Valid []-> array dereference
-        - Proper use of ^ parent scope operator
-        - Valid references() function usage
-        - No made-up syntax
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: code-correctness
-        maxScore: 100
-    - type: contains
-      value: '->'
-      weight: 1
-    - type: contains-any
-      value:
-        - references(
-        - references(^
-      weight: 1
-    - type: llm-rubric
-      value: |-
-        Score documentation coverage from 0 to 100:
-        - 0: Had to hallucinate/guess most implementation details
-        - 30: Significant gaps — filled with assumptions
-        - 50: Some gaps — inferred from partial information
-        - 80: Minor gaps — almost everything was documented
-        - 100: Complete coverage — all necessary info was in docs
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: doc-coverage
-        maxScore: 100
-- description: GROQ - Advanced filtering and projections (gold)
-  vars:
-    task: |-
-      Write GROQ queries demonstrating advanced filtering and projection patterns:
-      1. Use select() for conditional projections — return different fields
-         based on the document's _type (e.g., posts get excerpt, events get
-         date and venue)
-      2. Use coalesce() for fallback values — e.g., use seoTitle if it
-         exists, otherwise fall back to title
-      3. Use the match operator for full-text search in titles
-      4. Use count() to count documents matching a filter and to count
-         items within an array field
-      5. Use defined() to filter for documents that have a specific field set
-      6. Filter items within an array using [condition] syntax
-      7. Order results by multiple fields (e.g., featured status first,
-         then by publishedAt)
-      Use @sanity/client with client.fetch(). Include TypeScript types.
-    docs: file://contexts/canonical/groq-advanced-filtering.md
-    __featureArea: groq
-  assert:
-    - type: llm-rubric
-      value: |-
-        Score task completion from 0 to 100:
-        - 0: Couldn't attempt — missing critical information
-        - 20: Attempted but fundamentally wrong approach
-        - 50: Partial implementation — major functional gaps
-        - 80: Mostly complete — minor issues or missing edge cases
-        - 100: Fully functional code — works as expected
-        Must demonstrate:
-        - select() for conditional projections
-        - coalesce() for fallback values
-        - match operator for text search
-        - count() function usage
-        - defined() function for existence checks
-        - Array filtering with [condition]
-        - Multi-field ordering
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: task-completion
-        maxScore: 100
-    - type: llm-rubric
-      value: |-
-        Score code correctness from 0 to 100:
-        - 0: Broken code, syntax errors, or deprecated APIs
-        - 30: Works but uses anti-patterns or inefficient approaches
-        - 50: Works but not idiomatic
-        - 80: Follows most best practices
-        - 100: Follows all best practices, idiomatic implementation
-        Check for:
-        - Valid select() syntax with => arrow notation
-        - Correct coalesce() usage
-        - Proper match operator usage (on text fields)
-        - Valid count() and defined() function calls
-        - Correct array filter syntax
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: code-correctness
-        maxScore: 100
-    - type: contains-any
-      value:
-        - select(
-        - coalesce(
-      weight: 1
-    - type: contains-any
-      value:
-        - count(
-        - defined(
-      weight: 1
-    - type: contains-any
-      value:
-        - match
-      weight: 1
-    - type: llm-rubric
-      value: |-
-        Score documentation coverage from 0 to 100:
-        - 0: Had to hallucinate/guess most implementation details
-        - 30: Significant gaps — filled with assumptions
-        - 50: Some gaps — inferred from partial information
-        - 80: Minor gaps — almost everything was documented
-        - 100: Complete coverage — all necessary info was in docs
-        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
-      provider: anthropic:messages:claude-opus-4-5-20251101
-      metadata:
-        dimension: doc-coverage
-        maxScore: 100