npm - @sanity/ailf - Versions diffs - 4.0.6 → 4.1.0 - Mend

@sanity/ailf 4.0.6 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

package/bin/ailf.js +6 -1
package/dist/_vendor/ailf-core/schemas/external-providers.d.ts +136 -0
package/dist/_vendor/ailf-core/schemas/external-providers.js +136 -0
package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/index.js +2 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -3
package/dist/_vendor/ailf-core/schemas/report.d.ts +251 -0
package/dist/_vendor/ailf-core/schemas/report.js +235 -0
package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
package/dist/_vendor/ailf-core/services/index.js +1 -0
package/dist/_vendor/ailf-core/services/report-to-markdown.d.ts +38 -0
package/dist/_vendor/ailf-core/services/report-to-markdown.js +696 -0
package/dist/_vendor/ailf-core/types/api-requests.d.ts +159 -0
package/dist/_vendor/ailf-core/types/api-requests.js +27 -0
package/dist/_vendor/ailf-core/types/index.d.ts +3 -0
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +112 -0
package/dist/_vendor/ailf-core/types/pipeline-request.js +18 -0
package/dist/_vendor/ailf-core/types/repo-config.d.ts +146 -0
package/dist/_vendor/ailf-core/types/repo-config.js +18 -0
package/dist/_vendor/ailf-shared/index.d.ts +7 -5
package/dist/_vendor/ailf-shared/index.js +7 -5
package/dist/adapters/api-client/types.d.ts +2 -5
package/dist/adapters/task-sources/content-lake-task-source.d.ts +58 -1
package/dist/adapters/task-sources/content-lake-task-source.js +1 -1
package/dist/adapters/task-sources/index.d.ts +1 -1
package/dist/adapters/task-sources/index.js +1 -1
package/dist/adapters/task-sources/repo-schemas.d.ts +3 -2
package/dist/adapters/task-sources/repo-schemas.js +3 -1
package/dist/adapters/task-sources/repo-task-source.d.ts +11 -1
package/dist/adapters/task-sources/repo-task-source.js +7 -4
package/dist/adapters/task-sources/repo-validation.d.ts +6 -6
package/dist/adapters/task-sources/repo-validation.js +1 -1
package/dist/agent-observer/agentic-provider.d.ts +1 -0
package/dist/agent-observer/agentic-provider.js +43 -36
package/dist/agent-observer/config-schemas.d.ts +61 -0
package/dist/agent-observer/config-schemas.js +65 -0
package/dist/agent-observer/provider.d.ts +1 -0
package/dist/agent-observer/provider.js +19 -17
package/dist/cli.js +4 -4
package/dist/commands/validate-tasks.js +2 -2
package/dist/composition-root.d.ts +7 -0
package/dist/composition-root.js +27 -12
package/dist/index.d.ts +1 -1
package/dist/index.js +1 -1
package/dist/job-store.js +2 -2
package/dist/lib/dotenv-resolution.d.ts +21 -0
package/dist/lib/dotenv-resolution.js +30 -0
package/dist/orchestration/steps/fetch-docs-step.js +10 -30
package/dist/orchestration/steps/generate-configs-step.d.ts +8 -15
package/dist/orchestration/steps/generate-configs-step.js +26 -118
package/dist/orchestration/steps/mirror-repo-tasks-step.js +26 -3
package/dist/orchestration/steps/run-eval-step.js +21 -3
package/dist/pipeline/agent-behavior-report.d.ts +2 -8
package/dist/pipeline/cache.d.ts +2 -2
package/dist/pipeline/checks.d.ts +10 -2
package/dist/pipeline/checks.js +14 -4
package/dist/pipeline/compiler/literacy-bridge.js +2 -2
package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +2 -2
package/dist/pipeline/compiler/mode-handlers/index.d.ts +1 -1
package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +2 -2
package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +3 -3
package/dist/pipeline/compiler/promptfoo-compiler.js +7 -11
package/dist/pipeline/compiler/provider-assembler.js +33 -3
package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
package/dist/pipeline/mirror-repo-tasks.d.ts +13 -5
package/dist/pipeline/mirror-repo-tasks.js +16 -8
package/dist/pipeline/pr-comment.d.ts +22 -9
package/dist/pipeline/pr-comment.js +52 -472
package/dist/pipeline/resolve-mappings.d.ts +8 -3
package/dist/promptfoo-providers/mock-path.d.ts +12 -0
package/dist/promptfoo-providers/mock-path.js +15 -0
package/dist/report-store.d.ts +63 -1
package/dist/report-store.js +111 -31
package/dist/sanity/client.d.ts +58 -0
package/dist/sanity/client.js +106 -0
package/package.json +8 -7
package/dist/orchestration/load-pipeline-tasks.d.ts +0 -40
package/dist/orchestration/load-pipeline-tasks.js +0 -57

package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * Shared types for the agent harness mode handler.
  */
 import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
-import type { RubricConfig } from "../../rubric-resolution.js";
+import type { RubricResolutionInput } from "../../rubric-resolution.js";
 import type { SandboxType } from "../../sandbox/sandbox-strategy.js";
 /** Options for compiling an agent harness task */
 export interface AgentHarnessCompileOptions {
@@ -11,7 +11,7 @@ export interface AgentHarnessCompileOptions {
     /** Root directory for fixture resolution */
     rootDir?: string;
     /** Rubric config (templates, weights, profiles) — loaded from rubrics config */
-    rubricConfig?: RubricConfig;
+    rubricConfig?: RubricResolutionInput;
 }
 /** Result of compiling a single agent harness task */
 export interface AgentHarnessCompileResult {

package/dist/pipeline/compiler/mode-handlers/index.d.ts CHANGED Viewed

@@ -10,6 +10,6 @@
  * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
  */
 export { buildMCPAssertions, compileMCPTask, handler as mcpServerHandler, validateMCPTask, type MCPAssertionContext, type MCPCompileOptions, type MCPCompileResult, type MCPValidationError, } from "./mcp-server/index.js";
-export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type RubricConfig, } from "./literacy/index.js";
+export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type RubricResolutionInput, } from "./literacy/index.js";
 export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, type KnowledgeProbeCompileOptions, type KnowledgeProbeCompileResult, type KnowledgeProbeMetadata, type KnowledgeProbeValidationError, } from "./knowledge-probe/index.js";
 export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, type AgentHarnessCompileOptions, type AgentHarnessCompileResult, type AgentHarnessValidationError, type PromptfooExtension, type SandboxConfigMeta, } from "./agent-harness/index.js";

package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts CHANGED Viewed

@@ -2,7 +2,7 @@
  * Public types for the knowledge-probe mode handler.
  */
 import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
-import type { RubricConfig } from "../../rubric-resolution.js";
+import type { RubricResolutionInput } from "../../rubric-resolution.js";
 /** Options for compiling a knowledge probe task */
 export interface KnowledgeProbeCompileOptions {
     /** Grader provider for LLM-graded assertions */
@@ -15,7 +15,7 @@ export interface KnowledgeProbeCompileOptions {
     }[];
     /** Rubric config (templates, weights, profiles) — needed to resolve
      * templated `llm-rubric` assertions to dimension metadata. */
-    rubricConfig?: RubricConfig;
+    rubricConfig?: RubricResolutionInput;
 }
 /** Result of compiling a single knowledge probe task */
 export interface KnowledgeProbeCompileResult {

package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts CHANGED Viewed

@@ -7,5 +7,5 @@ import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
 export { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
 export { validateLiteracyTask, type LiteracyValidationError, } from "./validation.js";
 export { compileLiteracyTask } from "./compiler.js";
-export type { LiteracyCompileOptions, LiteracyCompileResult, RubricConfig, } from "./types.js";
+export type { LiteracyCompileOptions, LiteracyCompileResult, RubricResolutionInput, } from "./types.js";
 export declare const handler: ModeHandler;

package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts CHANGED Viewed

@@ -2,8 +2,8 @@
  * Shared types for the literacy mode handler.
  */
 import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
-export type { RubricConfig } from "../../rubric-resolution.js";
-import type { RubricConfig } from "../../rubric-resolution.js";
+export type { RubricResolutionInput } from "../../rubric-resolution.js";
+import type { RubricResolutionInput } from "../../rubric-resolution.js";
 /** Options for compiling a literacy task */
 export interface LiteracyCompileOptions {
     /** Grader provider for LLM-graded assertions */
@@ -19,7 +19,7 @@ export interface LiteracyCompileOptions {
         config?: Record<string, unknown>;
     }[];
     /** Rubric config (templates, weights, profiles) — loaded from rubrics config */
-    rubricConfig?: RubricConfig;
+    rubricConfig?: RubricResolutionInput;
 }
 /** Result of compiling a single literacy task */
 export interface LiteracyCompileResult {

package/dist/pipeline/compiler/promptfoo-compiler.js CHANGED Viewed

@@ -11,20 +11,11 @@
  *
  * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
  */
-import { dirname, resolve as resolvePath } from "node:path";
-import { fileURLToPath } from "node:url";
 import { mapAssertions } from "./assertion-mapper.js";
 import { resolveTaskFixtures } from "./fixture-resolver.js";
 import { LiteracyVariant } from "../normalize-mode.js";
 import { resolveVariables } from "./variable-resolver.js";
-/**
- * Absolute filesystem path to the AILF mock Promptfoo provider. Resolved
- * once at module load relative to this file. Promptfoo's `file://` provider
- * loader requires an absolute path. See buildProviders for the env-var
- * gate that swaps real providers for this mock.
- */
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const MOCK_PROVIDER_ABSPATH = resolvePath(__dirname, "..", "..", "promptfoo-providers", "mock-provider.cjs");
+import { MOCK_PROVIDER_ABSPATH } from "../../promptfoo-providers/mock-path.js";
 // ---------------------------------------------------------------------------
 // Public API
 // ---------------------------------------------------------------------------
@@ -158,12 +149,17 @@ function buildProviders(models, mode) {
     // never makes a live LLM call. We preserve `label` and stash the
     // original `id` in `config.originalId` so the mock provider can surface
     // model identity in its output and reports remain interpretable.
+    // `originalId ?? p.id` guards against double-swap clobbering a real id
+    // that's already been preserved on a prior pass.
     // See W0110 (M5.1) and docs/design-docs/testing-strategy.md.
     if (process.env.AILF_REPLAY_LLMS === "1") {
         return providers.map((p) => ({
             id: `file://${MOCK_PROVIDER_ABSPATH}`,
             label: p.label,
-            config: { ...p.config, originalId: p.id },
+            config: {
+                ...p.config,
+                originalId: p.config?.originalId ?? p.id,
+            },
         }));
     }
     return providers;

package/dist/pipeline/compiler/provider-assembler.js CHANGED Viewed

@@ -25,6 +25,36 @@
 import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
 import { loadConfigFile } from "./config-loader.js";
 import { modelMatchesLiteracyVariant } from "./mode-bases/literacy.js";
+import { MOCK_PROVIDER_ABSPATH } from "../../promptfoo-providers/mock-path.js";
+/**
+ * Apply the W0110 replay swap to a list of literacy provider records.
+ *
+ * When `AILF_REPLAY_LLMS=1`, every provider's `id` is rewritten to the
+ * file-based AILF mock provider so the Promptfoo subprocess never makes
+ * a live LLM call. We preserve `label` and stash the original `id` in
+ * `config.originalId` so reports remain interpretable. This mirrors the
+ * top-level `buildProviders` swap in `promptfoo-compiler.ts` — it exists
+ * here because the literacy mode runs through this assembler, not
+ * `compileToPromptfoo`, so without this hook the replay flag was a no-op
+ * for every literacy run (W0138 Slice 2 surface).
+ *
+ * `originalId` is set with `?? p.id` so a record that's already been
+ * swapped (or that pre-stashed an `originalId` for any other reason)
+ * doesn't get its real model id clobbered by the file:// path.
+ */
+function applyReplaySwap(providers) {
+    if (process.env.AILF_REPLAY_LLMS !== "1")
+        return providers;
+    return providers.map((raw) => {
+        const p = raw;
+        const config = p.config ?? {};
+        return {
+            id: `file://${MOCK_PROVIDER_ABSPATH}`,
+            label: p.label,
+            config: { ...config, originalId: config.originalId ?? p.id },
+        };
+    });
+}
 // ---------------------------------------------------------------------------
 // Public API
 // ---------------------------------------------------------------------------
@@ -40,9 +70,9 @@ export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigi
     return {
         models,
         providers: {
-            baseline: buildBaselineProviders(models),
-            agentic: buildAgenticProviders(models, source, searchMode, allowedOrigins),
-            observed: buildObservedProviders(models),
+            baseline: applyReplaySwap(buildBaselineProviders(models)),
+            agentic: applyReplaySwap(buildAgenticProviders(models, source, searchMode, allowedOrigins)),
+            observed: applyReplaySwap(buildObservedProviders(models)),
         },
     };
 }

package/dist/pipeline/compiler/rubric-resolution.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@
  */
 import type { PromptfooAssertion } from "./assertion-mapper.js";
 /** Minimal rubric config needed for template resolution */
-export interface RubricConfig {
+export interface RubricResolutionInput {
     templates: Record<string, {
         criteria_label?: string;
         dimension?: string;
@@ -37,4 +37,4 @@ export declare function resolveTemplatedAssertion(assertion: {
     criteria: string[];
     template: string;
     type: string;
-}, rubricConfig: RubricConfig | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
+}, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;

package/dist/pipeline/mirror-repo-tasks.d.ts CHANGED Viewed

@@ -15,8 +15,19 @@
 import type { SanityClient } from "@sanity/client";
 import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
 export interface MirrorOptions {
-    /** Sanity client with write access */
+    /**
+     * Sanity client targeting the AILF private dataset — used to write
+     * `ailf.task` and `ailf.featureArea` documents and to read existing
+     * mirror state. Per D0043, AILF docs live in `ailf-prod-private`.
+     */
     client: SanityClient;
+    /**
+     * Sanity client targeting the editorial dataset — used to resolve
+     * `article` slugs to document IDs for canonical-doc references. After
+     * the dataset split, the AILF client cannot see editorial documents,
+     * so this must be a separate client (or omitted to skip slug resolution).
+     */
+    editorialClient?: SanityClient;
     /** Tasks to mirror (already loaded from repo) */
     tasks: LiteracyTaskDefinition[];
     /** Git context for origin provenance */
@@ -124,10 +135,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
         _key: string;
         reason: string;
     } | {
-        doc?: {
-            _ref: string;
-            _type: string;
-        } | undefined;
+        doc?: import("../sanity/client.js").EditorialReference | undefined;
         docId?: string | undefined;
         refType: string;
         _key: string;

package/dist/pipeline/mirror-repo-tasks.js CHANGED Viewed

@@ -16,6 +16,7 @@ import { createHash } from "crypto";
 import { readFileSync } from "fs";
 import { isIdRef, isPathRef, isPerspectiveRef, isSlugRef, } from "../_vendor/ailf-core/index.js";
 import { ConsoleLogger } from "../adapters/loggers/index.js";
+import { buildEditorialReference } from "../sanity/client.js";
 // ---------------------------------------------------------------------------
 // Public API
 // ---------------------------------------------------------------------------
@@ -31,7 +32,7 @@ import { ConsoleLogger } from "../adapters/loggers/index.js";
  * 6. Upsert the ailf.task document with origin block
  */
 export async function mirrorRepoTasks(options) {
-    const { client, tasks, git, dryRun = false, logger } = options;
+    const { client, editorialClient, tasks, git, dryRun = false, logger, } = options;
     const log = logger ?? new ConsoleLogger();
     const result = {
         total: tasks.length,
@@ -44,11 +45,18 @@ export async function mirrorRepoTasks(options) {
     if (tasks.length === 0)
         return result;
     // Batch-resolve all context doc slugs (slug refs only — other ref types
-    // are stored without a resolved article reference for now)
+    // are stored without a resolved article reference for now). Slugs live on
+    // `article` documents in the editorial dataset, so this must use the
+    // editorial client. Without one, every slug ref stays unresolved.
     const allSlugs = [
         ...new Set(tasks.flatMap((t) => (t.context?.docs ?? []).filter(isSlugRef).map((d) => d.slug))),
     ];
-    const slugToDocId = await batchResolveDocSlugs(client, allSlugs);
+    const slugToDocId = editorialClient
+        ? await batchResolveDocSlugs(editorialClient, allSlugs)
+        : new Map();
+    if (!editorialClient && allSlugs.length > 0) {
+        log.warn("  ⚠️  No editorial Sanity client provided — skipping slug→article resolution");
+    }
     // Track unresolved slugs
     for (const slug of allSlugs) {
         if (!slugToDocId.has(slug)) {
@@ -363,13 +371,13 @@ export function buildMirrorDocument(task, opts) {
             // When a slug resolves to a document, store as "id" ref with
             // the resolved article reference. When unresolved, store as
             // "slug" so Studio knows the resolution strategy even if the
-            // article doesn't exist yet.
+            // article doesn't exist yet. The `doc` reference is a Cross
+            // Dataset Reference per D0043 — `ailf.task` lives in the AILF
+            // private dataset and `article` lives in the editorial dataset.
             return {
                 ...base,
                 refType: resolvedId ? "id" : "slug",
-                ...(resolvedId
-                    ? { doc: { _ref: resolvedId, _type: "reference" } }
-                    : {}),
+                ...(resolvedId ? { doc: buildEditorialReference(resolvedId) } : {}),
             };
         }
         if (isPathRef(ref)) {
@@ -380,7 +388,7 @@ export function buildMirrorDocument(task, opts) {
                 ...base,
                 refType: "id",
                 ...(ref.id
-                    ? { doc: { _ref: ref.id, _type: "reference" }, docId: ref.id }
+                    ? { doc: buildEditorialReference(ref.id), docId: ref.id }
                     : {}),
             };
         }

package/dist/pipeline/pr-comment.d.ts CHANGED Viewed

@@ -1,19 +1,32 @@
 /**
- * pipeline/pr-comment.ts — Generates a markdown PR comment from eval score-summary.json.
+ * pipeline/pr-comment.ts — Generate a markdown PR comment from
+ * `results/latest/score-summary.json` (and an optional comparison-report).
  *
- * All functions accept rootDir as a parameter — no module-level constants.
- * No process.argv parsing. No env var fallbacks.
+ * Thin wrapper around `@sanity/ailf-core`'s unified renderer (W0150).
+ * Reads the local JSON files, applies legacy-field normalization on the
+ * scores, builds a `RenderableReport` envelope (so the CLI's
+ * `--promptfoo-url` flag flows through `provenance.promptfooUrls[0]`),
+ * then delegates rendering.
  *
- * Reads: results/latest/score-summary.json
- * Writes: markdown to stdout or --output file
+ * All functions accept `rootDir` as a parameter — no module-level
+ * constants. No `process.argv` parsing. No env-var fallbacks.
  */
-/** Options for the generatePrComment() function. */
+import { type RenderableReport } from "../_vendor/ailf-core/index.d.ts";
+import type { ComparisonReport, ScoreSummary } from "./types.js";
+/** Options for the {@link generatePrComment} function. */
 export interface PrCommentOptions {
-    /** Path to write the comment (default: stdout) */
+    /** Path to write the comment (default: stdout). */
     outputPath?: string;
-    /** Promptfoo share URL to include in the comment */
+    /** Promptfoo share URL to include as the footer "view detailed results" link. */
     promptfooUrl?: string;
-    /** Root directory of the eval package */
+    /** Root directory of the eval package. */
     rootDir: string;
 }
 export declare function generatePrComment(options: PrCommentOptions): void;
+/**
+ * Adapter: build a {@link RenderableReport} from the in-memory pipeline
+ * artifacts. Exposed for the cross-renderer byte-equality contract test.
+ */
+export declare function scoreSummaryToRenderableReport(summary: ScoreSummary, comparison: ComparisonReport | undefined, options?: {
+    promptfooUrl?: string;
+}): RenderableReport;