@sanity/ailf 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/bin/ailf.js +6 -1
  2. package/dist/_vendor/ailf-core/schemas/external-providers.d.ts +136 -0
  3. package/dist/_vendor/ailf-core/schemas/external-providers.js +136 -0
  4. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  5. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  6. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -3
  7. package/dist/_vendor/ailf-core/schemas/report.d.ts +251 -0
  8. package/dist/_vendor/ailf-core/schemas/report.js +235 -0
  9. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  10. package/dist/_vendor/ailf-core/services/index.js +1 -0
  11. package/dist/_vendor/ailf-core/services/report-to-markdown.d.ts +38 -0
  12. package/dist/_vendor/ailf-core/services/report-to-markdown.js +696 -0
  13. package/dist/_vendor/ailf-core/types/api-requests.d.ts +159 -0
  14. package/dist/_vendor/ailf-core/types/api-requests.js +27 -0
  15. package/dist/_vendor/ailf-core/types/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +112 -0
  17. package/dist/_vendor/ailf-core/types/pipeline-request.js +18 -0
  18. package/dist/_vendor/ailf-core/types/repo-config.d.ts +146 -0
  19. package/dist/_vendor/ailf-core/types/repo-config.js +18 -0
  20. package/dist/_vendor/ailf-shared/index.d.ts +7 -5
  21. package/dist/_vendor/ailf-shared/index.js +7 -5
  22. package/dist/adapters/api-client/types.d.ts +2 -5
  23. package/dist/adapters/task-sources/content-lake-task-source.d.ts +58 -1
  24. package/dist/adapters/task-sources/content-lake-task-source.js +1 -1
  25. package/dist/adapters/task-sources/index.d.ts +1 -1
  26. package/dist/adapters/task-sources/index.js +1 -1
  27. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -2
  28. package/dist/adapters/task-sources/repo-schemas.js +3 -1
  29. package/dist/adapters/task-sources/repo-validation.d.ts +6 -6
  30. package/dist/adapters/task-sources/repo-validation.js +1 -1
  31. package/dist/agent-observer/agentic-provider.d.ts +1 -0
  32. package/dist/agent-observer/agentic-provider.js +43 -36
  33. package/dist/agent-observer/config-schemas.d.ts +61 -0
  34. package/dist/agent-observer/config-schemas.js +65 -0
  35. package/dist/agent-observer/provider.d.ts +1 -0
  36. package/dist/agent-observer/provider.js +19 -17
  37. package/dist/cli.js +4 -4
  38. package/dist/commands/validate-tasks.js +2 -2
  39. package/dist/composition-root.js +4 -2
  40. package/dist/index.d.ts +1 -1
  41. package/dist/index.js +1 -1
  42. package/dist/job-store.js +2 -2
  43. package/dist/lib/dotenv-resolution.d.ts +21 -0
  44. package/dist/lib/dotenv-resolution.js +30 -0
  45. package/dist/orchestration/steps/mirror-repo-tasks-step.js +14 -3
  46. package/dist/orchestration/steps/run-eval-step.js +21 -3
  47. package/dist/pipeline/agent-behavior-report.d.ts +2 -8
  48. package/dist/pipeline/cache.d.ts +2 -2
  49. package/dist/pipeline/checks.d.ts +10 -2
  50. package/dist/pipeline/checks.js +14 -4
  51. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  52. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +2 -2
  53. package/dist/pipeline/compiler/mode-handlers/index.d.ts +1 -1
  54. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +2 -2
  55. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  56. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +3 -3
  57. package/dist/pipeline/compiler/promptfoo-compiler.js +7 -11
  58. package/dist/pipeline/compiler/provider-assembler.js +33 -3
  59. package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
  60. package/dist/pipeline/mirror-repo-tasks.d.ts +13 -5
  61. package/dist/pipeline/mirror-repo-tasks.js +16 -8
  62. package/dist/pipeline/pr-comment.d.ts +22 -9
  63. package/dist/pipeline/pr-comment.js +52 -472
  64. package/dist/pipeline/resolve-mappings.d.ts +8 -3
  65. package/dist/promptfoo-providers/mock-path.d.ts +12 -0
  66. package/dist/promptfoo-providers/mock-path.js +15 -0
  67. package/dist/report-store.d.ts +63 -1
  68. package/dist/report-store.js +111 -31
  69. package/dist/sanity/client.d.ts +58 -0
  70. package/dist/sanity/client.js +106 -0
  71. package/package.json +8 -7
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Dotenv resolution helpers shared between the CLI bootstrap
3
+ * (`packages/eval/src/cli.ts`) and any code path that needs to honor the
4
+ * same `--dotenv <path>` override (today: `pipeline/checks.ts::checkEnvironment`,
5
+ * which re-loads the active env file as part of validation).
6
+ *
7
+ * Centralizing the argv parse means future changes — validating the path
8
+ * exists before returning, supporting `--dotenv=path` form, accepting an
9
+ * env-var fallback — happen in one place instead of drifting between
10
+ * call sites.
11
+ */
12
+ import { resolve } from "node:path";
13
+ /**
14
+ * Find an explicit `--dotenv <path>` argument and return its absolute,
15
+ * resolved path. Returns `undefined` when the flag is absent or has no
16
+ * following value.
17
+ *
18
+ * @param argv - Defaults to `process.argv`. Pass an explicit array in
19
+ * tests or in non-CLI hosts that have already shifted off the script
20
+ * prefix.
21
+ */
22
+ export function findExplicitDotenvArg(argv = process.argv) {
23
+ const idx = argv.indexOf("--dotenv");
24
+ if (idx === -1)
25
+ return undefined;
26
+ const value = argv[idx + 1];
27
+ if (!value)
28
+ return undefined;
29
+ return resolve(value);
30
+ }
@@ -12,7 +12,7 @@
12
12
  * @see packages/eval/src/pipeline/mirror-repo-tasks.ts
13
13
  * @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
- import { getSanityClient } from "../../sanity/client.js";
15
+ import { getAilfSanityClient, getSanityClient } from "../../sanity/client.js";
16
16
  import { detectGitContext, mirrorRepoTasks, } from "../../pipeline/mirror-repo-tasks.js";
17
17
  export class MirrorRepoTasksStep {
18
18
  name = "mirror-repo-tasks";
@@ -66,11 +66,22 @@ export class MirrorRepoTasksStep {
66
66
  // Detect git context (from env vars or git CLI)
67
67
  const git = await detectGitContext(ctx.config.repoTasksPath);
68
68
  ctx.logger.info(` Mirroring ${repoTasks.length} repo task(s) from ${git.repo}@${git.branch}`);
69
- // Create a client with write access
70
- const client = getSanityClient({ token });
69
+ // Two clients are required after the D0043 dataset split:
70
+ // - `client` writes ailf.task / ailf.featureArea to the AILF
71
+ // dataset and reads existing mirror state — uses the
72
+ // AILF-scoped token explicitly so writes work even when
73
+ // SANITY_API_TOKEN is editorial-read-only.
74
+ // - `editorialClient` resolves `article` slugs against the
75
+ // editorial dataset. Operators may scope the AILF token to
76
+ // AILF only (D0043 consequence #5); using it here would
77
+ // 401 on the editorial query. Let it pick up SANITY_API_TOKEN
78
+ // from the default config instead.
79
+ const client = getAilfSanityClient({ token });
80
+ const editorialClient = getSanityClient();
71
81
  // Run the mirror
72
82
  const result = await mirrorRepoTasks({
73
83
  client,
84
+ editorialClient,
74
85
  git,
75
86
  logger: ctx.logger,
76
87
  tasks: repoTasks,
@@ -111,10 +111,28 @@ export class RunEvalStep {
111
111
  // required eval modes were satisfied from the remote cache.
112
112
  state.remoteCacheHits ??= new Set();
113
113
  state.remoteCacheHits.add(this.mode);
114
- // Carry forward Promptfoo share URLs from the cached report
115
- if (remoteCacheResult.promptfooUrls?.length) {
114
+ // Carry forward the share-link backreference for THIS mode only.
115
+ // Pushing every entry from `remoteCacheResult.promptfooUrls`
116
+ // snowballs across the daily perspective cron: each cache-hit
117
+ // run inherits the cached report's full URL list (including
118
+ // other modes and any URLs the cached report had itself
119
+ // accumulated from earlier hits), then layers its own on top.
120
+ // Iterate from the tail to handle pre-fix cached reports that
121
+ // may carry multiple entries for the same mode.
122
+ const cachedUrls = remoteCacheResult.promptfooUrls;
123
+ let inherited;
124
+ if (cachedUrls) {
125
+ for (let i = cachedUrls.length - 1; i >= 0; i--) {
126
+ const entry = cachedUrls[i];
127
+ if (entry?.mode === this.mode) {
128
+ inherited = entry;
129
+ break;
130
+ }
131
+ }
132
+ }
133
+ if (inherited) {
116
134
  state.promptfooUrls ??= [];
117
- state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
135
+ state.promptfooUrls.push(inherited);
118
136
  }
119
137
  // D0040 / W0135 — restore the cached report's artifact manifest into
120
138
  // the accumulator so the new run's RunManifest advertises the cached
@@ -4,7 +4,9 @@
4
4
  * Pure analysis functions for agent behavior observation reports.
5
5
  * No I/O, no process.env, no process.argv — all data is passed in.
6
6
  */
7
+ import type { TestResult } from "../_vendor/ailf-core/index.d.ts";
7
8
  import type { AgentBehaviorSummary } from "../agent-observer/types.js";
9
+ export type { TestResult } from "../_vendor/ailf-core/index.d.ts";
8
10
  export interface PromptfooResults {
9
11
  results: TestResult[];
10
12
  }
@@ -13,14 +15,6 @@ export interface PromptfooResultsEnvelope {
13
15
  results: TestResult[];
14
16
  };
15
17
  }
16
- export interface TestResult {
17
- description: string;
18
- metadata?: Record<string, unknown>;
19
- response: {
20
- output: string;
21
- };
22
- vars: Record<string, string>;
23
- }
24
18
  export interface TaskBehavior {
25
19
  behavior: AgentBehaviorSummary;
26
20
  description: string;
@@ -28,7 +28,7 @@ export interface CacheEntry {
28
28
  timestamp: string;
29
29
  }
30
30
  /** Result of a cache lookup */
31
- export type CacheLookupResult = {
31
+ export type ManifestCacheLookupResult = {
32
32
  hit: false;
33
33
  currentHash: string;
34
34
  } | {
@@ -84,7 +84,7 @@ export declare function hashFiles(paths: string[], context?: string[]): string;
84
84
  * Optional `context` strings are included in the hash so that non-file
85
85
  * state (e.g., area/task filter flags) participates in cache key computation.
86
86
  */
87
- export declare function lookupCache(rootDir: string, step: string, context?: string[]): CacheLookupResult;
87
+ export declare function lookupCache(rootDir: string, step: string, context?: string[]): ManifestCacheLookupResult;
88
88
  /**
89
89
  * Read the cache manifest for a step.
90
90
  * Returns null if no manifest exists or it's corrupt.
@@ -18,8 +18,16 @@ export declare function checkCanonicalContextsExist(rootDir: string, taskIds: st
18
18
  export declare function checkContextsExist(rootDir: string, areas: string[]): ValidationIssue[];
19
19
  /**
20
20
  * Check that required environment variables are set.
21
- * Loads the root `.env` file first (with override, matching the dotenv CLI
22
- * `-o` flag used by other scripts), then checks for required keys.
21
+ *
22
+ * Loads the resolved `.env` file first (with override, matching the dotenv
23
+ * CLI `-o` flag used by other scripts), then checks for required keys. The
24
+ * resolution order mirrors `cli.ts`'s `resolveEnvPath()` so a `--dotenv
25
+ * <path>` argument on the parent CLI invocation isn't silently clobbered
26
+ * here. Without this, a Tier 2 test that uses `--dotenv` to override
27
+ * tenant-pointing vars (e.g. `AILF_GCS_ARTIFACT_BUCKET`,
28
+ * `GOOGLE_APPLICATION_CREDENTIALS`) gets its overrides reverted to the
29
+ * repo `.env` values when this function runs as part of the validate
30
+ * step. (W0138 Slice 2 surface — see `gcs-pipeline-replay-roundtrip.test.ts`.)
23
31
  */
24
32
  export declare function checkEnvironment(rootDir: string): ValidationIssue[];
25
33
  /**
@@ -8,6 +8,7 @@
8
8
  import { config as loadEnv } from "dotenv";
9
9
  import { existsSync, readFileSync, statSync } from "fs";
10
10
  import { join, resolve } from "path";
11
+ import { findExplicitDotenvArg } from "../lib/dotenv-resolution.js";
11
12
  import { configFileForMode } from "./eval-constants.js";
12
13
  // ---------------------------------------------------------------------------
13
14
  // Precondition: contexts exist for each feature area
@@ -80,13 +81,22 @@ export function checkContextsExist(rootDir, areas) {
80
81
  // ---------------------------------------------------------------------------
81
82
  /**
82
83
  * Check that required environment variables are set.
83
- * Loads the root `.env` file first (with override, matching the dotenv CLI
84
- * `-o` flag used by other scripts), then checks for required keys.
84
+ *
85
+ * Loads the resolved `.env` file first (with override, matching the dotenv
86
+ * CLI `-o` flag used by other scripts), then checks for required keys. The
87
+ * resolution order mirrors `cli.ts`'s `resolveEnvPath()` so a `--dotenv
88
+ * <path>` argument on the parent CLI invocation isn't silently clobbered
89
+ * here. Without this, a Tier 2 test that uses `--dotenv` to override
90
+ * tenant-pointing vars (e.g. `AILF_GCS_ARTIFACT_BUCKET`,
91
+ * `GOOGLE_APPLICATION_CREDENTIALS`) gets its overrides reverted to the
92
+ * repo `.env` values when this function runs as part of the validate
93
+ * step. (W0138 Slice 2 surface — see `gcs-pipeline-replay-roundtrip.test.ts`.)
85
94
  */
86
95
  export function checkEnvironment(rootDir) {
87
96
  const issues = [];
88
- // Load root .env so we see the same vars as dotenv -e ../../.env -o
89
- const envPath = resolve(rootDir, "..", "..", ".env");
97
+ // Load the active .env so we see the same vars as dotenv -e <path> -o.
98
+ // Resolution: explicit --dotenv arg wins, then the repo-root .env.
99
+ const envPath = findExplicitDotenvArg() ?? resolve(rootDir, "..", "..", ".env");
90
100
  if (existsSync(envPath)) {
91
101
  loadEnv({ override: true, path: envPath });
92
102
  }
@@ -46,7 +46,7 @@ import { buildTaskGraph } from "./task-graph-builder.js";
46
46
  * rules (e.g., rejecting archived tasks that slipped through).
47
47
  */
48
48
  export function compileLiteracyTasks(tasks, options) {
49
- const rubricConfig = loadRubricConfig(options.rootDir);
49
+ const rubricConfig = loadRubricResolutionInput(options.rootDir);
50
50
  const warnings = [];
51
51
  const results = [];
52
52
  let totalTests = 0;
@@ -146,7 +146,7 @@ export function compareCompilerOutputs(legacyEntries, newResult) {
146
146
  // ---------------------------------------------------------------------------
147
147
  // Rubric config loading
148
148
  // ---------------------------------------------------------------------------
149
- function loadRubricConfig(rootDir) {
149
+ function loadRubricResolutionInput(rootDir) {
150
150
  const result = tryLoadConfigFile("rubrics", rootDir);
151
151
  if (!result)
152
152
  return undefined;
@@ -2,7 +2,7 @@
2
2
  * Shared types for the agent harness mode handler.
3
3
  */
4
4
  import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
- import type { RubricConfig } from "../../rubric-resolution.js";
5
+ import type { RubricResolutionInput } from "../../rubric-resolution.js";
6
6
  import type { SandboxType } from "../../sandbox/sandbox-strategy.js";
7
7
  /** Options for compiling an agent harness task */
8
8
  export interface AgentHarnessCompileOptions {
@@ -11,7 +11,7 @@ export interface AgentHarnessCompileOptions {
11
11
  /** Root directory for fixture resolution */
12
12
  rootDir?: string;
13
13
  /** Rubric config (templates, weights, profiles) — loaded from rubrics config */
14
- rubricConfig?: RubricConfig;
14
+ rubricConfig?: RubricResolutionInput;
15
15
  }
16
16
  /** Result of compiling a single agent harness task */
17
17
  export interface AgentHarnessCompileResult {
@@ -10,6 +10,6 @@
10
10
  * @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
11
  */
12
12
  export { buildMCPAssertions, compileMCPTask, handler as mcpServerHandler, validateMCPTask, type MCPAssertionContext, type MCPCompileOptions, type MCPCompileResult, type MCPValidationError, } from "./mcp-server/index.js";
13
- export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type RubricConfig, } from "./literacy/index.js";
13
+ export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type RubricResolutionInput, } from "./literacy/index.js";
14
14
  export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, type KnowledgeProbeCompileOptions, type KnowledgeProbeCompileResult, type KnowledgeProbeMetadata, type KnowledgeProbeValidationError, } from "./knowledge-probe/index.js";
15
15
  export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, type AgentHarnessCompileOptions, type AgentHarnessCompileResult, type AgentHarnessValidationError, type PromptfooExtension, type SandboxConfigMeta, } from "./agent-harness/index.js";
@@ -2,7 +2,7 @@
2
2
  * Public types for the knowledge-probe mode handler.
3
3
  */
4
4
  import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
- import type { RubricConfig } from "../../rubric-resolution.js";
5
+ import type { RubricResolutionInput } from "../../rubric-resolution.js";
6
6
  /** Options for compiling a knowledge probe task */
7
7
  export interface KnowledgeProbeCompileOptions {
8
8
  /** Grader provider for LLM-graded assertions */
@@ -15,7 +15,7 @@ export interface KnowledgeProbeCompileOptions {
15
15
  }[];
16
16
  /** Rubric config (templates, weights, profiles) — needed to resolve
17
17
  * templated `llm-rubric` assertions to dimension metadata. */
18
- rubricConfig?: RubricConfig;
18
+ rubricConfig?: RubricResolutionInput;
19
19
  }
20
20
  /** Result of compiling a single knowledge probe task */
21
21
  export interface KnowledgeProbeCompileResult {
@@ -7,5 +7,5 @@ import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
7
7
  export { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
8
8
  export { validateLiteracyTask, type LiteracyValidationError, } from "./validation.js";
9
9
  export { compileLiteracyTask } from "./compiler.js";
10
- export type { LiteracyCompileOptions, LiteracyCompileResult, RubricConfig, } from "./types.js";
10
+ export type { LiteracyCompileOptions, LiteracyCompileResult, RubricResolutionInput, } from "./types.js";
11
11
  export declare const handler: ModeHandler;
@@ -2,8 +2,8 @@
2
2
  * Shared types for the literacy mode handler.
3
3
  */
4
4
  import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
- export type { RubricConfig } from "../../rubric-resolution.js";
6
- import type { RubricConfig } from "../../rubric-resolution.js";
5
+ export type { RubricResolutionInput } from "../../rubric-resolution.js";
6
+ import type { RubricResolutionInput } from "../../rubric-resolution.js";
7
7
  /** Options for compiling a literacy task */
8
8
  export interface LiteracyCompileOptions {
9
9
  /** Grader provider for LLM-graded assertions */
@@ -19,7 +19,7 @@ export interface LiteracyCompileOptions {
19
19
  config?: Record<string, unknown>;
20
20
  }[];
21
21
  /** Rubric config (templates, weights, profiles) — loaded from rubrics config */
22
- rubricConfig?: RubricConfig;
22
+ rubricConfig?: RubricResolutionInput;
23
23
  }
24
24
  /** Result of compiling a single literacy task */
25
25
  export interface LiteracyCompileResult {
@@ -11,20 +11,11 @@
11
11
  *
12
12
  * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
13
13
  */
14
- import { dirname, resolve as resolvePath } from "node:path";
15
- import { fileURLToPath } from "node:url";
16
14
  import { mapAssertions } from "./assertion-mapper.js";
17
15
  import { resolveTaskFixtures } from "./fixture-resolver.js";
18
16
  import { LiteracyVariant } from "../normalize-mode.js";
19
17
  import { resolveVariables } from "./variable-resolver.js";
20
- /**
21
- * Absolute filesystem path to the AILF mock Promptfoo provider. Resolved
22
- * once at module load relative to this file. Promptfoo's `file://` provider
23
- * loader requires an absolute path. See buildProviders for the env-var
24
- * gate that swaps real providers for this mock.
25
- */
26
- const __dirname = dirname(fileURLToPath(import.meta.url));
27
- const MOCK_PROVIDER_ABSPATH = resolvePath(__dirname, "..", "..", "promptfoo-providers", "mock-provider.cjs");
18
+ import { MOCK_PROVIDER_ABSPATH } from "../../promptfoo-providers/mock-path.js";
28
19
  // ---------------------------------------------------------------------------
29
20
  // Public API
30
21
  // ---------------------------------------------------------------------------
@@ -158,12 +149,17 @@ function buildProviders(models, mode) {
158
149
  // never makes a live LLM call. We preserve `label` and stash the
159
150
  // original `id` in `config.originalId` so the mock provider can surface
160
151
  // model identity in its output and reports remain interpretable.
152
+ // `originalId ?? p.id` guards against double-swap clobbering a real id
153
+ // that's already been preserved on a prior pass.
161
154
  // See W0110 (M5.1) and docs/design-docs/testing-strategy.md.
162
155
  if (process.env.AILF_REPLAY_LLMS === "1") {
163
156
  return providers.map((p) => ({
164
157
  id: `file://${MOCK_PROVIDER_ABSPATH}`,
165
158
  label: p.label,
166
- config: { ...p.config, originalId: p.id },
159
+ config: {
160
+ ...p.config,
161
+ originalId: p.config?.originalId ?? p.id,
162
+ },
167
163
  }));
168
164
  }
169
165
  return providers;
@@ -25,6 +25,36 @@
25
25
  import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
26
26
  import { loadConfigFile } from "./config-loader.js";
27
27
  import { modelMatchesLiteracyVariant } from "./mode-bases/literacy.js";
28
+ import { MOCK_PROVIDER_ABSPATH } from "../../promptfoo-providers/mock-path.js";
29
+ /**
30
+ * Apply the W0110 replay swap to a list of literacy provider records.
31
+ *
32
+ * When `AILF_REPLAY_LLMS=1`, every provider's `id` is rewritten to the
33
+ * file-based AILF mock provider so the Promptfoo subprocess never makes
34
+ * a live LLM call. We preserve `label` and stash the original `id` in
35
+ * `config.originalId` so reports remain interpretable. This mirrors the
36
+ * top-level `buildProviders` swap in `promptfoo-compiler.ts` — it exists
37
+ * here because the literacy mode runs through this assembler, not
38
+ * `compileToPromptfoo`, so without this hook the replay flag was a no-op
39
+ * for every literacy run (W0138 Slice 2 surface).
40
+ *
41
+ * `originalId` is set with `?? p.id` so a record that's already been
42
+ * swapped (or that pre-stashed an `originalId` for any other reason)
43
+ * doesn't get its real model id clobbered by the file:// path.
44
+ */
45
+ function applyReplaySwap(providers) {
46
+ if (process.env.AILF_REPLAY_LLMS !== "1")
47
+ return providers;
48
+ return providers.map((raw) => {
49
+ const p = raw;
50
+ const config = p.config ?? {};
51
+ return {
52
+ id: `file://${MOCK_PROVIDER_ABSPATH}`,
53
+ label: p.label,
54
+ config: { ...config, originalId: config.originalId ?? p.id },
55
+ };
56
+ });
57
+ }
28
58
  // ---------------------------------------------------------------------------
29
59
  // Public API
30
60
  // ---------------------------------------------------------------------------
@@ -40,9 +70,9 @@ export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigi
40
70
  return {
41
71
  models,
42
72
  providers: {
43
- baseline: buildBaselineProviders(models),
44
- agentic: buildAgenticProviders(models, source, searchMode, allowedOrigins),
45
- observed: buildObservedProviders(models),
73
+ baseline: applyReplaySwap(buildBaselineProviders(models)),
74
+ agentic: applyReplaySwap(buildAgenticProviders(models, source, searchMode, allowedOrigins)),
75
+ observed: applyReplaySwap(buildObservedProviders(models)),
46
76
  },
47
77
  };
48
78
  }
@@ -14,7 +14,7 @@
14
14
  */
15
15
  import type { PromptfooAssertion } from "./assertion-mapper.js";
16
16
  /** Minimal rubric config needed for template resolution */
17
- export interface RubricConfig {
17
+ export interface RubricResolutionInput {
18
18
  templates: Record<string, {
19
19
  criteria_label?: string;
20
20
  dimension?: string;
@@ -37,4 +37,4 @@ export declare function resolveTemplatedAssertion(assertion: {
37
37
  criteria: string[];
38
38
  template: string;
39
39
  type: string;
40
- }, rubricConfig: RubricConfig | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
40
+ }, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
@@ -15,8 +15,19 @@
15
15
  import type { SanityClient } from "@sanity/client";
16
16
  import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
17
17
  export interface MirrorOptions {
18
- /** Sanity client with write access */
18
+ /**
19
+ * Sanity client targeting the AILF private dataset — used to write
20
+ * `ailf.task` and `ailf.featureArea` documents and to read existing
21
+ * mirror state. Per D0043, AILF docs live in `ailf-prod-private`.
22
+ */
19
23
  client: SanityClient;
24
+ /**
25
+ * Sanity client targeting the editorial dataset — used to resolve
26
+ * `article` slugs to document IDs for canonical-doc references. After
27
+ * the dataset split, the AILF client cannot see editorial documents,
28
+ * so this must be a separate client (or omitted to skip slug resolution).
29
+ */
30
+ editorialClient?: SanityClient;
20
31
  /** Tasks to mirror (already loaded from repo) */
21
32
  tasks: LiteracyTaskDefinition[];
22
33
  /** Git context for origin provenance */
@@ -124,10 +135,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
124
135
  _key: string;
125
136
  reason: string;
126
137
  } | {
127
- doc?: {
128
- _ref: string;
129
- _type: string;
130
- } | undefined;
138
+ doc?: import("../sanity/client.js").EditorialReference | undefined;
131
139
  docId?: string | undefined;
132
140
  refType: string;
133
141
  _key: string;
@@ -16,6 +16,7 @@ import { createHash } from "crypto";
16
16
  import { readFileSync } from "fs";
17
17
  import { isIdRef, isPathRef, isPerspectiveRef, isSlugRef, } from "../_vendor/ailf-core/index.js";
18
18
  import { ConsoleLogger } from "../adapters/loggers/index.js";
19
+ import { buildEditorialReference } from "../sanity/client.js";
19
20
  // ---------------------------------------------------------------------------
20
21
  // Public API
21
22
  // ---------------------------------------------------------------------------
@@ -31,7 +32,7 @@ import { ConsoleLogger } from "../adapters/loggers/index.js";
31
32
  * 6. Upsert the ailf.task document with origin block
32
33
  */
33
34
  export async function mirrorRepoTasks(options) {
34
- const { client, tasks, git, dryRun = false, logger } = options;
35
+ const { client, editorialClient, tasks, git, dryRun = false, logger, } = options;
35
36
  const log = logger ?? new ConsoleLogger();
36
37
  const result = {
37
38
  total: tasks.length,
@@ -44,11 +45,18 @@ export async function mirrorRepoTasks(options) {
44
45
  if (tasks.length === 0)
45
46
  return result;
46
47
  // Batch-resolve all context doc slugs (slug refs only — other ref types
47
- // are stored without a resolved article reference for now)
48
+ // are stored without a resolved article reference for now). Slugs live on
49
+ // `article` documents in the editorial dataset, so this must use the
50
+ // editorial client. Without one, every slug ref stays unresolved.
48
51
  const allSlugs = [
49
52
  ...new Set(tasks.flatMap((t) => (t.context?.docs ?? []).filter(isSlugRef).map((d) => d.slug))),
50
53
  ];
51
- const slugToDocId = await batchResolveDocSlugs(client, allSlugs);
54
+ const slugToDocId = editorialClient
55
+ ? await batchResolveDocSlugs(editorialClient, allSlugs)
56
+ : new Map();
57
+ if (!editorialClient && allSlugs.length > 0) {
58
+ log.warn(" ⚠️ No editorial Sanity client provided — skipping slug→article resolution");
59
+ }
52
60
  // Track unresolved slugs
53
61
  for (const slug of allSlugs) {
54
62
  if (!slugToDocId.has(slug)) {
@@ -363,13 +371,13 @@ export function buildMirrorDocument(task, opts) {
363
371
  // When a slug resolves to a document, store as "id" ref with
364
372
  // the resolved article reference. When unresolved, store as
365
373
  // "slug" so Studio knows the resolution strategy even if the
366
- // article doesn't exist yet.
374
+ // article doesn't exist yet. The `doc` reference is a Cross
375
+ // Dataset Reference per D0043 — `ailf.task` lives in the AILF
376
+ // private dataset and `article` lives in the editorial dataset.
367
377
  return {
368
378
  ...base,
369
379
  refType: resolvedId ? "id" : "slug",
370
- ...(resolvedId
371
- ? { doc: { _ref: resolvedId, _type: "reference" } }
372
- : {}),
380
+ ...(resolvedId ? { doc: buildEditorialReference(resolvedId) } : {}),
373
381
  };
374
382
  }
375
383
  if (isPathRef(ref)) {
@@ -380,7 +388,7 @@ export function buildMirrorDocument(task, opts) {
380
388
  ...base,
381
389
  refType: "id",
382
390
  ...(ref.id
383
- ? { doc: { _ref: ref.id, _type: "reference" }, docId: ref.id }
391
+ ? { doc: buildEditorialReference(ref.id), docId: ref.id }
384
392
  : {}),
385
393
  };
386
394
  }
@@ -1,19 +1,32 @@
1
1
  /**
2
- * pipeline/pr-comment.ts — Generates a markdown PR comment from eval score-summary.json.
2
+ * pipeline/pr-comment.ts — Generate a markdown PR comment from
3
+ * `results/latest/score-summary.json` (and an optional comparison-report).
3
4
  *
4
- * All functions accept rootDir as a parameter — no module-level constants.
5
- * No process.argv parsing. No env var fallbacks.
5
+ * Thin wrapper around `@sanity/ailf-core`'s unified renderer (W0150).
6
+ * Reads the local JSON files, applies legacy-field normalization on the
7
+ * scores, builds a `RenderableReport` envelope (so the CLI's
8
+ * `--promptfoo-url` flag flows through `provenance.promptfooUrls[0]`),
9
+ * then delegates rendering.
6
10
  *
7
- * Reads: results/latest/score-summary.json
8
- * Writes: markdown to stdout or --output file
11
+ * All functions accept `rootDir` as a parameter — no module-level
12
+ * constants. No `process.argv` parsing. No env-var fallbacks.
9
13
  */
10
- /** Options for the generatePrComment() function. */
14
+ import { type RenderableReport } from "../_vendor/ailf-core/index.d.ts";
15
+ import type { ComparisonReport, ScoreSummary } from "./types.js";
16
+ /** Options for the {@link generatePrComment} function. */
11
17
  export interface PrCommentOptions {
12
- /** Path to write the comment (default: stdout) */
18
+ /** Path to write the comment (default: stdout). */
13
19
  outputPath?: string;
14
- /** Promptfoo share URL to include in the comment */
20
+ /** Promptfoo share URL to include as the footer "view detailed results" link. */
15
21
  promptfooUrl?: string;
16
- /** Root directory of the eval package */
22
+ /** Root directory of the eval package. */
17
23
  rootDir: string;
18
24
  }
19
25
  export declare function generatePrComment(options: PrCommentOptions): void;
26
+ /**
27
+ * Adapter: build a {@link RenderableReport} from the in-memory pipeline
28
+ * artifacts. Exposed for the cross-renderer byte-equality contract test.
29
+ */
30
+ export declare function scoreSummaryToRenderableReport(summary: ScoreSummary, comparison: ComparisonReport | undefined, options?: {
31
+ promptfooUrl?: string;
32
+ }): RenderableReport;