@sanity/ailf 4.0.6 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/bin/ailf.js +6 -1
  2. package/dist/_vendor/ailf-core/schemas/external-providers.d.ts +136 -0
  3. package/dist/_vendor/ailf-core/schemas/external-providers.js +136 -0
  4. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  5. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  6. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -3
  7. package/dist/_vendor/ailf-core/schemas/report.d.ts +251 -0
  8. package/dist/_vendor/ailf-core/schemas/report.js +235 -0
  9. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  10. package/dist/_vendor/ailf-core/services/index.js +1 -0
  11. package/dist/_vendor/ailf-core/services/report-to-markdown.d.ts +38 -0
  12. package/dist/_vendor/ailf-core/services/report-to-markdown.js +696 -0
  13. package/dist/_vendor/ailf-core/types/api-requests.d.ts +159 -0
  14. package/dist/_vendor/ailf-core/types/api-requests.js +27 -0
  15. package/dist/_vendor/ailf-core/types/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +112 -0
  17. package/dist/_vendor/ailf-core/types/pipeline-request.js +18 -0
  18. package/dist/_vendor/ailf-core/types/repo-config.d.ts +146 -0
  19. package/dist/_vendor/ailf-core/types/repo-config.js +18 -0
  20. package/dist/_vendor/ailf-shared/index.d.ts +7 -5
  21. package/dist/_vendor/ailf-shared/index.js +7 -5
  22. package/dist/adapters/api-client/types.d.ts +2 -5
  23. package/dist/adapters/task-sources/content-lake-task-source.d.ts +58 -1
  24. package/dist/adapters/task-sources/content-lake-task-source.js +1 -1
  25. package/dist/adapters/task-sources/index.d.ts +1 -1
  26. package/dist/adapters/task-sources/index.js +1 -1
  27. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -2
  28. package/dist/adapters/task-sources/repo-schemas.js +3 -1
  29. package/dist/adapters/task-sources/repo-task-source.d.ts +11 -1
  30. package/dist/adapters/task-sources/repo-task-source.js +7 -4
  31. package/dist/adapters/task-sources/repo-validation.d.ts +6 -6
  32. package/dist/adapters/task-sources/repo-validation.js +1 -1
  33. package/dist/agent-observer/agentic-provider.d.ts +1 -0
  34. package/dist/agent-observer/agentic-provider.js +43 -36
  35. package/dist/agent-observer/config-schemas.d.ts +61 -0
  36. package/dist/agent-observer/config-schemas.js +65 -0
  37. package/dist/agent-observer/provider.d.ts +1 -0
  38. package/dist/agent-observer/provider.js +19 -17
  39. package/dist/cli.js +4 -4
  40. package/dist/commands/validate-tasks.js +2 -2
  41. package/dist/composition-root.d.ts +7 -0
  42. package/dist/composition-root.js +27 -12
  43. package/dist/index.d.ts +1 -1
  44. package/dist/index.js +1 -1
  45. package/dist/job-store.js +2 -2
  46. package/dist/lib/dotenv-resolution.d.ts +21 -0
  47. package/dist/lib/dotenv-resolution.js +30 -0
  48. package/dist/orchestration/steps/fetch-docs-step.js +10 -30
  49. package/dist/orchestration/steps/generate-configs-step.d.ts +8 -15
  50. package/dist/orchestration/steps/generate-configs-step.js +26 -118
  51. package/dist/orchestration/steps/mirror-repo-tasks-step.js +26 -3
  52. package/dist/orchestration/steps/run-eval-step.js +21 -3
  53. package/dist/pipeline/agent-behavior-report.d.ts +2 -8
  54. package/dist/pipeline/cache.d.ts +2 -2
  55. package/dist/pipeline/checks.d.ts +10 -2
  56. package/dist/pipeline/checks.js +14 -4
  57. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  58. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +2 -2
  59. package/dist/pipeline/compiler/mode-handlers/index.d.ts +1 -1
  60. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +2 -2
  61. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  62. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +3 -3
  63. package/dist/pipeline/compiler/promptfoo-compiler.js +7 -11
  64. package/dist/pipeline/compiler/provider-assembler.js +33 -3
  65. package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
  66. package/dist/pipeline/mirror-repo-tasks.d.ts +13 -5
  67. package/dist/pipeline/mirror-repo-tasks.js +16 -8
  68. package/dist/pipeline/pr-comment.d.ts +22 -9
  69. package/dist/pipeline/pr-comment.js +52 -472
  70. package/dist/pipeline/resolve-mappings.d.ts +8 -3
  71. package/dist/promptfoo-providers/mock-path.d.ts +12 -0
  72. package/dist/promptfoo-providers/mock-path.js +15 -0
  73. package/dist/report-store.d.ts +63 -1
  74. package/dist/report-store.js +111 -31
  75. package/dist/sanity/client.d.ts +58 -0
  76. package/dist/sanity/client.js +106 -0
  77. package/package.json +8 -7
  78. package/dist/orchestration/load-pipeline-tasks.d.ts +0 -40
  79. package/dist/orchestration/load-pipeline-tasks.js +0 -57
@@ -32,9 +32,10 @@ import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-ada
32
32
  import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
33
33
  import { ConsoleProgressReporter } from "./adapters/progress/console-progress-reporter.js";
34
34
  import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, } from "./adapters/task-sources/index.js";
35
+ import { resolveVendoredSubdir } from "./pipeline/compiler/config-loader.js";
35
36
  import { createAgentHarnessBase, createKnowledgeProbeBase, createLiteracyModeBase, createMcpServerModeBase, } from "./pipeline/compiler/mode-bases/index.js";
36
37
  import { createSanityLiteracyPreset } from "./pipeline/compiler/presets/index.js";
37
- import { getSanityClient } from "./sanity/client.js";
38
+ import { getAilfSanityClient } from "./sanity/client.js";
38
39
  import { ReportStore } from "./report-store.js";
39
40
  import { loadSinks } from "./sinks/index.js";
40
41
  /**
@@ -297,7 +298,12 @@ function createCache(config) {
297
298
  return local;
298
299
  return new ContentLakeCacheAdapter(local, createReportStore(config));
299
300
  }
300
- function createTaskSource(config) {
301
+ /**
302
+ * Build the `TaskSource` adapter wired by the composition root for a
303
+ * given `ResolvedConfig`. Exported for test access — composition-root
304
+ * wiring is a contract worth asserting directly.
305
+ */
306
+ export function createTaskSource(config) {
301
307
  // "repo" mode — use ONLY repo tasks, no Content Lake or YAML merge.
302
308
  // This is the correct mode for API-triggered inline-task evaluations
303
309
  // where the caller sent their own task definitions. Without this,
@@ -309,21 +315,30 @@ function createTaskSource(config) {
309
315
  }
310
316
  return new RepoTaskSource(config.repoTasksPath);
311
317
  }
312
- // Primary source Content Lake (the only non-repo source remaining)
313
- const primary = new ContentLakeTaskSource(getSanityClient({
314
- token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
315
- process.env.SANITY_API_TOKEN ??
316
- undefined,
317
- }));
318
- // If repo tasks path is set, combine primary + repo sources.
319
- // This is the "augment" mode — repo tasks extend the primary source.
318
+ // "content-lake"Studio-authored ailf.task documents only.
319
+ // AILF documents live in the private dataset (D0043) — route through
320
+ // the AILF client factory so reads target `ailf-prod-private`.
321
+ if (config.taskSourceType === "content-lake") {
322
+ return new ContentLakeTaskSource(getAilfSanityClient({
323
+ token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
324
+ process.env.SANITY_API_TOKEN ??
325
+ undefined,
326
+ }));
327
+ }
328
+ // Unset — AILF-bundled defaults from `tasks/${mode}/`, optionally
329
+ // augmented with the caller's `--repo-tasks-path` (W0146). The
330
+ // bundled directory is allowed to be missing so test rootDirs and
331
+ // modes that ship no defaults degrade gracefully to the augment
332
+ // source (or empty).
333
+ const bundledDir = resolveVendoredSubdir(config.rootDir, `tasks/${config.mode}`);
334
+ const bundled = new RepoTaskSource(bundledDir, { allowMissing: true });
320
335
  if (config.repoTasksPath) {
321
336
  return new CompositeTaskSource([
322
- primary,
337
+ bundled,
323
338
  new RepoTaskSource(config.repoTasksPath),
324
339
  ]);
325
340
  }
326
- return primary;
341
+ return bundled;
327
342
  }
328
343
  // ---------------------------------------------------------------------------
329
344
  // Layer 0: Framework built-in assertions
package/dist/index.d.ts CHANGED
@@ -38,6 +38,6 @@ export type { PricingEntry, PromptEntry, SourceEntry } from "./_vendor/ailf-core
38
38
  export { env } from "./_vendor/ailf-core/index.d.ts";
39
39
  export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./_vendor/ailf-core/index.d.ts";
40
40
  export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, type CanonicalTask, type CuratedAssertionType, type RubricTemplateName, } from "./adapters/task-sources/repo-schemas.js";
41
- export { formatValidationResult, validateCanonicalTasks, type ValidationMessage, type ValidationResult, } from "./adapters/task-sources/repo-validation.js";
41
+ export { formatRepoValidationResult, validateCanonicalTasks, type RepoValidationMessage, type RepoValidationResult, } from "./adapters/task-sources/repo-validation.js";
42
42
  export { InMemoryPluginRegistry } from "./_vendor/ailf-core/index.d.ts";
43
43
  export type { CompilationContext, ModeBase, ModeCompileResult, ModeHandler, PresetDefinition, } from "./_vendor/ailf-core/index.d.ts";
package/dist/index.js CHANGED
@@ -45,7 +45,7 @@ export { env } from "./_vendor/ailf-core/index.js";
45
45
  // Validation — for programmatic validation of task YAML
46
46
  // ---------------------------------------------------------------------------
47
47
  export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, } from "./adapters/task-sources/repo-schemas.js";
48
- export { formatValidationResult, validateCanonicalTasks, } from "./adapters/task-sources/repo-validation.js";
48
+ export { formatRepoValidationResult, validateCanonicalTasks, } from "./adapters/task-sources/repo-validation.js";
49
49
  // ---------------------------------------------------------------------------
50
50
  // Plugin extension points — for authoring custom presets, modes, and registries
51
51
  // ---------------------------------------------------------------------------
package/dist/job-store.js CHANGED
@@ -11,7 +11,7 @@
11
11
  * @see docs/design-docs/api-service-gateway.md
12
12
  * @see packages/studio/src/schema/job.ts — Sanity document schema
13
13
  */
14
- import { getSanityClient } from "./sanity/client.js";
14
+ import { getAilfSanityClient } from "./sanity/client.js";
15
15
  // ---------------------------------------------------------------------------
16
16
  // Constants
17
17
  // ---------------------------------------------------------------------------
@@ -28,7 +28,7 @@ export class JobStore {
28
28
  this.client = options.client;
29
29
  }
30
30
  else {
31
- this.client = getSanityClient({
31
+ this.client = getAilfSanityClient({
32
32
  ...(options.dataset ? { dataset: options.dataset } : {}),
33
33
  ...(options.projectId ? { projectId: options.projectId } : {}),
34
34
  ...(options.token ? { token: options.token } : {}),
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Dotenv resolution helpers shared between the CLI bootstrap
3
+ * (`packages/eval/src/cli.ts`) and any code path that needs to honor the
4
+ * same `--dotenv <path>` override (today: `pipeline/checks.ts::checkEnvironment`,
5
+ * which re-loads the active env file as part of validation).
6
+ *
7
+ * Centralizing the argv parse means future changes — validating the path
8
+ * exists before returning, supporting `--dotenv=path` form, accepting an
9
+ * env-var fallback — happen in one place instead of drifting between
10
+ * call sites.
11
+ */
12
+ /**
13
+ * Find an explicit `--dotenv <path>` argument and return its absolute,
14
+ * resolved path. Returns `undefined` when the flag is absent or has no
15
+ * following value.
16
+ *
17
+ * @param argv - Defaults to `process.argv`. Pass an explicit array in
18
+ * tests or in non-CLI hosts that have already shifted off the script
19
+ * prefix.
20
+ */
21
+ export declare function findExplicitDotenvArg(argv?: readonly string[]): string | undefined;
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Dotenv resolution helpers shared between the CLI bootstrap
3
+ * (`packages/eval/src/cli.ts`) and any code path that needs to honor the
4
+ * same `--dotenv <path>` override (today: `pipeline/checks.ts::checkEnvironment`,
5
+ * which re-loads the active env file as part of validation).
6
+ *
7
+ * Centralizing the argv parse means future changes — validating the path
8
+ * exists before returning, supporting `--dotenv=path` form, accepting an
9
+ * env-var fallback — happen in one place instead of drifting between
10
+ * call sites.
11
+ */
12
+ import { resolve } from "node:path";
13
+ /**
14
+ * Find an explicit `--dotenv <path>` argument and return its absolute,
15
+ * resolved path. Returns `undefined` when the flag is absent or has no
16
+ * following value.
17
+ *
18
+ * @param argv - Defaults to `process.argv`. Pass an explicit array in
19
+ * tests or in non-CLI hosts that have already shifted off the script
20
+ * prefix.
21
+ */
22
+ export function findExplicitDotenvArg(argv = process.argv) {
23
+ const idx = argv.indexOf("--dotenv");
24
+ if (idx === -1)
25
+ return undefined;
26
+ const value = argv[idx + 1];
27
+ if (!value)
28
+ return undefined;
29
+ return resolve(value);
30
+ }
@@ -17,7 +17,6 @@ import { emitFileContents } from "../../artifact-capture/emit-file.js";
17
17
  import { getStepInputPaths } from "../../pipeline/cache.js";
18
18
  import { buildCacheContext } from "../cache-context.js";
19
19
  import { checkCanonicalContextsExist } from "../../pipeline/checks.js";
20
- import { loadPipelineTasks } from "../load-pipeline-tasks.js";
21
20
  import { loadSource } from "../../sources.js";
22
21
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
23
22
  export class FetchDocsStep {
@@ -30,35 +29,16 @@ export class FetchDocsStep {
30
29
  return { status: "skipped", reason: "--no-fetch" };
31
30
  }
32
31
  const start = Date.now();
33
- // Load tasks — use the same source as GenerateConfigsStep to avoid
34
- // a mismatch where configs reference context files that were never
35
- // fetched.
36
- //
37
- // Adapter path: ctx.taskSource handles both content-lake and repo modes.
38
- // The composition root wires the right adapter (ContentLakeTaskSource
39
- // or RepoTaskSource) per taskSourceType. RepoTaskSource loads BOTH
40
- // .yaml and .task.ts files necessary for external-consumer evals
41
- // that materialize inline tasks as YAML (W0148).
42
- // Filesystem path: load from .task.ts files (legacy unset path
43
- // AILF defaults from tasks/${mode}/ + optional repoTasksPath augment).
44
- let allTasks;
45
- if (ctx.config.taskSourceType === "content-lake" ||
46
- ctx.config.taskSourceType === "repo") {
47
- const filter = {
48
- ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
49
- ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
50
- ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
51
- };
52
- allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
53
- }
54
- else {
55
- allTasks = await loadPipelineTasks({
56
- rootDir: ctx.config.rootDir,
57
- mode: ctx.config.mode,
58
- repoTasksPath: ctx.config.repoTasksPath,
59
- taskSourceType: ctx.config.taskSourceType,
60
- });
61
- }
32
+ // Load tasks via ctx.taskSource — the composition root wires the
33
+ // right adapter for every taskSourceType (W0146). FetchDocsStep and
34
+ // GenerateConfigsStep MUST go through the same adapter so configs
35
+ // reference context files that were actually fetched.
36
+ const filter = {
37
+ ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
38
+ ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
39
+ ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
40
+ };
41
+ const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
62
42
  // Bridge: narrow to literacy tasks for canonical doc access
63
43
  const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
64
44
  const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);
@@ -17,24 +17,17 @@ export declare class GenerateConfigsStep implements PipelineStep {
17
17
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
18
18
  private compileLiteracyVariants;
19
19
  private compileSingleMode;
20
- private loadTasks;
21
20
  /**
22
- * Load tasks via ctx.taskSource (the composition-root-wired adapter).
21
+ * Load tasks via ctx.taskSource the single adapter wired by the
22
+ * composition root for every taskSourceType (W0146). FetchDocsStep
23
+ * and GenerateConfigsStep MUST go through the same adapter so configs
24
+ * reference context files that were actually fetched.
23
25
  *
24
- * Used for both `taskSourceType === "content-lake"` (ContentLakeTaskSource)
25
- * and `taskSourceType === "repo"` (RepoTaskSource). Filtering by
26
- * area/task/tag is delegated to the adapter — ContentLakeTaskSource
27
- * pushes it into the GROQ query, RepoTaskSource applies it in-memory.
26
+ * Filtering by area/task/tag is delegated to the adapter:
27
+ * ContentLakeTaskSource pushes it into the GROQ query;
28
+ * RepoTaskSource applies it in-memory.
28
29
  */
29
- private loadTasksFromAdapter;
30
- /**
31
- * Load tasks from filesystem .task.ts files.
32
- *
33
- * This is the original path used for repo-based and inline tasks.
34
- * It scans tasks/{mode}/ and optionally --repo-tasks-path.
35
- */
36
- private loadTasksFromFilesystem;
37
- private applyFilters;
30
+ private loadTasks;
38
31
  /**
39
32
  * Build a descriptive error message when no tasks match the current filters.
40
33
  * Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
@@ -208,99 +208,36 @@ export class GenerateConfigsStep {
208
208
  // ---------------------------------------------------------------------------
209
209
  // Task loading — unified for all modes
210
210
  // ---------------------------------------------------------------------------
211
- async loadTasks(ctx, mode, state) {
212
- // Adapter path — use ctx.taskSource. The composition root wires the
213
- // right adapter for each taskSourceType:
214
- // - "content-lake" → ContentLakeTaskSource (Studio-owned ailf.task docs)
215
- // - "repo" → RepoTaskSource (loads .yaml AND .task.ts from repoTasksPath)
216
- // Routing both through ctx.taskSource keeps the orchestration step
217
- // file-format-agnostic (W0148: external-consumer evals materialize
218
- // inline tasks as .yaml, which loadPipelineTasks can't read).
219
- if (ctx.config.taskSourceType === "content-lake" ||
220
- ctx.config.taskSourceType === "repo") {
221
- return this.loadTasksFromAdapter(ctx, state);
222
- }
223
- // Filesystem path — load from .task.ts files (legacy unset path:
224
- // AILF defaults from tasks/${mode}/ + optional repoTasksPath augment).
225
- return this.loadTasksFromFilesystem(ctx, mode, state);
226
- }
227
211
  /**
228
- * Load tasks via ctx.taskSource (the composition-root-wired adapter).
212
+ * Load tasks via ctx.taskSource the single adapter wired by the
213
+ * composition root for every taskSourceType (W0146). FetchDocsStep
214
+ * and GenerateConfigsStep MUST go through the same adapter so configs
215
+ * reference context files that were actually fetched.
229
216
  *
230
- * Used for both `taskSourceType === "content-lake"` (ContentLakeTaskSource)
231
- * and `taskSourceType === "repo"` (RepoTaskSource). Filtering by
232
- * area/task/tag is delegated to the adapter — ContentLakeTaskSource
233
- * pushes it into the GROQ query, RepoTaskSource applies it in-memory.
217
+ * Filtering by area/task/tag is delegated to the adapter:
218
+ * ContentLakeTaskSource pushes it into the GROQ query;
219
+ * RepoTaskSource applies it in-memory.
234
220
  */
235
- async loadTasksFromAdapter(ctx, state) {
221
+ async loadTasks(ctx, mode, state) {
236
222
  const filter = {
237
223
  ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
238
224
  ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
239
225
  ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
240
226
  };
241
- const tasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
242
- // Capture loaded IDs for error messages (same as filesystem path)
243
- this.lastLoadedTaskIds = tasks
244
- .map((t) => t.id)
245
- .filter((id) => !!id);
246
- // Release auto-scope
247
- if (state.releaseAutoScope && !ctx.config.noAutoScope) {
248
- const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
249
- const beforeCount = tasks.length;
250
- const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
251
- ctx.logger.info(` 🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
252
- return scoped;
253
- }
254
- return tasks;
255
- }
256
- /**
257
- * Load tasks from filesystem .task.ts files.
258
- *
259
- * This is the original path used for repo-based and inline tasks.
260
- * It scans tasks/{mode}/ and optionally --repo-tasks-path.
261
- */
262
- async loadTasksFromFilesystem(ctx, mode, state) {
263
- const { resolve } = await import("path");
264
- const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
265
- const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
266
- // Discover task files from the mode-specific directory and --repo-tasks-path.
267
- // Use vendored copies in dist/ when @sanity/ailf-core isn't resolvable
268
- // (i.e., running outside the monorepo via npx).
269
- //
270
- // When taskSourceType === "repo", skip the AILF-bundled tasks/${mode}/
271
- // directory and load ONLY from repoTasksPath. Mirrors the composition-root
272
- // contract for repo-only mode (see composition-root.ts:392-405).
273
- const dirs = [];
274
- if (ctx.config.taskSourceType !== "repo") {
275
- dirs.push(resolveVendoredSubdir(ctx.config.rootDir, `tasks/${mode}`));
276
- }
277
- else if (!ctx.config.repoTasksPath) {
278
- throw new Error('taskSourceType "repo" requires repoTasksPath to be set (no AILF defaults loaded in repo-only mode)');
279
- }
280
- // Also search --repo-tasks-path (e.g., .ailf/tasks/) for repo-based tasks
281
- if (ctx.config.repoTasksPath) {
282
- const repoDir = resolve(ctx.config.repoTasksPath);
283
- if (!dirs.includes(repoDir)) {
284
- dirs.push(repoDir);
285
- }
286
- }
227
+ const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
228
+ // Mode filter the adapter may return a mixed-mode set (e.g. a user's
229
+ // `--repo-tasks-path` containing tasks of multiple modes). Skip
230
+ // non-matching modes with a warning so unintentional misclassification
231
+ // is visible without breaking the run.
287
232
  const tasks = [];
288
233
  const skippedByMode = new Map();
289
- for (const dir of dirs) {
290
- const files = discoverTsTaskFiles(dir);
291
- for (const file of files) {
292
- const raw = await loadTsTaskFile(file);
293
- for (const t of raw.tasks) {
294
- const task = t;
295
- // Filter to matching mode (skip tasks from other modes in same dir)
296
- if (!("mode" in task) || task.mode === mode) {
297
- tasks.push(task);
298
- }
299
- else {
300
- const taskMode = task.mode ?? "unknown";
301
- skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
302
- }
303
- }
234
+ for (const task of allTasks) {
235
+ if (!("mode" in task) || task.mode === mode) {
236
+ tasks.push(task);
237
+ }
238
+ else {
239
+ const taskMode = task.mode ?? "unknown";
240
+ skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
304
241
  }
305
242
  }
306
243
  if (skippedByMode.size > 0) {
@@ -310,46 +247,17 @@ export class GenerateConfigsStep {
310
247
  .join(", ");
311
248
  ctx.logger.warn(` ⚠ Skipped ${total} task(s) with non-matching mode (${summary}). Current pipeline mode: ${mode}. Run with --mode <mode> to include them.`);
312
249
  }
313
- // Apply area/task/tag filters
314
- const filtered = this.applyFilters(ctx, tasks);
315
- // Release auto-scope
250
+ this.lastLoadedTaskIds = tasks
251
+ .map((t) => t.id)
252
+ .filter((id) => !!id);
316
253
  if (state.releaseAutoScope && !ctx.config.noAutoScope) {
317
254
  const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
318
- const beforeCount = filtered.length;
319
- const scoped = filtered.filter((t) => "id" in t && scopedIds.has(t.id));
255
+ const beforeCount = tasks.length;
256
+ const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
320
257
  ctx.logger.info(` 🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
321
258
  return scoped;
322
259
  }
323
- return filtered;
324
- }
325
- applyFilters(ctx, tasks) {
326
- // Capture pre-filter IDs for diagnostic messages
327
- this.lastLoadedTaskIds = tasks
328
- .map((t) => t.id)
329
- .filter((id) => !!id);
330
- let result = tasks;
331
- if (ctx.config.areas?.length) {
332
- const allowed = new Set(ctx.config.areas.map((a) => a.toLowerCase()));
333
- result = result.filter((t) => {
334
- const area = t.area?.toLowerCase();
335
- return area && allowed.has(area);
336
- });
337
- }
338
- if (ctx.config.tasks?.length) {
339
- const allowed = new Set(ctx.config.tasks);
340
- result = result.filter((t) => {
341
- const id = t.id;
342
- return id && allowed.has(id);
343
- });
344
- }
345
- if (ctx.config.tags?.length) {
346
- const allowed = new Set(ctx.config.tags);
347
- result = result.filter((t) => {
348
- const tags = t.tags;
349
- return tags?.some((tag) => allowed.has(tag));
350
- });
351
- }
352
- return result;
260
+ return tasks;
353
261
  }
354
262
  /**
355
263
  * Build a descriptive error message when no tasks match the current filters.
@@ -12,7 +12,7 @@
12
12
  * @see packages/eval/src/pipeline/mirror-repo-tasks.ts
13
13
  * @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
14
14
  */
15
- import { getSanityClient } from "../../sanity/client.js";
15
+ import { getAilfSanityClient, getSanityClient } from "../../sanity/client.js";
16
16
  import { detectGitContext, mirrorRepoTasks, } from "../../pipeline/mirror-repo-tasks.js";
17
17
  export class MirrorRepoTasksStep {
18
18
  name = "mirror-repo-tasks";
@@ -28,6 +28,18 @@ export class MirrorRepoTasksStep {
28
28
  if (!ctx.config.repoTasksPath) {
29
29
  return { status: "skipped", reason: "No --repo-tasks-path provided" };
30
30
  }
31
+ // W0145 — never mirror under repo-only mode. The API gateway maps
32
+ // PipelineRequest.taskMode="inline" → taskSourceType="repo", so an
33
+ // external consumer's ephemeral inline tasks would otherwise be
34
+ // upserted into AILF's canonical Content Lake. Mirroring is only
35
+ // correct for the in-tree dogfood path (taskSourceType unset +
36
+ // repoTasksPath set, e.g. external-eval.yml).
37
+ if (ctx.config.taskSourceType === "repo") {
38
+ return {
39
+ status: "skipped",
40
+ reason: 'taskSourceType="repo" — inline tasks are not mirrored',
41
+ };
42
+ }
31
43
  // Need a write token for mirroring
32
44
  const token = process.env.AILF_REPORT_SANITY_API_TOKEN ?? process.env.SANITY_API_TOKEN;
33
45
  if (!token) {
@@ -54,11 +66,22 @@ export class MirrorRepoTasksStep {
54
66
  // Detect git context (from env vars or git CLI)
55
67
  const git = await detectGitContext(ctx.config.repoTasksPath);
56
68
  ctx.logger.info(` Mirroring ${repoTasks.length} repo task(s) from ${git.repo}@${git.branch}`);
57
- // Create a client with write access
58
- const client = getSanityClient({ token });
69
+ // Two clients are required after the D0043 dataset split:
70
+ // - `client` writes ailf.task / ailf.featureArea to the AILF
71
+ // dataset and reads existing mirror state — uses the
72
+ // AILF-scoped token explicitly so writes work even when
73
+ // SANITY_API_TOKEN is editorial-read-only.
74
+ // - `editorialClient` resolves `article` slugs against the
75
+ // editorial dataset. Operators may scope the AILF token to
76
+ // AILF only (D0043 consequence #5); using it here would
77
+ // 401 on the editorial query. Let it pick up SANITY_API_TOKEN
78
+ // from the default config instead.
79
+ const client = getAilfSanityClient({ token });
80
+ const editorialClient = getSanityClient();
59
81
  // Run the mirror
60
82
  const result = await mirrorRepoTasks({
61
83
  client,
84
+ editorialClient,
62
85
  git,
63
86
  logger: ctx.logger,
64
87
  tasks: repoTasks,
@@ -111,10 +111,28 @@ export class RunEvalStep {
111
111
  // required eval modes were satisfied from the remote cache.
112
112
  state.remoteCacheHits ??= new Set();
113
113
  state.remoteCacheHits.add(this.mode);
114
- // Carry forward Promptfoo share URLs from the cached report
115
- if (remoteCacheResult.promptfooUrls?.length) {
114
+ // Carry forward the share-link backreference for THIS mode only.
115
+ // Pushing every entry from `remoteCacheResult.promptfooUrls`
116
+ // snowballs across the daily perspective cron: each cache-hit
117
+ // run inherits the cached report's full URL list (including
118
+ // other modes and any URLs the cached report had itself
119
+ // accumulated from earlier hits), then layers its own on top.
120
+ // Iterate from the tail to handle pre-fix cached reports that
121
+ // may carry multiple entries for the same mode.
122
+ const cachedUrls = remoteCacheResult.promptfooUrls;
123
+ let inherited;
124
+ if (cachedUrls) {
125
+ for (let i = cachedUrls.length - 1; i >= 0; i--) {
126
+ const entry = cachedUrls[i];
127
+ if (entry?.mode === this.mode) {
128
+ inherited = entry;
129
+ break;
130
+ }
131
+ }
132
+ }
133
+ if (inherited) {
116
134
  state.promptfooUrls ??= [];
117
- state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
135
+ state.promptfooUrls.push(inherited);
118
136
  }
119
137
  // D0040 / W0135 — restore the cached report's artifact manifest into
120
138
  // the accumulator so the new run's RunManifest advertises the cached
@@ -4,7 +4,9 @@
4
4
  * Pure analysis functions for agent behavior observation reports.
5
5
  * No I/O, no process.env, no process.argv — all data is passed in.
6
6
  */
7
+ import type { TestResult } from "../_vendor/ailf-core/index.d.ts";
7
8
  import type { AgentBehaviorSummary } from "../agent-observer/types.js";
9
+ export type { TestResult } from "../_vendor/ailf-core/index.d.ts";
8
10
  export interface PromptfooResults {
9
11
  results: TestResult[];
10
12
  }
@@ -13,14 +15,6 @@ export interface PromptfooResultsEnvelope {
13
15
  results: TestResult[];
14
16
  };
15
17
  }
16
- export interface TestResult {
17
- description: string;
18
- metadata?: Record<string, unknown>;
19
- response: {
20
- output: string;
21
- };
22
- vars: Record<string, string>;
23
- }
24
18
  export interface TaskBehavior {
25
19
  behavior: AgentBehaviorSummary;
26
20
  description: string;
@@ -28,7 +28,7 @@ export interface CacheEntry {
28
28
  timestamp: string;
29
29
  }
30
30
  /** Result of a cache lookup */
31
- export type CacheLookupResult = {
31
+ export type ManifestCacheLookupResult = {
32
32
  hit: false;
33
33
  currentHash: string;
34
34
  } | {
@@ -84,7 +84,7 @@ export declare function hashFiles(paths: string[], context?: string[]): string;
84
84
  * Optional `context` strings are included in the hash so that non-file
85
85
  * state (e.g., area/task filter flags) participates in cache key computation.
86
86
  */
87
- export declare function lookupCache(rootDir: string, step: string, context?: string[]): CacheLookupResult;
87
+ export declare function lookupCache(rootDir: string, step: string, context?: string[]): ManifestCacheLookupResult;
88
88
  /**
89
89
  * Read the cache manifest for a step.
90
90
  * Returns null if no manifest exists or it's corrupt.
@@ -18,8 +18,16 @@ export declare function checkCanonicalContextsExist(rootDir: string, taskIds: st
18
18
  export declare function checkContextsExist(rootDir: string, areas: string[]): ValidationIssue[];
19
19
  /**
20
20
  * Check that required environment variables are set.
21
- * Loads the root `.env` file first (with override, matching the dotenv CLI
22
- * `-o` flag used by other scripts), then checks for required keys.
21
+ *
22
+ * Loads the resolved `.env` file first (with override, matching the dotenv
23
+ * CLI `-o` flag used by other scripts), then checks for required keys. The
24
+ * resolution order mirrors `cli.ts`'s `resolveEnvPath()` so a `--dotenv
25
+ * <path>` argument on the parent CLI invocation isn't silently clobbered
26
+ * here. Without this, a Tier 2 test that uses `--dotenv` to override
27
+ * tenant-pointing vars (e.g. `AILF_GCS_ARTIFACT_BUCKET`,
28
+ * `GOOGLE_APPLICATION_CREDENTIALS`) gets its overrides reverted to the
29
+ * repo `.env` values when this function runs as part of the validate
30
+ * step. (W0138 Slice 2 surface — see `gcs-pipeline-replay-roundtrip.test.ts`.)
23
31
  */
24
32
  export declare function checkEnvironment(rootDir: string): ValidationIssue[];
25
33
  /**
@@ -8,6 +8,7 @@
8
8
  import { config as loadEnv } from "dotenv";
9
9
  import { existsSync, readFileSync, statSync } from "fs";
10
10
  import { join, resolve } from "path";
11
+ import { findExplicitDotenvArg } from "../lib/dotenv-resolution.js";
11
12
  import { configFileForMode } from "./eval-constants.js";
12
13
  // ---------------------------------------------------------------------------
13
14
  // Precondition: contexts exist for each feature area
@@ -80,13 +81,22 @@ export function checkContextsExist(rootDir, areas) {
80
81
  // ---------------------------------------------------------------------------
81
82
  /**
82
83
  * Check that required environment variables are set.
83
- * Loads the root `.env` file first (with override, matching the dotenv CLI
84
- * `-o` flag used by other scripts), then checks for required keys.
84
+ *
85
+ * Loads the resolved `.env` file first (with override, matching the dotenv
86
+ * CLI `-o` flag used by other scripts), then checks for required keys. The
87
+ * resolution order mirrors `cli.ts`'s `resolveEnvPath()` so a `--dotenv
88
+ * <path>` argument on the parent CLI invocation isn't silently clobbered
89
+ * here. Without this, a Tier 2 test that uses `--dotenv` to override
90
+ * tenant-pointing vars (e.g. `AILF_GCS_ARTIFACT_BUCKET`,
91
+ * `GOOGLE_APPLICATION_CREDENTIALS`) gets its overrides reverted to the
92
+ * repo `.env` values when this function runs as part of the validate
93
+ * step. (W0138 Slice 2 surface — see `gcs-pipeline-replay-roundtrip.test.ts`.)
85
94
  */
86
95
  export function checkEnvironment(rootDir) {
87
96
  const issues = [];
88
- // Load root .env so we see the same vars as dotenv -e ../../.env -o
89
- const envPath = resolve(rootDir, "..", "..", ".env");
97
+ // Load the active .env so we see the same vars as dotenv -e <path> -o.
98
+ // Resolution: explicit --dotenv arg wins, then the repo-root .env.
99
+ const envPath = findExplicitDotenvArg() ?? resolve(rootDir, "..", "..", ".env");
90
100
  if (existsSync(envPath)) {
91
101
  loadEnv({ override: true, path: envPath });
92
102
  }
@@ -46,7 +46,7 @@ import { buildTaskGraph } from "./task-graph-builder.js";
46
46
  * rules (e.g., rejecting archived tasks that slipped through).
47
47
  */
48
48
  export function compileLiteracyTasks(tasks, options) {
49
- const rubricConfig = loadRubricConfig(options.rootDir);
49
+ const rubricConfig = loadRubricResolutionInput(options.rootDir);
50
50
  const warnings = [];
51
51
  const results = [];
52
52
  let totalTests = 0;
@@ -146,7 +146,7 @@ export function compareCompilerOutputs(legacyEntries, newResult) {
146
146
  // ---------------------------------------------------------------------------
147
147
  // Rubric config loading
148
148
  // ---------------------------------------------------------------------------
149
- function loadRubricConfig(rootDir) {
149
+ function loadRubricResolutionInput(rootDir) {
150
150
  const result = tryLoadConfigFile("rubrics", rootDir);
151
151
  if (!result)
152
152
  return undefined;