@sanity/ailf 4.0.6 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ailf.js +6 -1
- package/dist/_vendor/ailf-core/schemas/external-providers.d.ts +136 -0
- package/dist/_vendor/ailf-core/schemas/external-providers.js +136 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -3
- package/dist/_vendor/ailf-core/schemas/report.d.ts +251 -0
- package/dist/_vendor/ailf-core/schemas/report.js +235 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +1 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.d.ts +38 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +696 -0
- package/dist/_vendor/ailf-core/types/api-requests.d.ts +159 -0
- package/dist/_vendor/ailf-core/types/api-requests.js +27 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +3 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +112 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.js +18 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +146 -0
- package/dist/_vendor/ailf-core/types/repo-config.js +18 -0
- package/dist/_vendor/ailf-shared/index.d.ts +7 -5
- package/dist/_vendor/ailf-shared/index.js +7 -5
- package/dist/adapters/api-client/types.d.ts +2 -5
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +58 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +1 -1
- package/dist/adapters/task-sources/index.d.ts +1 -1
- package/dist/adapters/task-sources/index.js +1 -1
- package/dist/adapters/task-sources/repo-schemas.d.ts +3 -2
- package/dist/adapters/task-sources/repo-schemas.js +3 -1
- package/dist/adapters/task-sources/repo-task-source.d.ts +11 -1
- package/dist/adapters/task-sources/repo-task-source.js +7 -4
- package/dist/adapters/task-sources/repo-validation.d.ts +6 -6
- package/dist/adapters/task-sources/repo-validation.js +1 -1
- package/dist/agent-observer/agentic-provider.d.ts +1 -0
- package/dist/agent-observer/agentic-provider.js +43 -36
- package/dist/agent-observer/config-schemas.d.ts +61 -0
- package/dist/agent-observer/config-schemas.js +65 -0
- package/dist/agent-observer/provider.d.ts +1 -0
- package/dist/agent-observer/provider.js +19 -17
- package/dist/cli.js +4 -4
- package/dist/commands/validate-tasks.js +2 -2
- package/dist/composition-root.d.ts +7 -0
- package/dist/composition-root.js +27 -12
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/job-store.js +2 -2
- package/dist/lib/dotenv-resolution.d.ts +21 -0
- package/dist/lib/dotenv-resolution.js +30 -0
- package/dist/orchestration/steps/fetch-docs-step.js +10 -30
- package/dist/orchestration/steps/generate-configs-step.d.ts +8 -15
- package/dist/orchestration/steps/generate-configs-step.js +26 -118
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +26 -3
- package/dist/orchestration/steps/run-eval-step.js +21 -3
- package/dist/pipeline/agent-behavior-report.d.ts +2 -8
- package/dist/pipeline/cache.d.ts +2 -2
- package/dist/pipeline/checks.d.ts +10 -2
- package/dist/pipeline/checks.js +14 -4
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +3 -3
- package/dist/pipeline/compiler/promptfoo-compiler.js +7 -11
- package/dist/pipeline/compiler/provider-assembler.js +33 -3
- package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -5
- package/dist/pipeline/mirror-repo-tasks.js +16 -8
- package/dist/pipeline/pr-comment.d.ts +22 -9
- package/dist/pipeline/pr-comment.js +52 -472
- package/dist/pipeline/resolve-mappings.d.ts +8 -3
- package/dist/promptfoo-providers/mock-path.d.ts +12 -0
- package/dist/promptfoo-providers/mock-path.js +15 -0
- package/dist/report-store.d.ts +63 -1
- package/dist/report-store.js +111 -31
- package/dist/sanity/client.d.ts +58 -0
- package/dist/sanity/client.js +106 -0
- package/package.json +8 -7
- package/dist/orchestration/load-pipeline-tasks.d.ts +0 -40
- package/dist/orchestration/load-pipeline-tasks.js +0 -57
package/dist/composition-root.js
CHANGED
|
@@ -32,9 +32,10 @@ import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-ada
|
|
|
32
32
|
import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
|
|
33
33
|
import { ConsoleProgressReporter } from "./adapters/progress/console-progress-reporter.js";
|
|
34
34
|
import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, } from "./adapters/task-sources/index.js";
|
|
35
|
+
import { resolveVendoredSubdir } from "./pipeline/compiler/config-loader.js";
|
|
35
36
|
import { createAgentHarnessBase, createKnowledgeProbeBase, createLiteracyModeBase, createMcpServerModeBase, } from "./pipeline/compiler/mode-bases/index.js";
|
|
36
37
|
import { createSanityLiteracyPreset } from "./pipeline/compiler/presets/index.js";
|
|
37
|
-
import {
|
|
38
|
+
import { getAilfSanityClient } from "./sanity/client.js";
|
|
38
39
|
import { ReportStore } from "./report-store.js";
|
|
39
40
|
import { loadSinks } from "./sinks/index.js";
|
|
40
41
|
/**
|
|
@@ -297,7 +298,12 @@ function createCache(config) {
|
|
|
297
298
|
return local;
|
|
298
299
|
return new ContentLakeCacheAdapter(local, createReportStore(config));
|
|
299
300
|
}
|
|
300
|
-
|
|
301
|
+
/**
|
|
302
|
+
* Build the `TaskSource` adapter wired by the composition root for a
|
|
303
|
+
* given `ResolvedConfig`. Exported for test access — composition-root
|
|
304
|
+
* wiring is a contract worth asserting directly.
|
|
305
|
+
*/
|
|
306
|
+
export function createTaskSource(config) {
|
|
301
307
|
// "repo" mode — use ONLY repo tasks, no Content Lake or YAML merge.
|
|
302
308
|
// This is the correct mode for API-triggered inline-task evaluations
|
|
303
309
|
// where the caller sent their own task definitions. Without this,
|
|
@@ -309,21 +315,30 @@ function createTaskSource(config) {
|
|
|
309
315
|
}
|
|
310
316
|
return new RepoTaskSource(config.repoTasksPath);
|
|
311
317
|
}
|
|
312
|
-
//
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
318
|
+
// "content-lake" — Studio-authored ailf.task documents only.
|
|
319
|
+
// AILF documents live in the private dataset (D0043) — route through
|
|
320
|
+
// the AILF client factory so reads target `ailf-prod-private`.
|
|
321
|
+
if (config.taskSourceType === "content-lake") {
|
|
322
|
+
return new ContentLakeTaskSource(getAilfSanityClient({
|
|
323
|
+
token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
|
|
324
|
+
process.env.SANITY_API_TOKEN ??
|
|
325
|
+
undefined,
|
|
326
|
+
}));
|
|
327
|
+
}
|
|
328
|
+
// Unset — AILF-bundled defaults from `tasks/${mode}/`, optionally
|
|
329
|
+
// augmented with the caller's `--repo-tasks-path` (W0146). The
|
|
330
|
+
// bundled directory is allowed to be missing so test rootDirs and
|
|
331
|
+
// modes that ship no defaults degrade gracefully to the augment
|
|
332
|
+
// source (or empty).
|
|
333
|
+
const bundledDir = resolveVendoredSubdir(config.rootDir, `tasks/${config.mode}`);
|
|
334
|
+
const bundled = new RepoTaskSource(bundledDir, { allowMissing: true });
|
|
320
335
|
if (config.repoTasksPath) {
|
|
321
336
|
return new CompositeTaskSource([
|
|
322
|
-
|
|
337
|
+
bundled,
|
|
323
338
|
new RepoTaskSource(config.repoTasksPath),
|
|
324
339
|
]);
|
|
325
340
|
}
|
|
326
|
-
return
|
|
341
|
+
return bundled;
|
|
327
342
|
}
|
|
328
343
|
// ---------------------------------------------------------------------------
|
|
329
344
|
// Layer 0: Framework built-in assertions
|
package/dist/index.d.ts
CHANGED
|
@@ -38,6 +38,6 @@ export type { PricingEntry, PromptEntry, SourceEntry } from "./_vendor/ailf-core
|
|
|
38
38
|
export { env } from "./_vendor/ailf-core/index.d.ts";
|
|
39
39
|
export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./_vendor/ailf-core/index.d.ts";
|
|
40
40
|
export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, type CanonicalTask, type CuratedAssertionType, type RubricTemplateName, } from "./adapters/task-sources/repo-schemas.js";
|
|
41
|
-
export {
|
|
41
|
+
export { formatRepoValidationResult, validateCanonicalTasks, type RepoValidationMessage, type RepoValidationResult, } from "./adapters/task-sources/repo-validation.js";
|
|
42
42
|
export { InMemoryPluginRegistry } from "./_vendor/ailf-core/index.d.ts";
|
|
43
43
|
export type { CompilationContext, ModeBase, ModeCompileResult, ModeHandler, PresetDefinition, } from "./_vendor/ailf-core/index.d.ts";
|
package/dist/index.js
CHANGED
|
@@ -45,7 +45,7 @@ export { env } from "./_vendor/ailf-core/index.js";
|
|
|
45
45
|
// Validation — for programmatic validation of task YAML
|
|
46
46
|
// ---------------------------------------------------------------------------
|
|
47
47
|
export { CanonicalTaskFileSchema, CanonicalTaskSchema, CURATED_ASSERTION_TYPES, detectLegacyFieldNames, parseCanonicalTaskFile, RUBRIC_TEMPLATE_NAMES, } from "./adapters/task-sources/repo-schemas.js";
|
|
48
|
-
export {
|
|
48
|
+
export { formatRepoValidationResult, validateCanonicalTasks, } from "./adapters/task-sources/repo-validation.js";
|
|
49
49
|
// ---------------------------------------------------------------------------
|
|
50
50
|
// Plugin extension points — for authoring custom presets, modes, and registries
|
|
51
51
|
// ---------------------------------------------------------------------------
|
package/dist/job-store.js
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
* @see docs/design-docs/api-service-gateway.md
|
|
12
12
|
* @see packages/studio/src/schema/job.ts — Sanity document schema
|
|
13
13
|
*/
|
|
14
|
-
import {
|
|
14
|
+
import { getAilfSanityClient } from "./sanity/client.js";
|
|
15
15
|
// ---------------------------------------------------------------------------
|
|
16
16
|
// Constants
|
|
17
17
|
// ---------------------------------------------------------------------------
|
|
@@ -28,7 +28,7 @@ export class JobStore {
|
|
|
28
28
|
this.client = options.client;
|
|
29
29
|
}
|
|
30
30
|
else {
|
|
31
|
-
this.client =
|
|
31
|
+
this.client = getAilfSanityClient({
|
|
32
32
|
...(options.dataset ? { dataset: options.dataset } : {}),
|
|
33
33
|
...(options.projectId ? { projectId: options.projectId } : {}),
|
|
34
34
|
...(options.token ? { token: options.token } : {}),
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dotenv resolution helpers shared between the CLI bootstrap
|
|
3
|
+
* (`packages/eval/src/cli.ts`) and any code path that needs to honor the
|
|
4
|
+
* same `--dotenv <path>` override (today: `pipeline/checks.ts::checkEnvironment`,
|
|
5
|
+
* which re-loads the active env file as part of validation).
|
|
6
|
+
*
|
|
7
|
+
* Centralizing the argv parse means future changes — validating the path
|
|
8
|
+
* exists before returning, supporting `--dotenv=path` form, accepting an
|
|
9
|
+
* env-var fallback — happen in one place instead of drifting between
|
|
10
|
+
* call sites.
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Find an explicit `--dotenv <path>` argument and return its absolute,
|
|
14
|
+
* resolved path. Returns `undefined` when the flag is absent or has no
|
|
15
|
+
* following value.
|
|
16
|
+
*
|
|
17
|
+
* @param argv - Defaults to `process.argv`. Pass an explicit array in
|
|
18
|
+
* tests or in non-CLI hosts that have already shifted off the script
|
|
19
|
+
* prefix.
|
|
20
|
+
*/
|
|
21
|
+
export declare function findExplicitDotenvArg(argv?: readonly string[]): string | undefined;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dotenv resolution helpers shared between the CLI bootstrap
|
|
3
|
+
* (`packages/eval/src/cli.ts`) and any code path that needs to honor the
|
|
4
|
+
* same `--dotenv <path>` override (today: `pipeline/checks.ts::checkEnvironment`,
|
|
5
|
+
* which re-loads the active env file as part of validation).
|
|
6
|
+
*
|
|
7
|
+
* Centralizing the argv parse means future changes — validating the path
|
|
8
|
+
* exists before returning, supporting `--dotenv=path` form, accepting an
|
|
9
|
+
* env-var fallback — happen in one place instead of drifting between
|
|
10
|
+
* call sites.
|
|
11
|
+
*/
|
|
12
|
+
import { resolve } from "node:path";
|
|
13
|
+
/**
|
|
14
|
+
* Find an explicit `--dotenv <path>` argument and return its absolute,
|
|
15
|
+
* resolved path. Returns `undefined` when the flag is absent or has no
|
|
16
|
+
* following value.
|
|
17
|
+
*
|
|
18
|
+
* @param argv - Defaults to `process.argv`. Pass an explicit array in
|
|
19
|
+
* tests or in non-CLI hosts that have already shifted off the script
|
|
20
|
+
* prefix.
|
|
21
|
+
*/
|
|
22
|
+
export function findExplicitDotenvArg(argv = process.argv) {
|
|
23
|
+
const idx = argv.indexOf("--dotenv");
|
|
24
|
+
if (idx === -1)
|
|
25
|
+
return undefined;
|
|
26
|
+
const value = argv[idx + 1];
|
|
27
|
+
if (!value)
|
|
28
|
+
return undefined;
|
|
29
|
+
return resolve(value);
|
|
30
|
+
}
|
|
@@ -17,7 +17,6 @@ import { emitFileContents } from "../../artifact-capture/emit-file.js";
|
|
|
17
17
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
18
18
|
import { buildCacheContext } from "../cache-context.js";
|
|
19
19
|
import { checkCanonicalContextsExist } from "../../pipeline/checks.js";
|
|
20
|
-
import { loadPipelineTasks } from "../load-pipeline-tasks.js";
|
|
21
20
|
import { loadSource } from "../../sources.js";
|
|
22
21
|
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
23
22
|
export class FetchDocsStep {
|
|
@@ -30,35 +29,16 @@ export class FetchDocsStep {
|
|
|
30
29
|
return { status: "skipped", reason: "--no-fetch" };
|
|
31
30
|
}
|
|
32
31
|
const start = Date.now();
|
|
33
|
-
// Load tasks —
|
|
34
|
-
//
|
|
35
|
-
//
|
|
36
|
-
//
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
// AILF defaults from tasks/${mode}/ + optional repoTasksPath augment).
|
|
44
|
-
let allTasks;
|
|
45
|
-
if (ctx.config.taskSourceType === "content-lake" ||
|
|
46
|
-
ctx.config.taskSourceType === "repo") {
|
|
47
|
-
const filter = {
|
|
48
|
-
...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
|
|
49
|
-
...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
|
|
50
|
-
...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
|
|
51
|
-
};
|
|
52
|
-
allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
|
|
53
|
-
}
|
|
54
|
-
else {
|
|
55
|
-
allTasks = await loadPipelineTasks({
|
|
56
|
-
rootDir: ctx.config.rootDir,
|
|
57
|
-
mode: ctx.config.mode,
|
|
58
|
-
repoTasksPath: ctx.config.repoTasksPath,
|
|
59
|
-
taskSourceType: ctx.config.taskSourceType,
|
|
60
|
-
});
|
|
61
|
-
}
|
|
32
|
+
// Load tasks via ctx.taskSource — the composition root wires the
|
|
33
|
+
// right adapter for every taskSourceType (W0146). FetchDocsStep and
|
|
34
|
+
// GenerateConfigsStep MUST go through the same adapter so configs
|
|
35
|
+
// reference context files that were actually fetched.
|
|
36
|
+
const filter = {
|
|
37
|
+
...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
|
|
38
|
+
...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
|
|
39
|
+
...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
|
|
40
|
+
};
|
|
41
|
+
const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
|
|
62
42
|
// Bridge: narrow to literacy tasks for canonical doc access
|
|
63
43
|
const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
|
|
64
44
|
const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);
|
|
@@ -17,24 +17,17 @@ export declare class GenerateConfigsStep implements PipelineStep {
|
|
|
17
17
|
execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
|
|
18
18
|
private compileLiteracyVariants;
|
|
19
19
|
private compileSingleMode;
|
|
20
|
-
private loadTasks;
|
|
21
20
|
/**
|
|
22
|
-
* Load tasks via ctx.taskSource
|
|
21
|
+
* Load tasks via ctx.taskSource — the single adapter wired by the
|
|
22
|
+
* composition root for every taskSourceType (W0146). FetchDocsStep
|
|
23
|
+
* and GenerateConfigsStep MUST go through the same adapter so configs
|
|
24
|
+
* reference context files that were actually fetched.
|
|
23
25
|
*
|
|
24
|
-
*
|
|
25
|
-
*
|
|
26
|
-
*
|
|
27
|
-
* pushes it into the GROQ query, RepoTaskSource applies it in-memory.
|
|
26
|
+
* Filtering by area/task/tag is delegated to the adapter:
|
|
27
|
+
* ContentLakeTaskSource pushes it into the GROQ query;
|
|
28
|
+
* RepoTaskSource applies it in-memory.
|
|
28
29
|
*/
|
|
29
|
-
private
|
|
30
|
-
/**
|
|
31
|
-
* Load tasks from filesystem .task.ts files.
|
|
32
|
-
*
|
|
33
|
-
* This is the original path used for repo-based and inline tasks.
|
|
34
|
-
* It scans tasks/{mode}/ and optionally --repo-tasks-path.
|
|
35
|
-
*/
|
|
36
|
-
private loadTasksFromFilesystem;
|
|
37
|
-
private applyFilters;
|
|
30
|
+
private loadTasks;
|
|
38
31
|
/**
|
|
39
32
|
* Build a descriptive error message when no tasks match the current filters.
|
|
40
33
|
* Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
|
|
@@ -208,99 +208,36 @@ export class GenerateConfigsStep {
|
|
|
208
208
|
// ---------------------------------------------------------------------------
|
|
209
209
|
// Task loading — unified for all modes
|
|
210
210
|
// ---------------------------------------------------------------------------
|
|
211
|
-
async loadTasks(ctx, mode, state) {
|
|
212
|
-
// Adapter path — use ctx.taskSource. The composition root wires the
|
|
213
|
-
// right adapter for each taskSourceType:
|
|
214
|
-
// - "content-lake" → ContentLakeTaskSource (Studio-owned ailf.task docs)
|
|
215
|
-
// - "repo" → RepoTaskSource (loads .yaml AND .task.ts from repoTasksPath)
|
|
216
|
-
// Routing both through ctx.taskSource keeps the orchestration step
|
|
217
|
-
// file-format-agnostic (W0148: external-consumer evals materialize
|
|
218
|
-
// inline tasks as .yaml, which loadPipelineTasks can't read).
|
|
219
|
-
if (ctx.config.taskSourceType === "content-lake" ||
|
|
220
|
-
ctx.config.taskSourceType === "repo") {
|
|
221
|
-
return this.loadTasksFromAdapter(ctx, state);
|
|
222
|
-
}
|
|
223
|
-
// Filesystem path — load from .task.ts files (legacy unset path:
|
|
224
|
-
// AILF defaults from tasks/${mode}/ + optional repoTasksPath augment).
|
|
225
|
-
return this.loadTasksFromFilesystem(ctx, mode, state);
|
|
226
|
-
}
|
|
227
211
|
/**
|
|
228
|
-
* Load tasks via ctx.taskSource
|
|
212
|
+
* Load tasks via ctx.taskSource — the single adapter wired by the
|
|
213
|
+
* composition root for every taskSourceType (W0146). FetchDocsStep
|
|
214
|
+
* and GenerateConfigsStep MUST go through the same adapter so configs
|
|
215
|
+
* reference context files that were actually fetched.
|
|
229
216
|
*
|
|
230
|
-
*
|
|
231
|
-
*
|
|
232
|
-
*
|
|
233
|
-
* pushes it into the GROQ query, RepoTaskSource applies it in-memory.
|
|
217
|
+
* Filtering by area/task/tag is delegated to the adapter:
|
|
218
|
+
* ContentLakeTaskSource pushes it into the GROQ query;
|
|
219
|
+
* RepoTaskSource applies it in-memory.
|
|
234
220
|
*/
|
|
235
|
-
async
|
|
221
|
+
async loadTasks(ctx, mode, state) {
|
|
236
222
|
const filter = {
|
|
237
223
|
...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
|
|
238
224
|
...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
|
|
239
225
|
...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
|
|
240
226
|
};
|
|
241
|
-
const
|
|
242
|
-
//
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
// Release auto-scope
|
|
247
|
-
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
248
|
-
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
249
|
-
const beforeCount = tasks.length;
|
|
250
|
-
const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
|
|
251
|
-
ctx.logger.info(` 🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
|
|
252
|
-
return scoped;
|
|
253
|
-
}
|
|
254
|
-
return tasks;
|
|
255
|
-
}
|
|
256
|
-
/**
|
|
257
|
-
* Load tasks from filesystem .task.ts files.
|
|
258
|
-
*
|
|
259
|
-
* This is the original path used for repo-based and inline tasks.
|
|
260
|
-
* It scans tasks/{mode}/ and optionally --repo-tasks-path.
|
|
261
|
-
*/
|
|
262
|
-
async loadTasksFromFilesystem(ctx, mode, state) {
|
|
263
|
-
const { resolve } = await import("path");
|
|
264
|
-
const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
|
|
265
|
-
const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
|
|
266
|
-
// Discover task files from the mode-specific directory and --repo-tasks-path.
|
|
267
|
-
// Use vendored copies in dist/ when @sanity/ailf-core isn't resolvable
|
|
268
|
-
// (i.e., running outside the monorepo via npx).
|
|
269
|
-
//
|
|
270
|
-
// When taskSourceType === "repo", skip the AILF-bundled tasks/${mode}/
|
|
271
|
-
// directory and load ONLY from repoTasksPath. Mirrors the composition-root
|
|
272
|
-
// contract for repo-only mode (see composition-root.ts:392-405).
|
|
273
|
-
const dirs = [];
|
|
274
|
-
if (ctx.config.taskSourceType !== "repo") {
|
|
275
|
-
dirs.push(resolveVendoredSubdir(ctx.config.rootDir, `tasks/${mode}`));
|
|
276
|
-
}
|
|
277
|
-
else if (!ctx.config.repoTasksPath) {
|
|
278
|
-
throw new Error('taskSourceType "repo" requires repoTasksPath to be set (no AILF defaults loaded in repo-only mode)');
|
|
279
|
-
}
|
|
280
|
-
// Also search --repo-tasks-path (e.g., .ailf/tasks/) for repo-based tasks
|
|
281
|
-
if (ctx.config.repoTasksPath) {
|
|
282
|
-
const repoDir = resolve(ctx.config.repoTasksPath);
|
|
283
|
-
if (!dirs.includes(repoDir)) {
|
|
284
|
-
dirs.push(repoDir);
|
|
285
|
-
}
|
|
286
|
-
}
|
|
227
|
+
const allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
|
|
228
|
+
// Mode filter — the adapter may return a mixed-mode set (e.g. a user's
|
|
229
|
+
// `--repo-tasks-path` containing tasks of multiple modes). Skip
|
|
230
|
+
// non-matching modes with a warning so unintentional misclassification
|
|
231
|
+
// is visible without breaking the run.
|
|
287
232
|
const tasks = [];
|
|
288
233
|
const skippedByMode = new Map();
|
|
289
|
-
for (const
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
if (!("mode" in task) || task.mode === mode) {
|
|
297
|
-
tasks.push(task);
|
|
298
|
-
}
|
|
299
|
-
else {
|
|
300
|
-
const taskMode = task.mode ?? "unknown";
|
|
301
|
-
skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
|
|
302
|
-
}
|
|
303
|
-
}
|
|
234
|
+
for (const task of allTasks) {
|
|
235
|
+
if (!("mode" in task) || task.mode === mode) {
|
|
236
|
+
tasks.push(task);
|
|
237
|
+
}
|
|
238
|
+
else {
|
|
239
|
+
const taskMode = task.mode ?? "unknown";
|
|
240
|
+
skippedByMode.set(taskMode, (skippedByMode.get(taskMode) ?? 0) + 1);
|
|
304
241
|
}
|
|
305
242
|
}
|
|
306
243
|
if (skippedByMode.size > 0) {
|
|
@@ -310,46 +247,17 @@ export class GenerateConfigsStep {
|
|
|
310
247
|
.join(", ");
|
|
311
248
|
ctx.logger.warn(` ⚠ Skipped ${total} task(s) with non-matching mode (${summary}). Current pipeline mode: ${mode}. Run with --mode <mode> to include them.`);
|
|
312
249
|
}
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
250
|
+
this.lastLoadedTaskIds = tasks
|
|
251
|
+
.map((t) => t.id)
|
|
252
|
+
.filter((id) => !!id);
|
|
316
253
|
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
317
254
|
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
318
|
-
const beforeCount =
|
|
319
|
-
const scoped =
|
|
255
|
+
const beforeCount = tasks.length;
|
|
256
|
+
const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
|
|
320
257
|
ctx.logger.info(` 🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
|
|
321
258
|
return scoped;
|
|
322
259
|
}
|
|
323
|
-
return
|
|
324
|
-
}
|
|
325
|
-
applyFilters(ctx, tasks) {
|
|
326
|
-
// Capture pre-filter IDs for diagnostic messages
|
|
327
|
-
this.lastLoadedTaskIds = tasks
|
|
328
|
-
.map((t) => t.id)
|
|
329
|
-
.filter((id) => !!id);
|
|
330
|
-
let result = tasks;
|
|
331
|
-
if (ctx.config.areas?.length) {
|
|
332
|
-
const allowed = new Set(ctx.config.areas.map((a) => a.toLowerCase()));
|
|
333
|
-
result = result.filter((t) => {
|
|
334
|
-
const area = t.area?.toLowerCase();
|
|
335
|
-
return area && allowed.has(area);
|
|
336
|
-
});
|
|
337
|
-
}
|
|
338
|
-
if (ctx.config.tasks?.length) {
|
|
339
|
-
const allowed = new Set(ctx.config.tasks);
|
|
340
|
-
result = result.filter((t) => {
|
|
341
|
-
const id = t.id;
|
|
342
|
-
return id && allowed.has(id);
|
|
343
|
-
});
|
|
344
|
-
}
|
|
345
|
-
if (ctx.config.tags?.length) {
|
|
346
|
-
const allowed = new Set(ctx.config.tags);
|
|
347
|
-
result = result.filter((t) => {
|
|
348
|
-
const tags = t.tags;
|
|
349
|
-
return tags?.some((tag) => allowed.has(tag));
|
|
350
|
-
});
|
|
351
|
-
}
|
|
352
|
-
return result;
|
|
260
|
+
return tasks;
|
|
353
261
|
}
|
|
354
262
|
/**
|
|
355
263
|
* Build a descriptive error message when no tasks match the current filters.
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
* @see packages/eval/src/pipeline/mirror-repo-tasks.ts
|
|
13
13
|
* @see docs/archive/exec-plans/tasks-as-content/phase-5-content-lake-mirroring.md
|
|
14
14
|
*/
|
|
15
|
-
import { getSanityClient } from "../../sanity/client.js";
|
|
15
|
+
import { getAilfSanityClient, getSanityClient } from "../../sanity/client.js";
|
|
16
16
|
import { detectGitContext, mirrorRepoTasks, } from "../../pipeline/mirror-repo-tasks.js";
|
|
17
17
|
export class MirrorRepoTasksStep {
|
|
18
18
|
name = "mirror-repo-tasks";
|
|
@@ -28,6 +28,18 @@ export class MirrorRepoTasksStep {
|
|
|
28
28
|
if (!ctx.config.repoTasksPath) {
|
|
29
29
|
return { status: "skipped", reason: "No --repo-tasks-path provided" };
|
|
30
30
|
}
|
|
31
|
+
// W0145 — never mirror under repo-only mode. The API gateway maps
|
|
32
|
+
// PipelineRequest.taskMode="inline" → taskSourceType="repo", so an
|
|
33
|
+
// external consumer's ephemeral inline tasks would otherwise be
|
|
34
|
+
// upserted into AILF's canonical Content Lake. Mirroring is only
|
|
35
|
+
// correct for the in-tree dogfood path (taskSourceType unset +
|
|
36
|
+
// repoTasksPath set, e.g. external-eval.yml).
|
|
37
|
+
if (ctx.config.taskSourceType === "repo") {
|
|
38
|
+
return {
|
|
39
|
+
status: "skipped",
|
|
40
|
+
reason: 'taskSourceType="repo" — inline tasks are not mirrored',
|
|
41
|
+
};
|
|
42
|
+
}
|
|
31
43
|
// Need a write token for mirroring
|
|
32
44
|
const token = process.env.AILF_REPORT_SANITY_API_TOKEN ?? process.env.SANITY_API_TOKEN;
|
|
33
45
|
if (!token) {
|
|
@@ -54,11 +66,22 @@ export class MirrorRepoTasksStep {
|
|
|
54
66
|
// Detect git context (from env vars or git CLI)
|
|
55
67
|
const git = await detectGitContext(ctx.config.repoTasksPath);
|
|
56
68
|
ctx.logger.info(` Mirroring ${repoTasks.length} repo task(s) from ${git.repo}@${git.branch}`);
|
|
57
|
-
//
|
|
58
|
-
|
|
69
|
+
// Two clients are required after the D0043 dataset split:
|
|
70
|
+
// - `client` writes ailf.task / ailf.featureArea to the AILF
|
|
71
|
+
// dataset and reads existing mirror state — uses the
|
|
72
|
+
// AILF-scoped token explicitly so writes work even when
|
|
73
|
+
// SANITY_API_TOKEN is editorial-read-only.
|
|
74
|
+
// - `editorialClient` resolves `article` slugs against the
|
|
75
|
+
// editorial dataset. Operators may scope the AILF token to
|
|
76
|
+
// AILF only (D0043 consequence #5); using it here would
|
|
77
|
+
// 401 on the editorial query. Let it pick up SANITY_API_TOKEN
|
|
78
|
+
// from the default config instead.
|
|
79
|
+
const client = getAilfSanityClient({ token });
|
|
80
|
+
const editorialClient = getSanityClient();
|
|
59
81
|
// Run the mirror
|
|
60
82
|
const result = await mirrorRepoTasks({
|
|
61
83
|
client,
|
|
84
|
+
editorialClient,
|
|
62
85
|
git,
|
|
63
86
|
logger: ctx.logger,
|
|
64
87
|
tasks: repoTasks,
|
|
@@ -111,10 +111,28 @@ export class RunEvalStep {
|
|
|
111
111
|
// required eval modes were satisfied from the remote cache.
|
|
112
112
|
state.remoteCacheHits ??= new Set();
|
|
113
113
|
state.remoteCacheHits.add(this.mode);
|
|
114
|
-
// Carry forward
|
|
115
|
-
|
|
114
|
+
// Carry forward the share-link backreference for THIS mode only.
|
|
115
|
+
// Pushing every entry from `remoteCacheResult.promptfooUrls`
|
|
116
|
+
// snowballs across the daily perspective cron: each cache-hit
|
|
117
|
+
// run inherits the cached report's full URL list (including
|
|
118
|
+
// other modes and any URLs the cached report had itself
|
|
119
|
+
// accumulated from earlier hits), then layers its own on top.
|
|
120
|
+
// Iterate from the tail to handle pre-fix cached reports that
|
|
121
|
+
// may carry multiple entries for the same mode.
|
|
122
|
+
const cachedUrls = remoteCacheResult.promptfooUrls;
|
|
123
|
+
let inherited;
|
|
124
|
+
if (cachedUrls) {
|
|
125
|
+
for (let i = cachedUrls.length - 1; i >= 0; i--) {
|
|
126
|
+
const entry = cachedUrls[i];
|
|
127
|
+
if (entry?.mode === this.mode) {
|
|
128
|
+
inherited = entry;
|
|
129
|
+
break;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
if (inherited) {
|
|
116
134
|
state.promptfooUrls ??= [];
|
|
117
|
-
state.promptfooUrls.push(
|
|
135
|
+
state.promptfooUrls.push(inherited);
|
|
118
136
|
}
|
|
119
137
|
// D0040 / W0135 — restore the cached report's artifact manifest into
|
|
120
138
|
// the accumulator so the new run's RunManifest advertises the cached
|
|
@@ -4,7 +4,9 @@
|
|
|
4
4
|
* Pure analysis functions for agent behavior observation reports.
|
|
5
5
|
* No I/O, no process.env, no process.argv — all data is passed in.
|
|
6
6
|
*/
|
|
7
|
+
import type { TestResult } from "../_vendor/ailf-core/index.d.ts";
|
|
7
8
|
import type { AgentBehaviorSummary } from "../agent-observer/types.js";
|
|
9
|
+
export type { TestResult } from "../_vendor/ailf-core/index.d.ts";
|
|
8
10
|
export interface PromptfooResults {
|
|
9
11
|
results: TestResult[];
|
|
10
12
|
}
|
|
@@ -13,14 +15,6 @@ export interface PromptfooResultsEnvelope {
|
|
|
13
15
|
results: TestResult[];
|
|
14
16
|
};
|
|
15
17
|
}
|
|
16
|
-
export interface TestResult {
|
|
17
|
-
description: string;
|
|
18
|
-
metadata?: Record<string, unknown>;
|
|
19
|
-
response: {
|
|
20
|
-
output: string;
|
|
21
|
-
};
|
|
22
|
-
vars: Record<string, string>;
|
|
23
|
-
}
|
|
24
18
|
export interface TaskBehavior {
|
|
25
19
|
behavior: AgentBehaviorSummary;
|
|
26
20
|
description: string;
|
package/dist/pipeline/cache.d.ts
CHANGED
|
@@ -28,7 +28,7 @@ export interface CacheEntry {
|
|
|
28
28
|
timestamp: string;
|
|
29
29
|
}
|
|
30
30
|
/** Result of a cache lookup */
|
|
31
|
-
export type
|
|
31
|
+
export type ManifestCacheLookupResult = {
|
|
32
32
|
hit: false;
|
|
33
33
|
currentHash: string;
|
|
34
34
|
} | {
|
|
@@ -84,7 +84,7 @@ export declare function hashFiles(paths: string[], context?: string[]): string;
|
|
|
84
84
|
* Optional `context` strings are included in the hash so that non-file
|
|
85
85
|
* state (e.g., area/task filter flags) participates in cache key computation.
|
|
86
86
|
*/
|
|
87
|
-
export declare function lookupCache(rootDir: string, step: string, context?: string[]):
|
|
87
|
+
export declare function lookupCache(rootDir: string, step: string, context?: string[]): ManifestCacheLookupResult;
|
|
88
88
|
/**
|
|
89
89
|
* Read the cache manifest for a step.
|
|
90
90
|
* Returns null if no manifest exists or it's corrupt.
|
|
@@ -18,8 +18,16 @@ export declare function checkCanonicalContextsExist(rootDir: string, taskIds: st
|
|
|
18
18
|
export declare function checkContextsExist(rootDir: string, areas: string[]): ValidationIssue[];
|
|
19
19
|
/**
|
|
20
20
|
* Check that required environment variables are set.
|
|
21
|
-
*
|
|
22
|
-
*
|
|
21
|
+
*
|
|
22
|
+
* Loads the resolved `.env` file first (with override, matching the dotenv
|
|
23
|
+
* CLI `-o` flag used by other scripts), then checks for required keys. The
|
|
24
|
+
* resolution order mirrors `cli.ts`'s `resolveEnvPath()` so a `--dotenv
|
|
25
|
+
* <path>` argument on the parent CLI invocation isn't silently clobbered
|
|
26
|
+
* here. Without this, a Tier 2 test that uses `--dotenv` to override
|
|
27
|
+
* tenant-pointing vars (e.g. `AILF_GCS_ARTIFACT_BUCKET`,
|
|
28
|
+
* `GOOGLE_APPLICATION_CREDENTIALS`) gets its overrides reverted to the
|
|
29
|
+
* repo `.env` values when this function runs as part of the validate
|
|
30
|
+
* step. (W0138 Slice 2 surface — see `gcs-pipeline-replay-roundtrip.test.ts`.)
|
|
23
31
|
*/
|
|
24
32
|
export declare function checkEnvironment(rootDir: string): ValidationIssue[];
|
|
25
33
|
/**
|
package/dist/pipeline/checks.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import { config as loadEnv } from "dotenv";
|
|
9
9
|
import { existsSync, readFileSync, statSync } from "fs";
|
|
10
10
|
import { join, resolve } from "path";
|
|
11
|
+
import { findExplicitDotenvArg } from "../lib/dotenv-resolution.js";
|
|
11
12
|
import { configFileForMode } from "./eval-constants.js";
|
|
12
13
|
// ---------------------------------------------------------------------------
|
|
13
14
|
// Precondition: contexts exist for each feature area
|
|
@@ -80,13 +81,22 @@ export function checkContextsExist(rootDir, areas) {
|
|
|
80
81
|
// ---------------------------------------------------------------------------
|
|
81
82
|
/**
|
|
82
83
|
* Check that required environment variables are set.
|
|
83
|
-
*
|
|
84
|
-
*
|
|
84
|
+
*
|
|
85
|
+
* Loads the resolved `.env` file first (with override, matching the dotenv
|
|
86
|
+
* CLI `-o` flag used by other scripts), then checks for required keys. The
|
|
87
|
+
* resolution order mirrors `cli.ts`'s `resolveEnvPath()` so a `--dotenv
|
|
88
|
+
* <path>` argument on the parent CLI invocation isn't silently clobbered
|
|
89
|
+
* here. Without this, a Tier 2 test that uses `--dotenv` to override
|
|
90
|
+
* tenant-pointing vars (e.g. `AILF_GCS_ARTIFACT_BUCKET`,
|
|
91
|
+
* `GOOGLE_APPLICATION_CREDENTIALS`) gets its overrides reverted to the
|
|
92
|
+
* repo `.env` values when this function runs as part of the validate
|
|
93
|
+
* step. (W0138 Slice 2 surface — see `gcs-pipeline-replay-roundtrip.test.ts`.)
|
|
85
94
|
*/
|
|
86
95
|
export function checkEnvironment(rootDir) {
|
|
87
96
|
const issues = [];
|
|
88
|
-
// Load
|
|
89
|
-
|
|
97
|
+
// Load the active .env so we see the same vars as dotenv -e <path> -o.
|
|
98
|
+
// Resolution: explicit --dotenv arg wins, then the repo-root .env.
|
|
99
|
+
const envPath = findExplicitDotenvArg() ?? resolve(rootDir, "..", "..", ".env");
|
|
90
100
|
if (existsSync(envPath)) {
|
|
91
101
|
loadEnv({ override: true, path: envPath });
|
|
92
102
|
}
|
|
@@ -46,7 +46,7 @@ import { buildTaskGraph } from "./task-graph-builder.js";
|
|
|
46
46
|
* rules (e.g., rejecting archived tasks that slipped through).
|
|
47
47
|
*/
|
|
48
48
|
export function compileLiteracyTasks(tasks, options) {
|
|
49
|
-
const rubricConfig =
|
|
49
|
+
const rubricConfig = loadRubricResolutionInput(options.rootDir);
|
|
50
50
|
const warnings = [];
|
|
51
51
|
const results = [];
|
|
52
52
|
let totalTests = 0;
|
|
@@ -146,7 +146,7 @@ export function compareCompilerOutputs(legacyEntries, newResult) {
|
|
|
146
146
|
// ---------------------------------------------------------------------------
|
|
147
147
|
// Rubric config loading
|
|
148
148
|
// ---------------------------------------------------------------------------
|
|
149
|
-
function
|
|
149
|
+
function loadRubricResolutionInput(rootDir) {
|
|
150
150
|
const result = tryLoadConfigFile("rubrics", rootDir);
|
|
151
151
|
if (!result)
|
|
152
152
|
return undefined;
|