@sanity/ailf 4.0.7 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ailf.js +6 -1
- package/dist/_vendor/ailf-core/schemas/external-providers.d.ts +136 -0
- package/dist/_vendor/ailf-core/schemas/external-providers.js +136 -0
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -3
- package/dist/_vendor/ailf-core/schemas/report.d.ts +251 -0
- package/dist/_vendor/ailf-core/schemas/report.js +235 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +1 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.d.ts +38 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +696 -0
- package/dist/_vendor/ailf-core/types/api-requests.d.ts +159 -0
- package/dist/_vendor/ailf-core/types/api-requests.js +27 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +20 -3
- package/dist/_vendor/ailf-core/types/index.d.ts +4 -1
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +112 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.js +18 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +146 -0
- package/dist/_vendor/ailf-core/types/repo-config.js +18 -0
- package/dist/_vendor/ailf-shared/index.d.ts +7 -5
- package/dist/_vendor/ailf-shared/index.js +7 -5
- package/dist/adapters/api-client/types.d.ts +2 -5
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +21 -5
- package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +129 -25
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +58 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +1 -1
- package/dist/adapters/task-sources/index.d.ts +1 -1
- package/dist/adapters/task-sources/index.js +1 -1
- package/dist/adapters/task-sources/repo-schemas.d.ts +19 -2
- package/dist/adapters/task-sources/repo-schemas.js +81 -2
- package/dist/adapters/task-sources/repo-task-source.js +11 -2
- package/dist/adapters/task-sources/repo-validation.d.ts +6 -6
- package/dist/adapters/task-sources/repo-validation.js +1 -1
- package/dist/agent-observer/agentic-provider.d.ts +1 -0
- package/dist/agent-observer/agentic-provider.js +43 -36
- package/dist/agent-observer/config-schemas.d.ts +61 -0
- package/dist/agent-observer/config-schemas.js +65 -0
- package/dist/agent-observer/provider.d.ts +1 -0
- package/dist/agent-observer/provider.js +19 -17
- package/dist/cli.js +4 -4
- package/dist/commands/validate-tasks.js +10 -4
- package/dist/composition-root.js +4 -2
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/job-store.js +2 -2
- package/dist/lib/dotenv-resolution.d.ts +21 -0
- package/dist/lib/dotenv-resolution.js +30 -0
- package/dist/orchestration/steps/mirror-repo-tasks-step.js +14 -3
- package/dist/orchestration/steps/run-eval-step.js +21 -3
- package/dist/pipeline/agent-behavior-report.d.ts +2 -8
- package/dist/pipeline/cache.d.ts +2 -2
- package/dist/pipeline/checks.d.ts +10 -2
- package/dist/pipeline/checks.js +14 -4
- package/dist/pipeline/compiler/literacy-bridge.js +2 -2
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +0 -12
- package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +0 -12
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +2 -2
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +44 -5
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +3 -3
- package/dist/pipeline/compiler/promptfoo-compiler.js +7 -11
- package/dist/pipeline/compiler/provider-assembler.js +33 -3
- package/dist/pipeline/compiler/rubric-resolution.d.ts +2 -2
- package/dist/pipeline/mirror-repo-tasks.d.ts +13 -5
- package/dist/pipeline/mirror-repo-tasks.js +16 -8
- package/dist/pipeline/pr-comment.d.ts +22 -9
- package/dist/pipeline/pr-comment.js +52 -472
- package/dist/pipeline/resolve-mappings.d.ts +8 -3
- package/dist/promptfoo-providers/mock-path.d.ts +12 -0
- package/dist/promptfoo-providers/mock-path.js +15 -0
- package/dist/report-store.d.ts +63 -1
- package/dist/report-store.js +111 -31
- package/dist/sanity/client.d.ts +58 -0
- package/dist/sanity/client.js +106 -0
- package/dist/sanity/document-renderers.d.ts +68 -0
- package/dist/sanity/document-renderers.js +221 -0
- package/dist/sanity/queries.d.ts +21 -0
- package/dist/sanity/queries.js +71 -0
- package/dist/tasks/knowledge-probe/define-type-api.task.ts +2 -6
- package/dist/tasks/knowledge-probe/groq-projections.task.ts +0 -5
- package/dist/tasks/literacy/content-lake.task.ts +4 -10
- package/dist/tasks/literacy/frameworks.task.ts +2 -8
- package/dist/tasks/literacy/functions.task.ts +1 -4
- package/dist/tasks/literacy/groq.task.ts +3 -12
- package/dist/tasks/literacy/image-handling.task.ts +1 -4
- package/dist/tasks/literacy/nextjs-live.task.ts +1 -4
- package/dist/tasks/literacy/portable-text.task.ts +2 -8
- package/dist/tasks/literacy/studio-setup.task.ts +2 -8
- package/dist/tasks/literacy/visual-editing.task.ts +2 -8
- package/package.json +8 -7
- package/tasks/knowledge-probe/define-type-api.task.ts +2 -6
- package/tasks/knowledge-probe/groq-projections.task.ts +0 -5
- package/tasks/literacy/content-lake.task.ts +4 -10
- package/tasks/literacy/frameworks.task.ts +2 -8
- package/tasks/literacy/functions.task.ts +1 -4
- package/tasks/literacy/groq.task.ts +3 -12
- package/tasks/literacy/image-handling.task.ts +1 -4
- package/tasks/literacy/nextjs-live.task.ts +1 -4
- package/tasks/literacy/portable-text.task.ts +2 -8
- package/tasks/literacy/studio-setup.task.ts +2 -8
- package/tasks/literacy/visual-editing.task.ts +2 -8
|
@@ -111,10 +111,28 @@ export class RunEvalStep {
|
|
|
111
111
|
// required eval modes were satisfied from the remote cache.
|
|
112
112
|
state.remoteCacheHits ??= new Set();
|
|
113
113
|
state.remoteCacheHits.add(this.mode);
|
|
114
|
-
// Carry forward
|
|
115
|
-
|
|
114
|
+
// Carry forward the share-link backreference for THIS mode only.
|
|
115
|
+
// Pushing every entry from `remoteCacheResult.promptfooUrls`
|
|
116
|
+
// snowballs across the daily perspective cron: each cache-hit
|
|
117
|
+
// run inherits the cached report's full URL list (including
|
|
118
|
+
// other modes and any URLs the cached report had itself
|
|
119
|
+
// accumulated from earlier hits), then layers its own on top.
|
|
120
|
+
// Iterate from the tail to handle pre-fix cached reports that
|
|
121
|
+
// may carry multiple entries for the same mode.
|
|
122
|
+
const cachedUrls = remoteCacheResult.promptfooUrls;
|
|
123
|
+
let inherited;
|
|
124
|
+
if (cachedUrls) {
|
|
125
|
+
for (let i = cachedUrls.length - 1; i >= 0; i--) {
|
|
126
|
+
const entry = cachedUrls[i];
|
|
127
|
+
if (entry?.mode === this.mode) {
|
|
128
|
+
inherited = entry;
|
|
129
|
+
break;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
if (inherited) {
|
|
116
134
|
state.promptfooUrls ??= [];
|
|
117
|
-
state.promptfooUrls.push(
|
|
135
|
+
state.promptfooUrls.push(inherited);
|
|
118
136
|
}
|
|
119
137
|
// D0040 / W0135 — restore the cached report's artifact manifest into
|
|
120
138
|
// the accumulator so the new run's RunManifest advertises the cached
|
|
@@ -4,7 +4,9 @@
|
|
|
4
4
|
* Pure analysis functions for agent behavior observation reports.
|
|
5
5
|
* No I/O, no process.env, no process.argv — all data is passed in.
|
|
6
6
|
*/
|
|
7
|
+
import type { TestResult } from "../_vendor/ailf-core/index.d.ts";
|
|
7
8
|
import type { AgentBehaviorSummary } from "../agent-observer/types.js";
|
|
9
|
+
export type { TestResult } from "../_vendor/ailf-core/index.d.ts";
|
|
8
10
|
export interface PromptfooResults {
|
|
9
11
|
results: TestResult[];
|
|
10
12
|
}
|
|
@@ -13,14 +15,6 @@ export interface PromptfooResultsEnvelope {
|
|
|
13
15
|
results: TestResult[];
|
|
14
16
|
};
|
|
15
17
|
}
|
|
16
|
-
export interface TestResult {
|
|
17
|
-
description: string;
|
|
18
|
-
metadata?: Record<string, unknown>;
|
|
19
|
-
response: {
|
|
20
|
-
output: string;
|
|
21
|
-
};
|
|
22
|
-
vars: Record<string, string>;
|
|
23
|
-
}
|
|
24
18
|
export interface TaskBehavior {
|
|
25
19
|
behavior: AgentBehaviorSummary;
|
|
26
20
|
description: string;
|
package/dist/pipeline/cache.d.ts
CHANGED
|
@@ -28,7 +28,7 @@ export interface CacheEntry {
|
|
|
28
28
|
timestamp: string;
|
|
29
29
|
}
|
|
30
30
|
/** Result of a cache lookup */
|
|
31
|
-
export type
|
|
31
|
+
export type ManifestCacheLookupResult = {
|
|
32
32
|
hit: false;
|
|
33
33
|
currentHash: string;
|
|
34
34
|
} | {
|
|
@@ -84,7 +84,7 @@ export declare function hashFiles(paths: string[], context?: string[]): string;
|
|
|
84
84
|
* Optional `context` strings are included in the hash so that non-file
|
|
85
85
|
* state (e.g., area/task filter flags) participates in cache key computation.
|
|
86
86
|
*/
|
|
87
|
-
export declare function lookupCache(rootDir: string, step: string, context?: string[]):
|
|
87
|
+
export declare function lookupCache(rootDir: string, step: string, context?: string[]): ManifestCacheLookupResult;
|
|
88
88
|
/**
|
|
89
89
|
* Read the cache manifest for a step.
|
|
90
90
|
* Returns null if no manifest exists or it's corrupt.
|
|
@@ -18,8 +18,16 @@ export declare function checkCanonicalContextsExist(rootDir: string, taskIds: st
|
|
|
18
18
|
export declare function checkContextsExist(rootDir: string, areas: string[]): ValidationIssue[];
|
|
19
19
|
/**
|
|
20
20
|
* Check that required environment variables are set.
|
|
21
|
-
*
|
|
22
|
-
*
|
|
21
|
+
*
|
|
22
|
+
* Loads the resolved `.env` file first (with override, matching the dotenv
|
|
23
|
+
* CLI `-o` flag used by other scripts), then checks for required keys. The
|
|
24
|
+
* resolution order mirrors `cli.ts`'s `resolveEnvPath()` so a `--dotenv
|
|
25
|
+
* <path>` argument on the parent CLI invocation isn't silently clobbered
|
|
26
|
+
* here. Without this, a Tier 2 test that uses `--dotenv` to override
|
|
27
|
+
* tenant-pointing vars (e.g. `AILF_GCS_ARTIFACT_BUCKET`,
|
|
28
|
+
* `GOOGLE_APPLICATION_CREDENTIALS`) gets its overrides reverted to the
|
|
29
|
+
* repo `.env` values when this function runs as part of the validate
|
|
30
|
+
* step. (W0138 Slice 2 surface — see `gcs-pipeline-replay-roundtrip.test.ts`.)
|
|
23
31
|
*/
|
|
24
32
|
export declare function checkEnvironment(rootDir: string): ValidationIssue[];
|
|
25
33
|
/**
|
package/dist/pipeline/checks.js
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
import { config as loadEnv } from "dotenv";
|
|
9
9
|
import { existsSync, readFileSync, statSync } from "fs";
|
|
10
10
|
import { join, resolve } from "path";
|
|
11
|
+
import { findExplicitDotenvArg } from "../lib/dotenv-resolution.js";
|
|
11
12
|
import { configFileForMode } from "./eval-constants.js";
|
|
12
13
|
// ---------------------------------------------------------------------------
|
|
13
14
|
// Precondition: contexts exist for each feature area
|
|
@@ -80,13 +81,22 @@ export function checkContextsExist(rootDir, areas) {
|
|
|
80
81
|
// ---------------------------------------------------------------------------
|
|
81
82
|
/**
|
|
82
83
|
* Check that required environment variables are set.
|
|
83
|
-
*
|
|
84
|
-
*
|
|
84
|
+
*
|
|
85
|
+
* Loads the resolved `.env` file first (with override, matching the dotenv
|
|
86
|
+
* CLI `-o` flag used by other scripts), then checks for required keys. The
|
|
87
|
+
* resolution order mirrors `cli.ts`'s `resolveEnvPath()` so a `--dotenv
|
|
88
|
+
* <path>` argument on the parent CLI invocation isn't silently clobbered
|
|
89
|
+
* here. Without this, a Tier 2 test that uses `--dotenv` to override
|
|
90
|
+
* tenant-pointing vars (e.g. `AILF_GCS_ARTIFACT_BUCKET`,
|
|
91
|
+
* `GOOGLE_APPLICATION_CREDENTIALS`) gets its overrides reverted to the
|
|
92
|
+
* repo `.env` values when this function runs as part of the validate
|
|
93
|
+
* step. (W0138 Slice 2 surface — see `gcs-pipeline-replay-roundtrip.test.ts`.)
|
|
85
94
|
*/
|
|
86
95
|
export function checkEnvironment(rootDir) {
|
|
87
96
|
const issues = [];
|
|
88
|
-
// Load
|
|
89
|
-
|
|
97
|
+
// Load the active .env so we see the same vars as dotenv -e <path> -o.
|
|
98
|
+
// Resolution: explicit --dotenv arg wins, then the repo-root .env.
|
|
99
|
+
const envPath = findExplicitDotenvArg() ?? resolve(rootDir, "..", "..", ".env");
|
|
90
100
|
if (existsSync(envPath)) {
|
|
91
101
|
loadEnv({ override: true, path: envPath });
|
|
92
102
|
}
|
|
@@ -46,7 +46,7 @@ import { buildTaskGraph } from "./task-graph-builder.js";
|
|
|
46
46
|
* rules (e.g., rejecting archived tasks that slipped through).
|
|
47
47
|
*/
|
|
48
48
|
export function compileLiteracyTasks(tasks, options) {
|
|
49
|
-
const rubricConfig =
|
|
49
|
+
const rubricConfig = loadRubricResolutionInput(options.rootDir);
|
|
50
50
|
const warnings = [];
|
|
51
51
|
const results = [];
|
|
52
52
|
let totalTests = 0;
|
|
@@ -146,7 +146,7 @@ export function compareCompilerOutputs(legacyEntries, newResult) {
|
|
|
146
146
|
// ---------------------------------------------------------------------------
|
|
147
147
|
// Rubric config loading
|
|
148
148
|
// ---------------------------------------------------------------------------
|
|
149
|
-
function
|
|
149
|
+
function loadRubricResolutionInput(rootDir) {
|
|
150
150
|
const result = tryLoadConfigFile("rubrics", rootDir);
|
|
151
151
|
if (!result)
|
|
152
152
|
return undefined;
|
|
@@ -30,10 +30,6 @@ export const scaffoldProjectTask = {
|
|
|
30
30
|
"2. Configure sanity.config.ts with project ID 'test-project' and dataset 'production'\n" +
|
|
31
31
|
"3. Create a 'post' schema type with title, slug, body, and author fields\n" +
|
|
32
32
|
"4. Ensure the project builds without errors",
|
|
33
|
-
vars: {
|
|
34
|
-
task: "Scaffold a Sanity Studio project with a post schema type. " +
|
|
35
|
-
"The project should build cleanly.",
|
|
36
|
-
},
|
|
37
33
|
},
|
|
38
34
|
assertions: [
|
|
39
35
|
{ type: "file-exists", value: "sanity.config.ts" },
|
|
@@ -70,10 +66,6 @@ export const modifyCodeTask = {
|
|
|
70
66
|
text: "In the existing Sanity Studio project, add a custom document action " +
|
|
71
67
|
"that logs a message before publishing. Follow the Sanity docs for " +
|
|
72
68
|
"custom document actions.",
|
|
73
|
-
vars: {
|
|
74
|
-
task: "Add a custom document action that wraps the default publish action " +
|
|
75
|
-
"and logs 'Publishing document: <title>' before executing.",
|
|
76
|
-
},
|
|
77
69
|
},
|
|
78
70
|
assertions: [
|
|
79
71
|
{ type: "file-exists", value: "actions/logPublishAction.ts" },
|
|
@@ -127,10 +119,6 @@ export const multiFileRefactorTask = {
|
|
|
127
119
|
"3. Query method calls (fetch → client.fetch with new signature)\n" +
|
|
128
120
|
"4. Mutation helpers (create/patch/delete API changes)\n" +
|
|
129
121
|
"Ensure the project compiles after migration.",
|
|
130
|
-
vars: {
|
|
131
|
-
task: "Migrate the codebase from @sanity/client v5 to v6, " +
|
|
132
|
-
"updating all files. Project must compile cleanly after migration.",
|
|
133
|
-
},
|
|
134
122
|
},
|
|
135
123
|
assertions: [
|
|
136
124
|
{
|
|
@@ -38,10 +38,6 @@ export const groqProjectionTask = {
|
|
|
38
38
|
"5. Array slicing with `[0..5]` and `[0...5]`\n" +
|
|
39
39
|
"6. Conditional projections using `select()`\n\n" +
|
|
40
40
|
"Provide working code examples for each.",
|
|
41
|
-
vars: {
|
|
42
|
-
task: "Explain GROQ projection syntax with working code examples " +
|
|
43
|
-
"covering projections, spread, dereference, slicing, and select().",
|
|
44
|
-
},
|
|
45
41
|
},
|
|
46
42
|
assertions: [
|
|
47
43
|
{ type: "contains", value: "->" },
|
|
@@ -89,10 +85,6 @@ export const defineTypeApiTask = {
|
|
|
89
85
|
"3. Why were these typed helpers introduced? What did they replace?\n" +
|
|
90
86
|
"4. Show a complete example of a document schema with various field types\n" +
|
|
91
87
|
"5. How do you add validation rules using the typed API?",
|
|
92
|
-
vars: {
|
|
93
|
-
task: "Explain Sanity's defineType/defineField schema API with examples, " +
|
|
94
|
-
"motivation, and validation rules.",
|
|
95
|
-
},
|
|
96
88
|
},
|
|
97
89
|
assertions: [
|
|
98
90
|
{ type: "contains", value: "defineType" },
|
|
@@ -142,10 +134,6 @@ export const ecosystemComparisonTask = {
|
|
|
142
134
|
"4. Developer experience and customization\n" +
|
|
143
135
|
"5. Pricing models\n" +
|
|
144
136
|
"6. When would you choose one over the other?",
|
|
145
|
-
vars: {
|
|
146
|
-
task: "Compare Sanity and Contentful across architecture, content modeling, " +
|
|
147
|
-
"querying, DX, pricing, and use case fit.",
|
|
148
|
-
},
|
|
149
137
|
},
|
|
150
138
|
assertions: [
|
|
151
139
|
{ type: "contains-any", value: ["GROQ", "groq"] },
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Shared types for the agent harness mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
-
import type {
|
|
5
|
+
import type { RubricResolutionInput } from "../../rubric-resolution.js";
|
|
6
6
|
import type { SandboxType } from "../../sandbox/sandbox-strategy.js";
|
|
7
7
|
/** Options for compiling an agent harness task */
|
|
8
8
|
export interface AgentHarnessCompileOptions {
|
|
@@ -11,7 +11,7 @@ export interface AgentHarnessCompileOptions {
|
|
|
11
11
|
/** Root directory for fixture resolution */
|
|
12
12
|
rootDir?: string;
|
|
13
13
|
/** Rubric config (templates, weights, profiles) — loaded from rubrics config */
|
|
14
|
-
rubricConfig?:
|
|
14
|
+
rubricConfig?: RubricResolutionInput;
|
|
15
15
|
}
|
|
16
16
|
/** Result of compiling a single agent harness task */
|
|
17
17
|
export interface AgentHarnessCompileResult {
|
|
@@ -10,6 +10,6 @@
|
|
|
10
10
|
* @see docs/archive/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
11
11
|
*/
|
|
12
12
|
export { buildMCPAssertions, compileMCPTask, handler as mcpServerHandler, validateMCPTask, type MCPAssertionContext, type MCPCompileOptions, type MCPCompileResult, type MCPValidationError, } from "./mcp-server/index.js";
|
|
13
|
-
export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type
|
|
13
|
+
export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type RubricResolutionInput, } from "./literacy/index.js";
|
|
14
14
|
export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, type KnowledgeProbeCompileOptions, type KnowledgeProbeCompileResult, type KnowledgeProbeMetadata, type KnowledgeProbeValidationError, } from "./knowledge-probe/index.js";
|
|
15
15
|
export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, type AgentHarnessCompileOptions, type AgentHarnessCompileResult, type AgentHarnessValidationError, type PromptfooExtension, type SandboxConfigMeta, } from "./agent-harness/index.js";
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Public types for the knowledge-probe mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
-
import type {
|
|
5
|
+
import type { RubricResolutionInput } from "../../rubric-resolution.js";
|
|
6
6
|
/** Options for compiling a knowledge probe task */
|
|
7
7
|
export interface KnowledgeProbeCompileOptions {
|
|
8
8
|
/** Grader provider for LLM-graded assertions */
|
|
@@ -15,7 +15,7 @@ export interface KnowledgeProbeCompileOptions {
|
|
|
15
15
|
}[];
|
|
16
16
|
/** Rubric config (templates, weights, profiles) — needed to resolve
|
|
17
17
|
* templated `llm-rubric` assertions to dimension metadata. */
|
|
18
|
-
rubricConfig?:
|
|
18
|
+
rubricConfig?: RubricResolutionInput;
|
|
19
19
|
}
|
|
20
20
|
/** Result of compiling a single knowledge probe task */
|
|
21
21
|
export interface KnowledgeProbeCompileResult {
|
|
@@ -10,6 +10,18 @@ import { LiteracyVariant, } from "../../../normalize-mode.js";
|
|
|
10
10
|
import { buildBaselineAssertions, resolveAssertions } from "./assertions.js";
|
|
11
11
|
import { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
|
|
12
12
|
import { validateLiteracyTask } from "./validation.js";
|
|
13
|
+
/**
|
|
14
|
+
* Variable keys reserved by the AILF compilers. Authoring these via
|
|
15
|
+
* `prompt.vars` is rejected by `PromptVars` at compile time and by
|
|
16
|
+
* `TaskPromptSchema` at parse time; this constant exists to defend
|
|
17
|
+
* the literacy compiler at runtime against legacy-shape `*.task.ts`
|
|
18
|
+
* files that bypass both gates.
|
|
19
|
+
*/
|
|
20
|
+
const RESERVED_PROMPT_VAR_KEYS = [
|
|
21
|
+
"task",
|
|
22
|
+
"docs",
|
|
23
|
+
"__featureArea",
|
|
24
|
+
];
|
|
13
25
|
/**
|
|
14
26
|
* Compile a literacy task into Promptfoo configuration.
|
|
15
27
|
*/
|
|
@@ -58,20 +70,47 @@ function buildPrompts(evalMode) {
|
|
|
58
70
|
// ---------------------------------------------------------------------------
|
|
59
71
|
function buildTestCases(task, evalMode, options, warnings) {
|
|
60
72
|
const tests = [];
|
|
61
|
-
|
|
73
|
+
// W0193: type-erased read of prompt.vars so we can defensively detect
|
|
74
|
+
// reserved keys on legacy-shape `*.task.ts` files (the type narrow makes
|
|
75
|
+
// `task.prompt.vars.task` `never`, but TS task files bypass both the
|
|
76
|
+
// type and the parse-time schema). YAML/inline-task paths have already
|
|
77
|
+
// been migrated by `migratePromptShape` upstream.
|
|
78
|
+
const rawVars = (task.prompt?.vars ?? {});
|
|
79
|
+
const legacyTaskBody = typeof rawVars.task === "string" ? rawVars.task : undefined;
|
|
80
|
+
const promptText = task.prompt?.text ?? legacyTaskBody ?? task.prompt?.template ?? "";
|
|
62
81
|
const contextDocs = task.context?.docs ?? [];
|
|
63
82
|
const taskArea = task.area ?? "";
|
|
64
83
|
const taskTitle = task.title;
|
|
65
|
-
|
|
84
|
+
// Strip reserved keys from the vars spread so they cannot override the
|
|
85
|
+
// canonical assignments below. `safePromptVars` carries only freeform
|
|
86
|
+
// template extras.
|
|
87
|
+
const safePromptVars = {};
|
|
88
|
+
const presentReserved = [];
|
|
89
|
+
for (const [key, value] of Object.entries(rawVars)) {
|
|
90
|
+
if (RESERVED_PROMPT_VAR_KEYS.includes(key)) {
|
|
91
|
+
presentReserved.push(key);
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
safePromptVars[key] = value;
|
|
95
|
+
}
|
|
96
|
+
// Single deduplicated deprecation warning per task — even when several
|
|
97
|
+
// reserved keys are present.
|
|
98
|
+
if (presentReserved.length > 0) {
|
|
99
|
+
warnings.push(`Literacy task "${task.id}": deprecated prompt.vars keys ` +
|
|
100
|
+
`(${presentReserved.join(", ")}) — use prompt.text for the prompt ` +
|
|
101
|
+
`body and context.docs for documentation references. The compiler ` +
|
|
102
|
+
`migrated them in-memory, but the task source should be updated.`);
|
|
103
|
+
}
|
|
66
104
|
const hasDocs = contextDocs.length > 0;
|
|
67
105
|
const docsVar = hasDocs ? `file://contexts/canonical/${task.id}.md` : "";
|
|
68
106
|
const assertions = resolveAssertions(task, options, warnings);
|
|
69
|
-
// Gold entry — canonical docs injected
|
|
107
|
+
// Gold entry — canonical docs injected. Spread freeform extras first so
|
|
108
|
+
// canonical keys (task / docs / __featureArea) cannot be overridden.
|
|
70
109
|
const goldVars = {
|
|
110
|
+
...safePromptVars,
|
|
71
111
|
task: promptText,
|
|
72
112
|
docs: docsVar,
|
|
73
113
|
__featureArea: taskArea,
|
|
74
|
-
...promptVars,
|
|
75
114
|
};
|
|
76
115
|
tests.push({
|
|
77
116
|
description: `${taskTitle} (gold)`,
|
|
@@ -89,10 +128,10 @@ function buildTestCases(task, evalMode, options, warnings) {
|
|
|
89
128
|
tests.push({
|
|
90
129
|
description: `${taskTitle} (baseline)`,
|
|
91
130
|
vars: {
|
|
131
|
+
...safePromptVars,
|
|
92
132
|
task: promptText,
|
|
93
133
|
docs: "",
|
|
94
134
|
__featureArea: taskArea,
|
|
95
|
-
...promptVars,
|
|
96
135
|
},
|
|
97
136
|
prompts: ["without-docs"],
|
|
98
137
|
...(baselineAssertions.length > 0
|
|
@@ -7,5 +7,5 @@ import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
|
7
7
|
export { LITERACY_PROMPT_TEMPLATES } from "./prompts.js";
|
|
8
8
|
export { validateLiteracyTask, type LiteracyValidationError, } from "./validation.js";
|
|
9
9
|
export { compileLiteracyTask } from "./compiler.js";
|
|
10
|
-
export type { LiteracyCompileOptions, LiteracyCompileResult,
|
|
10
|
+
export type { LiteracyCompileOptions, LiteracyCompileResult, RubricResolutionInput, } from "./types.js";
|
|
11
11
|
export declare const handler: ModeHandler;
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
* Shared types for the literacy mode handler.
|
|
3
3
|
*/
|
|
4
4
|
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
|
|
5
|
-
export type {
|
|
6
|
-
import type {
|
|
5
|
+
export type { RubricResolutionInput } from "../../rubric-resolution.js";
|
|
6
|
+
import type { RubricResolutionInput } from "../../rubric-resolution.js";
|
|
7
7
|
/** Options for compiling a literacy task */
|
|
8
8
|
export interface LiteracyCompileOptions {
|
|
9
9
|
/** Grader provider for LLM-graded assertions */
|
|
@@ -19,7 +19,7 @@ export interface LiteracyCompileOptions {
|
|
|
19
19
|
config?: Record<string, unknown>;
|
|
20
20
|
}[];
|
|
21
21
|
/** Rubric config (templates, weights, profiles) — loaded from rubrics config */
|
|
22
|
-
rubricConfig?:
|
|
22
|
+
rubricConfig?: RubricResolutionInput;
|
|
23
23
|
}
|
|
24
24
|
/** Result of compiling a single literacy task */
|
|
25
25
|
export interface LiteracyCompileResult {
|
|
@@ -11,20 +11,11 @@
|
|
|
11
11
|
*
|
|
12
12
|
* @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
|
|
13
13
|
*/
|
|
14
|
-
import { dirname, resolve as resolvePath } from "node:path";
|
|
15
|
-
import { fileURLToPath } from "node:url";
|
|
16
14
|
import { mapAssertions } from "./assertion-mapper.js";
|
|
17
15
|
import { resolveTaskFixtures } from "./fixture-resolver.js";
|
|
18
16
|
import { LiteracyVariant } from "../normalize-mode.js";
|
|
19
17
|
import { resolveVariables } from "./variable-resolver.js";
|
|
20
|
-
|
|
21
|
-
* Absolute filesystem path to the AILF mock Promptfoo provider. Resolved
|
|
22
|
-
* once at module load relative to this file. Promptfoo's `file://` provider
|
|
23
|
-
* loader requires an absolute path. See buildProviders for the env-var
|
|
24
|
-
* gate that swaps real providers for this mock.
|
|
25
|
-
*/
|
|
26
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
27
|
-
const MOCK_PROVIDER_ABSPATH = resolvePath(__dirname, "..", "..", "promptfoo-providers", "mock-provider.cjs");
|
|
18
|
+
import { MOCK_PROVIDER_ABSPATH } from "../../promptfoo-providers/mock-path.js";
|
|
28
19
|
// ---------------------------------------------------------------------------
|
|
29
20
|
// Public API
|
|
30
21
|
// ---------------------------------------------------------------------------
|
|
@@ -158,12 +149,17 @@ function buildProviders(models, mode) {
|
|
|
158
149
|
// never makes a live LLM call. We preserve `label` and stash the
|
|
159
150
|
// original `id` in `config.originalId` so the mock provider can surface
|
|
160
151
|
// model identity in its output and reports remain interpretable.
|
|
152
|
+
// `originalId ?? p.id` guards against double-swap clobbering a real id
|
|
153
|
+
// that's already been preserved on a prior pass.
|
|
161
154
|
// See W0110 (M5.1) and docs/design-docs/testing-strategy.md.
|
|
162
155
|
if (process.env.AILF_REPLAY_LLMS === "1") {
|
|
163
156
|
return providers.map((p) => ({
|
|
164
157
|
id: `file://${MOCK_PROVIDER_ABSPATH}`,
|
|
165
158
|
label: p.label,
|
|
166
|
-
config: {
|
|
159
|
+
config: {
|
|
160
|
+
...p.config,
|
|
161
|
+
originalId: p.config?.originalId ?? p.id,
|
|
162
|
+
},
|
|
167
163
|
}));
|
|
168
164
|
}
|
|
169
165
|
return providers;
|
|
@@ -25,6 +25,36 @@
|
|
|
25
25
|
import { extractModelName, extractProvider, mergeConfig, } from "../../_vendor/ailf-core/index.js";
|
|
26
26
|
import { loadConfigFile } from "./config-loader.js";
|
|
27
27
|
import { modelMatchesLiteracyVariant } from "./mode-bases/literacy.js";
|
|
28
|
+
import { MOCK_PROVIDER_ABSPATH } from "../../promptfoo-providers/mock-path.js";
|
|
29
|
+
/**
|
|
30
|
+
* Apply the W0110 replay swap to a list of literacy provider records.
|
|
31
|
+
*
|
|
32
|
+
* When `AILF_REPLAY_LLMS=1`, every provider's `id` is rewritten to the
|
|
33
|
+
* file-based AILF mock provider so the Promptfoo subprocess never makes
|
|
34
|
+
* a live LLM call. We preserve `label` and stash the original `id` in
|
|
35
|
+
* `config.originalId` so reports remain interpretable. This mirrors the
|
|
36
|
+
* top-level `buildProviders` swap in `promptfoo-compiler.ts` — it exists
|
|
37
|
+
* here because the literacy mode runs through this assembler, not
|
|
38
|
+
* `compileToPromptfoo`, so without this hook the replay flag was a no-op
|
|
39
|
+
* for every literacy run (W0138 Slice 2 surface).
|
|
40
|
+
*
|
|
41
|
+
* `originalId` is set with `?? p.id` so a record that's already been
|
|
42
|
+
* swapped (or that pre-stashed an `originalId` for any other reason)
|
|
43
|
+
* doesn't get its real model id clobbered by the file:// path.
|
|
44
|
+
*/
|
|
45
|
+
function applyReplaySwap(providers) {
|
|
46
|
+
if (process.env.AILF_REPLAY_LLMS !== "1")
|
|
47
|
+
return providers;
|
|
48
|
+
return providers.map((raw) => {
|
|
49
|
+
const p = raw;
|
|
50
|
+
const config = p.config ?? {};
|
|
51
|
+
return {
|
|
52
|
+
id: `file://${MOCK_PROVIDER_ABSPATH}`,
|
|
53
|
+
label: p.label,
|
|
54
|
+
config: { ...config, originalId: config.originalId ?? p.id },
|
|
55
|
+
};
|
|
56
|
+
});
|
|
57
|
+
}
|
|
28
58
|
// ---------------------------------------------------------------------------
|
|
29
59
|
// Public API
|
|
30
60
|
// ---------------------------------------------------------------------------
|
|
@@ -40,9 +70,9 @@ export function loadModelsAndProviders(rootDir, source, searchMode, allowedOrigi
|
|
|
40
70
|
return {
|
|
41
71
|
models,
|
|
42
72
|
providers: {
|
|
43
|
-
baseline: buildBaselineProviders(models),
|
|
44
|
-
agentic: buildAgenticProviders(models, source, searchMode, allowedOrigins),
|
|
45
|
-
observed: buildObservedProviders(models),
|
|
73
|
+
baseline: applyReplaySwap(buildBaselineProviders(models)),
|
|
74
|
+
agentic: applyReplaySwap(buildAgenticProviders(models, source, searchMode, allowedOrigins)),
|
|
75
|
+
observed: applyReplaySwap(buildObservedProviders(models)),
|
|
46
76
|
},
|
|
47
77
|
};
|
|
48
78
|
}
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
*/
|
|
15
15
|
import type { PromptfooAssertion } from "./assertion-mapper.js";
|
|
16
16
|
/** Minimal rubric config needed for template resolution */
|
|
17
|
-
export interface
|
|
17
|
+
export interface RubricResolutionInput {
|
|
18
18
|
templates: Record<string, {
|
|
19
19
|
criteria_label?: string;
|
|
20
20
|
dimension?: string;
|
|
@@ -37,4 +37,4 @@ export declare function resolveTemplatedAssertion(assertion: {
|
|
|
37
37
|
criteria: string[];
|
|
38
38
|
template: string;
|
|
39
39
|
type: string;
|
|
40
|
-
}, rubricConfig:
|
|
40
|
+
}, rubricConfig: RubricResolutionInput | undefined, graderProvider: string | undefined, warnings: string[]): PromptfooAssertion | null;
|
|
@@ -15,8 +15,19 @@
|
|
|
15
15
|
import type { SanityClient } from "@sanity/client";
|
|
16
16
|
import { type LiteracyTaskDefinition, type Logger } from "../_vendor/ailf-core/index.d.ts";
|
|
17
17
|
export interface MirrorOptions {
|
|
18
|
-
/**
|
|
18
|
+
/**
|
|
19
|
+
* Sanity client targeting the AILF private dataset — used to write
|
|
20
|
+
* `ailf.task` and `ailf.featureArea` documents and to read existing
|
|
21
|
+
* mirror state. Per D0043, AILF docs live in `ailf-prod-private`.
|
|
22
|
+
*/
|
|
19
23
|
client: SanityClient;
|
|
24
|
+
/**
|
|
25
|
+
* Sanity client targeting the editorial dataset — used to resolve
|
|
26
|
+
* `article` slugs to document IDs for canonical-doc references. After
|
|
27
|
+
* the dataset split, the AILF client cannot see editorial documents,
|
|
28
|
+
* so this must be a separate client (or omitted to skip slug resolution).
|
|
29
|
+
*/
|
|
30
|
+
editorialClient?: SanityClient;
|
|
20
31
|
/** Tasks to mirror (already loaded from repo) */
|
|
21
32
|
tasks: LiteracyTaskDefinition[];
|
|
22
33
|
/** Git context for origin provenance */
|
|
@@ -124,10 +135,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
|
|
|
124
135
|
_key: string;
|
|
125
136
|
reason: string;
|
|
126
137
|
} | {
|
|
127
|
-
doc?:
|
|
128
|
-
_ref: string;
|
|
129
|
-
_type: string;
|
|
130
|
-
} | undefined;
|
|
138
|
+
doc?: import("../sanity/client.js").EditorialReference | undefined;
|
|
131
139
|
docId?: string | undefined;
|
|
132
140
|
refType: string;
|
|
133
141
|
_key: string;
|
|
@@ -16,6 +16,7 @@ import { createHash } from "crypto";
|
|
|
16
16
|
import { readFileSync } from "fs";
|
|
17
17
|
import { isIdRef, isPathRef, isPerspectiveRef, isSlugRef, } from "../_vendor/ailf-core/index.js";
|
|
18
18
|
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
19
|
+
import { buildEditorialReference } from "../sanity/client.js";
|
|
19
20
|
// ---------------------------------------------------------------------------
|
|
20
21
|
// Public API
|
|
21
22
|
// ---------------------------------------------------------------------------
|
|
@@ -31,7 +32,7 @@ import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
|
31
32
|
* 6. Upsert the ailf.task document with origin block
|
|
32
33
|
*/
|
|
33
34
|
export async function mirrorRepoTasks(options) {
|
|
34
|
-
const { client, tasks, git, dryRun = false, logger } = options;
|
|
35
|
+
const { client, editorialClient, tasks, git, dryRun = false, logger, } = options;
|
|
35
36
|
const log = logger ?? new ConsoleLogger();
|
|
36
37
|
const result = {
|
|
37
38
|
total: tasks.length,
|
|
@@ -44,11 +45,18 @@ export async function mirrorRepoTasks(options) {
|
|
|
44
45
|
if (tasks.length === 0)
|
|
45
46
|
return result;
|
|
46
47
|
// Batch-resolve all context doc slugs (slug refs only — other ref types
|
|
47
|
-
// are stored without a resolved article reference for now)
|
|
48
|
+
// are stored without a resolved article reference for now). Slugs live on
|
|
49
|
+
// `article` documents in the editorial dataset, so this must use the
|
|
50
|
+
// editorial client. Without one, every slug ref stays unresolved.
|
|
48
51
|
const allSlugs = [
|
|
49
52
|
...new Set(tasks.flatMap((t) => (t.context?.docs ?? []).filter(isSlugRef).map((d) => d.slug))),
|
|
50
53
|
];
|
|
51
|
-
const slugToDocId =
|
|
54
|
+
const slugToDocId = editorialClient
|
|
55
|
+
? await batchResolveDocSlugs(editorialClient, allSlugs)
|
|
56
|
+
: new Map();
|
|
57
|
+
if (!editorialClient && allSlugs.length > 0) {
|
|
58
|
+
log.warn(" ⚠️ No editorial Sanity client provided — skipping slug→article resolution");
|
|
59
|
+
}
|
|
52
60
|
// Track unresolved slugs
|
|
53
61
|
for (const slug of allSlugs) {
|
|
54
62
|
if (!slugToDocId.has(slug)) {
|
|
@@ -363,13 +371,13 @@ export function buildMirrorDocument(task, opts) {
|
|
|
363
371
|
// When a slug resolves to a document, store as "id" ref with
|
|
364
372
|
// the resolved article reference. When unresolved, store as
|
|
365
373
|
// "slug" so Studio knows the resolution strategy even if the
|
|
366
|
-
// article doesn't exist yet.
|
|
374
|
+
// article doesn't exist yet. The `doc` reference is a Cross
|
|
375
|
+
// Dataset Reference per D0043 — `ailf.task` lives in the AILF
|
|
376
|
+
// private dataset and `article` lives in the editorial dataset.
|
|
367
377
|
return {
|
|
368
378
|
...base,
|
|
369
379
|
refType: resolvedId ? "id" : "slug",
|
|
370
|
-
...(resolvedId
|
|
371
|
-
? { doc: { _ref: resolvedId, _type: "reference" } }
|
|
372
|
-
: {}),
|
|
380
|
+
...(resolvedId ? { doc: buildEditorialReference(resolvedId) } : {}),
|
|
373
381
|
};
|
|
374
382
|
}
|
|
375
383
|
if (isPathRef(ref)) {
|
|
@@ -380,7 +388,7 @@ export function buildMirrorDocument(task, opts) {
|
|
|
380
388
|
...base,
|
|
381
389
|
refType: "id",
|
|
382
390
|
...(ref.id
|
|
383
|
-
? { doc:
|
|
391
|
+
? { doc: buildEditorialReference(ref.id), docId: ref.id }
|
|
384
392
|
: {}),
|
|
385
393
|
};
|
|
386
394
|
}
|
|
@@ -1,19 +1,32 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* pipeline/pr-comment.ts —
|
|
2
|
+
* pipeline/pr-comment.ts — Generate a markdown PR comment from
|
|
3
|
+
* `results/latest/score-summary.json` (and an optional comparison-report).
|
|
3
4
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
5
|
+
* Thin wrapper around `@sanity/ailf-core`'s unified renderer (W0150).
|
|
6
|
+
* Reads the local JSON files, applies legacy-field normalization on the
|
|
7
|
+
* scores, builds a `RenderableReport` envelope (so the CLI's
|
|
8
|
+
* `--promptfoo-url` flag flows through `provenance.promptfooUrls[0]`),
|
|
9
|
+
* then delegates rendering.
|
|
6
10
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
11
|
+
* All functions accept `rootDir` as a parameter — no module-level
|
|
12
|
+
* constants. No `process.argv` parsing. No env-var fallbacks.
|
|
9
13
|
*/
|
|
10
|
-
|
|
14
|
+
import { type RenderableReport } from "../_vendor/ailf-core/index.d.ts";
|
|
15
|
+
import type { ComparisonReport, ScoreSummary } from "./types.js";
|
|
16
|
+
/** Options for the {@link generatePrComment} function. */
|
|
11
17
|
export interface PrCommentOptions {
|
|
12
|
-
/** Path to write the comment (default: stdout) */
|
|
18
|
+
/** Path to write the comment (default: stdout). */
|
|
13
19
|
outputPath?: string;
|
|
14
|
-
/** Promptfoo share URL to include
|
|
20
|
+
/** Promptfoo share URL to include as the footer "view detailed results" link. */
|
|
15
21
|
promptfooUrl?: string;
|
|
16
|
-
/** Root directory of the eval package */
|
|
22
|
+
/** Root directory of the eval package. */
|
|
17
23
|
rootDir: string;
|
|
18
24
|
}
|
|
19
25
|
export declare function generatePrComment(options: PrCommentOptions): void;
|
|
26
|
+
/**
|
|
27
|
+
* Adapter: build a {@link RenderableReport} from the in-memory pipeline
|
|
28
|
+
* artifacts. Exposed for the cross-renderer byte-equality contract test.
|
|
29
|
+
*/
|
|
30
|
+
export declare function scoreSummaryToRenderableReport(summary: ScoreSummary, comparison: ComparisonReport | undefined, options?: {
|
|
31
|
+
promptfooUrl?: string;
|
|
32
|
+
}): RenderableReport;
|