@sanity/ailf 0.1.27 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/ports/context.d.ts +4 -0
- package/dist/_vendor/ailf-core/ports/task-source.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +26 -0
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +6 -2
- package/dist/adapters/api-client/progress.js +1 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +20 -1
- package/dist/adapters/task-sources/repo-task-source.js +7 -0
- package/dist/commands/explain-handler.js +1 -0
- package/dist/commands/pipeline-action.d.ts +1 -0
- package/dist/commands/pipeline-action.js +8 -0
- package/dist/commands/pipeline.d.ts +1 -0
- package/dist/commands/pipeline.js +7 -0
- package/dist/commands/publish.js +10 -2
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/steps/fetch-docs-step.js +3 -2
- package/dist/orchestration/steps/generate-configs-step.js +8 -3
- package/dist/orchestration/steps/publish-report-step.js +12 -2
- package/dist/orchestration/steps/run-eval-step.js +4 -2
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/plan.d.ts +1 -0
- package/dist/pipeline/plan.js +2 -1
- package/dist/pipeline/provenance.d.ts +2 -0
- package/dist/pipeline/provenance.js +5 -0
- package/dist/report-store.d.ts +20 -2
- package/dist/report-store.js +31 -7
- package/dist/webhook/eval-request-handler.d.ts +2 -0
- package/dist/webhook/eval-request-handler.js +3 -0
- package/package.json +1 -1
|
@@ -35,6 +35,8 @@ export interface ResolvedConfig {
|
|
|
35
35
|
areas?: string[];
|
|
36
36
|
/** Task ID filter */
|
|
37
37
|
tasks?: string[];
|
|
38
|
+
/** Tag filter — tasks must have at least one matching tag */
|
|
39
|
+
tags?: string[];
|
|
38
40
|
/** Changed doc slugs for impact scoping */
|
|
39
41
|
changedDocs?: string[];
|
|
40
42
|
/** Documentation source name */
|
|
@@ -89,6 +91,8 @@ export interface ResolvedConfig {
|
|
|
89
91
|
studioOriginOverride?: string;
|
|
90
92
|
/** Sanity document filter args */
|
|
91
93
|
sanityDocumentArgs?: string[];
|
|
94
|
+
/** Report ID that triggered this re-run (flows to provenance.lineage.rerunOf) */
|
|
95
|
+
sourceReportId?: string;
|
|
92
96
|
/** Disable release-aware auto-scoping (evaluate all tasks even when perspective is set) */
|
|
93
97
|
noAutoScope: boolean;
|
|
94
98
|
/** Before option for comparison */
|
|
@@ -112,6 +112,8 @@ export interface TaskDefinition {
|
|
|
112
112
|
baseline?: BaselineConfig;
|
|
113
113
|
/** Additional template variables beyond task (e.g., custom vars) */
|
|
114
114
|
extraVars?: Record<string, unknown>;
|
|
115
|
+
/** Freeform labels for filtering and organization */
|
|
116
|
+
tags?: string[];
|
|
115
117
|
}
|
|
116
118
|
/** Check if a canonical doc ref resolves by slug.
|
|
117
119
|
*
|
|
@@ -68,6 +68,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
68
68
|
"origin-only": "origin-only";
|
|
69
69
|
}>>;
|
|
70
70
|
source: z.ZodOptional<z.ZodString>;
|
|
71
|
+
sourceReportId: z.ZodOptional<z.ZodString>;
|
|
71
72
|
taskMode: z.ZodOptional<z.ZodEnum<{
|
|
72
73
|
"content-lake": "content-lake";
|
|
73
74
|
yaml: "yaml";
|
|
@@ -80,6 +80,7 @@ export const PipelineRequestSchema = z.object({
|
|
|
80
80
|
readiness: z.boolean().optional(),
|
|
81
81
|
searchMode: z.enum(["off", "open", "origin-only"]).optional(),
|
|
82
82
|
source: z.string().optional(),
|
|
83
|
+
sourceReportId: z.string().optional(),
|
|
83
84
|
taskMode: z.enum(["content-lake", "yaml", "inline"]).optional(),
|
|
84
85
|
tasks: z.array(z.string()).optional(),
|
|
85
86
|
urls: z.array(z.string().url()).optional(),
|
|
@@ -179,6 +179,8 @@ export interface FeatureScore {
|
|
|
179
179
|
export interface FilterOptions {
|
|
180
180
|
/** Feature areas to include (filename stems, e.g., ["groq", "frameworks"]) */
|
|
181
181
|
areas?: string[];
|
|
182
|
+
/** Tags to include — tasks must have at least one matching tag */
|
|
183
|
+
tags?: string[];
|
|
182
184
|
/** Specific task IDs to include (e.g., ["groq-blog-queries"]) */
|
|
183
185
|
taskIds?: string[];
|
|
184
186
|
}
|
|
@@ -1032,6 +1034,28 @@ export interface ReportAutoScope {
|
|
|
1032
1034
|
removed: number;
|
|
1033
1035
|
};
|
|
1034
1036
|
}
|
|
1037
|
+
/**
|
|
1038
|
+
* Typed relationships between reports. Each field is optional and
|
|
1039
|
+
* independent — populated only when that relationship exists.
|
|
1040
|
+
*
|
|
1041
|
+
* Stored at `provenance.lineage` in the report document.
|
|
1042
|
+
*
|
|
1043
|
+
* @see docs/design-docs/report-store/domain-model.md
|
|
1044
|
+
*/
|
|
1045
|
+
export interface ReportLineage {
|
|
1046
|
+
/**
|
|
1047
|
+
* This report was explicitly compared against another report.
|
|
1048
|
+
* Set when auto-compare selects a specific baseline or when the user
|
|
1049
|
+
* requests comparison against a named report.
|
|
1050
|
+
*/
|
|
1051
|
+
comparedAgainst?: ReportId;
|
|
1052
|
+
/**
|
|
1053
|
+
* This report was explicitly re-run from another report.
|
|
1054
|
+
* The re-run has the same EvalScope (mode, areas, perspective, etc.)
|
|
1055
|
+
* but measures the current state of docs/models/tasks.
|
|
1056
|
+
*/
|
|
1057
|
+
rerunOf?: ReportId;
|
|
1058
|
+
}
|
|
1035
1059
|
/** Full provenance metadata for an evaluation report */
|
|
1036
1060
|
export interface ReportProvenance {
|
|
1037
1061
|
/** Which feature areas were evaluated */
|
|
@@ -1055,6 +1079,8 @@ export interface ReportProvenance {
|
|
|
1055
1079
|
};
|
|
1056
1080
|
/** Grader model used for scoring */
|
|
1057
1081
|
graderModel: string;
|
|
1082
|
+
/** Typed relationships with other reports (re-run, comparison) */
|
|
1083
|
+
lineage?: ReportLineage;
|
|
1058
1084
|
/** Evaluation mode */
|
|
1059
1085
|
mode: EvalMode;
|
|
1060
1086
|
/** Models under evaluation */
|
|
@@ -167,12 +167,16 @@ function taskToInlineFormat(task) {
|
|
|
167
167
|
if (task.baseline) {
|
|
168
168
|
inline.baseline = task.baseline;
|
|
169
169
|
}
|
|
170
|
+
if (task.tags?.length) {
|
|
171
|
+
inline.tags = task.tags;
|
|
172
|
+
}
|
|
170
173
|
return inline;
|
|
171
174
|
}
|
|
172
175
|
function buildFilterOptions(config) {
|
|
173
176
|
const areas = config.areas?.length ? config.areas : undefined;
|
|
174
177
|
const taskIds = config.tasks?.length ? config.tasks : undefined;
|
|
175
|
-
|
|
178
|
+
const tags = config.tags?.length ? config.tags : undefined;
|
|
179
|
+
if (!areas && !taskIds && !tags)
|
|
176
180
|
return undefined;
|
|
177
|
-
return { areas, taskIds };
|
|
181
|
+
return { areas, taskIds, tags };
|
|
178
182
|
}
|
|
@@ -36,7 +36,7 @@ export function createProgressDisplay() {
|
|
|
36
36
|
line = `⏳ [queued] Waiting for runner... (${elapsed})`;
|
|
37
37
|
break;
|
|
38
38
|
case "running": {
|
|
39
|
-
if (job.progress) {
|
|
39
|
+
if (job.progress?.step && job.progress.current && job.progress.total) {
|
|
40
40
|
const { step, current, total } = job.progress;
|
|
41
41
|
line = `⏳ [running] Step ${current}/${total}: ${step} (${elapsed})`;
|
|
42
42
|
}
|
|
@@ -32,6 +32,7 @@ const TASKS_QUERY = /* groq */ `
|
|
|
32
32
|
&& (!defined($areas) || featureArea->areaId.current in $areas)
|
|
33
33
|
&& (!defined($taskIds) || id.current in $taskIds)
|
|
34
34
|
&& (execution.enabled != false)
|
|
35
|
+
&& (!defined($tags) || count((tags)[@ in $tags]) > 0)
|
|
35
36
|
] | order(featureArea->areaId.current asc, id.current asc) {
|
|
36
37
|
"taskId": id.current,
|
|
37
38
|
description,
|
|
@@ -51,6 +52,7 @@ const TASKS_QUERY = /* groq */ `
|
|
|
51
52
|
assert,
|
|
52
53
|
rawAssert,
|
|
53
54
|
baseline,
|
|
55
|
+
tags,
|
|
54
56
|
"referenceSolutionTitle": referenceSolution->title
|
|
55
57
|
}
|
|
56
58
|
`;
|
|
@@ -90,6 +92,7 @@ function buildGroqParams(filter) {
|
|
|
90
92
|
areas: filter?.areas && filter.areas.length > 0
|
|
91
93
|
? filter.areas.map((a) => a.toLowerCase())
|
|
92
94
|
: null,
|
|
95
|
+
tags: filter?.tags && filter.tags.length > 0 ? filter.tags : null,
|
|
93
96
|
taskIds: filter?.taskIds && filter.taskIds.length > 0 ? filter.taskIds : null,
|
|
94
97
|
};
|
|
95
98
|
}
|
|
@@ -116,6 +119,21 @@ function mapToTaskDefinition(raw) {
|
|
|
116
119
|
.map(mapCanonicalDocRef)
|
|
117
120
|
.filter((d) => d !== null);
|
|
118
121
|
const assertions = mapAssertions(raw.assert ?? []);
|
|
122
|
+
// Append raw pass-through assertions (escape hatch for arbitrary Promptfoo
|
|
123
|
+
// assertion types that aren't in the curated list). These bypass template
|
|
124
|
+
// resolution and flow directly into the expanded Promptfoo test case as
|
|
125
|
+
// value-based assertions. In baseline mode, buildBaselineAsserts() with
|
|
126
|
+
// "abbreviated" (the default) drops non-rubric assertions, so rawAssert
|
|
127
|
+
// entries only run in the gold variant — consistent with how regular
|
|
128
|
+
// value-based assertions like `contains` or `regex` behave.
|
|
129
|
+
const rawAssertions = (raw.rawAssert ?? [])
|
|
130
|
+
.filter((a) => !!a.type)
|
|
131
|
+
.map((a) => ({
|
|
132
|
+
type: a.type,
|
|
133
|
+
...(a.value !== undefined ? { value: a.value } : {}),
|
|
134
|
+
...(a.threshold !== undefined ? { threshold: a.threshold } : {}),
|
|
135
|
+
}));
|
|
136
|
+
const allAssertions = [...assertions, ...rawAssertions];
|
|
119
137
|
const baseline = raw.baseline
|
|
120
138
|
? {
|
|
121
139
|
...(raw.baseline.enabled !== undefined
|
|
@@ -129,7 +147,7 @@ function mapToTaskDefinition(raw) {
|
|
|
129
147
|
}
|
|
130
148
|
: undefined;
|
|
131
149
|
return {
|
|
132
|
-
assertions,
|
|
150
|
+
assertions: allAssertions,
|
|
133
151
|
canonicalDocs,
|
|
134
152
|
description: raw.description,
|
|
135
153
|
docCoverage: raw.docCoverage ?? false,
|
|
@@ -143,6 +161,7 @@ function mapToTaskDefinition(raw) {
|
|
|
143
161
|
referenceSolution: "",
|
|
144
162
|
taskPrompt: raw.taskPrompt,
|
|
145
163
|
...(baseline ? { baseline } : {}),
|
|
164
|
+
...(raw.tags?.length ? { tags: raw.tags } : {}),
|
|
146
165
|
};
|
|
147
166
|
}
|
|
148
167
|
/**
|
|
@@ -79,6 +79,12 @@ export class RepoTaskSource {
|
|
|
79
79
|
if (entry.execution?.enabled === false) {
|
|
80
80
|
continue;
|
|
81
81
|
}
|
|
82
|
+
// Tag filter — skip tasks that don't match any requested tag
|
|
83
|
+
if (filter?.tags &&
|
|
84
|
+
filter.tags.length > 0 &&
|
|
85
|
+
(!entry.tags || !entry.tags.some((t) => filter.tags.includes(t)))) {
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
82
88
|
definitions.push(mapToTaskDefinition(entry));
|
|
83
89
|
}
|
|
84
90
|
}
|
|
@@ -108,5 +114,6 @@ function mapToTaskDefinition(raw) {
|
|
|
108
114
|
taskPrompt: typeof task === "string" ? task : "",
|
|
109
115
|
...(raw.baseline ? { baseline: raw.baseline } : {}),
|
|
110
116
|
...(extraVars ? { extraVars } : {}),
|
|
117
|
+
...(raw.tags?.length ? { tags: raw.tags } : {}),
|
|
111
118
|
};
|
|
112
119
|
}
|
|
@@ -688,6 +688,7 @@ async function buildPipelineExplainPlan(actionCommand, rootDir) {
|
|
|
688
688
|
skipEval: raw.skipEval ?? false,
|
|
689
689
|
skipFetch: raw.skipFetch ?? false,
|
|
690
690
|
source: raw.source,
|
|
691
|
+
tag: raw.tag ?? [],
|
|
691
692
|
task: raw.task,
|
|
692
693
|
threshold: raw.threshold,
|
|
693
694
|
url: raw.url ?? [],
|
|
@@ -126,6 +126,13 @@ export function computeResolvedOptions(opts) {
|
|
|
126
126
|
// Scoping
|
|
127
127
|
const areaOption = opts.area ?? process.env.EVAL_FILTER_AREAS ?? undefined;
|
|
128
128
|
const taskOption = opts.task ?? process.env.EVAL_FILTER_TASKS ?? undefined;
|
|
129
|
+
const tagOption = opts.tag?.length
|
|
130
|
+
? opts.tag
|
|
131
|
+
: process.env.EVAL_FILTER_TAGS
|
|
132
|
+
? process.env.EVAL_FILTER_TAGS.split(",")
|
|
133
|
+
.map((s) => s.trim())
|
|
134
|
+
.filter(Boolean)
|
|
135
|
+
: undefined;
|
|
129
136
|
const changedDocsOption = opts.changedDocs ?? process.env.EVAL_CHANGED_DOCS ?? undefined;
|
|
130
137
|
// Document-driven scoping (pure — computes impactSummary without env writes)
|
|
131
138
|
let impactSummary;
|
|
@@ -237,6 +244,7 @@ export function computeResolvedOptions(opts) {
|
|
|
237
244
|
? resolve(callerCwd, opts.repoTasksPath)
|
|
238
245
|
: undefined,
|
|
239
246
|
taskOption,
|
|
247
|
+
tagOption,
|
|
240
248
|
taskSourceType: resolveTaskSourceType(opts.taskSource),
|
|
241
249
|
urlArgs,
|
|
242
250
|
};
|
|
@@ -22,6 +22,13 @@ export function createPipelineCommand() {
|
|
|
22
22
|
.option("--no-auto-scope", "Disable release-aware auto-scoping (evaluate all tasks even when a perspective is set)")
|
|
23
23
|
.option("-a, --area <areas>", "Scope to feature areas (comma-separated)")
|
|
24
24
|
.option("-t, --task <id>", "Scope to specific task ID")
|
|
25
|
+
.option("--tag <tags>", "Scope to tasks with matching tags (comma-separated, repeatable)", (val, prev) => [
|
|
26
|
+
...prev,
|
|
27
|
+
...val
|
|
28
|
+
.split(",")
|
|
29
|
+
.map((s) => s.trim())
|
|
30
|
+
.filter(Boolean),
|
|
31
|
+
], [])
|
|
25
32
|
.option("--changed-docs <slugs>", "Auto-scope to tasks affected by these document slugs")
|
|
26
33
|
.option("-j, --concurrency <n>", "Max parallel API calls during evaluation", parseInt)
|
|
27
34
|
.option("--grader-replications <n>", "Grader consistency replications", parseInt)
|
package/dist/commands/publish.js
CHANGED
|
@@ -24,7 +24,7 @@ import { fileURLToPath } from "url";
|
|
|
24
24
|
import { Command } from "commander";
|
|
25
25
|
import { createAppContext } from "../composition-root.js";
|
|
26
26
|
import { buildProvenance, } from "../pipeline/provenance.js";
|
|
27
|
-
import { generateReportId } from "../report-store.js";
|
|
27
|
+
import { generateReportId, } from "../report-store.js";
|
|
28
28
|
import { withRetry } from "../sinks/retry.js";
|
|
29
29
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
30
30
|
const ROOT = resolve(__dirname, "..", "..");
|
|
@@ -154,9 +154,17 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
154
154
|
}
|
|
155
155
|
}
|
|
156
156
|
// Auto-compare against most recent comparable baseline
|
|
157
|
-
const
|
|
157
|
+
const autoCompareResult = opts.dryRun || !store
|
|
158
158
|
? null
|
|
159
159
|
: await store.autoCompare(summary, provenance, now);
|
|
160
|
+
const comparison = autoCompareResult?.comparison ?? null;
|
|
161
|
+
// Record which report we compared against in lineage
|
|
162
|
+
if (autoCompareResult) {
|
|
163
|
+
provenance.lineage = {
|
|
164
|
+
...provenance.lineage,
|
|
165
|
+
comparedAgainst: autoCompareResult.baselineReportId,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
160
168
|
const reportId = generateReportId();
|
|
161
169
|
const report = {
|
|
162
170
|
comparison: comparison ?? undefined,
|
|
@@ -121,12 +121,13 @@ export class FetchDocsStep {
|
|
|
121
121
|
// Helpers
|
|
122
122
|
// ---------------------------------------------------------------------------
|
|
123
123
|
function buildFilter(ctx) {
|
|
124
|
-
const { areas, tasks } = ctx.config;
|
|
125
|
-
if (!areas && !tasks)
|
|
124
|
+
const { areas, tasks, tags } = ctx.config;
|
|
125
|
+
if (!areas && !tasks && !tags)
|
|
126
126
|
return undefined;
|
|
127
127
|
return {
|
|
128
128
|
...(areas ? { areas } : {}),
|
|
129
129
|
...(tasks ? { taskIds: tasks } : {}),
|
|
130
|
+
...(tags ? { tags } : {}),
|
|
130
131
|
};
|
|
131
132
|
}
|
|
132
133
|
/**
|
|
@@ -28,8 +28,12 @@ export class GenerateConfigsStep {
|
|
|
28
28
|
// repo-based, and YAML tasks depending on which adapter is wired.
|
|
29
29
|
let tasks;
|
|
30
30
|
try {
|
|
31
|
-
const filter = ctx.config.areas || ctx.config.tasks
|
|
32
|
-
? {
|
|
31
|
+
const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
32
|
+
? {
|
|
33
|
+
areas: ctx.config.areas,
|
|
34
|
+
taskIds: ctx.config.tasks,
|
|
35
|
+
tags: ctx.config.tags,
|
|
36
|
+
}
|
|
33
37
|
: undefined;
|
|
34
38
|
tasks = await ctx.taskSource.loadTasks(filter);
|
|
35
39
|
}
|
|
@@ -54,10 +58,11 @@ export class GenerateConfigsStep {
|
|
|
54
58
|
try {
|
|
55
59
|
generateConfigs({
|
|
56
60
|
allowedOrigins: ctx.config.allowedOrigins,
|
|
57
|
-
filter: ctx.config.areas || ctx.config.tasks
|
|
61
|
+
filter: ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
58
62
|
? {
|
|
59
63
|
areas: ctx.config.areas,
|
|
60
64
|
taskIds: ctx.config.tasks,
|
|
65
|
+
tags: ctx.config.tags,
|
|
61
66
|
}
|
|
62
67
|
: undefined,
|
|
63
68
|
resolvedSource,
|
|
@@ -80,10 +80,19 @@ export class PublishReportStep {
|
|
|
80
80
|
const now = new Date().toISOString();
|
|
81
81
|
const reportId = generateReportId();
|
|
82
82
|
const durationMs = Date.now() - this.pipelineStart;
|
|
83
|
-
// Auto-compare against most recent comparable baseline
|
|
84
|
-
|
|
83
|
+
// Auto-compare against most recent comparable baseline.
|
|
84
|
+
// Returns the comparison + baseline report ID for lineage tracking.
|
|
85
|
+
const autoCompareResult = ctx.reportStore
|
|
85
86
|
? (await ctx.reportStore.autoCompare(summary, provenance, now))
|
|
86
87
|
: null;
|
|
88
|
+
const comparison = autoCompareResult?.comparison ?? null;
|
|
89
|
+
// Record which report we compared against in lineage
|
|
90
|
+
if (autoCompareResult) {
|
|
91
|
+
provenance.lineage = {
|
|
92
|
+
...provenance.lineage,
|
|
93
|
+
comparedAgainst: autoCompareResult.baselineReportId,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
87
96
|
const report = {
|
|
88
97
|
comparison: comparison ?? undefined,
|
|
89
98
|
completedAt: now,
|
|
@@ -168,6 +177,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
|
|
|
168
177
|
rootDir: ctx.config.rootDir,
|
|
169
178
|
sanityDocumentIds,
|
|
170
179
|
source,
|
|
180
|
+
sourceReportId: ctx.config.sourceReportId,
|
|
171
181
|
taskIds,
|
|
172
182
|
};
|
|
173
183
|
}
|
|
@@ -40,10 +40,11 @@ export class RunEvalStep {
|
|
|
40
40
|
// Precondition: canonical context files exist for filtered tasks.
|
|
41
41
|
// Must apply the same area/task filter as fetch-docs so we only
|
|
42
42
|
// check contexts that were actually fetched.
|
|
43
|
-
const filter = ctx.config.areas || ctx.config.tasks
|
|
43
|
+
const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
44
44
|
? {
|
|
45
45
|
...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
|
|
46
46
|
...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
|
|
47
|
+
...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
|
|
47
48
|
}
|
|
48
49
|
: undefined;
|
|
49
50
|
let tasks = await ctx.taskSource.loadTasks(filter);
|
|
@@ -76,10 +77,11 @@ export class RunEvalStep {
|
|
|
76
77
|
if (!debug?.enabled) {
|
|
77
78
|
try {
|
|
78
79
|
evalFingerprint = computeEvalFingerprint({
|
|
79
|
-
filter: ctx.config.areas || ctx.config.tasks
|
|
80
|
+
filter: ctx.config.areas || ctx.config.tasks || ctx.config.tags
|
|
80
81
|
? {
|
|
81
82
|
areas: ctx.config.areas,
|
|
82
83
|
taskIds: ctx.config.tasks,
|
|
84
|
+
tags: ctx.config.tags,
|
|
83
85
|
}
|
|
84
86
|
: undefined,
|
|
85
87
|
graderModel: "default",
|
|
@@ -55,6 +55,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
55
55
|
promptfooUrl: undefined,
|
|
56
56
|
studioOriginOverride: undefined,
|
|
57
57
|
sanityDocumentArgs: undefined,
|
|
58
|
+
sourceReportId: request.sourceReportId,
|
|
58
59
|
beforeOption: undefined,
|
|
59
60
|
repoTasksPath: undefined,
|
|
60
61
|
callerGit: request.callerGit,
|
package/dist/pipeline/plan.d.ts
CHANGED
package/dist/pipeline/plan.js
CHANGED
|
@@ -117,7 +117,7 @@ export async function buildPipelinePlan(opts, rootDir) {
|
|
|
117
117
|
.filter((i) => i.severity === "error")
|
|
118
118
|
.map((i) => `[${i.source}] ${i.message}`);
|
|
119
119
|
// 2. Expand tasks with filters
|
|
120
|
-
const filter = opts.areaOption || opts.taskOption
|
|
120
|
+
const filter = opts.areaOption || opts.taskOption || opts.tagOption?.length
|
|
121
121
|
? {
|
|
122
122
|
areas: opts.areaOption
|
|
123
123
|
? opts.areaOption.split(",").map((a) => a.trim())
|
|
@@ -125,6 +125,7 @@ export async function buildPipelinePlan(opts, rootDir) {
|
|
|
125
125
|
taskIds: opts.taskOption
|
|
126
126
|
? opts.taskOption.split(",").map((t) => t.trim())
|
|
127
127
|
: undefined,
|
|
128
|
+
tags: opts.tagOption,
|
|
128
129
|
}
|
|
129
130
|
: undefined;
|
|
130
131
|
let totalTests = 0;
|
|
@@ -41,6 +41,8 @@ export interface ProvenanceInput {
|
|
|
41
41
|
promptfooUrls?: PromptfooUrlEntry[];
|
|
42
42
|
/** Path to the package root (for reading models.yaml) */
|
|
43
43
|
rootDir: string;
|
|
44
|
+
/** Report ID that triggered this re-run (becomes lineage.rerunOf) */
|
|
45
|
+
sourceReportId?: string;
|
|
44
46
|
/** Sanity document IDs targeted */
|
|
45
47
|
sanityDocumentIds?: string[];
|
|
46
48
|
/** Resolved documentation source */
|
|
@@ -35,6 +35,10 @@ export function buildProvenance(input) {
|
|
|
35
35
|
sha: input.callerGit.sha ?? "unknown",
|
|
36
36
|
}
|
|
37
37
|
: detectGitMetadata();
|
|
38
|
+
// Build lineage from explicit relationships
|
|
39
|
+
const lineage = input.sourceReportId
|
|
40
|
+
? { rerunOf: input.sourceReportId }
|
|
41
|
+
: undefined;
|
|
38
42
|
return {
|
|
39
43
|
areas: input.areas,
|
|
40
44
|
autoScope: input.autoScope,
|
|
@@ -42,6 +46,7 @@ export function buildProvenance(input) {
|
|
|
42
46
|
evalFingerprint: input.evalFingerprint,
|
|
43
47
|
git,
|
|
44
48
|
graderModel: models.grader.id,
|
|
49
|
+
lineage,
|
|
45
50
|
mode: input.mode,
|
|
46
51
|
models: models.models.map((m) => ({ id: m.id, label: m.label })),
|
|
47
52
|
promptfooUrl: input.promptfooUrl,
|
package/dist/report-store.d.ts
CHANGED
|
@@ -16,6 +16,16 @@
|
|
|
16
16
|
*/
|
|
17
17
|
import type { SanityClient } from "@sanity/client";
|
|
18
18
|
import type { ComparisonReport, ISOTimestamp, LineageQuery, Report, ReportId, ReportProvenance, ScoreSummary } from "./pipeline/types.js";
|
|
19
|
+
/**
|
|
20
|
+
* Result of an auto-comparison, bundling the ComparisonReport with the
|
|
21
|
+
* baseline report ID so the caller can record lineage (comparedAgainst).
|
|
22
|
+
*/
|
|
23
|
+
export interface AutoCompareResult {
|
|
24
|
+
/** The report ID of the baseline used for comparison */
|
|
25
|
+
baselineReportId: ReportId;
|
|
26
|
+
/** The computed comparison report */
|
|
27
|
+
comparison: ComparisonReport;
|
|
28
|
+
}
|
|
19
29
|
export interface ReportStoreOptions {
|
|
20
30
|
/** Override the Sanity client (for testing) */
|
|
21
31
|
client?: SanityClient;
|
|
@@ -33,9 +43,17 @@ export declare class ReportStore {
|
|
|
33
43
|
* Auto-compare: find the most recent comparable report and compute
|
|
34
44
|
* a ComparisonReport using the existing compare() primitive.
|
|
35
45
|
*
|
|
36
|
-
*
|
|
46
|
+
* Baseline selection order:
|
|
47
|
+
* 1. Explicit lineage — if `provenance.lineage.rerunOf` is set,
|
|
48
|
+
* compare against that specific report (deterministic re-run comparison)
|
|
49
|
+
* 2. Fuzzy matching — most recent report with same mode + source name
|
|
50
|
+
*
|
|
51
|
+
* Returns the comparison plus the baseline report ID so the caller
|
|
52
|
+
* can record `provenance.lineage.comparedAgainst`.
|
|
53
|
+
*
|
|
54
|
+
* @returns The comparison result with baseline ID, or null if no baseline found
|
|
37
55
|
*/
|
|
38
|
-
autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<
|
|
56
|
+
autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<AutoCompareResult | null>;
|
|
39
57
|
/**
|
|
40
58
|
* Find a report by its evaluation fingerprint (cross-environment cache lookup).
|
|
41
59
|
*
|
package/dist/report-store.js
CHANGED
|
@@ -41,19 +41,43 @@ export class ReportStore {
|
|
|
41
41
|
* Auto-compare: find the most recent comparable report and compute
|
|
42
42
|
* a ComparisonReport using the existing compare() primitive.
|
|
43
43
|
*
|
|
44
|
-
*
|
|
44
|
+
* Baseline selection order:
|
|
45
|
+
* 1. Explicit lineage — if `provenance.lineage.rerunOf` is set,
|
|
46
|
+
* compare against that specific report (deterministic re-run comparison)
|
|
47
|
+
* 2. Fuzzy matching — most recent report with same mode + source name
|
|
48
|
+
*
|
|
49
|
+
* Returns the comparison plus the baseline report ID so the caller
|
|
50
|
+
* can record `provenance.lineage.comparedAgainst`.
|
|
51
|
+
*
|
|
52
|
+
* @returns The comparison result with baseline ID, or null if no baseline found
|
|
45
53
|
*/
|
|
46
54
|
async autoCompare(currentSummary, provenance, completedAt) {
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
55
|
+
// 1. Prefer explicit lineage source (deterministic re-run comparison)
|
|
56
|
+
const rerunSourceId = provenance.lineage?.rerunOf;
|
|
57
|
+
let baseline = null;
|
|
58
|
+
if (rerunSourceId) {
|
|
59
|
+
baseline = await this.read(rerunSourceId);
|
|
60
|
+
if (baseline) {
|
|
61
|
+
console.log(` 🔗 Comparing against lineage source: ${rerunSourceId}`);
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
console.warn(` ⚠️ Lineage source ${rerunSourceId} not found, falling back to fuzzy match`);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
// 2. Fall back to fuzzy matching
|
|
68
|
+
if (!baseline) {
|
|
69
|
+
baseline = await this.findComparableBaseline({
|
|
70
|
+
before: completedAt,
|
|
71
|
+
mode: provenance.mode,
|
|
72
|
+
source: { name: provenance.source.name },
|
|
73
|
+
});
|
|
74
|
+
}
|
|
52
75
|
if (!baseline) {
|
|
53
76
|
return null;
|
|
54
77
|
}
|
|
55
78
|
try {
|
|
56
|
-
|
|
79
|
+
const comparison = compare(baseline.summary, currentSummary);
|
|
80
|
+
return { baselineReportId: baseline.id, comparison };
|
|
57
81
|
}
|
|
58
82
|
catch (error) {
|
|
59
83
|
console.warn(` ⚠️ Auto-comparison failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
@@ -68,6 +68,8 @@ export interface EvalRequestPayload {
|
|
|
68
68
|
requestedAt: string;
|
|
69
69
|
/** User ID who requested */
|
|
70
70
|
requestedBy?: string;
|
|
71
|
+
/** Report ID that triggered this re-run (if any) */
|
|
72
|
+
sourceReportId?: string;
|
|
71
73
|
/** Request status */
|
|
72
74
|
status: string;
|
|
73
75
|
/** Publish tag */
|
|
@@ -180,6 +180,9 @@ async function dispatchGitHubEval(repo, payload, config) {
|
|
|
180
180
|
...(hasAreas ? { areas: payload.areas } : {}),
|
|
181
181
|
...(payload.debug ? { debug: true } : {}),
|
|
182
182
|
...(payload.tag ? { publishTag: payload.tag } : {}),
|
|
183
|
+
...(payload.sourceReportId
|
|
184
|
+
? { sourceReportId: payload.sourceReportId }
|
|
185
|
+
: {}),
|
|
183
186
|
},
|
|
184
187
|
event_type: "external-eval",
|
|
185
188
|
};
|