@sanity/ailf 0.1.27 → 0.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/ports/context.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +24 -0
- package/dist/adapters/api-client/progress.js +1 -1
- package/dist/commands/publish.js +10 -2
- package/dist/orchestration/steps/publish-report-step.js +12 -2
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/provenance.d.ts +2 -0
- package/dist/pipeline/provenance.js +5 -0
- package/dist/report-store.d.ts +20 -2
- package/dist/report-store.js +31 -7
- package/dist/webhook/eval-request-handler.d.ts +2 -0
- package/dist/webhook/eval-request-handler.js +3 -0
- package/package.json +1 -1
|
@@ -89,6 +89,8 @@ export interface ResolvedConfig {
|
|
|
89
89
|
studioOriginOverride?: string;
|
|
90
90
|
/** Sanity document filter args */
|
|
91
91
|
sanityDocumentArgs?: string[];
|
|
92
|
+
/** Report ID that triggered this re-run (flows to provenance.lineage.rerunOf) */
|
|
93
|
+
sourceReportId?: string;
|
|
92
94
|
/** Disable release-aware auto-scoping (evaluate all tasks even when perspective is set) */
|
|
93
95
|
noAutoScope: boolean;
|
|
94
96
|
/** Before option for comparison */
|
|
@@ -68,6 +68,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
|
|
|
68
68
|
"origin-only": "origin-only";
|
|
69
69
|
}>>;
|
|
70
70
|
source: z.ZodOptional<z.ZodString>;
|
|
71
|
+
sourceReportId: z.ZodOptional<z.ZodString>;
|
|
71
72
|
taskMode: z.ZodOptional<z.ZodEnum<{
|
|
72
73
|
"content-lake": "content-lake";
|
|
73
74
|
yaml: "yaml";
|
|
@@ -80,6 +80,7 @@ export const PipelineRequestSchema = z.object({
|
|
|
80
80
|
readiness: z.boolean().optional(),
|
|
81
81
|
searchMode: z.enum(["off", "open", "origin-only"]).optional(),
|
|
82
82
|
source: z.string().optional(),
|
|
83
|
+
sourceReportId: z.string().optional(),
|
|
83
84
|
taskMode: z.enum(["content-lake", "yaml", "inline"]).optional(),
|
|
84
85
|
tasks: z.array(z.string()).optional(),
|
|
85
86
|
urls: z.array(z.string().url()).optional(),
|
|
@@ -1032,6 +1032,28 @@ export interface ReportAutoScope {
|
|
|
1032
1032
|
removed: number;
|
|
1033
1033
|
};
|
|
1034
1034
|
}
|
|
1035
|
+
/**
|
|
1036
|
+
* Typed relationships between reports. Each field is optional and
|
|
1037
|
+
* independent — populated only when that relationship exists.
|
|
1038
|
+
*
|
|
1039
|
+
* Stored at `provenance.lineage` in the report document.
|
|
1040
|
+
*
|
|
1041
|
+
* @see docs/design-docs/report-store/domain-model.md
|
|
1042
|
+
*/
|
|
1043
|
+
export interface ReportLineage {
|
|
1044
|
+
/**
|
|
1045
|
+
* This report was explicitly compared against another report.
|
|
1046
|
+
* Set when auto-compare selects a specific baseline or when the user
|
|
1047
|
+
* requests comparison against a named report.
|
|
1048
|
+
*/
|
|
1049
|
+
comparedAgainst?: ReportId;
|
|
1050
|
+
/**
|
|
1051
|
+
* This report was explicitly re-run from another report.
|
|
1052
|
+
* The re-run has the same EvalScope (mode, areas, perspective, etc.)
|
|
1053
|
+
* but measures the current state of docs/models/tasks.
|
|
1054
|
+
*/
|
|
1055
|
+
rerunOf?: ReportId;
|
|
1056
|
+
}
|
|
1035
1057
|
/** Full provenance metadata for an evaluation report */
|
|
1036
1058
|
export interface ReportProvenance {
|
|
1037
1059
|
/** Which feature areas were evaluated */
|
|
@@ -1055,6 +1077,8 @@ export interface ReportProvenance {
|
|
|
1055
1077
|
};
|
|
1056
1078
|
/** Grader model used for scoring */
|
|
1057
1079
|
graderModel: string;
|
|
1080
|
+
/** Typed relationships with other reports (re-run, comparison) */
|
|
1081
|
+
lineage?: ReportLineage;
|
|
1058
1082
|
/** Evaluation mode */
|
|
1059
1083
|
mode: EvalMode;
|
|
1060
1084
|
/** Models under evaluation */
|
|
@@ -36,7 +36,7 @@ export function createProgressDisplay() {
|
|
|
36
36
|
line = `⏳ [queued] Waiting for runner... (${elapsed})`;
|
|
37
37
|
break;
|
|
38
38
|
case "running": {
|
|
39
|
-
if (job.progress) {
|
|
39
|
+
if (job.progress?.step && job.progress.current && job.progress.total) {
|
|
40
40
|
const { step, current, total } = job.progress;
|
|
41
41
|
line = `⏳ [running] Step ${current}/${total}: ${step} (${elapsed})`;
|
|
42
42
|
}
|
package/dist/commands/publish.js
CHANGED
|
@@ -24,7 +24,7 @@ import { fileURLToPath } from "url";
|
|
|
24
24
|
import { Command } from "commander";
|
|
25
25
|
import { createAppContext } from "../composition-root.js";
|
|
26
26
|
import { buildProvenance, } from "../pipeline/provenance.js";
|
|
27
|
-
import { generateReportId } from "../report-store.js";
|
|
27
|
+
import { generateReportId, } from "../report-store.js";
|
|
28
28
|
import { withRetry } from "../sinks/retry.js";
|
|
29
29
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
30
30
|
const ROOT = resolve(__dirname, "..", "..");
|
|
@@ -154,9 +154,17 @@ async function runPublishCommand(summaryPath, opts) {
|
|
|
154
154
|
}
|
|
155
155
|
}
|
|
156
156
|
// Auto-compare against most recent comparable baseline
|
|
157
|
-
const
|
|
157
|
+
const autoCompareResult = opts.dryRun || !store
|
|
158
158
|
? null
|
|
159
159
|
: await store.autoCompare(summary, provenance, now);
|
|
160
|
+
const comparison = autoCompareResult?.comparison ?? null;
|
|
161
|
+
// Record which report we compared against in lineage
|
|
162
|
+
if (autoCompareResult) {
|
|
163
|
+
provenance.lineage = {
|
|
164
|
+
...provenance.lineage,
|
|
165
|
+
comparedAgainst: autoCompareResult.baselineReportId,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
160
168
|
const reportId = generateReportId();
|
|
161
169
|
const report = {
|
|
162
170
|
comparison: comparison ?? undefined,
|
|
@@ -80,10 +80,19 @@ export class PublishReportStep {
|
|
|
80
80
|
const now = new Date().toISOString();
|
|
81
81
|
const reportId = generateReportId();
|
|
82
82
|
const durationMs = Date.now() - this.pipelineStart;
|
|
83
|
-
// Auto-compare against most recent comparable baseline
|
|
84
|
-
|
|
83
|
+
// Auto-compare against most recent comparable baseline.
|
|
84
|
+
// Returns the comparison + baseline report ID for lineage tracking.
|
|
85
|
+
const autoCompareResult = ctx.reportStore
|
|
85
86
|
? (await ctx.reportStore.autoCompare(summary, provenance, now))
|
|
86
87
|
: null;
|
|
88
|
+
const comparison = autoCompareResult?.comparison ?? null;
|
|
89
|
+
// Record which report we compared against in lineage
|
|
90
|
+
if (autoCompareResult) {
|
|
91
|
+
provenance.lineage = {
|
|
92
|
+
...provenance.lineage,
|
|
93
|
+
comparedAgainst: autoCompareResult.baselineReportId,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
87
96
|
const report = {
|
|
88
97
|
comparison: comparison ?? undefined,
|
|
89
98
|
completedAt: now,
|
|
@@ -168,6 +177,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
|
|
|
168
177
|
rootDir: ctx.config.rootDir,
|
|
169
178
|
sanityDocumentIds,
|
|
170
179
|
source,
|
|
180
|
+
sourceReportId: ctx.config.sourceReportId,
|
|
171
181
|
taskIds,
|
|
172
182
|
};
|
|
173
183
|
}
|
|
@@ -55,6 +55,7 @@ export function mapRequestToConfig(request, rootDir) {
|
|
|
55
55
|
promptfooUrl: undefined,
|
|
56
56
|
studioOriginOverride: undefined,
|
|
57
57
|
sanityDocumentArgs: undefined,
|
|
58
|
+
sourceReportId: request.sourceReportId,
|
|
58
59
|
beforeOption: undefined,
|
|
59
60
|
repoTasksPath: undefined,
|
|
60
61
|
callerGit: request.callerGit,
|
|
@@ -41,6 +41,8 @@ export interface ProvenanceInput {
|
|
|
41
41
|
promptfooUrls?: PromptfooUrlEntry[];
|
|
42
42
|
/** Path to the package root (for reading models.yaml) */
|
|
43
43
|
rootDir: string;
|
|
44
|
+
/** Report ID that triggered this re-run (becomes lineage.rerunOf) */
|
|
45
|
+
sourceReportId?: string;
|
|
44
46
|
/** Sanity document IDs targeted */
|
|
45
47
|
sanityDocumentIds?: string[];
|
|
46
48
|
/** Resolved documentation source */
|
|
@@ -35,6 +35,10 @@ export function buildProvenance(input) {
|
|
|
35
35
|
sha: input.callerGit.sha ?? "unknown",
|
|
36
36
|
}
|
|
37
37
|
: detectGitMetadata();
|
|
38
|
+
// Build lineage from explicit relationships
|
|
39
|
+
const lineage = input.sourceReportId
|
|
40
|
+
? { rerunOf: input.sourceReportId }
|
|
41
|
+
: undefined;
|
|
38
42
|
return {
|
|
39
43
|
areas: input.areas,
|
|
40
44
|
autoScope: input.autoScope,
|
|
@@ -42,6 +46,7 @@ export function buildProvenance(input) {
|
|
|
42
46
|
evalFingerprint: input.evalFingerprint,
|
|
43
47
|
git,
|
|
44
48
|
graderModel: models.grader.id,
|
|
49
|
+
lineage,
|
|
45
50
|
mode: input.mode,
|
|
46
51
|
models: models.models.map((m) => ({ id: m.id, label: m.label })),
|
|
47
52
|
promptfooUrl: input.promptfooUrl,
|
package/dist/report-store.d.ts
CHANGED
|
@@ -16,6 +16,16 @@
|
|
|
16
16
|
*/
|
|
17
17
|
import type { SanityClient } from "@sanity/client";
|
|
18
18
|
import type { ComparisonReport, ISOTimestamp, LineageQuery, Report, ReportId, ReportProvenance, ScoreSummary } from "./pipeline/types.js";
|
|
19
|
+
/**
|
|
20
|
+
* Result of an auto-comparison, bundling the ComparisonReport with the
|
|
21
|
+
* baseline report ID so the caller can record lineage (comparedAgainst).
|
|
22
|
+
*/
|
|
23
|
+
export interface AutoCompareResult {
|
|
24
|
+
/** The report ID of the baseline used for comparison */
|
|
25
|
+
baselineReportId: ReportId;
|
|
26
|
+
/** The computed comparison report */
|
|
27
|
+
comparison: ComparisonReport;
|
|
28
|
+
}
|
|
19
29
|
export interface ReportStoreOptions {
|
|
20
30
|
/** Override the Sanity client (for testing) */
|
|
21
31
|
client?: SanityClient;
|
|
@@ -33,9 +43,17 @@ export declare class ReportStore {
|
|
|
33
43
|
* Auto-compare: find the most recent comparable report and compute
|
|
34
44
|
* a ComparisonReport using the existing compare() primitive.
|
|
35
45
|
*
|
|
36
|
-
*
|
|
46
|
+
* Baseline selection order:
|
|
47
|
+
* 1. Explicit lineage — if `provenance.lineage.rerunOf` is set,
|
|
48
|
+
* compare against that specific report (deterministic re-run comparison)
|
|
49
|
+
* 2. Fuzzy matching — most recent report with same mode + source name
|
|
50
|
+
*
|
|
51
|
+
* Returns the comparison plus the baseline report ID so the caller
|
|
52
|
+
* can record `provenance.lineage.comparedAgainst`.
|
|
53
|
+
*
|
|
54
|
+
* @returns The comparison result with baseline ID, or null if no baseline found
|
|
37
55
|
*/
|
|
38
|
-
autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<
|
|
56
|
+
autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<AutoCompareResult | null>;
|
|
39
57
|
/**
|
|
40
58
|
* Find a report by its evaluation fingerprint (cross-environment cache lookup).
|
|
41
59
|
*
|
package/dist/report-store.js
CHANGED
|
@@ -41,19 +41,43 @@ export class ReportStore {
|
|
|
41
41
|
* Auto-compare: find the most recent comparable report and compute
|
|
42
42
|
* a ComparisonReport using the existing compare() primitive.
|
|
43
43
|
*
|
|
44
|
-
*
|
|
44
|
+
* Baseline selection order:
|
|
45
|
+
* 1. Explicit lineage — if `provenance.lineage.rerunOf` is set,
|
|
46
|
+
* compare against that specific report (deterministic re-run comparison)
|
|
47
|
+
* 2. Fuzzy matching — most recent report with same mode + source name
|
|
48
|
+
*
|
|
49
|
+
* Returns the comparison plus the baseline report ID so the caller
|
|
50
|
+
* can record `provenance.lineage.comparedAgainst`.
|
|
51
|
+
*
|
|
52
|
+
* @returns The comparison result with baseline ID, or null if no baseline found
|
|
45
53
|
*/
|
|
46
54
|
async autoCompare(currentSummary, provenance, completedAt) {
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
55
|
+
// 1. Prefer explicit lineage source (deterministic re-run comparison)
|
|
56
|
+
const rerunSourceId = provenance.lineage?.rerunOf;
|
|
57
|
+
let baseline = null;
|
|
58
|
+
if (rerunSourceId) {
|
|
59
|
+
baseline = await this.read(rerunSourceId);
|
|
60
|
+
if (baseline) {
|
|
61
|
+
console.log(` 🔗 Comparing against lineage source: ${rerunSourceId}`);
|
|
62
|
+
}
|
|
63
|
+
else {
|
|
64
|
+
console.warn(` ⚠️ Lineage source ${rerunSourceId} not found, falling back to fuzzy match`);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
// 2. Fall back to fuzzy matching
|
|
68
|
+
if (!baseline) {
|
|
69
|
+
baseline = await this.findComparableBaseline({
|
|
70
|
+
before: completedAt,
|
|
71
|
+
mode: provenance.mode,
|
|
72
|
+
source: { name: provenance.source.name },
|
|
73
|
+
});
|
|
74
|
+
}
|
|
52
75
|
if (!baseline) {
|
|
53
76
|
return null;
|
|
54
77
|
}
|
|
55
78
|
try {
|
|
56
|
-
|
|
79
|
+
const comparison = compare(baseline.summary, currentSummary);
|
|
80
|
+
return { baselineReportId: baseline.id, comparison };
|
|
57
81
|
}
|
|
58
82
|
catch (error) {
|
|
59
83
|
console.warn(` ⚠️ Auto-comparison failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
@@ -68,6 +68,8 @@ export interface EvalRequestPayload {
|
|
|
68
68
|
requestedAt: string;
|
|
69
69
|
/** User ID who requested */
|
|
70
70
|
requestedBy?: string;
|
|
71
|
+
/** Report ID that triggered this re-run (if any) */
|
|
72
|
+
sourceReportId?: string;
|
|
71
73
|
/** Request status */
|
|
72
74
|
status: string;
|
|
73
75
|
/** Publish tag */
|
|
@@ -180,6 +180,9 @@ async function dispatchGitHubEval(repo, payload, config) {
|
|
|
180
180
|
...(hasAreas ? { areas: payload.areas } : {}),
|
|
181
181
|
...(payload.debug ? { debug: true } : {}),
|
|
182
182
|
...(payload.tag ? { publishTag: payload.tag } : {}),
|
|
183
|
+
...(payload.sourceReportId
|
|
184
|
+
? { sourceReportId: payload.sourceReportId }
|
|
185
|
+
: {}),
|
|
183
186
|
},
|
|
184
187
|
event_type: "external-eval",
|
|
185
188
|
};
|