@sanity/ailf 2.7.1 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +72 -0
- package/dist/_vendor/ailf-core/artifact-registry.js +150 -0
- package/dist/_vendor/ailf-core/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +56 -0
- package/dist/_vendor/ailf-core/ports/artifact-writer.js +28 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +13 -3
- package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/index.js +1 -1
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +9 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +110 -68
- package/dist/_vendor/ailf-core/types/index.js +1 -1
- package/dist/_vendor/ailf-shared/index.d.ts +2 -0
- package/dist/_vendor/ailf-shared/index.js +2 -0
- package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
- package/dist/_vendor/ailf-shared/run-context.js +17 -0
- package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
- package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +39 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.js +148 -0
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +30 -0
- package/dist/artifact-capture/gcs-artifact-writer.js +119 -0
- package/dist/commands/publish.js +3 -2
- package/dist/composition-root.d.ts +3 -3
- package/dist/composition-root.js +20 -15
- package/dist/orchestration/build-step-sequence.js +6 -1
- package/dist/orchestration/steps/calculate-scores-step.js +42 -2
- package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
- package/dist/orchestration/steps/finalize-run-step.js +103 -0
- package/dist/orchestration/steps/publish-report-step.js +19 -39
- package/dist/pipeline/calculate-scores.js +13 -2
- package/dist/pipeline/provenance.d.ts +24 -44
- package/dist/pipeline/provenance.js +17 -165
- package/dist/pipeline/report-title.d.ts +2 -2
- package/dist/pipeline/run-context.d.ts +57 -0
- package/dist/pipeline/run-context.js +156 -0
- package/dist/pipeline/upload-test-outputs.d.ts +26 -0
- package/dist/pipeline/upload-test-outputs.js +34 -0
- package/dist/report-store.js +4 -2
- package/package.json +3 -3
- package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
- package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
- package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
- package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
- package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
- package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66
|
@@ -1,65 +1,45 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/provenance.ts
|
|
3
3
|
*
|
|
4
|
-
* Builds ReportProvenance from data available during a pipeline run.
|
|
4
|
+
* Builds `ReportProvenance` from data available during a pipeline run.
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
6
|
+
* `ReportProvenance extends RunContext` (D0032). This module derives
|
|
7
|
+
* RunContext via `buildRunContext()` and attaches report-specific extras
|
|
8
|
+
* (lineage, autoScope, promptfoo URLs, targetDocuments, runId). A single
|
|
9
|
+
* derivation path for RunContext foreclosures drift between the run
|
|
10
|
+
* manifest (GCS) and the report provenance (Content Lake).
|
|
10
11
|
*
|
|
11
|
-
* @see
|
|
12
|
-
* @see docs/
|
|
12
|
+
* @see packages/eval/src/pipeline/run-context.ts — the shared derivation path
|
|
13
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Drift Prevention)
|
|
13
14
|
*/
|
|
14
|
-
import type {
|
|
15
|
-
import type
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
logger?: Logger;
|
|
15
|
+
import type { PromptfooUrlEntry, ReportAutoScope, ReportProvenance, RunId } from "./types.js";
|
|
16
|
+
import { type RunContextInput } from "./run-context.js";
|
|
17
|
+
/**
|
|
18
|
+
* Inputs needed to build a ReportProvenance. Extends `RunContextInput` so
|
|
19
|
+
* the RunContext derivation path is shared.
|
|
20
|
+
*/
|
|
21
|
+
export interface ProvenanceInput extends RunContextInput {
|
|
22
22
|
/** Release auto-scope metadata (when perspective evaluation was scoped) */
|
|
23
23
|
autoScope?: ReportAutoScope;
|
|
24
|
-
/**
|
|
25
|
-
* Git metadata from the *calling* repository (cross-repo evaluations).
|
|
26
|
-
* When provided, overrides CI env var detection so provenance attributes
|
|
27
|
-
* to the caller — not the AILF core repo where the workflow executes.
|
|
28
|
-
*/
|
|
29
|
-
callerGit?: {
|
|
30
|
-
branch?: string;
|
|
31
|
-
prNumber?: number;
|
|
32
|
-
repo: string;
|
|
33
|
-
sha?: string;
|
|
34
|
-
};
|
|
35
24
|
/** SHA-256 hash of the doc context files (from cache system) */
|
|
36
25
|
contextHash?: string;
|
|
37
|
-
/** Evaluation fingerprint for cross-environment cache lookup */
|
|
38
|
-
evalFingerprint?: string;
|
|
39
|
-
/** Evaluation mode */
|
|
40
|
-
mode: EvalMode;
|
|
41
26
|
/** @deprecated Use `promptfooUrls` — kept for backward compatibility */
|
|
42
27
|
promptfooUrl?: string;
|
|
43
28
|
/** Per-mode Promptfoo share URLs */
|
|
44
29
|
promptfooUrls?: PromptfooUrlEntry[];
|
|
45
|
-
/**
|
|
46
|
-
|
|
47
|
-
/** Report ID that triggered this re-run (becomes lineage.rerunOf) */
|
|
48
|
-
sourceReportId?: string;
|
|
30
|
+
/** Identity of the pipeline run that produced this report (D0032) */
|
|
31
|
+
runId: RunId;
|
|
49
32
|
/** Sanity document IDs targeted */
|
|
50
33
|
sanityDocumentIds?: string[];
|
|
51
|
-
/**
|
|
52
|
-
|
|
53
|
-
/** Specific task IDs evaluated (if scoped) */
|
|
54
|
-
taskIds?: string[];
|
|
34
|
+
/** Report ID that triggered this re-run (becomes lineage.rerunOf) */
|
|
35
|
+
sourceReportId?: string;
|
|
55
36
|
}
|
|
56
37
|
/**
|
|
57
|
-
* Build a ReportProvenance
|
|
38
|
+
* Build a ReportProvenance from pipeline context.
|
|
58
39
|
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
* - Optional metadata (context hash, Promptfoo URL)
|
|
40
|
+
* RunContext fields (mode, areas, taskIds, models, graderModel, source,
|
|
41
|
+
* evalFingerprint, trigger, git) come from `buildRunContext`. Report-
|
|
42
|
+
* specific fields (autoScope, contextHash, lineage, promptfoo*, runId,
|
|
43
|
+
* targetDocuments) are attached here.
|
|
64
44
|
*/
|
|
65
45
|
export declare function buildProvenance(input: ProvenanceInput): ReportProvenance;
|
|
@@ -1,188 +1,40 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/provenance.ts
|
|
3
3
|
*
|
|
4
|
-
* Builds ReportProvenance from data available during a pipeline run.
|
|
4
|
+
* Builds `ReportProvenance` from data available during a pipeline run.
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
6
|
+
* `ReportProvenance extends RunContext` (D0032). This module derives
|
|
7
|
+
* RunContext via `buildRunContext()` and attaches report-specific extras
|
|
8
|
+
* (lineage, autoScope, promptfoo URLs, targetDocuments, runId). A single
|
|
9
|
+
* derivation path for RunContext foreclosures drift between the run
|
|
10
|
+
* manifest (GCS) and the report provenance (Content Lake).
|
|
10
11
|
*
|
|
11
|
-
* @see
|
|
12
|
-
* @see docs/
|
|
12
|
+
* @see packages/eval/src/pipeline/run-context.ts — the shared derivation path
|
|
13
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Drift Prevention)
|
|
13
14
|
*/
|
|
14
|
-
import {
|
|
15
|
-
import { tryLoadConfigFile } from "./compiler/config-loader.js";
|
|
15
|
+
import { buildRunContext } from "./run-context.js";
|
|
16
16
|
/**
|
|
17
|
-
* Build a ReportProvenance
|
|
17
|
+
* Build a ReportProvenance from pipeline context.
|
|
18
18
|
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
* - Optional metadata (context hash, Promptfoo URL)
|
|
19
|
+
* RunContext fields (mode, areas, taskIds, models, graderModel, source,
|
|
20
|
+
* evalFingerprint, trigger, git) come from `buildRunContext`. Report-
|
|
21
|
+
* specific fields (autoScope, contextHash, lineage, promptfoo*, runId,
|
|
22
|
+
* targetDocuments) are attached here.
|
|
24
23
|
*/
|
|
25
24
|
export function buildProvenance(input) {
|
|
26
|
-
const
|
|
27
|
-
const models = loadModelsConfig(input.rootDir, log);
|
|
28
|
-
log.debug("Assembling provenance input", {
|
|
29
|
-
mode: input.mode,
|
|
30
|
-
sourceName: input.source.name,
|
|
31
|
-
sourceBaseUrl: input.source.baseUrl,
|
|
32
|
-
areas: input.areas,
|
|
33
|
-
taskIds: input.taskIds,
|
|
34
|
-
hasContextHash: Boolean(input.contextHash),
|
|
35
|
-
hasEvalFingerprint: Boolean(input.evalFingerprint),
|
|
36
|
-
hasCallerGit: Boolean(input.callerGit),
|
|
37
|
-
hasSourceReportId: Boolean(input.sourceReportId),
|
|
38
|
-
modelCount: models.models.length,
|
|
39
|
-
});
|
|
40
|
-
// Cross-repo evaluations: prefer explicit caller git metadata over
|
|
41
|
-
// CI env vars (which always reflect the AILF core repo).
|
|
42
|
-
const git = input.callerGit
|
|
43
|
-
? {
|
|
44
|
-
branch: input.callerGit.branch ?? "unknown",
|
|
45
|
-
prNumber: input.callerGit.prNumber,
|
|
46
|
-
repo: input.callerGit.repo,
|
|
47
|
-
sha: input.callerGit.sha ?? "unknown",
|
|
48
|
-
}
|
|
49
|
-
: detectGitMetadata();
|
|
25
|
+
const runContext = buildRunContext(input);
|
|
50
26
|
// Build lineage from explicit relationships
|
|
51
27
|
const lineage = input.sourceReportId
|
|
52
28
|
? { rerunOf: input.sourceReportId }
|
|
53
29
|
: undefined;
|
|
54
|
-
const trigger = detectTrigger();
|
|
55
|
-
log.debug("Provenance computed", {
|
|
56
|
-
triggerType: trigger.type,
|
|
57
|
-
gitRepo: git?.repo,
|
|
58
|
-
gitBranch: git?.branch,
|
|
59
|
-
evalFingerprint: input.evalFingerprint,
|
|
60
|
-
hasLineage: Boolean(lineage),
|
|
61
|
-
});
|
|
62
|
-
// Non-literacy modes (agent-harness, mcp-server, etc.) don't use the
|
|
63
|
-
// config/models.ts model matrix — listing those models would be misleading.
|
|
64
|
-
// Only include them for literacy mode where they're the actual eval targets.
|
|
65
|
-
const evaluatedModels = input.mode === "literacy"
|
|
66
|
-
? models.models.map((m) => ({ id: m.id, label: m.label }))
|
|
67
|
-
: [];
|
|
68
30
|
return {
|
|
69
|
-
|
|
31
|
+
...runContext,
|
|
70
32
|
autoScope: input.autoScope,
|
|
71
33
|
contextHash: input.contextHash,
|
|
72
|
-
evalFingerprint: input.evalFingerprint,
|
|
73
|
-
git,
|
|
74
|
-
graderModel: models.grader.id,
|
|
75
34
|
lineage,
|
|
76
|
-
mode: input.mode,
|
|
77
|
-
models: evaluatedModels,
|
|
78
35
|
promptfooUrl: input.promptfooUrl,
|
|
79
36
|
promptfooUrls: input.promptfooUrls,
|
|
80
|
-
|
|
81
|
-
baseUrl: input.source.baseUrl,
|
|
82
|
-
dataset: input.source.dataset,
|
|
83
|
-
name: input.source.name,
|
|
84
|
-
perspective: input.source.perspective,
|
|
85
|
-
projectId: input.source.projectId,
|
|
86
|
-
},
|
|
37
|
+
runId: input.runId,
|
|
87
38
|
targetDocuments: input.sanityDocumentIds,
|
|
88
|
-
taskIds: input.taskIds,
|
|
89
|
-
trigger: detectTrigger(),
|
|
90
|
-
};
|
|
91
|
-
}
|
|
92
|
-
// ---------------------------------------------------------------------------
|
|
93
|
-
// Trigger detection
|
|
94
|
-
// ---------------------------------------------------------------------------
|
|
95
|
-
/**
|
|
96
|
-
* Extract git metadata from GitHub Actions environment variables.
|
|
97
|
-
* Returns undefined when not running in CI.
|
|
98
|
-
*/
|
|
99
|
-
function detectGitMetadata() {
|
|
100
|
-
const repo = process.env.GITHUB_REPOSITORY;
|
|
101
|
-
if (!repo)
|
|
102
|
-
return undefined;
|
|
103
|
-
const sha = process.env.GITHUB_SHA ?? "unknown";
|
|
104
|
-
const ref = process.env.GITHUB_REF ?? "";
|
|
105
|
-
// Extract branch name from ref (refs/heads/main → main)
|
|
106
|
-
const branch = ref.startsWith("refs/heads/")
|
|
107
|
-
? ref.slice("refs/heads/".length)
|
|
108
|
-
: ref.startsWith("refs/pull/")
|
|
109
|
-
? `pr-${ref.split("/")[2]}`
|
|
110
|
-
: ref;
|
|
111
|
-
// Extract PR number from GITHUB_REF (refs/pull/123/merge)
|
|
112
|
-
const prMatch = ref.match(/^refs\/pull\/(\d+)\//);
|
|
113
|
-
const prNumber = prMatch ? parseInt(prMatch[1], 10) : undefined;
|
|
114
|
-
return { branch, prNumber, repo, sha };
|
|
115
|
-
}
|
|
116
|
-
// ---------------------------------------------------------------------------
|
|
117
|
-
// Git metadata
|
|
118
|
-
// ---------------------------------------------------------------------------
|
|
119
|
-
/**
|
|
120
|
-
* Infer what triggered this evaluation from environment variables.
|
|
121
|
-
*
|
|
122
|
-
* Detection order:
|
|
123
|
-
* 1. AILF_TRIGGER_TYPE — explicit override (for custom integrations)
|
|
124
|
-
* 2. GITHUB_EVENT_NAME === "schedule" — cron-triggered
|
|
125
|
-
* 3. GITHUB_EVENT_NAME === "repository_dispatch" — cross-repo trigger
|
|
126
|
-
* 4. GITHUB_ACTIONS === "true" — CI-triggered
|
|
127
|
-
* 5. Default: manual
|
|
128
|
-
*/
|
|
129
|
-
function detectTrigger() {
|
|
130
|
-
const explicit = process.env.AILF_TRIGGER_TYPE;
|
|
131
|
-
if (explicit === "scheduled") {
|
|
132
|
-
return {
|
|
133
|
-
schedule: process.env.AILF_SCHEDULE ?? "unknown",
|
|
134
|
-
type: "scheduled",
|
|
135
|
-
};
|
|
136
|
-
}
|
|
137
|
-
if (explicit === "webhook") {
|
|
138
|
-
return {
|
|
139
|
-
documentId: process.env.AILF_WEBHOOK_DOCUMENT_ID,
|
|
140
|
-
source: process.env.AILF_WEBHOOK_SOURCE ?? "unknown",
|
|
141
|
-
type: "webhook",
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
|
-
// GitHub Actions context
|
|
145
|
-
const eventName = process.env.GITHUB_EVENT_NAME;
|
|
146
|
-
if (eventName === "schedule") {
|
|
147
|
-
return {
|
|
148
|
-
schedule: process.env.GITHUB_SCHEDULE ?? "unknown",
|
|
149
|
-
type: "scheduled",
|
|
150
|
-
};
|
|
151
|
-
}
|
|
152
|
-
if (eventName === "repository_dispatch") {
|
|
153
|
-
return {
|
|
154
|
-
callerRef: process.env.GITHUB_REF,
|
|
155
|
-
// Note: callerRepo here is a fallback. The accurate caller repo
|
|
156
|
-
// comes from callerGit (injected into the PipelineRequest payload).
|
|
157
|
-
// GITHUB_REPOSITORY_OWNER_ID is just the org ID, not owner/repo.
|
|
158
|
-
callerRepo: process.env.GITHUB_REPOSITORY_OWNER_ID ?? "unknown",
|
|
159
|
-
type: "cross-repo",
|
|
160
|
-
};
|
|
161
|
-
}
|
|
162
|
-
if (process.env.GITHUB_ACTIONS === "true") {
|
|
163
|
-
return {
|
|
164
|
-
runId: process.env.GITHUB_RUN_ID ?? "unknown",
|
|
165
|
-
type: "ci",
|
|
166
|
-
workflow: process.env.GITHUB_WORKFLOW ?? "unknown",
|
|
167
|
-
};
|
|
168
|
-
}
|
|
169
|
-
return { type: "manual" };
|
|
170
|
-
}
|
|
171
|
-
// ---------------------------------------------------------------------------
|
|
172
|
-
// Model config loading
|
|
173
|
-
// ---------------------------------------------------------------------------
|
|
174
|
-
/**
|
|
175
|
-
* Load config/models to extract model list and grader info.
|
|
176
|
-
* Falls back to a minimal config if the file can't be read.
|
|
177
|
-
*/
|
|
178
|
-
function loadModelsConfig(rootDir, log) {
|
|
179
|
-
const result = tryLoadConfigFile("models", rootDir);
|
|
180
|
-
if (result)
|
|
181
|
-
return result.data;
|
|
182
|
-
log.warn("Could not read config/models for provenance");
|
|
183
|
-
return {
|
|
184
|
-
defaults: {},
|
|
185
|
-
grader: { id: "unknown" },
|
|
186
|
-
models: [],
|
|
187
39
|
};
|
|
188
40
|
}
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* @see docs/design-docs/report-store/domain-model.md
|
|
16
16
|
* @see packages/eval/src/pipeline/provenance.ts — builds the provenance input
|
|
17
17
|
*/
|
|
18
|
-
import type { EvalMode,
|
|
18
|
+
import type { EvalMode, RunTrigger } from "./types.js";
|
|
19
19
|
/** Input required to generate a human-readable report title. */
|
|
20
20
|
export interface ReportTitleInput {
|
|
21
21
|
provenance: {
|
|
@@ -31,7 +31,7 @@ export interface ReportTitleInput {
|
|
|
31
31
|
/** Sanity document IDs targeted (when scoped to specific documents) */
|
|
32
32
|
targetDocuments?: string[];
|
|
33
33
|
/** What triggered the evaluation */
|
|
34
|
-
trigger:
|
|
34
|
+
trigger: RunTrigger;
|
|
35
35
|
};
|
|
36
36
|
/**
|
|
37
37
|
* Total number of known feature areas in the system.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* buildRunContext — the single code path that derives `RunContext` from
|
|
3
|
+
* pipeline inputs.
|
|
4
|
+
*
|
|
5
|
+
* `RunContext` is the 9-field shape shared between `RunManifest.context`
|
|
6
|
+
* (in GCS) and `ReportProvenance` (in Content Lake, which `extends
|
|
7
|
+
* RunContext`). Routing every consumer through this function makes it
|
|
8
|
+
* structurally impossible for the two to disagree: there is no second
|
|
9
|
+
* code path to drift against.
|
|
10
|
+
*
|
|
11
|
+
* Contract test: `packages/eval/src/__tests__/run-context-parity.test.ts`
|
|
12
|
+
*
|
|
13
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Move 5 — Drift Prevention)
|
|
14
|
+
*/
|
|
15
|
+
import type { Logger, RunContext } from "../_vendor/ailf-core/index.d.ts";
|
|
16
|
+
import type { ResolvedSourceConfig } from "../sources.js";
|
|
17
|
+
import type { EvalMode } from "./types.js";
|
|
18
|
+
/**
|
|
19
|
+
* Inputs required to derive a `RunContext`. `ProvenanceInput` extends this
|
|
20
|
+
* so every caller of `buildProvenance` is automatically a valid input to
|
|
21
|
+
* `buildRunContext`.
|
|
22
|
+
*/
|
|
23
|
+
export interface RunContextInput {
|
|
24
|
+
/** Feature areas that were evaluated */
|
|
25
|
+
areas: string[];
|
|
26
|
+
/**
|
|
27
|
+
* Git metadata from the *calling* repository (cross-repo evaluations).
|
|
28
|
+
* When provided, overrides CI env var detection so context attributes
|
|
29
|
+
* to the caller — not the AILF core repo where the workflow executes.
|
|
30
|
+
*/
|
|
31
|
+
callerGit?: {
|
|
32
|
+
branch?: string;
|
|
33
|
+
prNumber?: number;
|
|
34
|
+
repo: string;
|
|
35
|
+
sha?: string;
|
|
36
|
+
};
|
|
37
|
+
/** Evaluation fingerprint for cross-environment cache lookup */
|
|
38
|
+
evalFingerprint?: string;
|
|
39
|
+
/** Logger instance (defaults to ConsoleLogger) */
|
|
40
|
+
logger?: Logger;
|
|
41
|
+
/** Evaluation mode */
|
|
42
|
+
mode: EvalMode;
|
|
43
|
+
/** Path to the package root (for reading config/models) */
|
|
44
|
+
rootDir: string;
|
|
45
|
+
/** Resolved documentation source */
|
|
46
|
+
source: ResolvedSourceConfig;
|
|
47
|
+
/** Specific task IDs evaluated (if scoped) */
|
|
48
|
+
taskIds?: string[];
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Derive `RunContext` from pipeline inputs. The only construction path.
|
|
52
|
+
*
|
|
53
|
+
* Both `FinalizeRunStep` (via `RunManifest.context`) and
|
|
54
|
+
* `PublishReportStep` (via `ReportProvenance`) call this function — the
|
|
55
|
+
* former directly, the latter transitively through `buildProvenance`.
|
|
56
|
+
*/
|
|
57
|
+
export declare function buildRunContext(input: RunContextInput): RunContext;
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* buildRunContext — the single code path that derives `RunContext` from
|
|
3
|
+
* pipeline inputs.
|
|
4
|
+
*
|
|
5
|
+
* `RunContext` is the 9-field shape shared between `RunManifest.context`
|
|
6
|
+
* (in GCS) and `ReportProvenance` (in Content Lake, which `extends
|
|
7
|
+
* RunContext`). Routing every consumer through this function makes it
|
|
8
|
+
* structurally impossible for the two to disagree: there is no second
|
|
9
|
+
* code path to drift against.
|
|
10
|
+
*
|
|
11
|
+
* Contract test: `packages/eval/src/__tests__/run-context-parity.test.ts`
|
|
12
|
+
*
|
|
13
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Move 5 — Drift Prevention)
|
|
14
|
+
*/
|
|
15
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
16
|
+
import { tryLoadConfigFile } from "./compiler/config-loader.js";
|
|
17
|
+
/**
|
|
18
|
+
* Derive `RunContext` from pipeline inputs. The only construction path.
|
|
19
|
+
*
|
|
20
|
+
* Both `FinalizeRunStep` (via `RunManifest.context`) and
|
|
21
|
+
* `PublishReportStep` (via `ReportProvenance`) call this function — the
|
|
22
|
+
* former directly, the latter transitively through `buildProvenance`.
|
|
23
|
+
*/
|
|
24
|
+
export function buildRunContext(input) {
|
|
25
|
+
const log = input.logger ?? new ConsoleLogger();
|
|
26
|
+
const models = loadModelsConfig(input.rootDir, log);
|
|
27
|
+
// Cross-repo evaluations: prefer explicit caller git metadata over
|
|
28
|
+
// CI env vars (which always reflect the AILF core repo).
|
|
29
|
+
const git = input.callerGit
|
|
30
|
+
? {
|
|
31
|
+
branch: input.callerGit.branch ?? "unknown",
|
|
32
|
+
prNumber: input.callerGit.prNumber,
|
|
33
|
+
repo: input.callerGit.repo,
|
|
34
|
+
sha: input.callerGit.sha ?? "unknown",
|
|
35
|
+
}
|
|
36
|
+
: detectGitMetadata();
|
|
37
|
+
const trigger = detectTrigger();
|
|
38
|
+
// Non-literacy modes (agent-harness, mcp-server, etc.) don't use the
|
|
39
|
+
// config/models.ts model matrix — listing those models would be
|
|
40
|
+
// misleading. Only include them for literacy mode where they're the
|
|
41
|
+
// actual eval targets.
|
|
42
|
+
const evaluatedModels = input.mode === "literacy"
|
|
43
|
+
? models.models.map((m) => ({ id: m.id, label: m.label }))
|
|
44
|
+
: [];
|
|
45
|
+
return {
|
|
46
|
+
areas: input.areas,
|
|
47
|
+
evalFingerprint: input.evalFingerprint,
|
|
48
|
+
git,
|
|
49
|
+
graderModel: models.grader.id,
|
|
50
|
+
mode: input.mode,
|
|
51
|
+
models: evaluatedModels,
|
|
52
|
+
source: {
|
|
53
|
+
baseUrl: input.source.baseUrl,
|
|
54
|
+
dataset: input.source.dataset,
|
|
55
|
+
name: input.source.name,
|
|
56
|
+
perspective: input.source.perspective,
|
|
57
|
+
projectId: input.source.projectId,
|
|
58
|
+
},
|
|
59
|
+
taskIds: input.taskIds,
|
|
60
|
+
trigger,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Environment-derived context
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
/**
|
|
67
|
+
* Extract git metadata from GitHub Actions environment variables.
|
|
68
|
+
* Returns undefined when not running in CI.
|
|
69
|
+
*/
|
|
70
|
+
function detectGitMetadata() {
|
|
71
|
+
const repo = process.env.GITHUB_REPOSITORY;
|
|
72
|
+
if (!repo)
|
|
73
|
+
return undefined;
|
|
74
|
+
const sha = process.env.GITHUB_SHA ?? "unknown";
|
|
75
|
+
const ref = process.env.GITHUB_REF ?? "";
|
|
76
|
+
// Extract branch name from ref (refs/heads/main → main)
|
|
77
|
+
const branch = ref.startsWith("refs/heads/")
|
|
78
|
+
? ref.slice("refs/heads/".length)
|
|
79
|
+
: ref.startsWith("refs/pull/")
|
|
80
|
+
? `pr-${ref.split("/")[2]}`
|
|
81
|
+
: ref;
|
|
82
|
+
// Extract PR number from GITHUB_REF (refs/pull/123/merge)
|
|
83
|
+
const prMatch = ref.match(/^refs\/pull\/(\d+)\//);
|
|
84
|
+
const prNumber = prMatch ? parseInt(prMatch[1], 10) : undefined;
|
|
85
|
+
return { branch, prNumber, repo, sha };
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Infer what triggered this evaluation from environment variables.
|
|
89
|
+
*
|
|
90
|
+
* Detection order:
|
|
91
|
+
* 1. AILF_TRIGGER_TYPE — explicit override (for custom integrations)
|
|
92
|
+
* 2. GITHUB_EVENT_NAME === "schedule" — cron-triggered
|
|
93
|
+
* 3. GITHUB_EVENT_NAME === "repository_dispatch" — cross-repo trigger
|
|
94
|
+
* 4. GITHUB_ACTIONS === "true" — CI-triggered
|
|
95
|
+
* 5. Default: manual
|
|
96
|
+
*/
|
|
97
|
+
function detectTrigger() {
|
|
98
|
+
const explicit = process.env.AILF_TRIGGER_TYPE;
|
|
99
|
+
if (explicit === "scheduled") {
|
|
100
|
+
return {
|
|
101
|
+
schedule: process.env.AILF_SCHEDULE ?? "unknown",
|
|
102
|
+
type: "scheduled",
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
if (explicit === "webhook") {
|
|
106
|
+
return {
|
|
107
|
+
documentId: process.env.AILF_WEBHOOK_DOCUMENT_ID,
|
|
108
|
+
source: process.env.AILF_WEBHOOK_SOURCE ?? "unknown",
|
|
109
|
+
type: "webhook",
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
// GitHub Actions context
|
|
113
|
+
const eventName = process.env.GITHUB_EVENT_NAME;
|
|
114
|
+
if (eventName === "schedule") {
|
|
115
|
+
return {
|
|
116
|
+
schedule: process.env.GITHUB_SCHEDULE ?? "unknown",
|
|
117
|
+
type: "scheduled",
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
if (eventName === "repository_dispatch") {
|
|
121
|
+
return {
|
|
122
|
+
callerRef: process.env.GITHUB_REF,
|
|
123
|
+
// Note: callerRepo here is a fallback. The accurate caller repo
|
|
124
|
+
// comes from callerGit (injected into the PipelineRequest payload).
|
|
125
|
+
// GITHUB_REPOSITORY_OWNER_ID is just the org ID, not owner/repo.
|
|
126
|
+
callerRepo: process.env.GITHUB_REPOSITORY_OWNER_ID ?? "unknown",
|
|
127
|
+
type: "cross-repo",
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
if (process.env.GITHUB_ACTIONS === "true") {
|
|
131
|
+
return {
|
|
132
|
+
runId: process.env.GITHUB_RUN_ID ?? "unknown",
|
|
133
|
+
type: "ci",
|
|
134
|
+
workflow: process.env.GITHUB_WORKFLOW ?? "unknown",
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
return { type: "manual" };
|
|
138
|
+
}
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
// Model config loading
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
/**
|
|
143
|
+
* Load config/models to extract model list and grader info.
|
|
144
|
+
* Falls back to a minimal config if the file can't be read.
|
|
145
|
+
*/
|
|
146
|
+
function loadModelsConfig(rootDir, log) {
|
|
147
|
+
const result = tryLoadConfigFile("models", rootDir);
|
|
148
|
+
if (result)
|
|
149
|
+
return result.data;
|
|
150
|
+
log.warn("Could not read config/models for run context");
|
|
151
|
+
return {
|
|
152
|
+
defaults: {},
|
|
153
|
+
grader: { id: "unknown" },
|
|
154
|
+
models: [],
|
|
155
|
+
};
|
|
156
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* upload-test-outputs.ts — shared helper for the testOutputs artifact upload.
|
|
3
|
+
*
|
|
4
|
+
* CalculateScoresStep calls this once its score-summary.json is complete.
|
|
5
|
+
* Each {taskId, modelId} pair becomes one GCS object under
|
|
6
|
+
* `runs/{runId}/test-outputs/{taskId}--{modelId}.json` carrying the full
|
|
7
|
+
* response output and truncation flag. The returned ArtifactRef's
|
|
8
|
+
* `entries[]` catalog lists every uploaded entry so Studio can render
|
|
9
|
+
* drill-down state without a second listing call.
|
|
10
|
+
*
|
|
11
|
+
* PublishReportStep later strips responseOutput from the inline
|
|
12
|
+
* testResults[] when this upload succeeds, so the Content Lake document
|
|
13
|
+
* stays slim — the full output lives in GCS and is fetched per-entry
|
|
14
|
+
* on click.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
|
+
*/
|
|
18
|
+
import type { ArtifactRef, ArtifactWriter, RunId, StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
|
|
19
|
+
/**
|
|
20
|
+
* Upload testOutputs as per-entry GCS objects under
|
|
21
|
+
* `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
|
|
22
|
+
*
|
|
23
|
+
* Returns the `ArtifactRef` on success, or `null` when upload is skipped or
|
|
24
|
+
* fails (P5: non-blocking).
|
|
25
|
+
*/
|
|
26
|
+
export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* upload-test-outputs.ts — shared helper for the testOutputs artifact upload.
|
|
3
|
+
*
|
|
4
|
+
* CalculateScoresStep calls this once its score-summary.json is complete.
|
|
5
|
+
* Each {taskId, modelId} pair becomes one GCS object under
|
|
6
|
+
* `runs/{runId}/test-outputs/{taskId}--{modelId}.json` carrying the full
|
|
7
|
+
* response output and truncation flag. The returned ArtifactRef's
|
|
8
|
+
* `entries[]` catalog lists every uploaded entry so Studio can render
|
|
9
|
+
* drill-down state without a second listing call.
|
|
10
|
+
*
|
|
11
|
+
* PublishReportStep later strips responseOutput from the inline
|
|
12
|
+
* testResults[] when this upload succeeds, so the Content Lake document
|
|
13
|
+
* stays slim — the full output lives in GCS and is fetched per-entry
|
|
14
|
+
* on click.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Upload testOutputs as per-entry GCS objects under
|
|
20
|
+
* `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
|
|
21
|
+
*
|
|
22
|
+
* Returns the `ArtifactRef` on success, or `null` when upload is skipped or
|
|
23
|
+
* fails (P5: non-blocking).
|
|
24
|
+
*/
|
|
25
|
+
export async function uploadTestOutputs(writer, runId, testResults) {
|
|
26
|
+
const entries = testResults.map((tr) => ({
|
|
27
|
+
key: `${tr.taskId}::${tr.modelId}`,
|
|
28
|
+
data: {
|
|
29
|
+
responseOutput: tr.responseOutput ?? "",
|
|
30
|
+
responseOutputTruncated: tr.responseOutputTruncated ?? false,
|
|
31
|
+
},
|
|
32
|
+
}));
|
|
33
|
+
return writer.writePerEntry("testOutputs", runId, entries);
|
|
34
|
+
}
|
package/dist/report-store.js
CHANGED
|
@@ -211,8 +211,10 @@ export class ReportStore {
|
|
|
211
211
|
summary: {
|
|
212
212
|
...report.summary,
|
|
213
213
|
// Artifact references live inside summary in Sanity so they're
|
|
214
|
-
// projected automatically by the reportDetailQuery (
|
|
215
|
-
...(report.
|
|
214
|
+
// projected automatically by the reportDetailQuery (D0032)
|
|
215
|
+
...(report.artifactManifest
|
|
216
|
+
? { artifactManifest: report.artifactManifest }
|
|
217
|
+
: {}),
|
|
216
218
|
},
|
|
217
219
|
tag: report.tag ?? null,
|
|
218
220
|
title: report.title ?? null,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.8.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -52,8 +52,8 @@
|
|
|
52
52
|
"@types/node": "^22.13.1",
|
|
53
53
|
"tsx": "^4.19.2",
|
|
54
54
|
"typescript": "^5.7.3",
|
|
55
|
-
"@sanity/ailf-
|
|
56
|
-
"@sanity/ailf-
|
|
55
|
+
"@sanity/ailf-core": "0.1.0",
|
|
56
|
+
"@sanity/ailf-shared": "0.1.0"
|
|
57
57
|
},
|
|
58
58
|
"scripts": {
|
|
59
59
|
"build": "tsc && tsx scripts/bundle-workspace-deps.ts",
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Port: ArtifactUploader — uploads report artifacts to external object storage.
|
|
3
|
-
*
|
|
4
|
-
* Separate from ArtifactCollector (which captures forensic archives).
|
|
5
|
-
* This port puts structured files at known paths so Studio can fetch
|
|
6
|
-
* them on demand via signed URLs.
|
|
7
|
-
*
|
|
8
|
-
* @see docs/design-docs/external-artifact-store.md
|
|
9
|
-
* @see docs/decisions/D0030-external-artifact-store.md
|
|
10
|
-
*/
|
|
11
|
-
import type { ArtifactRef } from "../types/index.js";
|
|
12
|
-
/**
|
|
13
|
-
* Uploads report artifacts to external storage.
|
|
14
|
-
*
|
|
15
|
-
* Implementations:
|
|
16
|
-
* - GcsReportArtifactUploader (packages/eval) — uploads to GCS
|
|
17
|
-
* - NoOpArtifactUploader (below) — returns null (no-op when GCS is not configured)
|
|
18
|
-
*/
|
|
19
|
-
export interface ArtifactUploader {
|
|
20
|
-
/**
|
|
21
|
-
* Upload a JSON artifact for a report.
|
|
22
|
-
*
|
|
23
|
-
* @param reportId - Report identifier (used as the GCS path prefix)
|
|
24
|
-
* @param fileName - File name within the report prefix (e.g., "test-outputs.json")
|
|
25
|
-
* @param data - Serializable data (will be JSON.stringify'd)
|
|
26
|
-
* @returns ArtifactRef on success, null if upload is skipped or fails
|
|
27
|
-
*/
|
|
28
|
-
upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
|
|
29
|
-
}
|
|
30
|
-
/**
|
|
31
|
-
* No-op uploader — always returns null. Used when GCS is not configured.
|
|
32
|
-
*/
|
|
33
|
-
export declare class NoOpArtifactUploader implements ArtifactUploader {
|
|
34
|
-
upload(): Promise<null>;
|
|
35
|
-
}
|