@sanity/ailf 2.7.1 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
- package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +173 -0
- package/dist/_vendor/ailf-core/artifact-registry.js +811 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -1
- package/dist/_vendor/ailf-core/index.js +3 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +95 -0
- package/dist/_vendor/ailf-core/ports/artifact-writer.js +51 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +32 -3
- package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/index.js +1 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +1 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +42 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +298 -77
- package/dist/_vendor/ailf-core/types/index.js +1 -1
- package/dist/_vendor/ailf-shared/index.d.ts +2 -0
- package/dist/_vendor/ailf-shared/index.js +2 -0
- package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
- package/dist/_vendor/ailf-shared/run-context.js +17 -0
- package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
- package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +52 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.js +199 -0
- package/dist/artifact-capture/emit-file.d.ts +28 -0
- package/dist/artifact-capture/emit-file.js +56 -0
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
- package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
- package/dist/artifact-capture/filesystem-collector.js +48 -23
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +67 -0
- package/dist/artifact-capture/gcs-artifact-writer.js +343 -0
- package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
- package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
- package/dist/commands/explain-handler.js +4 -0
- package/dist/commands/pipeline-action.d.ts +5 -0
- package/dist/commands/pipeline-action.js +56 -5
- package/dist/commands/pipeline.d.ts +4 -0
- package/dist/commands/pipeline.js +6 -2
- package/dist/commands/publish.js +7 -3
- package/dist/composition-root.d.ts +14 -11
- package/dist/composition-root.js +90 -31
- package/dist/orchestration/build-step-sequence.js +6 -1
- package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
- package/dist/orchestration/pipeline-orchestrator.js +41 -30
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
- package/dist/orchestration/steps/calculate-scores-step.js +50 -10
- package/dist/orchestration/steps/callback-step.d.ts +1 -1
- package/dist/orchestration/steps/callback-step.js +6 -4
- package/dist/orchestration/steps/compare-step.d.ts +1 -1
- package/dist/orchestration/steps/compare-step.js +4 -2
- package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
- package/dist/orchestration/steps/discovery-report-step.js +4 -1
- package/dist/orchestration/steps/fetch-docs-step.js +9 -15
- package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
- package/dist/orchestration/steps/finalize-run-step.js +117 -0
- package/dist/orchestration/steps/gap-analysis-step.js +34 -6
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
- package/dist/orchestration/steps/generate-configs-step.js +11 -11
- package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
- package/dist/orchestration/steps/publish-report-step.js +40 -55
- package/dist/orchestration/steps/readiness-step.d.ts +1 -1
- package/dist/orchestration/steps/readiness-step.js +4 -1
- package/dist/orchestration/steps/report-step.d.ts +1 -1
- package/dist/orchestration/steps/report-step.js +6 -3
- package/dist/orchestration/steps/run-eval-step.js +14 -9
- package/dist/pipeline/calculate-scores.js +13 -2
- package/dist/pipeline/compare.d.ts +2 -2
- package/dist/pipeline/emit-eval-results.d.ts +38 -0
- package/dist/pipeline/emit-eval-results.js +100 -0
- package/dist/pipeline/provenance.d.ts +24 -44
- package/dist/pipeline/provenance.js +17 -165
- package/dist/pipeline/report-title.d.ts +2 -2
- package/dist/pipeline/run-context.d.ts +57 -0
- package/dist/pipeline/run-context.js +156 -0
- package/dist/pipeline/upload-test-outputs.d.ts +26 -0
- package/dist/pipeline/upload-test-outputs.js +34 -0
- package/dist/report-store.js +4 -2
- package/package.json +3 -3
- package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
- package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
- package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
- package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
- package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
- package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* emit-eval-results.ts — decompose the promptfoo results file into the
|
|
3
|
+
* per-entry descriptors that W0049's registry expects.
|
|
4
|
+
*
|
|
5
|
+
* Replaces the Phase-B-stopgap "route the aggregated JSON through the
|
|
6
|
+
* deprecated `evalResults` bulk descriptor" path. For each test in the
|
|
7
|
+
* promptfoo output we emit:
|
|
8
|
+
*
|
|
9
|
+
* - `rawResults` per (run, mode, task, model) — the full result
|
|
10
|
+
* - `renderedPrompts` per (run, mode, task, model) — prompt the model saw
|
|
11
|
+
* - `graderPrompts` per (run, mode, task, model, grader) — rubric text
|
|
12
|
+
* - `graderJudgments` per (run, mode, task, model, grader) — {score, reason, pass}
|
|
13
|
+
*
|
|
14
|
+
* `testOutputs` is still emitted separately by `calculate-scores-step`
|
|
15
|
+
* via `uploadTestOutputs()` (carried forward from W0048 for byte-
|
|
16
|
+
* equivalence with the original rollout).
|
|
17
|
+
*
|
|
18
|
+
* `traces` is NOT produced here — agentic trace data flows through the
|
|
19
|
+
* agent-observer, not through the promptfoo result shape. Traces
|
|
20
|
+
* emission is out of scope for this helper and lands when the observer
|
|
21
|
+
* integration migrates (follow-up; not in W0050).
|
|
22
|
+
*
|
|
23
|
+
* The "grader" axis value is the rubric dimension string produced by
|
|
24
|
+
* `classifyRubric` (e.g. "task-completion", "code-correctness"). Non-
|
|
25
|
+
* LLM-rubric component assertions (javascript, contains, etc.) don't
|
|
26
|
+
* have a natural grader identifier and are skipped — their outcomes
|
|
27
|
+
* still live inside the full `rawResults` object.
|
|
28
|
+
*/
|
|
29
|
+
import { readFileSync } from "node:fs";
|
|
30
|
+
import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Public entry point
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
/**
|
|
35
|
+
* Parse a promptfoo results file and emit the per-entry artifacts.
|
|
36
|
+
*
|
|
37
|
+
* Non-blocking: any individual emit failure warns but does not halt.
|
|
38
|
+
* File read/parse errors are caught and logged; the caller keeps going.
|
|
39
|
+
*/
|
|
40
|
+
export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
|
|
41
|
+
let raw;
|
|
42
|
+
try {
|
|
43
|
+
raw = JSON.parse(readFileSync(resultsPath, "utf-8"));
|
|
44
|
+
}
|
|
45
|
+
catch (err) {
|
|
46
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
47
|
+
console.warn(` ⚠️ emitPerEntryEvalResults: failed to read ${resultsPath} — ${message}`);
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
// Promptfoo wraps results in either `{ results: { results: [...] } }`
|
|
51
|
+
// (older shape) or directly as `{ results: [...] }` (some adapters).
|
|
52
|
+
const wrapper = raw.results && "results" in raw.results
|
|
53
|
+
? raw.results
|
|
54
|
+
: raw;
|
|
55
|
+
const rows = wrapper?.results ?? [];
|
|
56
|
+
if (rows.length === 0) {
|
|
57
|
+
console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
for (const result of rows) {
|
|
61
|
+
const taskId = result.testCase?.description ?? "unknown-task";
|
|
62
|
+
const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
|
|
63
|
+
const baseAssoc = {
|
|
64
|
+
run: ctx.runId,
|
|
65
|
+
mode,
|
|
66
|
+
task: taskId,
|
|
67
|
+
model: modelId,
|
|
68
|
+
};
|
|
69
|
+
// rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
|
|
70
|
+
await writer.emit("rawResults", baseAssoc, result);
|
|
71
|
+
// renderedPrompts — what the model saw + which provider it went to
|
|
72
|
+
if (result.prompt !== undefined) {
|
|
73
|
+
await writer.emit("renderedPrompts", baseAssoc, {
|
|
74
|
+
prompt: result.prompt,
|
|
75
|
+
provider: result.provider,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
// Per-grader decomposition — only LLM-rubric assertions have a
|
|
79
|
+
// natural grader identity. Code assertions (javascript/contains/…)
|
|
80
|
+
// show up in rawResults but not as standalone graderJudgments.
|
|
81
|
+
const components = result.gradingResult?.componentResults ?? [];
|
|
82
|
+
for (const comp of components) {
|
|
83
|
+
if (comp.assertion?.type !== "llm-rubric")
|
|
84
|
+
continue;
|
|
85
|
+
const dimension = classifyRubric(comp);
|
|
86
|
+
if (!dimension)
|
|
87
|
+
continue;
|
|
88
|
+
const graderAssoc = { ...baseAssoc, grader: dimension };
|
|
89
|
+
await writer.emit("graderPrompts", graderAssoc, {
|
|
90
|
+
dimension,
|
|
91
|
+
assertion: comp.assertion,
|
|
92
|
+
});
|
|
93
|
+
await writer.emit("graderJudgments", graderAssoc, {
|
|
94
|
+
score: parseRubricScore(comp) ?? 0,
|
|
95
|
+
reason: comp.reason ?? "",
|
|
96
|
+
pass: comp.pass,
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
@@ -1,65 +1,45 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/provenance.ts
|
|
3
3
|
*
|
|
4
|
-
* Builds ReportProvenance from data available during a pipeline run.
|
|
4
|
+
* Builds `ReportProvenance` from data available during a pipeline run.
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
6
|
+
* `ReportProvenance extends RunContext` (D0032). This module derives
|
|
7
|
+
* RunContext via `buildRunContext()` and attaches report-specific extras
|
|
8
|
+
* (lineage, autoScope, promptfoo URLs, targetDocuments, runId). A single
|
|
9
|
+
* derivation path for RunContext foreclosures drift between the run
|
|
10
|
+
* manifest (GCS) and the report provenance (Content Lake).
|
|
10
11
|
*
|
|
11
|
-
* @see
|
|
12
|
-
* @see docs/
|
|
12
|
+
* @see packages/eval/src/pipeline/run-context.ts — the shared derivation path
|
|
13
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Drift Prevention)
|
|
13
14
|
*/
|
|
14
|
-
import type {
|
|
15
|
-
import type
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
logger?: Logger;
|
|
15
|
+
import type { PromptfooUrlEntry, ReportAutoScope, ReportProvenance, RunId } from "./types.js";
|
|
16
|
+
import { type RunContextInput } from "./run-context.js";
|
|
17
|
+
/**
|
|
18
|
+
* Inputs needed to build a ReportProvenance. Extends `RunContextInput` so
|
|
19
|
+
* the RunContext derivation path is shared.
|
|
20
|
+
*/
|
|
21
|
+
export interface ProvenanceInput extends RunContextInput {
|
|
22
22
|
/** Release auto-scope metadata (when perspective evaluation was scoped) */
|
|
23
23
|
autoScope?: ReportAutoScope;
|
|
24
|
-
/**
|
|
25
|
-
* Git metadata from the *calling* repository (cross-repo evaluations).
|
|
26
|
-
* When provided, overrides CI env var detection so provenance attributes
|
|
27
|
-
* to the caller — not the AILF core repo where the workflow executes.
|
|
28
|
-
*/
|
|
29
|
-
callerGit?: {
|
|
30
|
-
branch?: string;
|
|
31
|
-
prNumber?: number;
|
|
32
|
-
repo: string;
|
|
33
|
-
sha?: string;
|
|
34
|
-
};
|
|
35
24
|
/** SHA-256 hash of the doc context files (from cache system) */
|
|
36
25
|
contextHash?: string;
|
|
37
|
-
/** Evaluation fingerprint for cross-environment cache lookup */
|
|
38
|
-
evalFingerprint?: string;
|
|
39
|
-
/** Evaluation mode */
|
|
40
|
-
mode: EvalMode;
|
|
41
26
|
/** @deprecated Use `promptfooUrls` — kept for backward compatibility */
|
|
42
27
|
promptfooUrl?: string;
|
|
43
28
|
/** Per-mode Promptfoo share URLs */
|
|
44
29
|
promptfooUrls?: PromptfooUrlEntry[];
|
|
45
|
-
/**
|
|
46
|
-
|
|
47
|
-
/** Report ID that triggered this re-run (becomes lineage.rerunOf) */
|
|
48
|
-
sourceReportId?: string;
|
|
30
|
+
/** Identity of the pipeline run that produced this report (D0032) */
|
|
31
|
+
runId: RunId;
|
|
49
32
|
/** Sanity document IDs targeted */
|
|
50
33
|
sanityDocumentIds?: string[];
|
|
51
|
-
/**
|
|
52
|
-
|
|
53
|
-
/** Specific task IDs evaluated (if scoped) */
|
|
54
|
-
taskIds?: string[];
|
|
34
|
+
/** Report ID that triggered this re-run (becomes lineage.rerunOf) */
|
|
35
|
+
sourceReportId?: string;
|
|
55
36
|
}
|
|
56
37
|
/**
|
|
57
|
-
* Build a ReportProvenance
|
|
38
|
+
* Build a ReportProvenance from pipeline context.
|
|
58
39
|
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
* - Optional metadata (context hash, Promptfoo URL)
|
|
40
|
+
* RunContext fields (mode, areas, taskIds, models, graderModel, source,
|
|
41
|
+
* evalFingerprint, trigger, git) come from `buildRunContext`. Report-
|
|
42
|
+
* specific fields (autoScope, contextHash, lineage, promptfoo*, runId,
|
|
43
|
+
* targetDocuments) are attached here.
|
|
64
44
|
*/
|
|
65
45
|
export declare function buildProvenance(input: ProvenanceInput): ReportProvenance;
|
|
@@ -1,188 +1,40 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* pipeline/provenance.ts
|
|
3
3
|
*
|
|
4
|
-
* Builds ReportProvenance from data available during a pipeline run.
|
|
4
|
+
* Builds `ReportProvenance` from data available during a pipeline run.
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
6
|
+
* `ReportProvenance extends RunContext` (D0032). This module derives
|
|
7
|
+
* RunContext via `buildRunContext()` and attaches report-specific extras
|
|
8
|
+
* (lineage, autoScope, promptfoo URLs, targetDocuments, runId). A single
|
|
9
|
+
* derivation path for RunContext foreclosures drift between the run
|
|
10
|
+
* manifest (GCS) and the report provenance (Content Lake).
|
|
10
11
|
*
|
|
11
|
-
* @see
|
|
12
|
-
* @see docs/
|
|
12
|
+
* @see packages/eval/src/pipeline/run-context.ts — the shared derivation path
|
|
13
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Drift Prevention)
|
|
13
14
|
*/
|
|
14
|
-
import {
|
|
15
|
-
import { tryLoadConfigFile } from "./compiler/config-loader.js";
|
|
15
|
+
import { buildRunContext } from "./run-context.js";
|
|
16
16
|
/**
|
|
17
|
-
* Build a ReportProvenance
|
|
17
|
+
* Build a ReportProvenance from pipeline context.
|
|
18
18
|
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
* - Optional metadata (context hash, Promptfoo URL)
|
|
19
|
+
* RunContext fields (mode, areas, taskIds, models, graderModel, source,
|
|
20
|
+
* evalFingerprint, trigger, git) come from `buildRunContext`. Report-
|
|
21
|
+
* specific fields (autoScope, contextHash, lineage, promptfoo*, runId,
|
|
22
|
+
* targetDocuments) are attached here.
|
|
24
23
|
*/
|
|
25
24
|
export function buildProvenance(input) {
|
|
26
|
-
const
|
|
27
|
-
const models = loadModelsConfig(input.rootDir, log);
|
|
28
|
-
log.debug("Assembling provenance input", {
|
|
29
|
-
mode: input.mode,
|
|
30
|
-
sourceName: input.source.name,
|
|
31
|
-
sourceBaseUrl: input.source.baseUrl,
|
|
32
|
-
areas: input.areas,
|
|
33
|
-
taskIds: input.taskIds,
|
|
34
|
-
hasContextHash: Boolean(input.contextHash),
|
|
35
|
-
hasEvalFingerprint: Boolean(input.evalFingerprint),
|
|
36
|
-
hasCallerGit: Boolean(input.callerGit),
|
|
37
|
-
hasSourceReportId: Boolean(input.sourceReportId),
|
|
38
|
-
modelCount: models.models.length,
|
|
39
|
-
});
|
|
40
|
-
// Cross-repo evaluations: prefer explicit caller git metadata over
|
|
41
|
-
// CI env vars (which always reflect the AILF core repo).
|
|
42
|
-
const git = input.callerGit
|
|
43
|
-
? {
|
|
44
|
-
branch: input.callerGit.branch ?? "unknown",
|
|
45
|
-
prNumber: input.callerGit.prNumber,
|
|
46
|
-
repo: input.callerGit.repo,
|
|
47
|
-
sha: input.callerGit.sha ?? "unknown",
|
|
48
|
-
}
|
|
49
|
-
: detectGitMetadata();
|
|
25
|
+
const runContext = buildRunContext(input);
|
|
50
26
|
// Build lineage from explicit relationships
|
|
51
27
|
const lineage = input.sourceReportId
|
|
52
28
|
? { rerunOf: input.sourceReportId }
|
|
53
29
|
: undefined;
|
|
54
|
-
const trigger = detectTrigger();
|
|
55
|
-
log.debug("Provenance computed", {
|
|
56
|
-
triggerType: trigger.type,
|
|
57
|
-
gitRepo: git?.repo,
|
|
58
|
-
gitBranch: git?.branch,
|
|
59
|
-
evalFingerprint: input.evalFingerprint,
|
|
60
|
-
hasLineage: Boolean(lineage),
|
|
61
|
-
});
|
|
62
|
-
// Non-literacy modes (agent-harness, mcp-server, etc.) don't use the
|
|
63
|
-
// config/models.ts model matrix — listing those models would be misleading.
|
|
64
|
-
// Only include them for literacy mode where they're the actual eval targets.
|
|
65
|
-
const evaluatedModels = input.mode === "literacy"
|
|
66
|
-
? models.models.map((m) => ({ id: m.id, label: m.label }))
|
|
67
|
-
: [];
|
|
68
30
|
return {
|
|
69
|
-
|
|
31
|
+
...runContext,
|
|
70
32
|
autoScope: input.autoScope,
|
|
71
33
|
contextHash: input.contextHash,
|
|
72
|
-
evalFingerprint: input.evalFingerprint,
|
|
73
|
-
git,
|
|
74
|
-
graderModel: models.grader.id,
|
|
75
34
|
lineage,
|
|
76
|
-
mode: input.mode,
|
|
77
|
-
models: evaluatedModels,
|
|
78
35
|
promptfooUrl: input.promptfooUrl,
|
|
79
36
|
promptfooUrls: input.promptfooUrls,
|
|
80
|
-
|
|
81
|
-
baseUrl: input.source.baseUrl,
|
|
82
|
-
dataset: input.source.dataset,
|
|
83
|
-
name: input.source.name,
|
|
84
|
-
perspective: input.source.perspective,
|
|
85
|
-
projectId: input.source.projectId,
|
|
86
|
-
},
|
|
37
|
+
runId: input.runId,
|
|
87
38
|
targetDocuments: input.sanityDocumentIds,
|
|
88
|
-
taskIds: input.taskIds,
|
|
89
|
-
trigger: detectTrigger(),
|
|
90
|
-
};
|
|
91
|
-
}
|
|
92
|
-
// ---------------------------------------------------------------------------
|
|
93
|
-
// Trigger detection
|
|
94
|
-
// ---------------------------------------------------------------------------
|
|
95
|
-
/**
|
|
96
|
-
* Extract git metadata from GitHub Actions environment variables.
|
|
97
|
-
* Returns undefined when not running in CI.
|
|
98
|
-
*/
|
|
99
|
-
function detectGitMetadata() {
|
|
100
|
-
const repo = process.env.GITHUB_REPOSITORY;
|
|
101
|
-
if (!repo)
|
|
102
|
-
return undefined;
|
|
103
|
-
const sha = process.env.GITHUB_SHA ?? "unknown";
|
|
104
|
-
const ref = process.env.GITHUB_REF ?? "";
|
|
105
|
-
// Extract branch name from ref (refs/heads/main → main)
|
|
106
|
-
const branch = ref.startsWith("refs/heads/")
|
|
107
|
-
? ref.slice("refs/heads/".length)
|
|
108
|
-
: ref.startsWith("refs/pull/")
|
|
109
|
-
? `pr-${ref.split("/")[2]}`
|
|
110
|
-
: ref;
|
|
111
|
-
// Extract PR number from GITHUB_REF (refs/pull/123/merge)
|
|
112
|
-
const prMatch = ref.match(/^refs\/pull\/(\d+)\//);
|
|
113
|
-
const prNumber = prMatch ? parseInt(prMatch[1], 10) : undefined;
|
|
114
|
-
return { branch, prNumber, repo, sha };
|
|
115
|
-
}
|
|
116
|
-
// ---------------------------------------------------------------------------
|
|
117
|
-
// Git metadata
|
|
118
|
-
// ---------------------------------------------------------------------------
|
|
119
|
-
/**
|
|
120
|
-
* Infer what triggered this evaluation from environment variables.
|
|
121
|
-
*
|
|
122
|
-
* Detection order:
|
|
123
|
-
* 1. AILF_TRIGGER_TYPE — explicit override (for custom integrations)
|
|
124
|
-
* 2. GITHUB_EVENT_NAME === "schedule" — cron-triggered
|
|
125
|
-
* 3. GITHUB_EVENT_NAME === "repository_dispatch" — cross-repo trigger
|
|
126
|
-
* 4. GITHUB_ACTIONS === "true" — CI-triggered
|
|
127
|
-
* 5. Default: manual
|
|
128
|
-
*/
|
|
129
|
-
function detectTrigger() {
|
|
130
|
-
const explicit = process.env.AILF_TRIGGER_TYPE;
|
|
131
|
-
if (explicit === "scheduled") {
|
|
132
|
-
return {
|
|
133
|
-
schedule: process.env.AILF_SCHEDULE ?? "unknown",
|
|
134
|
-
type: "scheduled",
|
|
135
|
-
};
|
|
136
|
-
}
|
|
137
|
-
if (explicit === "webhook") {
|
|
138
|
-
return {
|
|
139
|
-
documentId: process.env.AILF_WEBHOOK_DOCUMENT_ID,
|
|
140
|
-
source: process.env.AILF_WEBHOOK_SOURCE ?? "unknown",
|
|
141
|
-
type: "webhook",
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
|
-
// GitHub Actions context
|
|
145
|
-
const eventName = process.env.GITHUB_EVENT_NAME;
|
|
146
|
-
if (eventName === "schedule") {
|
|
147
|
-
return {
|
|
148
|
-
schedule: process.env.GITHUB_SCHEDULE ?? "unknown",
|
|
149
|
-
type: "scheduled",
|
|
150
|
-
};
|
|
151
|
-
}
|
|
152
|
-
if (eventName === "repository_dispatch") {
|
|
153
|
-
return {
|
|
154
|
-
callerRef: process.env.GITHUB_REF,
|
|
155
|
-
// Note: callerRepo here is a fallback. The accurate caller repo
|
|
156
|
-
// comes from callerGit (injected into the PipelineRequest payload).
|
|
157
|
-
// GITHUB_REPOSITORY_OWNER_ID is just the org ID, not owner/repo.
|
|
158
|
-
callerRepo: process.env.GITHUB_REPOSITORY_OWNER_ID ?? "unknown",
|
|
159
|
-
type: "cross-repo",
|
|
160
|
-
};
|
|
161
|
-
}
|
|
162
|
-
if (process.env.GITHUB_ACTIONS === "true") {
|
|
163
|
-
return {
|
|
164
|
-
runId: process.env.GITHUB_RUN_ID ?? "unknown",
|
|
165
|
-
type: "ci",
|
|
166
|
-
workflow: process.env.GITHUB_WORKFLOW ?? "unknown",
|
|
167
|
-
};
|
|
168
|
-
}
|
|
169
|
-
return { type: "manual" };
|
|
170
|
-
}
|
|
171
|
-
// ---------------------------------------------------------------------------
|
|
172
|
-
// Model config loading
|
|
173
|
-
// ---------------------------------------------------------------------------
|
|
174
|
-
/**
|
|
175
|
-
* Load config/models to extract model list and grader info.
|
|
176
|
-
* Falls back to a minimal config if the file can't be read.
|
|
177
|
-
*/
|
|
178
|
-
function loadModelsConfig(rootDir, log) {
|
|
179
|
-
const result = tryLoadConfigFile("models", rootDir);
|
|
180
|
-
if (result)
|
|
181
|
-
return result.data;
|
|
182
|
-
log.warn("Could not read config/models for provenance");
|
|
183
|
-
return {
|
|
184
|
-
defaults: {},
|
|
185
|
-
grader: { id: "unknown" },
|
|
186
|
-
models: [],
|
|
187
39
|
};
|
|
188
40
|
}
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* @see docs/design-docs/report-store/domain-model.md
|
|
16
16
|
* @see packages/eval/src/pipeline/provenance.ts — builds the provenance input
|
|
17
17
|
*/
|
|
18
|
-
import type { EvalMode,
|
|
18
|
+
import type { EvalMode, RunTrigger } from "./types.js";
|
|
19
19
|
/** Input required to generate a human-readable report title. */
|
|
20
20
|
export interface ReportTitleInput {
|
|
21
21
|
provenance: {
|
|
@@ -31,7 +31,7 @@ export interface ReportTitleInput {
|
|
|
31
31
|
/** Sanity document IDs targeted (when scoped to specific documents) */
|
|
32
32
|
targetDocuments?: string[];
|
|
33
33
|
/** What triggered the evaluation */
|
|
34
|
-
trigger:
|
|
34
|
+
trigger: RunTrigger;
|
|
35
35
|
};
|
|
36
36
|
/**
|
|
37
37
|
* Total number of known feature areas in the system.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* buildRunContext — the single code path that derives `RunContext` from
|
|
3
|
+
* pipeline inputs.
|
|
4
|
+
*
|
|
5
|
+
* `RunContext` is the 9-field shape shared between `RunManifest.context`
|
|
6
|
+
* (in GCS) and `ReportProvenance` (in Content Lake, which `extends
|
|
7
|
+
* RunContext`). Routing every consumer through this function makes it
|
|
8
|
+
* structurally impossible for the two to disagree: there is no second
|
|
9
|
+
* code path to drift against.
|
|
10
|
+
*
|
|
11
|
+
* Contract test: `packages/eval/src/__tests__/run-context-parity.test.ts`
|
|
12
|
+
*
|
|
13
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Move 5 — Drift Prevention)
|
|
14
|
+
*/
|
|
15
|
+
import type { Logger, RunContext } from "../_vendor/ailf-core/index.d.ts";
|
|
16
|
+
import type { ResolvedSourceConfig } from "../sources.js";
|
|
17
|
+
import type { EvalMode } from "./types.js";
|
|
18
|
+
/**
|
|
19
|
+
* Inputs required to derive a `RunContext`. `ProvenanceInput` extends this
|
|
20
|
+
* so every caller of `buildProvenance` is automatically a valid input to
|
|
21
|
+
* `buildRunContext`.
|
|
22
|
+
*/
|
|
23
|
+
export interface RunContextInput {
|
|
24
|
+
/** Feature areas that were evaluated */
|
|
25
|
+
areas: string[];
|
|
26
|
+
/**
|
|
27
|
+
* Git metadata from the *calling* repository (cross-repo evaluations).
|
|
28
|
+
* When provided, overrides CI env var detection so context attributes
|
|
29
|
+
* to the caller — not the AILF core repo where the workflow executes.
|
|
30
|
+
*/
|
|
31
|
+
callerGit?: {
|
|
32
|
+
branch?: string;
|
|
33
|
+
prNumber?: number;
|
|
34
|
+
repo: string;
|
|
35
|
+
sha?: string;
|
|
36
|
+
};
|
|
37
|
+
/** Evaluation fingerprint for cross-environment cache lookup */
|
|
38
|
+
evalFingerprint?: string;
|
|
39
|
+
/** Logger instance (defaults to ConsoleLogger) */
|
|
40
|
+
logger?: Logger;
|
|
41
|
+
/** Evaluation mode */
|
|
42
|
+
mode: EvalMode;
|
|
43
|
+
/** Path to the package root (for reading config/models) */
|
|
44
|
+
rootDir: string;
|
|
45
|
+
/** Resolved documentation source */
|
|
46
|
+
source: ResolvedSourceConfig;
|
|
47
|
+
/** Specific task IDs evaluated (if scoped) */
|
|
48
|
+
taskIds?: string[];
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Derive `RunContext` from pipeline inputs. The only construction path.
|
|
52
|
+
*
|
|
53
|
+
* Both `FinalizeRunStep` (via `RunManifest.context`) and
|
|
54
|
+
* `PublishReportStep` (via `ReportProvenance`) call this function — the
|
|
55
|
+
* former directly, the latter transitively through `buildProvenance`.
|
|
56
|
+
*/
|
|
57
|
+
export declare function buildRunContext(input: RunContextInput): RunContext;
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* buildRunContext — the single code path that derives `RunContext` from
|
|
3
|
+
* pipeline inputs.
|
|
4
|
+
*
|
|
5
|
+
* `RunContext` is the 9-field shape shared between `RunManifest.context`
|
|
6
|
+
* (in GCS) and `ReportProvenance` (in Content Lake, which `extends
|
|
7
|
+
* RunContext`). Routing every consumer through this function makes it
|
|
8
|
+
* structurally impossible for the two to disagree: there is no second
|
|
9
|
+
* code path to drift against.
|
|
10
|
+
*
|
|
11
|
+
* Contract test: `packages/eval/src/__tests__/run-context-parity.test.ts`
|
|
12
|
+
*
|
|
13
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Move 5 — Drift Prevention)
|
|
14
|
+
*/
|
|
15
|
+
import { ConsoleLogger } from "../adapters/loggers/index.js";
|
|
16
|
+
import { tryLoadConfigFile } from "./compiler/config-loader.js";
|
|
17
|
+
/**
|
|
18
|
+
* Derive `RunContext` from pipeline inputs. The only construction path.
|
|
19
|
+
*
|
|
20
|
+
* Both `FinalizeRunStep` (via `RunManifest.context`) and
|
|
21
|
+
* `PublishReportStep` (via `ReportProvenance`) call this function — the
|
|
22
|
+
* former directly, the latter transitively through `buildProvenance`.
|
|
23
|
+
*/
|
|
24
|
+
export function buildRunContext(input) {
|
|
25
|
+
const log = input.logger ?? new ConsoleLogger();
|
|
26
|
+
const models = loadModelsConfig(input.rootDir, log);
|
|
27
|
+
// Cross-repo evaluations: prefer explicit caller git metadata over
|
|
28
|
+
// CI env vars (which always reflect the AILF core repo).
|
|
29
|
+
const git = input.callerGit
|
|
30
|
+
? {
|
|
31
|
+
branch: input.callerGit.branch ?? "unknown",
|
|
32
|
+
prNumber: input.callerGit.prNumber,
|
|
33
|
+
repo: input.callerGit.repo,
|
|
34
|
+
sha: input.callerGit.sha ?? "unknown",
|
|
35
|
+
}
|
|
36
|
+
: detectGitMetadata();
|
|
37
|
+
const trigger = detectTrigger();
|
|
38
|
+
// Non-literacy modes (agent-harness, mcp-server, etc.) don't use the
|
|
39
|
+
// config/models.ts model matrix — listing those models would be
|
|
40
|
+
// misleading. Only include them for literacy mode where they're the
|
|
41
|
+
// actual eval targets.
|
|
42
|
+
const evaluatedModels = input.mode === "literacy"
|
|
43
|
+
? models.models.map((m) => ({ id: m.id, label: m.label }))
|
|
44
|
+
: [];
|
|
45
|
+
return {
|
|
46
|
+
areas: input.areas,
|
|
47
|
+
evalFingerprint: input.evalFingerprint,
|
|
48
|
+
git,
|
|
49
|
+
graderModel: models.grader.id,
|
|
50
|
+
mode: input.mode,
|
|
51
|
+
models: evaluatedModels,
|
|
52
|
+
source: {
|
|
53
|
+
baseUrl: input.source.baseUrl,
|
|
54
|
+
dataset: input.source.dataset,
|
|
55
|
+
name: input.source.name,
|
|
56
|
+
perspective: input.source.perspective,
|
|
57
|
+
projectId: input.source.projectId,
|
|
58
|
+
},
|
|
59
|
+
taskIds: input.taskIds,
|
|
60
|
+
trigger,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Environment-derived context
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
/**
|
|
67
|
+
* Extract git metadata from GitHub Actions environment variables.
|
|
68
|
+
* Returns undefined when not running in CI.
|
|
69
|
+
*/
|
|
70
|
+
function detectGitMetadata() {
|
|
71
|
+
const repo = process.env.GITHUB_REPOSITORY;
|
|
72
|
+
if (!repo)
|
|
73
|
+
return undefined;
|
|
74
|
+
const sha = process.env.GITHUB_SHA ?? "unknown";
|
|
75
|
+
const ref = process.env.GITHUB_REF ?? "";
|
|
76
|
+
// Extract branch name from ref (refs/heads/main → main)
|
|
77
|
+
const branch = ref.startsWith("refs/heads/")
|
|
78
|
+
? ref.slice("refs/heads/".length)
|
|
79
|
+
: ref.startsWith("refs/pull/")
|
|
80
|
+
? `pr-${ref.split("/")[2]}`
|
|
81
|
+
: ref;
|
|
82
|
+
// Extract PR number from GITHUB_REF (refs/pull/123/merge)
|
|
83
|
+
const prMatch = ref.match(/^refs\/pull\/(\d+)\//);
|
|
84
|
+
const prNumber = prMatch ? parseInt(prMatch[1], 10) : undefined;
|
|
85
|
+
return { branch, prNumber, repo, sha };
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Infer what triggered this evaluation from environment variables.
|
|
89
|
+
*
|
|
90
|
+
* Detection order:
|
|
91
|
+
* 1. AILF_TRIGGER_TYPE — explicit override (for custom integrations)
|
|
92
|
+
* 2. GITHUB_EVENT_NAME === "schedule" — cron-triggered
|
|
93
|
+
* 3. GITHUB_EVENT_NAME === "repository_dispatch" — cross-repo trigger
|
|
94
|
+
* 4. GITHUB_ACTIONS === "true" — CI-triggered
|
|
95
|
+
* 5. Default: manual
|
|
96
|
+
*/
|
|
97
|
+
function detectTrigger() {
|
|
98
|
+
const explicit = process.env.AILF_TRIGGER_TYPE;
|
|
99
|
+
if (explicit === "scheduled") {
|
|
100
|
+
return {
|
|
101
|
+
schedule: process.env.AILF_SCHEDULE ?? "unknown",
|
|
102
|
+
type: "scheduled",
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
if (explicit === "webhook") {
|
|
106
|
+
return {
|
|
107
|
+
documentId: process.env.AILF_WEBHOOK_DOCUMENT_ID,
|
|
108
|
+
source: process.env.AILF_WEBHOOK_SOURCE ?? "unknown",
|
|
109
|
+
type: "webhook",
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
// GitHub Actions context
|
|
113
|
+
const eventName = process.env.GITHUB_EVENT_NAME;
|
|
114
|
+
if (eventName === "schedule") {
|
|
115
|
+
return {
|
|
116
|
+
schedule: process.env.GITHUB_SCHEDULE ?? "unknown",
|
|
117
|
+
type: "scheduled",
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
if (eventName === "repository_dispatch") {
|
|
121
|
+
return {
|
|
122
|
+
callerRef: process.env.GITHUB_REF,
|
|
123
|
+
// Note: callerRepo here is a fallback. The accurate caller repo
|
|
124
|
+
// comes from callerGit (injected into the PipelineRequest payload).
|
|
125
|
+
// GITHUB_REPOSITORY_OWNER_ID is just the org ID, not owner/repo.
|
|
126
|
+
callerRepo: process.env.GITHUB_REPOSITORY_OWNER_ID ?? "unknown",
|
|
127
|
+
type: "cross-repo",
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
if (process.env.GITHUB_ACTIONS === "true") {
|
|
131
|
+
return {
|
|
132
|
+
runId: process.env.GITHUB_RUN_ID ?? "unknown",
|
|
133
|
+
type: "ci",
|
|
134
|
+
workflow: process.env.GITHUB_WORKFLOW ?? "unknown",
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
return { type: "manual" };
|
|
138
|
+
}
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
// Model config loading
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
/**
|
|
143
|
+
* Load config/models to extract model list and grader info.
|
|
144
|
+
* Falls back to a minimal config if the file can't be read.
|
|
145
|
+
*/
|
|
146
|
+
function loadModelsConfig(rootDir, log) {
|
|
147
|
+
const result = tryLoadConfigFile("models", rootDir);
|
|
148
|
+
if (result)
|
|
149
|
+
return result.data;
|
|
150
|
+
log.warn("Could not read config/models for run context");
|
|
151
|
+
return {
|
|
152
|
+
defaults: {},
|
|
153
|
+
grader: { id: "unknown" },
|
|
154
|
+
models: [],
|
|
155
|
+
};
|
|
156
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* upload-test-outputs.ts — shared helper for the testOutputs artifact upload.
|
|
3
|
+
*
|
|
4
|
+
* CalculateScoresStep calls this once its score-summary.json is complete.
|
|
5
|
+
* Each {taskId, modelId} pair becomes one GCS object under
|
|
6
|
+
* `runs/{runId}/test-outputs/{taskId}--{modelId}.json` carrying the full
|
|
7
|
+
* response output and truncation flag. The returned ArtifactRef's
|
|
8
|
+
* `entries[]` catalog lists every uploaded entry so Studio can render
|
|
9
|
+
* drill-down state without a second listing call.
|
|
10
|
+
*
|
|
11
|
+
* PublishReportStep later strips responseOutput from the inline
|
|
12
|
+
* testResults[] when this upload succeeds, so the Content Lake document
|
|
13
|
+
* stays slim — the full output lives in GCS and is fetched per-entry
|
|
14
|
+
* on click.
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
|
+
*/
|
|
18
|
+
import type { ArtifactRef, ArtifactWriter, RunId, StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
|
|
19
|
+
/**
|
|
20
|
+
* Upload testOutputs as per-entry GCS objects under
|
|
21
|
+
* `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
|
|
22
|
+
*
|
|
23
|
+
* Returns the `ArtifactRef` on success, or `null` when upload is skipped or
|
|
24
|
+
* fails (P5: non-blocking).
|
|
25
|
+
*/
|
|
26
|
+
export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;
|