@sanity/ailf 2.7.1 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +173 -0
  4. package/dist/_vendor/ailf-core/artifact-registry.js +811 -0
  5. package/dist/_vendor/ailf-core/index.d.ts +3 -1
  6. package/dist/_vendor/ailf-core/index.js +3 -1
  7. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
  8. package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +95 -0
  9. package/dist/_vendor/ailf-core/ports/artifact-writer.js +51 -0
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +32 -3
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -1
  13. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
  14. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  15. package/dist/_vendor/ailf-core/services/index.js +1 -0
  16. package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
  17. package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
  18. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +42 -0
  19. package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
  20. package/dist/_vendor/ailf-core/types/index.d.ts +298 -77
  21. package/dist/_vendor/ailf-core/types/index.js +1 -1
  22. package/dist/_vendor/ailf-shared/index.d.ts +2 -0
  23. package/dist/_vendor/ailf-shared/index.js +2 -0
  24. package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
  25. package/dist/_vendor/ailf-shared/run-context.js +17 -0
  26. package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
  27. package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
  28. package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
  29. package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
  30. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +52 -0
  31. package/dist/artifact-capture/api-gateway-artifact-writer.js +199 -0
  32. package/dist/artifact-capture/emit-file.d.ts +28 -0
  33. package/dist/artifact-capture/emit-file.js +56 -0
  34. package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
  35. package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
  36. package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
  37. package/dist/artifact-capture/filesystem-collector.js +48 -23
  38. package/dist/artifact-capture/gcs-artifact-writer.d.ts +67 -0
  39. package/dist/artifact-capture/gcs-artifact-writer.js +343 -0
  40. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
  41. package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
  42. package/dist/commands/explain-handler.js +4 -0
  43. package/dist/commands/pipeline-action.d.ts +5 -0
  44. package/dist/commands/pipeline-action.js +56 -5
  45. package/dist/commands/pipeline.d.ts +4 -0
  46. package/dist/commands/pipeline.js +6 -2
  47. package/dist/commands/publish.js +7 -3
  48. package/dist/composition-root.d.ts +14 -11
  49. package/dist/composition-root.js +90 -31
  50. package/dist/orchestration/build-step-sequence.js +6 -1
  51. package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
  52. package/dist/orchestration/pipeline-orchestrator.js +41 -30
  53. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
  54. package/dist/orchestration/steps/calculate-scores-step.js +50 -10
  55. package/dist/orchestration/steps/callback-step.d.ts +1 -1
  56. package/dist/orchestration/steps/callback-step.js +6 -4
  57. package/dist/orchestration/steps/compare-step.d.ts +1 -1
  58. package/dist/orchestration/steps/compare-step.js +4 -2
  59. package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
  60. package/dist/orchestration/steps/discovery-report-step.js +4 -1
  61. package/dist/orchestration/steps/fetch-docs-step.js +9 -15
  62. package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
  63. package/dist/orchestration/steps/finalize-run-step.js +117 -0
  64. package/dist/orchestration/steps/gap-analysis-step.js +34 -6
  65. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
  66. package/dist/orchestration/steps/generate-configs-step.js +11 -11
  67. package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
  68. package/dist/orchestration/steps/publish-report-step.js +40 -55
  69. package/dist/orchestration/steps/readiness-step.d.ts +1 -1
  70. package/dist/orchestration/steps/readiness-step.js +4 -1
  71. package/dist/orchestration/steps/report-step.d.ts +1 -1
  72. package/dist/orchestration/steps/report-step.js +6 -3
  73. package/dist/orchestration/steps/run-eval-step.js +14 -9
  74. package/dist/pipeline/calculate-scores.js +13 -2
  75. package/dist/pipeline/compare.d.ts +2 -2
  76. package/dist/pipeline/emit-eval-results.d.ts +38 -0
  77. package/dist/pipeline/emit-eval-results.js +100 -0
  78. package/dist/pipeline/provenance.d.ts +24 -44
  79. package/dist/pipeline/provenance.js +17 -165
  80. package/dist/pipeline/report-title.d.ts +2 -2
  81. package/dist/pipeline/run-context.d.ts +57 -0
  82. package/dist/pipeline/run-context.js +156 -0
  83. package/dist/pipeline/upload-test-outputs.d.ts +26 -0
  84. package/dist/pipeline/upload-test-outputs.js +34 -0
  85. package/dist/report-store.js +4 -2
  86. package/package.json +3 -3
  87. package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
  88. package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
  89. package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
  90. package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
  91. package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
  92. package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66
@@ -0,0 +1,100 @@
1
+ /**
2
+ * emit-eval-results.ts — decompose the promptfoo results file into the
3
+ * per-entry descriptors that W0049's registry expects.
4
+ *
5
+ * Replaces the Phase-B-stopgap "route the aggregated JSON through the
6
+ * deprecated `evalResults` bulk descriptor" path. For each test in the
7
+ * promptfoo output we emit:
8
+ *
9
+ * - `rawResults` per (run, mode, task, model) — the full result
10
+ * - `renderedPrompts` per (run, mode, task, model) — prompt the model saw
11
+ * - `graderPrompts` per (run, mode, task, model, grader) — rubric text
12
+ * - `graderJudgments` per (run, mode, task, model, grader) — {score, reason, pass}
13
+ *
14
+ * `testOutputs` is still emitted separately by `calculate-scores-step`
15
+ * via `uploadTestOutputs()` (carried forward from W0048 for byte-
16
+ * equivalence with the original rollout).
17
+ *
18
+ * `traces` is NOT produced here — agentic trace data flows through the
19
+ * agent-observer, not through the promptfoo result shape. Traces
20
+ * emission is out of scope for this helper and lands when the observer
21
+ * integration migrates (follow-up; not in W0050).
22
+ *
23
+ * The "grader" axis value is the rubric dimension string produced by
24
+ * `classifyRubric` (e.g. "task-completion", "code-correctness"). Non-
25
+ * LLM-rubric component assertions (javascript, contains, etc.) don't
26
+ * have a natural grader identifier and are skipped — their outcomes
27
+ * still live inside the full `rawResults` object.
28
+ */
29
+ import { readFileSync } from "node:fs";
30
+ import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
31
+ // ---------------------------------------------------------------------------
32
+ // Public entry point
33
+ // ---------------------------------------------------------------------------
34
+ /**
35
+ * Parse a promptfoo results file and emit the per-entry artifacts.
36
+ *
37
+ * Non-blocking: any individual emit failure warns but does not halt.
38
+ * File read/parse errors are caught and logged; the caller keeps going.
39
+ */
40
+ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
41
+ let raw;
42
+ try {
43
+ raw = JSON.parse(readFileSync(resultsPath, "utf-8"));
44
+ }
45
+ catch (err) {
46
+ const message = err instanceof Error ? err.message : String(err);
47
+ console.warn(` ⚠️ emitPerEntryEvalResults: failed to read ${resultsPath} — ${message}`);
48
+ return;
49
+ }
50
+ // Promptfoo wraps results in either `{ results: { results: [...] } }`
51
+ // (older shape) or directly as `{ results: [...] }` (some adapters).
52
+ const wrapper = raw.results && "results" in raw.results
53
+ ? raw.results
54
+ : raw;
55
+ const rows = wrapper?.results ?? [];
56
+ if (rows.length === 0) {
57
+ console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
58
+ return;
59
+ }
60
+ for (const result of rows) {
61
+ const taskId = result.testCase?.description ?? "unknown-task";
62
+ const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
63
+ const baseAssoc = {
64
+ run: ctx.runId,
65
+ mode,
66
+ task: taskId,
67
+ model: modelId,
68
+ };
69
+ // rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
70
+ await writer.emit("rawResults", baseAssoc, result);
71
+ // renderedPrompts — what the model saw + which provider it went to
72
+ if (result.prompt !== undefined) {
73
+ await writer.emit("renderedPrompts", baseAssoc, {
74
+ prompt: result.prompt,
75
+ provider: result.provider,
76
+ });
77
+ }
78
+ // Per-grader decomposition — only LLM-rubric assertions have a
79
+ // natural grader identity. Code assertions (javascript/contains/…)
80
+ // show up in rawResults but not as standalone graderJudgments.
81
+ const components = result.gradingResult?.componentResults ?? [];
82
+ for (const comp of components) {
83
+ if (comp.assertion?.type !== "llm-rubric")
84
+ continue;
85
+ const dimension = classifyRubric(comp);
86
+ if (!dimension)
87
+ continue;
88
+ const graderAssoc = { ...baseAssoc, grader: dimension };
89
+ await writer.emit("graderPrompts", graderAssoc, {
90
+ dimension,
91
+ assertion: comp.assertion,
92
+ });
93
+ await writer.emit("graderJudgments", graderAssoc, {
94
+ score: parseRubricScore(comp) ?? 0,
95
+ reason: comp.reason ?? "",
96
+ pass: comp.pass,
97
+ });
98
+ }
99
+ }
100
+ }
@@ -1,65 +1,45 @@
1
1
  /**
2
2
  * pipeline/provenance.ts
3
3
  *
4
- * Builds ReportProvenance from data available during a pipeline run.
4
+ * Builds `ReportProvenance` from data available during a pipeline run.
5
5
  *
6
- * Provenance captures what produced an evaluation report: which models,
7
- * which source, which mode, what triggered it, git metadata, etc.
8
- * Most of this data already flows through the pipeline — this module
9
- * just captures what would otherwise be ephemeral.
6
+ * `ReportProvenance extends RunContext` (D0032). This module derives
7
+ * RunContext via `buildRunContext()` and attaches report-specific extras
8
+ * (lineage, autoScope, promptfoo URLs, targetDocuments, runId). A single
9
+ * derivation path for RunContext foreclosures drift between the run
10
+ * manifest (GCS) and the report provenance (Content Lake).
10
11
  *
11
- * @see docs/design-docs/report-store/domain-model.md
12
- * @see docs/design-docs/report-store/architecture.md Provenance collection
12
+ * @see packages/eval/src/pipeline/run-context.ts — the shared derivation path
13
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md Drift Prevention)
13
14
  */
14
- import type { Logger } from "../_vendor/ailf-core/index.d.ts";
15
- import type { ResolvedSourceConfig } from "../sources.js";
16
- import type { EvalMode, PromptfooUrlEntry, ReportAutoScope, ReportProvenance } from "./types.js";
17
- export interface ProvenanceInput {
18
- /** Feature areas that were evaluated */
19
- areas: string[];
20
- /** Logger instance (defaults to ConsoleLogger) */
21
- logger?: Logger;
15
+ import type { PromptfooUrlEntry, ReportAutoScope, ReportProvenance, RunId } from "./types.js";
16
+ import { type RunContextInput } from "./run-context.js";
17
+ /**
18
+ * Inputs needed to build a ReportProvenance. Extends `RunContextInput` so
19
+ * the RunContext derivation path is shared.
20
+ */
21
+ export interface ProvenanceInput extends RunContextInput {
22
22
  /** Release auto-scope metadata (when perspective evaluation was scoped) */
23
23
  autoScope?: ReportAutoScope;
24
- /**
25
- * Git metadata from the *calling* repository (cross-repo evaluations).
26
- * When provided, overrides CI env var detection so provenance attributes
27
- * to the caller — not the AILF core repo where the workflow executes.
28
- */
29
- callerGit?: {
30
- branch?: string;
31
- prNumber?: number;
32
- repo: string;
33
- sha?: string;
34
- };
35
24
  /** SHA-256 hash of the doc context files (from cache system) */
36
25
  contextHash?: string;
37
- /** Evaluation fingerprint for cross-environment cache lookup */
38
- evalFingerprint?: string;
39
- /** Evaluation mode */
40
- mode: EvalMode;
41
26
  /** @deprecated Use `promptfooUrls` — kept for backward compatibility */
42
27
  promptfooUrl?: string;
43
28
  /** Per-mode Promptfoo share URLs */
44
29
  promptfooUrls?: PromptfooUrlEntry[];
45
- /** Path to the package root (for reading config/models) */
46
- rootDir: string;
47
- /** Report ID that triggered this re-run (becomes lineage.rerunOf) */
48
- sourceReportId?: string;
30
+ /** Identity of the pipeline run that produced this report (D0032) */
31
+ runId: RunId;
49
32
  /** Sanity document IDs targeted */
50
33
  sanityDocumentIds?: string[];
51
- /** Resolved documentation source */
52
- source: ResolvedSourceConfig;
53
- /** Specific task IDs evaluated (if scoped) */
54
- taskIds?: string[];
34
+ /** Report ID that triggered this re-run (becomes lineage.rerunOf) */
35
+ sourceReportId?: string;
55
36
  }
56
37
  /**
57
- * Build a ReportProvenance object from pipeline context.
38
+ * Build a ReportProvenance from pipeline context.
58
39
  *
59
- * Assembles provenance from:
60
- * - Pipeline options (mode, source, areas, tasks)
61
- * - config/models.ts (model list, grader)
62
- * - Environment variables (CI metadata, trigger detection)
63
- * - Optional metadata (context hash, Promptfoo URL)
40
+ * RunContext fields (mode, areas, taskIds, models, graderModel, source,
41
+ * evalFingerprint, trigger, git) come from `buildRunContext`. Report-
42
+ * specific fields (autoScope, contextHash, lineage, promptfoo*, runId,
43
+ * targetDocuments) are attached here.
64
44
  */
65
45
  export declare function buildProvenance(input: ProvenanceInput): ReportProvenance;
@@ -1,188 +1,40 @@
1
1
  /**
2
2
  * pipeline/provenance.ts
3
3
  *
4
- * Builds ReportProvenance from data available during a pipeline run.
4
+ * Builds `ReportProvenance` from data available during a pipeline run.
5
5
  *
6
- * Provenance captures what produced an evaluation report: which models,
7
- * which source, which mode, what triggered it, git metadata, etc.
8
- * Most of this data already flows through the pipeline — this module
9
- * just captures what would otherwise be ephemeral.
6
+ * `ReportProvenance extends RunContext` (D0032). This module derives
7
+ * RunContext via `buildRunContext()` and attaches report-specific extras
8
+ * (lineage, autoScope, promptfoo URLs, targetDocuments, runId). A single
9
+ * derivation path for RunContext foreclosures drift between the run
10
+ * manifest (GCS) and the report provenance (Content Lake).
10
11
  *
11
- * @see docs/design-docs/report-store/domain-model.md
12
- * @see docs/design-docs/report-store/architecture.md Provenance collection
12
+ * @see packages/eval/src/pipeline/run-context.ts — the shared derivation path
13
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md Drift Prevention)
13
14
  */
14
- import { ConsoleLogger } from "../adapters/loggers/index.js";
15
- import { tryLoadConfigFile } from "./compiler/config-loader.js";
15
+ import { buildRunContext } from "./run-context.js";
16
16
  /**
17
- * Build a ReportProvenance object from pipeline context.
17
+ * Build a ReportProvenance from pipeline context.
18
18
  *
19
- * Assembles provenance from:
20
- * - Pipeline options (mode, source, areas, tasks)
21
- * - config/models.ts (model list, grader)
22
- * - Environment variables (CI metadata, trigger detection)
23
- * - Optional metadata (context hash, Promptfoo URL)
19
+ * RunContext fields (mode, areas, taskIds, models, graderModel, source,
20
+ * evalFingerprint, trigger, git) come from `buildRunContext`. Report-
21
+ * specific fields (autoScope, contextHash, lineage, promptfoo*, runId,
22
+ * targetDocuments) are attached here.
24
23
  */
25
24
  export function buildProvenance(input) {
26
- const log = input.logger ?? new ConsoleLogger();
27
- const models = loadModelsConfig(input.rootDir, log);
28
- log.debug("Assembling provenance input", {
29
- mode: input.mode,
30
- sourceName: input.source.name,
31
- sourceBaseUrl: input.source.baseUrl,
32
- areas: input.areas,
33
- taskIds: input.taskIds,
34
- hasContextHash: Boolean(input.contextHash),
35
- hasEvalFingerprint: Boolean(input.evalFingerprint),
36
- hasCallerGit: Boolean(input.callerGit),
37
- hasSourceReportId: Boolean(input.sourceReportId),
38
- modelCount: models.models.length,
39
- });
40
- // Cross-repo evaluations: prefer explicit caller git metadata over
41
- // CI env vars (which always reflect the AILF core repo).
42
- const git = input.callerGit
43
- ? {
44
- branch: input.callerGit.branch ?? "unknown",
45
- prNumber: input.callerGit.prNumber,
46
- repo: input.callerGit.repo,
47
- sha: input.callerGit.sha ?? "unknown",
48
- }
49
- : detectGitMetadata();
25
+ const runContext = buildRunContext(input);
50
26
  // Build lineage from explicit relationships
51
27
  const lineage = input.sourceReportId
52
28
  ? { rerunOf: input.sourceReportId }
53
29
  : undefined;
54
- const trigger = detectTrigger();
55
- log.debug("Provenance computed", {
56
- triggerType: trigger.type,
57
- gitRepo: git?.repo,
58
- gitBranch: git?.branch,
59
- evalFingerprint: input.evalFingerprint,
60
- hasLineage: Boolean(lineage),
61
- });
62
- // Non-literacy modes (agent-harness, mcp-server, etc.) don't use the
63
- // config/models.ts model matrix — listing those models would be misleading.
64
- // Only include them for literacy mode where they're the actual eval targets.
65
- const evaluatedModels = input.mode === "literacy"
66
- ? models.models.map((m) => ({ id: m.id, label: m.label }))
67
- : [];
68
30
  return {
69
- areas: input.areas,
31
+ ...runContext,
70
32
  autoScope: input.autoScope,
71
33
  contextHash: input.contextHash,
72
- evalFingerprint: input.evalFingerprint,
73
- git,
74
- graderModel: models.grader.id,
75
34
  lineage,
76
- mode: input.mode,
77
- models: evaluatedModels,
78
35
  promptfooUrl: input.promptfooUrl,
79
36
  promptfooUrls: input.promptfooUrls,
80
- source: {
81
- baseUrl: input.source.baseUrl,
82
- dataset: input.source.dataset,
83
- name: input.source.name,
84
- perspective: input.source.perspective,
85
- projectId: input.source.projectId,
86
- },
37
+ runId: input.runId,
87
38
  targetDocuments: input.sanityDocumentIds,
88
- taskIds: input.taskIds,
89
- trigger: detectTrigger(),
90
- };
91
- }
92
- // ---------------------------------------------------------------------------
93
- // Trigger detection
94
- // ---------------------------------------------------------------------------
95
- /**
96
- * Extract git metadata from GitHub Actions environment variables.
97
- * Returns undefined when not running in CI.
98
- */
99
- function detectGitMetadata() {
100
- const repo = process.env.GITHUB_REPOSITORY;
101
- if (!repo)
102
- return undefined;
103
- const sha = process.env.GITHUB_SHA ?? "unknown";
104
- const ref = process.env.GITHUB_REF ?? "";
105
- // Extract branch name from ref (refs/heads/main → main)
106
- const branch = ref.startsWith("refs/heads/")
107
- ? ref.slice("refs/heads/".length)
108
- : ref.startsWith("refs/pull/")
109
- ? `pr-${ref.split("/")[2]}`
110
- : ref;
111
- // Extract PR number from GITHUB_REF (refs/pull/123/merge)
112
- const prMatch = ref.match(/^refs\/pull\/(\d+)\//);
113
- const prNumber = prMatch ? parseInt(prMatch[1], 10) : undefined;
114
- return { branch, prNumber, repo, sha };
115
- }
116
- // ---------------------------------------------------------------------------
117
- // Git metadata
118
- // ---------------------------------------------------------------------------
119
- /**
120
- * Infer what triggered this evaluation from environment variables.
121
- *
122
- * Detection order:
123
- * 1. AILF_TRIGGER_TYPE — explicit override (for custom integrations)
124
- * 2. GITHUB_EVENT_NAME === "schedule" — cron-triggered
125
- * 3. GITHUB_EVENT_NAME === "repository_dispatch" — cross-repo trigger
126
- * 4. GITHUB_ACTIONS === "true" — CI-triggered
127
- * 5. Default: manual
128
- */
129
- function detectTrigger() {
130
- const explicit = process.env.AILF_TRIGGER_TYPE;
131
- if (explicit === "scheduled") {
132
- return {
133
- schedule: process.env.AILF_SCHEDULE ?? "unknown",
134
- type: "scheduled",
135
- };
136
- }
137
- if (explicit === "webhook") {
138
- return {
139
- documentId: process.env.AILF_WEBHOOK_DOCUMENT_ID,
140
- source: process.env.AILF_WEBHOOK_SOURCE ?? "unknown",
141
- type: "webhook",
142
- };
143
- }
144
- // GitHub Actions context
145
- const eventName = process.env.GITHUB_EVENT_NAME;
146
- if (eventName === "schedule") {
147
- return {
148
- schedule: process.env.GITHUB_SCHEDULE ?? "unknown",
149
- type: "scheduled",
150
- };
151
- }
152
- if (eventName === "repository_dispatch") {
153
- return {
154
- callerRef: process.env.GITHUB_REF,
155
- // Note: callerRepo here is a fallback. The accurate caller repo
156
- // comes from callerGit (injected into the PipelineRequest payload).
157
- // GITHUB_REPOSITORY_OWNER_ID is just the org ID, not owner/repo.
158
- callerRepo: process.env.GITHUB_REPOSITORY_OWNER_ID ?? "unknown",
159
- type: "cross-repo",
160
- };
161
- }
162
- if (process.env.GITHUB_ACTIONS === "true") {
163
- return {
164
- runId: process.env.GITHUB_RUN_ID ?? "unknown",
165
- type: "ci",
166
- workflow: process.env.GITHUB_WORKFLOW ?? "unknown",
167
- };
168
- }
169
- return { type: "manual" };
170
- }
171
- // ---------------------------------------------------------------------------
172
- // Model config loading
173
- // ---------------------------------------------------------------------------
174
- /**
175
- * Load config/models to extract model list and grader info.
176
- * Falls back to a minimal config if the file can't be read.
177
- */
178
- function loadModelsConfig(rootDir, log) {
179
- const result = tryLoadConfigFile("models", rootDir);
180
- if (result)
181
- return result.data;
182
- log.warn("Could not read config/models for provenance");
183
- return {
184
- defaults: {},
185
- grader: { id: "unknown" },
186
- models: [],
187
39
  };
188
40
  }
@@ -15,7 +15,7 @@
15
15
  * @see docs/design-docs/report-store/domain-model.md
16
16
  * @see packages/eval/src/pipeline/provenance.ts — builds the provenance input
17
17
  */
18
- import type { EvalMode, ReportTrigger } from "./types.js";
18
+ import type { EvalMode, RunTrigger } from "./types.js";
19
19
  /** Input required to generate a human-readable report title. */
20
20
  export interface ReportTitleInput {
21
21
  provenance: {
@@ -31,7 +31,7 @@ export interface ReportTitleInput {
31
31
  /** Sanity document IDs targeted (when scoped to specific documents) */
32
32
  targetDocuments?: string[];
33
33
  /** What triggered the evaluation */
34
- trigger: ReportTrigger;
34
+ trigger: RunTrigger;
35
35
  };
36
36
  /**
37
37
  * Total number of known feature areas in the system.
@@ -0,0 +1,57 @@
1
+ /**
2
+ * buildRunContext — the single code path that derives `RunContext` from
3
+ * pipeline inputs.
4
+ *
5
+ * `RunContext` is the 9-field shape shared between `RunManifest.context`
6
+ * (in GCS) and `ReportProvenance` (in Content Lake, which `extends
7
+ * RunContext`). Routing every consumer through this function makes it
8
+ * structurally impossible for the two to disagree: there is no second
9
+ * code path to drift against.
10
+ *
11
+ * Contract test: `packages/eval/src/__tests__/run-context-parity.test.ts`
12
+ *
13
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Move 5 — Drift Prevention)
14
+ */
15
+ import type { Logger, RunContext } from "../_vendor/ailf-core/index.d.ts";
16
+ import type { ResolvedSourceConfig } from "../sources.js";
17
+ import type { EvalMode } from "./types.js";
18
+ /**
19
+ * Inputs required to derive a `RunContext`. `ProvenanceInput` extends this
20
+ * so every caller of `buildProvenance` is automatically a valid input to
21
+ * `buildRunContext`.
22
+ */
23
+ export interface RunContextInput {
24
+ /** Feature areas that were evaluated */
25
+ areas: string[];
26
+ /**
27
+ * Git metadata from the *calling* repository (cross-repo evaluations).
28
+ * When provided, overrides CI env var detection so context attributes
29
+ * to the caller — not the AILF core repo where the workflow executes.
30
+ */
31
+ callerGit?: {
32
+ branch?: string;
33
+ prNumber?: number;
34
+ repo: string;
35
+ sha?: string;
36
+ };
37
+ /** Evaluation fingerprint for cross-environment cache lookup */
38
+ evalFingerprint?: string;
39
+ /** Logger instance (defaults to ConsoleLogger) */
40
+ logger?: Logger;
41
+ /** Evaluation mode */
42
+ mode: EvalMode;
43
+ /** Path to the package root (for reading config/models) */
44
+ rootDir: string;
45
+ /** Resolved documentation source */
46
+ source: ResolvedSourceConfig;
47
+ /** Specific task IDs evaluated (if scoped) */
48
+ taskIds?: string[];
49
+ }
50
+ /**
51
+ * Derive `RunContext` from pipeline inputs. The only construction path.
52
+ *
53
+ * Both `FinalizeRunStep` (via `RunManifest.context`) and
54
+ * `PublishReportStep` (via `ReportProvenance`) call this function — the
55
+ * former directly, the latter transitively through `buildProvenance`.
56
+ */
57
+ export declare function buildRunContext(input: RunContextInput): RunContext;
@@ -0,0 +1,156 @@
1
+ /**
2
+ * buildRunContext — the single code path that derives `RunContext` from
3
+ * pipeline inputs.
4
+ *
5
+ * `RunContext` is the 9-field shape shared between `RunManifest.context`
6
+ * (in GCS) and `ReportProvenance` (in Content Lake, which `extends
7
+ * RunContext`). Routing every consumer through this function makes it
8
+ * structurally impossible for the two to disagree: there is no second
9
+ * code path to drift against.
10
+ *
11
+ * Contract test: `packages/eval/src/__tests__/run-context-parity.test.ts`
12
+ *
13
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md (§ Move 5 — Drift Prevention)
14
+ */
15
+ import { ConsoleLogger } from "../adapters/loggers/index.js";
16
+ import { tryLoadConfigFile } from "./compiler/config-loader.js";
17
+ /**
18
+ * Derive `RunContext` from pipeline inputs. The only construction path.
19
+ *
20
+ * Both `FinalizeRunStep` (via `RunManifest.context`) and
21
+ * `PublishReportStep` (via `ReportProvenance`) call this function — the
22
+ * former directly, the latter transitively through `buildProvenance`.
23
+ */
24
+ export function buildRunContext(input) {
25
+ const log = input.logger ?? new ConsoleLogger();
26
+ const models = loadModelsConfig(input.rootDir, log);
27
+ // Cross-repo evaluations: prefer explicit caller git metadata over
28
+ // CI env vars (which always reflect the AILF core repo).
29
+ const git = input.callerGit
30
+ ? {
31
+ branch: input.callerGit.branch ?? "unknown",
32
+ prNumber: input.callerGit.prNumber,
33
+ repo: input.callerGit.repo,
34
+ sha: input.callerGit.sha ?? "unknown",
35
+ }
36
+ : detectGitMetadata();
37
+ const trigger = detectTrigger();
38
+ // Non-literacy modes (agent-harness, mcp-server, etc.) don't use the
39
+ // config/models.ts model matrix — listing those models would be
40
+ // misleading. Only include them for literacy mode where they're the
41
+ // actual eval targets.
42
+ const evaluatedModels = input.mode === "literacy"
43
+ ? models.models.map((m) => ({ id: m.id, label: m.label }))
44
+ : [];
45
+ return {
46
+ areas: input.areas,
47
+ evalFingerprint: input.evalFingerprint,
48
+ git,
49
+ graderModel: models.grader.id,
50
+ mode: input.mode,
51
+ models: evaluatedModels,
52
+ source: {
53
+ baseUrl: input.source.baseUrl,
54
+ dataset: input.source.dataset,
55
+ name: input.source.name,
56
+ perspective: input.source.perspective,
57
+ projectId: input.source.projectId,
58
+ },
59
+ taskIds: input.taskIds,
60
+ trigger,
61
+ };
62
+ }
63
+ // ---------------------------------------------------------------------------
64
+ // Environment-derived context
65
+ // ---------------------------------------------------------------------------
66
+ /**
67
+ * Extract git metadata from GitHub Actions environment variables.
68
+ * Returns undefined when not running in CI.
69
+ */
70
+ function detectGitMetadata() {
71
+ const repo = process.env.GITHUB_REPOSITORY;
72
+ if (!repo)
73
+ return undefined;
74
+ const sha = process.env.GITHUB_SHA ?? "unknown";
75
+ const ref = process.env.GITHUB_REF ?? "";
76
+ // Extract branch name from ref (refs/heads/main → main)
77
+ const branch = ref.startsWith("refs/heads/")
78
+ ? ref.slice("refs/heads/".length)
79
+ : ref.startsWith("refs/pull/")
80
+ ? `pr-${ref.split("/")[2]}`
81
+ : ref;
82
+ // Extract PR number from GITHUB_REF (refs/pull/123/merge)
83
+ const prMatch = ref.match(/^refs\/pull\/(\d+)\//);
84
+ const prNumber = prMatch ? parseInt(prMatch[1], 10) : undefined;
85
+ return { branch, prNumber, repo, sha };
86
+ }
87
+ /**
88
+ * Infer what triggered this evaluation from environment variables.
89
+ *
90
+ * Detection order:
91
+ * 1. AILF_TRIGGER_TYPE — explicit override (for custom integrations)
92
+ * 2. GITHUB_EVENT_NAME === "schedule" — cron-triggered
93
+ * 3. GITHUB_EVENT_NAME === "repository_dispatch" — cross-repo trigger
94
+ * 4. GITHUB_ACTIONS === "true" — CI-triggered
95
+ * 5. Default: manual
96
+ */
97
+ function detectTrigger() {
98
+ const explicit = process.env.AILF_TRIGGER_TYPE;
99
+ if (explicit === "scheduled") {
100
+ return {
101
+ schedule: process.env.AILF_SCHEDULE ?? "unknown",
102
+ type: "scheduled",
103
+ };
104
+ }
105
+ if (explicit === "webhook") {
106
+ return {
107
+ documentId: process.env.AILF_WEBHOOK_DOCUMENT_ID,
108
+ source: process.env.AILF_WEBHOOK_SOURCE ?? "unknown",
109
+ type: "webhook",
110
+ };
111
+ }
112
+ // GitHub Actions context
113
+ const eventName = process.env.GITHUB_EVENT_NAME;
114
+ if (eventName === "schedule") {
115
+ return {
116
+ schedule: process.env.GITHUB_SCHEDULE ?? "unknown",
117
+ type: "scheduled",
118
+ };
119
+ }
120
+ if (eventName === "repository_dispatch") {
121
+ return {
122
+ callerRef: process.env.GITHUB_REF,
123
+ // Note: callerRepo here is a fallback. The accurate caller repo
124
+ // comes from callerGit (injected into the PipelineRequest payload).
125
+ // GITHUB_REPOSITORY_OWNER_ID is just the org ID, not owner/repo.
126
+ callerRepo: process.env.GITHUB_REPOSITORY_OWNER_ID ?? "unknown",
127
+ type: "cross-repo",
128
+ };
129
+ }
130
+ if (process.env.GITHUB_ACTIONS === "true") {
131
+ return {
132
+ runId: process.env.GITHUB_RUN_ID ?? "unknown",
133
+ type: "ci",
134
+ workflow: process.env.GITHUB_WORKFLOW ?? "unknown",
135
+ };
136
+ }
137
+ return { type: "manual" };
138
+ }
139
+ // ---------------------------------------------------------------------------
140
+ // Model config loading
141
+ // ---------------------------------------------------------------------------
142
+ /**
143
+ * Load config/models to extract model list and grader info.
144
+ * Falls back to a minimal config if the file can't be read.
145
+ */
146
+ function loadModelsConfig(rootDir, log) {
147
+ const result = tryLoadConfigFile("models", rootDir);
148
+ if (result)
149
+ return result.data;
150
+ log.warn("Could not read config/models for run context");
151
+ return {
152
+ defaults: {},
153
+ grader: { id: "unknown" },
154
+ models: [],
155
+ };
156
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * upload-test-outputs.ts — shared helper for the testOutputs artifact upload.
3
+ *
4
+ * CalculateScoresStep calls this once its score-summary.json is complete.
5
+ * Each {taskId, modelId} pair becomes one GCS object under
6
+ * `runs/{runId}/test-outputs/{taskId}--{modelId}.json` carrying the full
7
+ * response output and truncation flag. The returned ArtifactRef's
8
+ * `entries[]` catalog lists every uploaded entry so Studio can render
9
+ * drill-down state without a second listing call.
10
+ *
11
+ * PublishReportStep later strips responseOutput from the inline
12
+ * testResults[] when this upload succeeds, so the Content Lake document
13
+ * stays slim — the full output lives in GCS and is fetched per-entry
14
+ * on click.
15
+ *
16
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
+ */
18
+ import type { ArtifactRef, ArtifactWriter, RunId, StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
19
+ /**
20
+ * Upload testOutputs as per-entry GCS objects under
21
+ * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
22
+ *
23
+ * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
24
+ * fails (P5: non-blocking).
25
+ */
26
+ export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;