@tangle-network/agent-eval 0.30.0 → 0.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/CHANGELOG.md +79 -0
  2. package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
  3. package/dist/benchmarks/index.d.ts +3 -3
  4. package/dist/builder-eval/index.d.ts +3 -3
  5. package/dist/builder-eval/index.js +2 -2
  6. package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
  7. package/dist/{chunk-SZSBQUIJ.js → chunk-B73G44OH.js} +3 -3
  8. package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
  9. package/dist/{chunk-RUI6SIHY.js → chunk-DTEJNZYK.js} +5 -4
  10. package/dist/chunk-DTEJNZYK.js.map +1 -0
  11. package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
  12. package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
  13. package/dist/chunk-HIO4UIS5.js.map +1 -0
  14. package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
  15. package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
  16. package/dist/chunk-QYJT52YW.js.map +1 -0
  17. package/dist/{chunk-PALJO75S.js → chunk-S4Y5VXMS.js} +2 -2
  18. package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
  19. package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
  20. package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
  21. package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
  22. package/dist/{chunk-NLMNWKVM.js → chunk-ZN2CMQIW.js} +54 -2
  23. package/dist/chunk-ZN2CMQIW.js.map +1 -0
  24. package/dist/cli.js +3 -3
  25. package/dist/{control-rJhEDdpy.d.ts → control-p2ns7elI.d.ts} +5 -5
  26. package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
  27. package/dist/control.d.ts +8 -8
  28. package/dist/control.js +3 -3
  29. package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
  30. package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  31. package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
  32. package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
  33. package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
  34. package/dist/governance/index.d.ts +4 -4
  35. package/dist/{index--fVrWDiR.d.ts → index-BTqhGHJT.d.ts} +1 -1
  36. package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
  37. package/dist/index.d.ts +108 -38
  38. package/dist/index.js +159 -14
  39. package/dist/index.js.map +1 -1
  40. package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
  41. package/dist/knowledge/index.d.ts +3 -3
  42. package/dist/meta-eval/index.d.ts +4 -4
  43. package/dist/openapi.json +1 -1
  44. package/dist/optimization.d.ts +11 -11
  45. package/dist/optimization.js +8 -8
  46. package/dist/pipelines/index.d.ts +6 -6
  47. package/dist/pipelines/index.js +3 -3
  48. package/dist/prm/index.d.ts +4 -4
  49. package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
  50. package/dist/{release-report-PWhGlpfO.d.ts → release-report-DLWbBPtH.d.ts} +3 -3
  51. package/dist/reporting.d.ts +8 -8
  52. package/dist/reporting.js +4 -4
  53. package/dist/{researcher-ClDX3KZx.d.ts → researcher-BRHa5Jxo.d.ts} +12 -6
  54. package/dist/rl.d.ts +10 -10
  55. package/dist/rl.js +6 -6
  56. package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
  57. package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-CMHypZ_M.d.ts} +1 -1
  58. package/dist/{run-record-CqzahIbx.d.ts → run-record-BfX5y68A.d.ts} +43 -2
  59. package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  60. package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-D7AQS7eB.d.ts} +2 -2
  61. package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
  62. package/dist/traces.d.ts +533 -10
  63. package/dist/traces.js +14 -300
  64. package/dist/traces.js.map +1 -1
  65. package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
  66. package/dist/wire/index.d.ts +6 -6
  67. package/dist/wire/index.js +3 -3
  68. package/package.json +12 -21
  69. package/dist/chunk-NG236HPC.js.map +0 -1
  70. package/dist/chunk-NLMNWKVM.js.map +0 -1
  71. package/dist/chunk-RUI6SIHY.js.map +0 -1
  72. package/dist/chunk-UW4NOOZI.js.map +0 -1
  73. package/dist/replay-BX5Fm8en.d.ts +0 -529
  74. /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
  75. /package/dist/{chunk-SZSBQUIJ.js.map → chunk-B73G44OH.js.map} +0 -0
  76. /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
  77. /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
  78. /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
  79. /package/dist/{chunk-PALJO75S.js.map → chunk-S4Y5VXMS.js.map} +0 -0
  80. /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
  81. /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
  82. /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
  83. /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
package/CHANGELOG.md CHANGED
@@ -1,5 +1,84 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.31.1 — 2026-05-20
4
+
5
+ ### Republish of 0.31.0 — dist drift fix
6
+
7
+ The `v0.31.0` tag's npm tarball shipped a stale `dist/` — `JudgeScoresRecord`
8
+ was missing from `dist/index.d.ts` and the `recordOutcome.judgeScores`
9
+ propagation never made it into `dist/index.js`, even though the source on
10
+ the tagged commit had both. Consumers that bumped to `^0.31.0` got a
11
+ typecheck failure on `RunOutcome.judgeScores` (since the type wasn't
12
+ re-exported) and a silent drop on the wire (since the campaign runner
13
+ didn't carry the field through).
14
+
15
+ Cause: a build artifact picked up by the publish workflow predated the
16
+ source merge. The retag forces a clean `pnpm build` and republish; this
17
+ patch carries no source change beyond the version bump.
18
+
19
+ Verified after this tag: `dist/index.d.ts` contains `JudgeScoresRecord`,
20
+ `dist/index.js` propagates `outcome.judgeScores` end-to-end via
21
+ `recordOutcome.judgeScores`, and a downstream `pnpm install
22
+ @tangle-network/agent-eval@0.31.1` types-clean against the shape
23
+ documented in 0.31.0.
24
+
25
+ ## 0.31.0 — 2026-05-20
26
+
27
+ ### `JudgeScoresRecord` on `RunRecord.outcome` — substrate-blessed ensemble shape
28
+
29
+ Multi-judge consumers (forge-chat in agent-builder, and four sibling
30
+ product agents on the same trajectory) compute per-judge per-dimension
31
+ scores per cell, then collapse to a single composite for the gate. The
32
+ substrate's `RunOutcome` only had a slot for the composite plus a free
33
+ `raw: Record<string, number>` bag. Consumers were either dropping the
34
+ breakdown on the floor or smuggling it through stringly-typed `raw`
35
+ keys like `judge_kimi_helpfulness` — neither survives a corpus-IRR run
36
+ (0.27.2's `corpusInterRaterAgreement` expects structured per-judge
37
+ per-dim records, not parsed strings).
38
+
39
+ This release ships the typed slot so every product agent speaks the
40
+ same shape, and the inter-rater primitives consume it without a
41
+ per-consumer adapter.
42
+
43
+ ### Added
44
+
45
+ - **`JudgeScoresRecord`** (`src/run-record.ts`) — `perJudge[judgeId][dim]`
46
+ is the canonical store; `perDimMean` and `composite` are precomputed
47
+ projections so reporters and IRR primitives don't repeat the
48
+ aggregation; `failedJudges?: string[]` records dead-judge ids
49
+ explicitly (no inferring partial-failure from missing keys);
50
+ `notes?: string` carries panel prose.
51
+ - **`RunOutcome.judgeScores?: JudgeScoresRecord`** — optional. Single-
52
+ judge or scalar-only runs leave it unset; ensemble runs populate it.
53
+ - **`CampaignRunOutcome.judgeScores?: JudgeScoresRecord`** — runners
54
+ return it on the per-cell outcome; `runEvalCampaign` threads it onto
55
+ the resulting `RunRecord.outcome.judgeScores` without coercion.
56
+
57
+ ### Validator extended
58
+
59
+ `validateRunRecord` validates `outcome.judgeScores` when present.
60
+ Every `perJudge[judge][dim]` and every `perDimMean[dim]` and the
61
+ `composite` must be finite numbers — the NaN-as-silent-zero bug class
62
+ banned by `CLAUDE.md` cannot pass the boundary. `failedJudges` must be
63
+ an array of non-empty strings; `notes` must be a string. Round-trip
64
+ tested in `tests/run-record.test.ts`.
65
+
66
+ ### Fail-loud contract
67
+
68
+ A judge that throws lands in `failedJudges` by id, not a silent zero
69
+ in `perJudge`. The composite is computed over surviving judges only;
70
+ the partial-failure signal is preserved through to the gate.
71
+ `tests/eval-campaign.test.ts` covers the four shapes (full, partial,
72
+ missing, with notes) plus an explicit fail-loud case where one judge
73
+ throws and the run record carries `failedJudges: ['glm-5.1@...']`.
74
+
75
+ ### Consumer contract
76
+
77
+ `tests/consumer-contract.test.ts` pins `JudgeScoresRecord` as a
78
+ type-level export at the root entry. The 0.30.0 surface is preserved —
79
+ the new field is additive on `RunOutcome` and the new type is a new
80
+ export, so existing consumers stay green.
81
+
3
82
  ## 0.29.0 — 2026-05-19
4
83
 
5
84
  ### Analyst kinds + cross-run findings context
@@ -1,4 +1,4 @@
1
- import { T as TraceStore } from './store-BP5be6s7.js';
1
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
2
2
 
3
3
  /**
4
4
  * Tool-use metrics — derived purely from trace data.
@@ -1,3 +1,3 @@
1
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index--fVrWDiR.js';
2
- import '../run-record-CqzahIbx.js';
3
- import '../errors-BZ9sTdz7.js';
1
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-BTqhGHJT.js';
2
+ import '../run-record-BfX5y68A.js';
3
+ import '../errors-mje_cKOs.js';
@@ -1,6 +1,6 @@
1
- import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-BJ54PDan.js';
2
- import { T as TraceEmitter } from '../emitter-BqjeOvJh.js';
3
- import { T as TraceStore, R as Run } from '../store-BP5be6s7.js';
1
+ import { S as SandboxDriver, H as HarnessConfig, a as SandboxHarnessResult, T as TestGradedScenario, b as TestGradedRunResult } from '../test-graded-scenario-B2kWEdh9.js';
2
+ import { T as TraceEmitter } from '../emitter-DP_cSSiw.js';
3
+ import { T as TraceStore, R as Run } from '../store-Db2Bv8Cf.js';
4
4
 
5
5
  /**
6
6
  * BuilderSession — ties a builder-of-builders workflow together.
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  SandboxHarness,
3
3
  runTestGradedScenario
4
- } from "../chunk-QHF6EQKK.js";
4
+ } from "../chunk-YTMXBHFM.js";
5
5
  import {
6
6
  judgeSpans
7
7
  } from "../chunk-47X6LRCE.js";
@@ -9,7 +9,7 @@ import "../chunk-5BKGXME7.js";
9
9
  import {
10
10
  TraceEmitter
11
11
  } from "../chunk-TVVP3ZZQ.js";
12
- import "../chunk-NG236HPC.js";
12
+ import "../chunk-QYJT52YW.js";
13
13
  import "../chunk-PZ5AY32C.js";
14
14
 
15
15
  // src/builder-eval/builder-session.ts
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  ValidationError
3
- } from "./chunk-NG236HPC.js";
3
+ } from "./chunk-QYJT52YW.js";
4
4
 
5
5
  // src/judge-calibration.ts
6
6
  function calibrateJudge(golden, candidate) {
@@ -719,4 +719,4 @@ export {
719
719
  corpusInterRaterAgreement,
720
720
  corpusInterRaterAgreementFromJudgeScores
721
721
  };
722
- //# sourceMappingURL=chunk-R5UQJNKC.js.map
722
+ //# sourceMappingURL=chunk-4L3WJXQJ.js.map
@@ -1,10 +1,10 @@
1
1
  import {
2
2
  validateRunRecord
3
- } from "./chunk-NLMNWKVM.js";
3
+ } from "./chunk-ZN2CMQIW.js";
4
4
  import {
5
5
  pairedBootstrap,
6
6
  pairedWilcoxon
7
- } from "./chunk-5AKPEK5L.js";
7
+ } from "./chunk-CXJOVDJR.js";
8
8
 
9
9
  // src/feedback-trajectory.ts
10
10
  var DEFAULT_SPLIT_POLICY = {
@@ -1409,4 +1409,4 @@ export {
1409
1409
  CallbackResearcher,
1410
1410
  NoopResearcher
1411
1411
  };
1412
- //# sourceMappingURL=chunk-SZSBQUIJ.js.map
1412
+ //# sourceMappingURL=chunk-B73G44OH.js.map
@@ -2,7 +2,7 @@ import {
2
2
  cohensD,
3
3
  confidenceInterval,
4
4
  wilcoxonSignedRank
5
- } from "./chunk-R5UQJNKC.js";
5
+ } from "./chunk-4L3WJXQJ.js";
6
6
  import {
7
7
  canonicalize,
8
8
  hashJson
@@ -1047,4 +1047,4 @@ export {
1047
1047
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
1048
1048
  researchReport
1049
1049
  };
1050
- //# sourceMappingURL=chunk-5AKPEK5L.js.map
1050
+ //# sourceMappingURL=chunk-CXJOVDJR.js.map
@@ -1,13 +1,13 @@
1
1
  import {
2
2
  assertLlmRoute
3
- } from "./chunk-4S4BM3QQ.js";
3
+ } from "./chunk-M6RZ5LJN.js";
4
4
  import {
5
5
  researchReport
6
- } from "./chunk-5AKPEK5L.js";
6
+ } from "./chunk-CXJOVDJR.js";
7
7
  import {
8
8
  RunIntegrityError,
9
9
  assertRunCaptured
10
- } from "./chunk-KTGTIOFD.js";
10
+ } from "./chunk-UBPIXOC4.js";
11
11
  import {
12
12
  FileSystemRawProviderSink
13
13
  } from "./chunk-PC4UYEBM.js";
@@ -202,6 +202,7 @@ async function runEvalCampaign(opts) {
202
202
  };
203
203
  if (splitTag === "holdout") recordOutcome.holdoutScore = outcome.score;
204
204
  else recordOutcome.searchScore = outcome.score;
205
+ if (outcome.judgeScores !== void 0) recordOutcome.judgeScores = outcome.judgeScores;
205
206
  const record = {
206
207
  runId,
207
208
  experimentId: opts.campaignId,
@@ -284,4 +285,4 @@ function defaultRunId(params) {
284
285
  export {
285
286
  runEvalCampaign
286
287
  };
287
- //# sourceMappingURL=chunk-RUI6SIHY.js.map
288
+ //# sourceMappingURL=chunk-DTEJNZYK.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/eval-campaign.ts"],"sourcesContent":["/**\n * EvalCampaign — opinionated matrix runner that wires the four\n * capture-integrity directives by construction.\n *\n * The canonical benchmark shape — matrix runner → for each\n * (variant, scenario, seed) → start a TraceEmitter → call LLMs → end the\n * run → analyze — has a bug class at the integration boundary: raw\n * events not captured, route silently wrong, integrity not asserted,\n * analyst never run. The directives in `SKILL.md § Capture integrity`\n * are the mitigations.\n *\n * `EvalCampaign` is the structural fix — consumers don't wire the\n * integrity surface themselves; the campaign owns it. Specifically:\n *\n * - calls `assertLlmRoute` once at preflight before any work runs\n * - constructs a per-run `TraceStore` and `RawProviderSink` via factories\n * - constructs the `TraceEmitter` with `onRunComplete: [analyst hook]`\n * - hands the runner an `LlmClientOptions` pre-wired with the sink and\n * trace context — the runner can't accidentally call an LLM without\n * capturing the raw HTTP envelope\n * - calls `assertRunCaptured` after every `endRun` and routes failures\n * through a configurable policy (`throw` / `mark_failed` / `log`)\n * - assembles per-run `RunRecord`s and runs `researchReport` at the end\n * so the campaign artifact is launch-decision-grade by default\n * - embeds the campaign fingerprint (a SHA-256 over the canonicalised\n * run set) and optional `preregistrationHash` in the report\n *\n * The runner contract is intentionally narrow: produce a `CampaignRunOutcome`\n * given a fully-wired `CampaignRunContext`. Everything orchestration-shaped\n * lives in the campaign. This is the inversion-of-control point — consumers\n * stop writing matrix runners and start writing scenario-runners.\n *\n * Out of scope for v1 (tracked in `docs/research-report-methodology.md`):\n *\n * - Distributed/cluster execution (concurrency is local async)\n * - Adaptive sampling / sequential interim looks\n * - Resume from partial state across crashes\n * - LLM-call retry beyond what `LlmClient` already does\n */\n\nimport { assertLlmRoute, type LlmClientOptions, type LlmRouteRequirements } from './llm-client'\nimport { canonicalize, hashJson } from './pre-registration'\nimport type {\n JudgeScoresRecord,\n RunJudgeMetadata,\n RunOutcome,\n RunRecord,\n RunSplitTag,\n RunTokenUsage,\n} from './run-record'\nimport { type ResearchReport, type ResearchReportOptions, researchReport } from './summary-report'\nimport type { RunCompleteHook } from './trace/emitter'\nimport { TraceEmitter } from './trace/emitter'\nimport {\n assertRunCaptured,\n RunIntegrityError,\n type RunIntegrityExpectations,\n type RunIntegrityReport,\n} from './trace/integrity'\nimport { FileSystemRawProviderSink, type RawProviderSink } from './trace/raw-provider-sink'\nimport type { TraceStore } from './trace/store'\n\n// ── Public types ─────────────────────────────────────────────────────────\n\nexport interface CampaignVariant<V> {\n id: string\n payload: V\n}\n\nexport interface CampaignScenario {\n scenarioId: string\n /** Free-form metadata propagated to runs and reports. */\n tags?: Record<string, string>\n}\n\nexport interface CampaignRunContext<V> {\n /** Stable run id. The campaign generates this; the runner does not. */\n runId: string\n /** Logical experiment id (campaignId by default; overridable per-run via opts). */\n experimentId: string\n variant: V\n variantId: string\n scenarioId: string\n scenarioTags: Record<string, string>\n seed: number\n splitTag: RunSplitTag\n /**\n * The TraceEmitter for this run, with `onRunComplete` hooks pre-wired\n * (analyst auto-execution if configured, plus integrity check). The\n * runner MUST call `emitter.startRun` before doing any work and either\n * `emitter.endRun` or `emitter.abortRun` before returning.\n */\n emitter: TraceEmitter\n store: TraceStore\n rawSink: RawProviderSink\n /**\n * Pre-wired LLM client options — `rawSink` and `traceContext` are populated\n * so any `callLlm(req, ctx.llmOpts)` automatically captures raw HTTP. The\n * runner can spread additional fields if needed.\n */\n llmOpts: LlmClientOptions\n}\n\nexport interface CampaignRunOutcome {\n /** Did the run pass? Mirrors `RunOutcome.pass` semantics. */\n pass: boolean\n /** Score for the run on its split. Maps to `searchScore` or `holdoutScore`. */\n score: number\n /** Mandatory cost in USD. Use 0 + raw.cost_unknown=1 only if truly unknown. */\n costUsd: number\n tokenUsage: RunTokenUsage\n /** Snapshot model id (e.g. `claude-sonnet-4-6@2025-04-15`). */\n model: string\n /** sha256 of the effective prompt sent to the model. */\n promptHash: string\n /** sha256 of the effective config (model, temperature, tools, judges, splits). */\n configHash: string\n /** Optional extra numeric metrics to land in `outcome.raw`. */\n raw?: Record<string, number>\n /** Optional failure-taxonomy tag if the run failed. */\n failureMode?: string\n /** Optional judge metadata when a judge was used. */\n judgeMetadata?: RunJudgeMetadata\n /**\n * Optional per-judge / per-dim breakdown for ensemble-judged runs.\n * Propagated to `outcome.judgeScores` on the resulting `RunRecord`.\n * Single-judge or scalar-only runs leave this unset.\n */\n judgeScores?: JudgeScoresRecord\n}\n\nexport type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>\n\nexport type CampaignIntegrityPolicy = 'throw' | 'mark_failed' | 'log'\n\nexport interface EvalCampaignOptions<V> {\n /**\n * Stable id for the campaign. Used as the default `experimentId` on\n * every run, and folded into the campaign fingerprint.\n */\n campaignId: string\n variants: CampaignVariant<V>[]\n scenarios: CampaignScenario[]\n /** Default `[0, 1, 2]`. */\n seeds?: number[]\n /** Default `'holdout'` — the split that anchors a launch decision. */\n splitTag?: RunSplitTag\n /** Git SHA the campaign is run against. Mandatory; `RunRecord` rejects unset. */\n commitSha: string\n /**\n * LLM client config. Augmented per-run with `rawSink` and `traceContext`\n * before being passed to the runner. The campaign asserts this config\n * matches `routeRequirements` once at preflight.\n */\n llmOpts: LlmClientOptions\n /**\n * Default `{ requireExplicitBaseUrl: true, requireAuth: true }` — fail\n * loud if the campaign would silently fall back to the public router or\n * run unauthenticated. Override with an empty object to disable.\n */\n routeRequirements?: LlmRouteRequirements\n /**\n * Per-run TraceStore factory. Common shape: a fresh store per run keyed\n * on `runId`. Implementations that share a store across the campaign\n * are valid — the campaign only writes through `emitter`.\n */\n storeFactory: (params: CampaignFactoryParams) => TraceStore\n /**\n * Per-run RawProviderSink factory. Defaults to `FileSystemRawProviderSink`\n * rooted at `${workDir}/raw-events/${runId}` if `workDir` is supplied;\n * otherwise required. Forensic capture is non-negotiable in a campaign\n * run — pass `NoopRawProviderSink` explicitly if you want to opt out.\n */\n rawSinkFactory?: (params: CampaignFactoryParams) => RawProviderSink\n /**\n * Filesystem root for default `rawSinkFactory`. Ignored if\n * `rawSinkFactory` is supplied.\n */\n workDir?: string\n /**\n * Extra `onRunComplete` hooks the campaign appends (after its own\n * integrity-check hook). Pass `traceAnalystOnRunComplete(...)` here.\n */\n onRunComplete?: RunCompleteHook[]\n /**\n * Per-run integrity expectations. Defaults to:\n * `{ llmSpansMin: 1, requireRawCoverageOfLlmSpans: true, requireOutcome: true }`.\n * Override (e.g. `{ llmSpansMin: 0 }`) for runs that don't call LLMs.\n */\n integrity?: RunIntegrityExpectations\n /** Behaviour when integrity fails. Default `'mark_failed'`. */\n onIntegrityFailure?: CampaignIntegrityPolicy\n /**\n * Per-run runner. Receives a fully-wired context; produces an outcome\n * the campaign converts into a `RunRecord`.\n */\n runner: CampaignRunner<V>\n /**\n * If set, the campaign computes `researchReport` at the end. `comparator`\n * is a `variantId`. Other fields are forwarded verbatim.\n */\n report?: { comparator?: string } & Omit<\n ResearchReportOptions,\n 'comparator' | 'preregistrationHash' | 'generatedAt'\n >\n /**\n * Hash of a signed `HypothesisManifest` (see `pre-registration.ts`).\n * Embedded in the campaign fingerprint and the research report.\n */\n preregistrationHash?: string\n /** Local concurrency. Default `1` (sequential). */\n concurrency?: number\n /**\n * Override the time source. Tests pass a mock to make wallMs deterministic.\n */\n now?: () => number\n /** Override the runId generator. Tests pin this. */\n runId?: (params: CampaignFactoryParams) => string\n}\n\nexport interface CampaignFactoryParams {\n campaignId: string\n runId: string\n variantId: string\n scenarioId: string\n seed: number\n}\n\nexport interface FailedRun {\n runId: string\n variantId: string\n scenarioId: string\n seed: number\n reason: string\n error?: string\n}\n\nexport interface EvalCampaignResult {\n campaignId: string\n /** SHA-256 over canonicalised `(variantIds, scenarioIds, seeds, comparator, splitTag, baseUrl, provider, preregistrationHash)`. */\n campaignFingerprint: string\n preregistrationHash: string | null\n /** Successful runs only. Failed runs land in `failedRuns`. */\n runs: RunRecord[]\n /** Integrity reports for every successful run. */\n integrityReports: RunIntegrityReport[]\n failedRuns: FailedRun[]\n /** Computed when `report` is set on options. */\n report?: ResearchReport\n startedAt: string\n endedAt: string\n}\n\n// ── Implementation ───────────────────────────────────────────────────────\n\nconst DEFAULT_INTEGRITY: RunIntegrityExpectations = {\n llmSpansMin: 1,\n requireRawCoverageOfLlmSpans: true,\n requireOutcome: true,\n}\n\nconst DEFAULT_ROUTE: LlmRouteRequirements = {\n requireExplicitBaseUrl: true,\n requireAuth: true,\n}\n\nexport async function runEvalCampaign<V>(\n opts: EvalCampaignOptions<V>,\n): Promise<EvalCampaignResult> {\n // ── Preflight ──────────────────────────────────────────────────────\n assertLlmRoute(opts.llmOpts, opts.routeRequirements ?? DEFAULT_ROUTE)\n\n if (opts.variants.length === 0) {\n throw new Error('runEvalCampaign: variants must be non-empty.')\n }\n if (opts.scenarios.length === 0) {\n throw new Error('runEvalCampaign: scenarios must be non-empty.')\n }\n const variantIds = new Set<string>()\n for (const v of opts.variants) {\n if (variantIds.has(v.id)) {\n throw new Error(`runEvalCampaign: duplicate variant id \"${v.id}\".`)\n }\n variantIds.add(v.id)\n }\n const scenarioIds = new Set<string>()\n for (const s of opts.scenarios) {\n if (scenarioIds.has(s.scenarioId)) {\n throw new Error(`runEvalCampaign: duplicate scenarioId \"${s.scenarioId}\".`)\n }\n scenarioIds.add(s.scenarioId)\n }\n if (opts.report?.comparator && !variantIds.has(opts.report.comparator)) {\n throw new Error(\n `runEvalCampaign: report.comparator \"${opts.report.comparator}\" is not a configured variantId.`,\n )\n }\n if (!opts.commitSha) {\n throw new Error('runEvalCampaign: commitSha is required (every RunRecord needs it).')\n }\n\n const seeds = opts.seeds ?? [0, 1, 2]\n const splitTag: RunSplitTag = opts.splitTag ?? 'holdout'\n const concurrency = Math.max(1, opts.concurrency ?? 1)\n const integrity = { ...DEFAULT_INTEGRITY, ...(opts.integrity ?? {}) }\n const onIntegrityFailure: CampaignIntegrityPolicy = opts.onIntegrityFailure ?? 'mark_failed'\n const now = opts.now ?? (() => Date.now())\n const baseUrl = (opts.llmOpts.baseUrl ?? '').replace(/\\/+$/, '')\n const provider = opts.llmOpts.provider ?? null\n const preregistrationHash = opts.preregistrationHash ?? null\n\n const rawSinkFactory = opts.rawSinkFactory ?? defaultRawSinkFactory(opts.workDir)\n\n // ── Fingerprint ────────────────────────────────────────────────────\n const campaignFingerprint = await hashJson(\n canonicalize({\n campaignId: opts.campaignId,\n variants: opts.variants.map((v) => v.id).sort(),\n scenarios: opts.scenarios.map((s) => s.scenarioId).sort(),\n seeds: [...seeds].sort((a, b) => a - b),\n splitTag,\n comparator: opts.report?.comparator ?? null,\n baseUrl,\n provider,\n preregistrationHash,\n }),\n )\n\n // ── Plan the matrix ────────────────────────────────────────────────\n type Cell = { variant: CampaignVariant<V>; scenario: CampaignScenario; seed: number }\n const cells: Cell[] = []\n for (const variant of opts.variants) {\n for (const scenario of opts.scenarios) {\n for (const seed of seeds) {\n cells.push({ variant, scenario, seed })\n }\n }\n }\n\n const startedAt = new Date(now()).toISOString()\n const runs: RunRecord[] = []\n const integrityReports: RunIntegrityReport[] = []\n const failedRuns: FailedRun[] = []\n\n // ── Execute (bounded-concurrency worker pool) ──────────────────────\n let cursor = 0\n async function worker(): Promise<void> {\n while (true) {\n const i = cursor++\n if (i >= cells.length) return\n const cell = cells[i]!\n try {\n const result = await runOneCell(cell)\n runs.push(result.record)\n integrityReports.push(result.integrity)\n } catch (err) {\n if (err instanceof CellExecutionError) {\n failedRuns.push(err.failed)\n if (err.integrity) integrityReports.push(err.integrity)\n } else {\n // Genuine bug — not a runner failure, not an integrity failure.\n // Surface it; don't silently mask.\n throw err\n }\n }\n }\n }\n\n async function runOneCell(\n cell: Cell,\n ): Promise<{ record: RunRecord; integrity: RunIntegrityReport }> {\n const runId = (opts.runId ?? defaultRunId)({\n campaignId: opts.campaignId,\n runId: '', // unused by default generator\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n })\n const factoryParams: CampaignFactoryParams = {\n campaignId: opts.campaignId,\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n }\n const store = opts.storeFactory(factoryParams)\n const rawSink = rawSinkFactory(factoryParams)\n\n const emitter = new TraceEmitter(store, {\n runId,\n now: opts.now,\n onRunComplete: opts.onRunComplete,\n })\n\n const llmOpts: LlmClientOptions = {\n ...opts.llmOpts,\n rawSink,\n traceContext: { runId },\n }\n\n const ctx: CampaignRunContext<V> = {\n runId,\n experimentId: opts.campaignId,\n variant: cell.variant.payload,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n scenarioTags: cell.scenario.tags ?? {},\n seed: cell.seed,\n splitTag,\n emitter,\n store,\n rawSink,\n llmOpts,\n }\n\n const wallStart = now()\n let outcome: CampaignRunOutcome\n try {\n outcome = await opts.runner(ctx)\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err)\n // The runner threw mid-execution; give it a chance to have aborted.\n try {\n await emitter.abortRun(message)\n } catch {\n // Already aborted/ended; ignore.\n }\n throw new CellExecutionError({\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n reason: 'runner_threw',\n error: message,\n })\n }\n const wallMs = now() - wallStart\n\n const integrityReport = await assertRunCaptured(store, runId, { ...integrity, rawSink })\n if (!integrityReport.ok) {\n switch (onIntegrityFailure) {\n case 'throw':\n throw new RunIntegrityError(integrityReport)\n case 'mark_failed':\n throw new CellExecutionError(\n {\n runId,\n variantId: cell.variant.id,\n scenarioId: cell.scenario.scenarioId,\n seed: cell.seed,\n reason: 'integrity_failed',\n error: integrityReport.issues.map((i) => i.code).join(', '),\n },\n integrityReport,\n )\n case 'log':\n // Caller wants the run admitted with a flagged report; fall through.\n break\n }\n }\n\n const recordOutcome: RunOutcome = {\n raw: outcome.raw ?? {},\n }\n if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score\n else recordOutcome.searchScore = outcome.score\n if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores\n\n const record: RunRecord = {\n runId,\n experimentId: opts.campaignId,\n candidateId: cell.variant.id,\n seed: cell.seed,\n model: outcome.model,\n promptHash: outcome.promptHash,\n configHash: outcome.configHash,\n commitSha: opts.commitSha,\n wallMs,\n costUsd: outcome.costUsd,\n tokenUsage: outcome.tokenUsage,\n judgeMetadata: outcome.judgeMetadata,\n outcome: recordOutcome,\n failureMode: outcome.failureMode,\n splitTag,\n scenarioId: cell.scenario.scenarioId,\n }\n return { record, integrity: integrityReport }\n }\n\n const workers = Array.from({ length: Math.min(concurrency, cells.length) }, () => worker())\n await Promise.all(workers)\n\n // ── Optional research report ───────────────────────────────────────\n let report: ResearchReport | undefined\n if (opts.report) {\n const reportOpts: ResearchReportOptions = {\n ...opts.report,\n comparator: opts.report.comparator,\n split: splitTag === 'dev' ? 'search' : splitTag,\n generatedAt: new Date(now()).toISOString(),\n preregistrationHash: preregistrationHash ?? undefined,\n }\n report = await researchReport(runs, reportOpts)\n }\n\n const endedAt = new Date(now()).toISOString()\n\n return {\n campaignId: opts.campaignId,\n campaignFingerprint,\n preregistrationHash,\n runs,\n integrityReports,\n failedRuns,\n report,\n startedAt,\n endedAt,\n }\n}\n\n// ── Internal ─────────────────────────────────────────────────────────────\n\nclass CellExecutionError extends Error {\n readonly failed: FailedRun\n readonly integrity?: RunIntegrityReport\n constructor(failed: FailedRun, integrity?: RunIntegrityReport) {\n super(`cell ${failed.variantId}/${failed.scenarioId}@${failed.seed} failed: ${failed.reason}`)\n this.failed = failed\n this.integrity = integrity\n }\n}\n\nfunction defaultRawSinkFactory(workDir: string | undefined) {\n return (params: CampaignFactoryParams): RawProviderSink => {\n if (!workDir) {\n throw new Error(\n 'runEvalCampaign: rawSinkFactory not supplied and workDir not set. Pass either to enable raw provider capture, or pass `new NoopRawProviderSink()` via rawSinkFactory to opt out explicitly.',\n )\n }\n return new FileSystemRawProviderSink({\n dir: `${workDir}/raw-events/${params.runId}`,\n })\n }\n}\n\nfunction defaultRunId(params: CampaignFactoryParams): string {\n // Stable across re-runs: fingerprint of (campaignId, variantId, scenarioId, seed).\n // Caller can override via opts.runId for non-deterministic IDs.\n const base = `${params.campaignId}::${params.variantId}::${params.scenarioId}::${params.seed}`\n // Lightweight hex: we don't need crypto-grade here, just stability + uniqueness.\n let h1 = 0x811c9dc5\n let h2 = 0x12345678\n for (let i = 0; i < base.length; i++) {\n const c = base.charCodeAt(i)\n h1 = Math.imul(h1 ^ c, 0x01000193) >>> 0\n h2 = Math.imul(h2 ^ c, 0x9e3779b1) >>> 0\n }\n return `run-${h1.toString(16).padStart(8, '0')}${h2.toString(16).padStart(8, '0')}`\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;AA+PA,IAAM,oBAA8C;AAAA,EAClD,aAAa;AAAA,EACb,8BAA8B;AAAA,EAC9B,gBAAgB;AAClB;AAEA,IAAM,gBAAsC;AAAA,EAC1C,wBAAwB;AAAA,EACxB,aAAa;AACf;AAEA,eAAsB,gBACpB,MAC6B;AAE7B,iBAAe,KAAK,SAAS,KAAK,qBAAqB,aAAa;AAEpE,MAAI,KAAK,SAAS,WAAW,GAAG;AAC9B,UAAM,IAAI,MAAM,8CAA8C;AAAA,EAChE;AACA,MAAI,KAAK,UAAU,WAAW,GAAG;AAC/B,UAAM,IAAI,MAAM,+CAA+C;AAAA,EACjE;AACA,QAAM,aAAa,oBAAI,IAAY;AACnC,aAAW,KAAK,KAAK,UAAU;AAC7B,QAAI,WAAW,IAAI,EAAE,EAAE,GAAG;AACxB,YAAM,IAAI,MAAM,0CAA0C,EAAE,EAAE,IAAI;AAAA,IACpE;AACA,eAAW,IAAI,EAAE,EAAE;AAAA,EACrB;AACA,QAAM,cAAc,oBAAI,IAAY;AACpC,aAAW,KAAK,KAAK,WAAW;AAC9B,QAAI,YAAY,IAAI,EAAE,UAAU,GAAG;AACjC,YAAM,IAAI,MAAM,0CAA0C,EAAE,UAAU,IAAI;AAAA,IAC5E;AACA,gBAAY,IAAI,EAAE,UAAU;AAAA,EAC9B;AACA,MAAI,KAAK,QAAQ,cAAc,CAAC,WAAW,IAAI,KAAK,OAAO,UAAU,GAAG;AACtE,UAAM,IAAI;AAAA,MACR,uCAAuC,KAAK,OAAO,UAAU;AAAA,IAC/D;AAAA,EACF;AACA,MAAI,CAAC,KAAK,WAAW;AACnB,UAAM,IAAI,MAAM,oEAAoE;AAAA,EACtF;AAEA,QAAM,QAAQ,KAAK,SAAS,CAAC,GAAG,GAAG,CAAC;AACpC,QAAM,WAAwB,KAAK,YAAY;AAC/C,QAAM,cAAc,KAAK,IAAI,GAAG,KAAK,eAAe,CAAC;AACrD,QAAM,YAAY,EAAE,GAAG,mBAAmB,GAAI,KAAK,aAAa,CAAC,EAAG;AACpE,QAAM,qBAA8C,KAAK,sBAAsB;AAC/E,QAAM,MAAM,KAAK,QAAQ,MAAM,KAAK,IAAI;AACxC,QAAM,WAAW,KAAK,QAAQ,WAAW,IAAI,QAAQ,QAAQ,EAAE;AAC/D,QAAM,WAAW,KAAK,QAAQ,YAAY;AAC1C,QAAM,sBAAsB,KAAK,uBAAuB;AAExD,QAAM,iBAAiB,KAAK,kBAAkB,sBAAsB,KAAK,OAAO;AAGhF,QAAM,sBAAsB,MAAM;AAAA,IAChC,aAAa;AAAA,MACX,YAAY,KAAK;AAAA,MACjB,UAAU,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,EAAE,EAAE,KAAK;AAAA,MAC9C,WAAW,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,UAAU,EAAE,KAAK;AAAA,MACxD,OAAO,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAAA,MACtC;AAAA,MACA,YAAY,KAAK,QAAQ,cAAc;AAAA,MACvC;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AAIA,QAAM,QAAgB,CAAC;AACvB,aAAW,WAAW,KAAK,UAAU;AACnC,eAAW,YAAY,KAAK,WAAW;AACrC,iBAAW,QAAQ,OAAO;AACxB,cAAM,KAAK,EAAE,SAAS,UAAU,KAAK,CAAC;AAAA,MACxC;AAAA,IACF;AAAA,EACF;AAEA,QAAM,YAAY,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAC9C,QAAM,OAAoB,CAAC;AAC3B,QAAM,mBAAyC,CAAC;AAChD,QAAM,aAA0B,CAAC;AAGjC,MAAI,SAAS;AACb,iBAAe,SAAwB;AACrC,WAAO,MAAM;AACX,YAAM,IAAI;AACV,UAAI,KAAK,MAAM,OAAQ;AACvB,YAAM,OAAO,MAAM,CAAC;AACpB,UAAI;AACF,cAAM,SAAS,MAAM,WAAW,IAAI;AACpC,aAAK,KAAK,OAAO,MAAM;AACvB,yBAAiB,KAAK,OAAO,SAAS;AAAA,MACxC,SAAS,KAAK;AACZ,YAAI,eAAe,oBAAoB;AACrC,qBAAW,KAAK,IAAI,MAAM;AAC1B,cAAI,IAAI,UAAW,kBAAiB,KAAK,IAAI,SAAS;AAAA,QACxD,OAAO;AAGL,gBAAM;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,iBAAe,WACb,MAC+D;AAC/D,UAAM,SAAS,KAAK,SAAS,cAAc;AAAA,MACzC,YAAY,KAAK;AAAA,MACjB,OAAO;AAAA;AAAA,MACP,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,IACb,CAAC;AACD,UAAM,gBAAuC;AAAA,MAC3C,YAAY,KAAK;AAAA,MACjB;AAAA,MACA,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,MAAM,KAAK;AAAA,IACb;AACA,UAAM,QAAQ,KAAK,aAAa,aAAa;AAC7C,UAAM,UAAU,eAAe,aAAa;AAE5C,UAAM,UAAU,IAAI,aAAa,OAAO;AAAA,MACtC;AAAA,MACA,KAAK,KAAK;AAAA,MACV,eAAe,KAAK;AAAA,IACtB,CAAC;AAED,UAAM,UAA4B;AAAA,MAChC,GAAG,KAAK;AAAA,MACR;AAAA,MACA,cAAc,EAAE,MAAM;AAAA,IACxB;AAEA,UAAM,MAA6B;AAAA,MACjC;AAAA,MACA,cAAc,KAAK;AAAA,MACnB,SAAS,KAAK,QAAQ;AAAA,MACtB,WAAW,KAAK,QAAQ;AAAA,MACxB,YAAY,KAAK,SAAS;AAAA,MAC1B,cAAc,KAAK,SAAS,QAAQ,CAAC;AAAA,MACrC,MAAM,KAAK;AAAA,MACX;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AAEA,UAAM,YAAY,IAAI;AACtB,QAAI;AACJ,QAAI;AACF,gBAAU,MAAM,KAAK,OAAO,GAAG;AAAA,IACjC,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAE/D,UAAI;AACF,cAAM,QAAQ,SAAS,OAAO;AAAA,MAChC,QAAQ;AAAA,MAER;AACA,YAAM,IAAI,mBAAmB;AAAA,QAC3B;AAAA,QACA,WAAW,KAAK,QAAQ;AAAA,QACxB,YAAY,KAAK,SAAS;AAAA,QAC1B,MAAM,KAAK;AAAA,QACX,QAAQ;AAAA,QACR,OAAO;AAAA,MACT,CAAC;AAAA,IACH;AACA,UAAM,SAAS,IAAI,IAAI;AAEvB,UAAM,kBAAkB,MAAM,kBAAkB,OAAO,OAAO,EAAE,GAAG,WAAW,QAAQ,CAAC;AACvF,QAAI,CAAC,gBAAgB,IAAI;AACvB,cAAQ,oBAAoB;AAAA,QAC1B,KAAK;AACH,gBAAM,IAAI,kBAAkB,eAAe;AAAA,QAC7C,KAAK;AACH,gBAAM,IAAI;AAAA,YACR;AAAA,cACE;AAAA,cACA,WAAW,KAAK,QAAQ;AAAA,cACxB,YAAY,KAAK,SAAS;AAAA,cAC1B,MAAM,KAAK;AAAA,cACX,QAAQ;AAAA,cACR,OAAO,gBAAgB,OAAO,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,IAAI;AAAA,YAC5D;AAAA,YACA;AAAA,UACF;AAAA,QACF,KAAK;AAEH;AAAA,MACJ;AAAA,IACF;AAEA,UAAM,gBAA4B;AAAA,MAChC,KAAK,QAAQ,OAAO,CAAC;AAAA,IACvB;AACA,QAAI,aAAa,UAAW,eAAc,eAAe,QAAQ;AAAA,QAC5D,eAAc,cAAc,QAAQ;AACzC,QAAI,QAAQ,gBAAgB,OAAW,eAAc,cAAc,QAAQ;AAE3E,UAAM,SAAoB;AAAA,MACxB;AAAA,MACA,cAAc,KAAK;AAAA,MACnB,aAAa,KAAK,QAAQ;AAAA,MAC1B,MAAM,KAAK;AAAA,MACX,OAAO,QAAQ;AAAA,MACf,YAAY,QAAQ;AAAA,MACpB,YAAY,QAAQ;AAAA,MACpB,WAAW,KAAK;AAAA,MAChB;AAAA,MACA,SAAS,QAAQ;AAAA,MACjB,YAAY,QAAQ;AAAA,MACpB,eAAe,QAAQ;AAAA,MACvB,SAAS;AAAA,MACT,aAAa,QAAQ;AAAA,MACrB;AAAA,MACA,YAAY,KAAK,SAAS;AAAA,IAC5B;AACA,WAAO,EAAE,QAAQ,WAAW,gBAAgB;AAAA,EAC9C;AAEA,QAAM,UAAU,MAAM,KAAK,EAAE,QAAQ,KAAK,IAAI,aAAa,MAAM,MAAM,EAAE,GAAG,MAAM,OAAO,CAAC;AAC1F,QAAM,QAAQ,IAAI,OAAO;AAGzB,MAAI;AACJ,MAAI,KAAK,QAAQ;AACf,UAAM,aAAoC;AAAA,MACxC,GAAG,KAAK;AAAA,MACR,YAAY,KAAK,OAAO;AAAA,MACxB,OAAO,aAAa,QAAQ,WAAW;AAAA,MACvC,aAAa,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAAA,MACzC,qBAAqB,uBAAuB;AAAA,IAC9C;AACA,aAAS,MAAM,eAAe,MAAM,UAAU;AAAA,EAChD;AAEA,QAAM,UAAU,IAAI,KAAK,IAAI,CAAC,EAAE,YAAY;AAE5C,SAAO;AAAA,IACL,YAAY,KAAK;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAIA,IAAM,qBAAN,cAAiC,MAAM;AAAA,EAC5B;AAAA,EACA;AAAA,EACT,YAAY,QAAmB,WAAgC;AAC7D,UAAM,QAAQ,OAAO,SAAS,IAAI,OAAO,UAAU,IAAI,OAAO,IAAI,YAAY,OAAO,MAAM,EAAE;AAC7F,SAAK,SAAS;AACd,SAAK,YAAY;AAAA,EACnB;AACF;AAEA,SAAS,sBAAsB,SAA6B;AAC1D,SAAO,CAAC,WAAmD;AACzD,QAAI,CAAC,SAAS;AACZ,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AACA,WAAO,IAAI,0BAA0B;AAAA,MACnC,KAAK,GAAG,OAAO,eAAe,OAAO,KAAK;AAAA,IAC5C,CAAC;AAAA,EACH;AACF;AAEA,SAAS,aAAa,QAAuC;AAG3D,QAAM,OAAO,GAAG,OAAO,UAAU,KAAK,OAAO,SAAS,KAAK,OAAO,UAAU,KAAK,OAAO,IAAI;AAE5F,MAAI,KAAK;AACT,MAAI,KAAK;AACT,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,WAAW,CAAC;AAC3B,SAAK,KAAK,KAAK,KAAK,GAAG,QAAU,MAAM;AACvC,SAAK,KAAK,KAAK,KAAK,GAAG,UAAU,MAAM;AAAA,EACzC;AACA,SAAO,OAAO,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC,GAAG,GAAG,SAAS,EAAE,EAAE,SAAS,GAAG,GAAG,CAAC;AACnF;","names":[]}
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  cohensD
3
- } from "./chunk-R5UQJNKC.js";
3
+ } from "./chunk-4L3WJXQJ.js";
4
4
  import {
5
5
  argHash,
6
6
  groupBy,
@@ -615,4 +615,4 @@ export {
615
615
  iqr,
616
616
  welchsTTest
617
617
  };
618
- //# sourceMappingURL=chunk-K33INZHH.js.map
618
+ //# sourceMappingURL=chunk-GVQT44CS.js.map
@@ -5,7 +5,7 @@ import {
5
5
  import {
6
6
  NotFoundError,
7
7
  ReplayError
8
- } from "./chunk-NG236HPC.js";
8
+ } from "./chunk-QYJT52YW.js";
9
9
 
10
10
  // src/trace-analyst/prompts.ts
11
11
  var TRACE_ANALYST_ACTOR_DESCRIPTION = `You answer questions about an OTLP-shaped JSONL trace dataset using the trace tools provided in the \`traces\` namespace.
@@ -986,6 +986,302 @@ function normalizeRecordArray(value) {
986
986
  );
987
987
  }
988
988
 
989
+ // src/trace-analyst/hook.ts
990
+ var DEFAULT_QUESTION = "Summarise what happened in this run. Surface any failure modes, surprising findings, or evidence that the run's verdict is wrong.";
991
+ function traceAnalystOnRunComplete(opts) {
992
+ return async (ctx) => {
993
+ if (opts.shouldRun && !opts.shouldRun(ctx)) return;
994
+ const source = opts.analyze.source;
995
+ if (source === void 0) {
996
+ await ctx.store.appendEvent({
997
+ eventId: `analyst-skip-${ctx.runId}`,
998
+ runId: ctx.runId,
999
+ kind: "log",
1000
+ timestamp: Date.now(),
1001
+ payload: { source: "trace_analyst_hook", reason: "no source configured" }
1002
+ });
1003
+ return;
1004
+ }
1005
+ const result = await analyzeTraces({ question: opts.question ?? DEFAULT_QUESTION }, {
1006
+ ...opts.analyze,
1007
+ source
1008
+ });
1009
+ if (opts.save) await opts.save(result, ctx);
1010
+ if (opts.gateOn && !opts.gateOn(result, ctx)) {
1011
+ await ctx.store.appendEvent({
1012
+ eventId: `analyst-gate-${ctx.runId}`,
1013
+ runId: ctx.runId,
1014
+ kind: "log",
1015
+ timestamp: Date.now(),
1016
+ payload: {
1017
+ source: "trace_analyst_hook",
1018
+ reason: "analyst_gate_failed",
1019
+ findings: result.findings
1020
+ }
1021
+ });
1022
+ }
1023
+ };
1024
+ }
1025
+
1026
+ // src/trace-analyst/insights.ts
1027
+ var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
1028
+ "and",
1029
+ "advanced",
1030
+ "app",
1031
+ "build",
1032
+ "create",
1033
+ "easy",
1034
+ "expert",
1035
+ "extreme",
1036
+ "for",
1037
+ "from",
1038
+ "hard",
1039
+ "implementation",
1040
+ "integrate",
1041
+ "medium",
1042
+ "project",
1043
+ "task",
1044
+ "the",
1045
+ "this",
1046
+ "with",
1047
+ "workflow"
1048
+ ]);
1049
+ function tokenizeDomainWords(value) {
1050
+ return [...value.matchAll(/[A-Za-z][A-Za-z0-9.+#-]{2,}/g)].map((match) => match[0].toLowerCase()).filter((word) => !DOMAIN_STOP_WORDS.has(word));
1051
+ }
1052
+ function inferDomainKeywords(suite) {
1053
+ const suiteWords = new Set(tokenizeDomainWords(`${suite.name} ${suite.collectionId ?? ""}`));
1054
+ const source = [
1055
+ suite.name,
1056
+ suite.collectionId ?? "",
1057
+ ...suite.tasks.flatMap((task) => [
1058
+ task.id,
1059
+ task.name,
1060
+ task.prompt ?? "",
1061
+ task.difficulty ?? "",
1062
+ ...task.tags ?? [],
1063
+ ...task.gaps ?? []
1064
+ ])
1065
+ ].join(" ");
1066
+ const counts = /* @__PURE__ */ new Map();
1067
+ for (const word of tokenizeDomainWords(source)) counts.set(word, (counts.get(word) ?? 0) + 1);
1068
+ return [...counts.entries()].filter(([word, count]) => count >= 2 || suiteWords.has(word)).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).map(([word]) => word).slice(0, 18);
1069
+ }
1070
+ function domainEvidencePattern(keywords) {
1071
+ const escaped = keywords.filter((keyword) => keyword.length >= 3).map((keyword) => keyword.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
1072
+ return escaped.length > 0 ? new RegExp(`(?<![A-Za-z0-9])(?:${escaped.join("|")})(?![A-Za-z0-9])`, "i") : /(?<![A-Za-z0-9])(?:sdk|api|css|dns|xml|provider|client|service|integration|webhook|transaction|auth|oauth|graphql|rest)(?![A-Za-z0-9])/i;
1073
+ }
1074
+ function describeTraceInsightScope(suite) {
1075
+ const taskLabel = suite.tasks.length === 1 ? "1 implementation task" : `${suite.tasks.length} implementation tasks`;
1076
+ const tags = /* @__PURE__ */ new Map();
1077
+ for (const task of suite.tasks) {
1078
+ for (const tag of task.tags ?? []) tags.set(tag, (tags.get(tag) ?? 0) + 1);
1079
+ }
1080
+ const topTags = [...tags.entries()].sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0])).slice(0, 8).map(([tag]) => tag);
1081
+ if (topTags.length > 0) return `${taskLabel} across ${topTags.join(", ")}.`;
1082
+ const difficulties = [
1083
+ ...new Set(
1084
+ suite.tasks.map((task) => task.difficulty).filter((value) => Boolean(value))
1085
+ )
1086
+ ].join(", ");
1087
+ return `${taskLabel} across ${difficulties || "the selected benchmark scope"}.`;
1088
+ }
1089
+ function planTraceInsightQuestions(input) {
1090
+ const hasFailures = input.suite.tasks.some((task) => task.outcome && task.outcome !== "satisfied");
1091
+ const hasMultipleShots = input.suite.tasks.some(
1092
+ (task) => (task.gaps ?? []).some((gap) => /shot|review|retry|continue/i.test(gap))
1093
+ );
1094
+ const questions = [
1095
+ {
1096
+ id: "execution-path",
1097
+ question: "What did the worker actually do before the first meaningful implementation edit?",
1098
+ why: "Separates grounded execution from polished but shallow output."
1099
+ },
1100
+ {
1101
+ id: "research-grounding",
1102
+ question: "Did the worker inspect docs, source, examples, or package references before committing to an implementation path?",
1103
+ why: "Identifies whether failures came from weak retrieval, weak examples, or premature coding."
1104
+ },
1105
+ {
1106
+ id: "domain-proof",
1107
+ question: "Which tasks produced executable domain proof versus UI copy, placeholders, or inferred behavior?",
1108
+ why: "Keeps product-quality claims tied to concrete evidence."
1109
+ },
1110
+ {
1111
+ id: "root-cause",
1112
+ question: "For each major failure cluster, is the likely root cause prompt/scaffold, docs/examples, SDK/API ergonomics, evaluator, runtime, or model behavior?",
1113
+ why: "Turns trace observations into actionable ownership."
1114
+ },
1115
+ {
1116
+ id: "evidence-quality",
1117
+ question: "Which external-facing claims are directly supported by trace ids, span ids, verifier findings, reviewer notes, or generated code?",
1118
+ why: "Prevents unsupported customer-report conclusions."
1119
+ }
1120
+ ];
1121
+ if (hasMultipleShots) {
1122
+ questions.push({
1123
+ id: "reviewer-lift",
1124
+ question: "Where did reviewer feedback improve score, stall, or regress across shots?",
1125
+ why: "Shows whether the driver loop is learning or merely repeating work."
1126
+ });
1127
+ }
1128
+ if (hasFailures) {
1129
+ questions.push({
1130
+ id: "optimization-targets",
1131
+ question: "Which prompt, evaluator, scaffold, or workflow changes should feed the next GEPA/autoresearch optimization run?",
1132
+ why: "Connects benchmark evidence to the optimization loop."
1133
+ });
1134
+ }
1135
+ return questions;
1136
+ }
1137
+ function buildTraceInsightContext(input) {
1138
+ return {
1139
+ suite: input.suite,
1140
+ scope: describeTraceInsightScope(input.suite),
1141
+ keywords: inferDomainKeywords(input.suite),
1142
+ questions: planTraceInsightQuestions(input),
1143
+ panel: defaultTraceInsightPanel(),
1144
+ findings: input.findings ?? [],
1145
+ agent: input.agent ?? null,
1146
+ totals: input.totals ?? null
1147
+ };
1148
+ }
1149
+ function scoreTraceInsightReadiness(context) {
1150
+ const failedTasks = context.suite.tasks.filter(
1151
+ (task) => task.outcome && task.outcome !== "satisfied"
1152
+ );
1153
+ const findingTaskIds = new Set(context.findings.flatMap((finding) => finding.taskIds));
1154
+ const failedTasksWithFindings = failedTasks.filter((task) => findingTaskIds.has(task.id));
1155
+ const tasksWithGaps = context.suite.tasks.filter((task) => (task.gaps ?? []).length > 0);
1156
+ const gates = [
1157
+ {
1158
+ id: "domain-context",
1159
+ label: "Domain context inferred",
1160
+ passed: context.keywords.length > 0,
1161
+ severity: "high",
1162
+ detail: context.keywords.length > 0 ? `${context.keywords.length} domain terms inferred: ${context.keywords.slice(0, 8).join(", ")}` : "No domain terms were inferred from suite, tasks, prompts, tags, or gaps."
1163
+ },
1164
+ {
1165
+ id: "panel-coverage",
1166
+ label: "Analyst panel planned",
1167
+ passed: context.panel.length >= 4 && context.questions.length >= 5,
1168
+ severity: "high",
1169
+ detail: `${context.panel.length} panel roles and ${context.questions.length} investigation questions planned.`
1170
+ },
1171
+ {
1172
+ id: "failure-coverage",
1173
+ label: "Failures mapped to findings",
1174
+ passed: failedTasks.length === 0 || failedTasksWithFindings.length / failedTasks.length >= 0.5,
1175
+ severity: "critical",
1176
+ detail: failedTasks.length === 0 ? "No failed tasks in suite." : `${failedTasksWithFindings.length}/${failedTasks.length} failed tasks appear in finding clusters.`
1177
+ },
1178
+ {
1179
+ id: "gap-evidence",
1180
+ label: "Task gaps captured",
1181
+ passed: failedTasks.length === 0 || tasksWithGaps.length / failedTasks.length >= 0.5,
1182
+ severity: "medium",
1183
+ detail: `${tasksWithGaps.length} tasks include explicit evaluator or analyst gaps.`
1184
+ }
1185
+ ];
1186
+ const penalty = gates.reduce((sum, gate) => {
1187
+ if (gate.passed) return sum;
1188
+ if (gate.severity === "critical") return sum + 35;
1189
+ if (gate.severity === "high") return sum + 20;
1190
+ if (gate.severity === "medium") return sum + 10;
1191
+ return sum + 5;
1192
+ }, 0);
1193
+ const score = Math.max(0, Math.min(1, 1 - penalty / 100));
1194
+ return {
1195
+ score,
1196
+ grade: score >= 0.9 ? "external-ready" : score >= 0.7 ? "internal-review" : "raw-analysis",
1197
+ gates
1198
+ };
1199
+ }
1200
+ function defaultTraceInsightPanel() {
1201
+ return [
1202
+ {
1203
+ id: "trace-forensics",
1204
+ name: "Trace Forensics",
1205
+ responsibility: "Reconstruct what the worker did in order, including research, edits, reviewer interventions, verifier feedback, and stop reason."
1206
+ },
1207
+ {
1208
+ id: "root-cause",
1209
+ name: "Root Cause",
1210
+ responsibility: "Map failures to prompt/scaffold, docs/examples, SDK/API/product ergonomics, evaluator, runtime, or model behavior."
1211
+ },
1212
+ {
1213
+ id: "optimization",
1214
+ name: "Optimization",
1215
+ responsibility: "Identify prompt, reviewer, evaluator, scaffold, and GEPA/autoresearch changes that should be tested next."
1216
+ },
1217
+ {
1218
+ id: "external-evidence",
1219
+ name: "External Evidence",
1220
+ responsibility: "Separate customer-safe claims from internal harness findings and reject conclusions without task, trace, span, code, reviewer, or verifier evidence."
1221
+ }
1222
+ ];
1223
+ }
1224
+ function buildTraceInsightPrompt(input) {
1225
+ const context = buildTraceInsightContext(input);
1226
+ const maxRepresentativeTraces = input.maxRepresentativeTraces ?? 6;
1227
+ return `Analyze this benchmark run and produce evidence-backed trace intelligence.
1228
+
1229
+ Audience:
1230
+ - internal AI/product leadership
1231
+ - possible customer-facing report for ${input.suite.name}
1232
+
1233
+ Investigation plan:
1234
+ ${context.questions.map((item, index) => `${index + 1}. ${item.question} (${item.why})`).join("\n")}
1235
+
1236
+ Analyst panel:
1237
+ ${context.panel.map((role) => `- ${role.name}: ${role.responsibility}`).join("\n")}
1238
+
1239
+ If the task branches are independent, use subagents for the panel roles above and aggregate their findings. Do not run a panel role unless its answer will change the final report.
1240
+
1241
+ Required output:
1242
+ 1. Executive verdict: what this run proves and does not prove.
1243
+ 2. The investigation questions you answered and the evidence used.
1244
+ 3. Failure taxonomy: agent prompting, evaluator/harness, docs/examples, SDK/API/product integration, infra.
1245
+ 4. Evidence-backed examples with trace ids/task ids and concrete verifier findings.
1246
+ 5. Highest-ROI fixes for the benchmark harness, prompt/GEPA optimization, and customer-facing product/docs surface.
1247
+ 6. What is safe for an external report versus what must stay internal.
1248
+ 7. One rerun plan that would validate lift after optimization.
1249
+
1250
+ Budget:
1251
+ - Inspect the dataset overview, the failure summary, and at most ${maxRepresentativeTraces} representative traces.
1252
+ - Prefer traces named in the failure summary over broad exploration.
1253
+ - Do not do exhaustive trace sweeps.
1254
+ - Return the final report as soon as the taxonomy and examples are supported.
1255
+
1256
+ Run summary:
1257
+ ${JSON.stringify(
1258
+ {
1259
+ suite: input.suite.name,
1260
+ scope: context.scope,
1261
+ inferredKeywords: context.keywords,
1262
+ agent: context.agent,
1263
+ totals: context.totals,
1264
+ findings: context.findings.map((finding) => ({
1265
+ kind: finding.kind,
1266
+ severity: finding.severity,
1267
+ taskCount: finding.taskIds.length,
1268
+ proposedFixClass: finding.proposedFixClass
1269
+ })),
1270
+ failures: input.suite.tasks.filter((task) => task.outcome && task.outcome !== "satisfied").map((task) => ({
1271
+ task: task.id,
1272
+ difficulty: task.difficulty,
1273
+ outcome: task.outcome,
1274
+ score: task.score,
1275
+ gaps: task.gaps ?? []
1276
+ }))
1277
+ },
1278
+ null,
1279
+ 2
1280
+ )}
1281
+
1282
+ Use the trace tools. Do not invent facts. Cite task ids. Separate customer-facing claims from internal harness/model findings.`;
1283
+ }
1284
+
989
1285
  // src/trace/store.ts
990
1286
  var InMemoryTraceStore = class {
991
1287
  runs = /* @__PURE__ */ new Map();
@@ -1545,6 +1841,16 @@ export {
1545
1841
  buildTraceAnalystTools,
1546
1842
  traceAnalystFunctionGroup,
1547
1843
  analyzeTraces,
1844
+ traceAnalystOnRunComplete,
1845
+ tokenizeDomainWords,
1846
+ inferDomainKeywords,
1847
+ domainEvidencePattern,
1848
+ describeTraceInsightScope,
1849
+ planTraceInsightQuestions,
1850
+ buildTraceInsightContext,
1851
+ scoreTraceInsightReadiness,
1852
+ defaultTraceInsightPanel,
1853
+ buildTraceInsightPrompt,
1548
1854
  InMemoryTraceStore,
1549
1855
  FileSystemTraceStore,
1550
1856
  OTEL_AGENT_EVAL_SCOPE,
@@ -1558,4 +1864,4 @@ export {
1558
1864
  createReplayFetch,
1559
1865
  iterateRawCalls
1560
1866
  };
1561
- //# sourceMappingURL=chunk-UW4NOOZI.js.map
1867
+ //# sourceMappingURL=chunk-HIO4UIS5.js.map