@tangle-network/agent-eval 0.59.1 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/http.js +1 -1
  4. package/dist/adapters/langchain.d.ts +1 -1
  5. package/dist/adapters/langchain.js +1 -1
  6. package/dist/adapters/otel.d.ts +5 -5
  7. package/dist/adapters/otel.js +1 -1
  8. package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
  9. package/dist/benchmarks/index.d.ts +3 -3
  10. package/dist/benchmarks/index.js +2 -2
  11. package/dist/builder-eval/index.js +3 -3
  12. package/dist/campaign/index.d.ts +153 -9
  13. package/dist/campaign/index.js +229 -23
  14. package/dist/campaign/index.js.map +1 -1
  15. package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
  16. package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
  17. package/dist/chunk-3BFEG2F6.js.map +1 -0
  18. package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
  19. package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
  20. package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
  21. package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
  22. package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
  23. package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
  24. package/dist/{chunk-N4SBKEPJ.js → chunk-GMXHLSLL.js} +107 -2
  25. package/dist/chunk-GMXHLSLL.js.map +1 -0
  26. package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
  27. package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
  28. package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
  29. package/dist/{chunk-74Y2EMNH.js → chunk-OLULBECP.js} +18 -6
  30. package/dist/chunk-OLULBECP.js.map +1 -0
  31. package/dist/chunk-PQV2TKC3.js +27 -0
  32. package/dist/chunk-PQV2TKC3.js.map +1 -0
  33. package/dist/chunk-PZ5AY32C.js +10 -0
  34. package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
  35. package/dist/chunk-SHTXZ4O2.js +113 -0
  36. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  37. package/dist/{chunk-JB4UWIM6.js → chunk-SUGME4OT.js} +266 -15
  38. package/dist/chunk-SUGME4OT.js.map +1 -0
  39. package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
  40. package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
  41. package/dist/cli.js +4 -4
  42. package/dist/contract/index.d.ts +48 -16
  43. package/dist/contract/index.js +59 -19
  44. package/dist/contract/index.js.map +1 -1
  45. package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
  46. package/dist/control.d.ts +5 -5
  47. package/dist/control.js +4 -4
  48. package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
  49. package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
  50. package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
  51. package/dist/governance/index.d.ts +3 -3
  52. package/dist/governance/index.js +1 -1
  53. package/dist/hosted/index.d.ts +5 -5
  54. package/dist/hosted/index.js +1 -1
  55. package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
  56. package/dist/{index-D2nT6_KT.d.ts → index-D9dwa00f.d.ts} +2 -2
  57. package/dist/index.d.ts +24 -132
  58. package/dist/index.js +23 -36
  59. package/dist/index.js.map +1 -1
  60. package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
  61. package/dist/knowledge/index.js +1 -1
  62. package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
  63. package/dist/matrix/index.js +1 -1
  64. package/dist/meta-eval/index.d.ts +3 -3
  65. package/dist/meta-eval/index.js +1 -1
  66. package/dist/multishot/index.js +1 -1
  67. package/dist/openapi.json +1 -1
  68. package/dist/pipelines/index.js +4 -4
  69. package/dist/prm/index.js +1 -1
  70. package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-D0WeCXt1.d.ts} +208 -6
  71. package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
  72. package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
  73. package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
  74. package/dist/reporting.d.ts +6 -6
  75. package/dist/reporting.js +5 -5
  76. package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
  77. package/dist/rl.d.ts +9 -9
  78. package/dist/rl.js +8 -8
  79. package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
  80. package/dist/run-campaign-HXPJAUZ3.js +10 -0
  81. package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
  82. package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
  83. package/dist/telemetry/file.js +1 -1
  84. package/dist/telemetry/index.js +1 -1
  85. package/dist/traces.d.ts +2 -2
  86. package/dist/traces.js +4 -4
  87. package/dist/{types-BgrxOJSf.d.ts → types-Beb6KPqZ.d.ts} +52 -4
  88. package/dist/wire/index.d.ts +3 -3
  89. package/dist/wire/index.js +4 -4
  90. package/package.json +1 -1
  91. package/dist/chunk-74Y2EMNH.js.map +0 -1
  92. package/dist/chunk-JB4UWIM6.js.map +0 -1
  93. package/dist/chunk-N4SBKEPJ.js.map +0 -1
  94. package/dist/chunk-NSBPE2FW.js +0 -17
  95. package/dist/chunk-QYJT52YW.js.map +0 -1
  96. package/dist/chunk-ZWEQJIM6.js +0 -220
  97. package/dist/chunk-ZWEQJIM6.js.map +0 -1
  98. package/dist/run-campaign-ZURVWMMI.js +0 -10
  99. /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
  100. /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
  101. /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
  102. /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
  103. /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
  104. /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
  105. /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
  106. /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
  107. /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
  108. /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
  109. /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
  110. /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
  111. /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
  112. /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
  113. /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0
package/CHANGELOG.md CHANGED
@@ -4,6 +4,27 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
4
4
 
5
5
  ---
6
6
 
7
+ ## [0.61.0] — 2026-05-30 — `runProfileMatrix` (profile × scenario × persona matrix with integrity by construction)
8
+
9
+ ### Added
10
+
11
+ - **`runProfileMatrix({ profiles, scenarios, dispatch, judges, reps, integrity, personaOf })`** (`@tangle-network/agent-eval/campaign`) — the keystone that lets a consumer express a multi-profile × scenario/persona eval as **one** call instead of a hand-rolled `eval:*` script. Fans `profiles` (axis 3) over the scenario/persona corpus (axis 1), runs `runCampaign` per profile (reusing its seeds / reps / bootstrap CIs / resumability / `LabeledScenarioStore` capture flywheel), maps every cell to a validated `RunRecord` carrying real `tokenUsage`, and runs **`assertRealBackend` by construction** (`integrity: 'assert' | 'warn' | 'off'`, default `assert`). Returns `{ records, byProfile, byScenario, byPersona, integrity, campaigns }`.
12
+ - **`ProfileMatrixError`** — thrown at preflight (before any LLM spend) when a profile's model lacks a snapshot version or the profile/scenario lists are empty.
13
+
14
+ ### Fixed / closed gap
15
+
16
+ - **Token usage is now captured by `runCampaign`** — `CampaignCostMeter` gains `observeTokens(usage)` + `tokens()`, and `CampaignCellResult` gains `tokenUsage`. Previously a campaign cell carried `costUsd` but no token counts, so `assertRealBackend`/`summarizeBackendIntegrity` (which key on `tokenUsage`) could not run on a `CampaignResult`. This closes the integrity gap for **every** campaign consumer, not just `runProfileMatrix`.
17
+
18
+ ### Why this matters
19
+
20
+ A fleet eval-surface audit found every consumer hand-rolls the same matrix→dispatch→`RunRecord`→integrity bridge as a bespoke script, because no primitive produced integrity-checkable `RunRecord`s from a profile matrix. `runProfileMatrix` is that bridge, once — so the adoption skills can mandate "one matrix harness, axes as flags" with a real primitive behind it. Consumers wrap their runtime (e.g. agent-runtime `runLoop` + `reportLoopUsage`) in `dispatch`; the integrity guard then sees real LLM activity.
21
+
22
+ ### Notes
23
+
24
+ Pure additive surface (the `CampaignCostMeter` additions are new optional methods). 7 new tests under `tests/campaign/run-profile-matrix.test.ts` — the keystone being the **stub→throws** regression (a zero-token dispatch fails the matrix loudly instead of reporting a clean 0/N leaderboard). Full suite 1527/1527 green.
25
+
26
+ ---
27
+
7
28
  ## [0.53.0] — 2026-05-27 — prior-period comparison ("did my last change help?")
8
29
 
9
30
  ### Added
@@ -1,4 +1,4 @@
1
- import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-BgrxOJSf.js';
1
+ import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-Beb6KPqZ.js';
2
2
 
3
3
  /**
4
4
  * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.
@@ -1,4 +1,4 @@
1
- import "../chunk-NSBPE2FW.js";
1
+ import "../chunk-PZ5AY32C.js";
2
2
 
3
3
  // src/adapters/http.ts
4
4
  function resolveAuth(auth) {
@@ -1,4 +1,4 @@
1
- import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-BgrxOJSf.js';
1
+ import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-Beb6KPqZ.js';
2
2
 
3
3
  /**
4
4
  * # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
@@ -1,4 +1,4 @@
1
- import "../chunk-NSBPE2FW.js";
1
+ import "../chunk-PZ5AY32C.js";
2
2
 
3
3
  // src/adapters/langchain.ts
4
4
  function langchainDispatch(opts) {
@@ -1,8 +1,8 @@
1
- import { T as TraceSpanEvent, H as HostedClient } from '../index-D2nT6_KT.js';
2
- import '../types-BgrxOJSf.js';
3
- import '../summary-report-DLxh4yWk.js';
4
- import '../run-record-etiCMsUq.js';
5
- import '../errors-mje_cKOs.js';
1
+ import { T as TraceSpanEvent, H as HostedClient } from '../index-D9dwa00f.js';
2
+ import '../types-Beb6KPqZ.js';
3
+ import '../summary-report-BQvXpvaR.js';
4
+ import '../run-record-DgUVo5pw.js';
5
+ import '../errors-Dwqw-T_m.js';
6
6
  import '../schema-m0gsnbt3.js';
7
7
  import '../failure-cluster-CL7IVgkJ.js';
8
8
  import '../store-CKUAgsJz.js';
@@ -1,4 +1,4 @@
1
- import "../chunk-NSBPE2FW.js";
1
+ import "../chunk-PZ5AY32C.js";
2
2
 
3
3
  // src/adapters/otel.ts
4
4
  var OTEL_STATUS_UNSET = 0;
@@ -0,0 +1,114 @@
1
+ import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
2
+ import { R as RunRecord } from './run-record-DgUVo5pw.js';
3
+
4
+ /**
5
+ * Backend-integrity guard: distinguish "agent failed" from "eval ran against
6
+ * a stub / unconfigured backend." Without this guard a canonical eval can
7
+ * silently report `0/N passed` and look like an agent-quality problem when
8
+ * the LLM was never actually called — the failure mode we just hit running
9
+ * the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
10
+ * char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
11
+ *
12
+ * The shape:
13
+ *
14
+ * const report = summarizeBackendIntegrity(records)
15
+ * assertRealBackend(records) // throws BackendIntegrityError if 100% stub
16
+ *
17
+ * A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
18
+ * (`costUsd` alone is unreliable — some backends successfully call LLMs but
19
+ * don't propagate pricing, producing real tokens with $0 cost.)
20
+ *
21
+ * Verdicts:
22
+ * - `real` — at least one record has nonzero token usage
23
+ * - `stub` — every record is stub-mode (eval ran blind)
24
+ * - `mixed` — some records real, some stub (partial backend failure;
25
+ * often the 429-cascade or auth-half-failed case)
26
+ */
27
+
28
+ interface BackendIntegrityReport {
29
+ /** Total records inspected. */
30
+ totalRecords: number;
31
+ /** Records with input=0 AND output=0 (a stub fingerprint). */
32
+ stubRecords: number;
33
+ /** Records with nonzero token usage (real LLM activity). */
34
+ realRecords: number;
35
+ /** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
36
+ uncostedRecords: number;
37
+ /** Sum of input tokens across all records. */
38
+ totalInputTokens: number;
39
+ /** Sum of output tokens across all records. */
40
+ totalOutputTokens: number;
41
+ /** Sum of costUsd across all records. */
42
+ totalCostUsd: number;
43
+ /** Worst-case integrity verdict. */
44
+ verdict: 'real' | 'mixed' | 'stub';
45
+ /** Human-readable diagnosis suitable for terminal output. */
46
+ diagnosis: string;
47
+ }
48
+ /**
49
+ * Error thrown when an integrity assertion fails. Caller can pattern-match
50
+ * by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
51
+ * errors.
52
+ */
53
+ declare class BackendIntegrityError extends AgentEvalError {
54
+ readonly report: BackendIntegrityReport;
55
+ constructor(message: string, report: BackendIntegrityReport);
56
+ }
57
+ /**
58
+ * Inspect a batch of RunRecords and return an integrity report. Pure
59
+ * function — no I/O, no logging. The caller decides what to do with the
60
+ * verdict (print warning, throw, gate CI, etc.).
61
+ */
62
+ declare function summarizeBackendIntegrity(records: ReadonlyArray<RunRecord>): BackendIntegrityReport;
63
+ /**
64
+ * Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
65
+ * shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
66
+ * to also reject mixed verdicts (recommended for CI gates).
67
+ *
68
+ * Real backends pass through silently.
69
+ */
70
+ declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
71
+ allowMixed?: boolean;
72
+ }): BackendIntegrityReport;
73
+
74
+ /**
75
+ * @stable
76
+ *
77
+ * AgentProfile — the eval harness's unit of variation.
78
+ *
79
+ * A profile pins everything that changes agent behaviour for a benchmark
80
+ * cell: the model, the active skills, the prompt version, the available
81
+ * tools. Vary the profile — swap a model, add a skill — and re-run the suite
82
+ * to benchmark the change. The scorecard keys a cell on
83
+ * `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
84
+ * inside the profile, and two profiles with the same model but different
85
+ * skills are different cells.
86
+ *
87
+ * `agentProfileHash` is the profile's behaviour identity. Two profiles that
88
+ * produce the same agent behaviour share a hash (and a scorecard cell);
89
+ * reordering `skills` or `tools` does not change it; the human-facing `id`
90
+ * label does not affect it.
91
+ */
92
+ interface AgentProfile {
93
+ /** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
94
+ id: string;
95
+ /** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
96
+ model: string;
97
+ /** Skill ids/versions active in this profile — the primary behaviour lever. */
98
+ skills?: string[];
99
+ /** Prompt version identifier. */
100
+ promptVersion?: string;
101
+ /** Tool ids available to the agent. */
102
+ tools?: string[];
103
+ /** Any other behaviour-bearing knobs that should fingerprint into the hash. */
104
+ metadata?: Record<string, string | number | boolean>;
105
+ }
106
+ /**
107
+ * Deterministic behaviour identity of a profile — a sha256 over the
108
+ * behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
109
+ * `id` label is excluded. Throws on a profile with no `model` — an unkeyable
110
+ * profile must fail loud rather than collapse into a blank-model cell.
111
+ */
112
+ declare function agentProfileHash(profile: AgentProfile): string;
113
+
114
+ export { type AgentProfile as A, type BackendIntegrityReport as B, BackendIntegrityError as a, agentProfileHash as b, assertRealBackend as c, summarizeBackendIntegrity as s };
@@ -1,4 +1,4 @@
1
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-wlaiph9Y.js';
2
- import '../run-record-etiCMsUq.js';
3
- import '../errors-mje_cKOs.js';
1
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-Bvk35ils.js';
2
+ import '../run-record-DgUVo5pw.js';
3
+ import '../errors-Dwqw-T_m.js';
4
4
  import '../schema-m0gsnbt3.js';
@@ -2,8 +2,8 @@ import {
2
2
  BENCHMARK_SPLIT_SEED,
3
3
  deterministicSplit,
4
4
  routing_exports
5
- } from "../chunk-MHQPVHXU.js";
6
- import "../chunk-NSBPE2FW.js";
5
+ } from "../chunk-6QDKWHLS.js";
6
+ import "../chunk-PZ5AY32C.js";
7
7
  export {
8
8
  BENCHMARK_SPLIT_SEED,
9
9
  deterministicSplit,
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  SandboxHarness,
3
3
  runTestGradedScenario
4
- } from "../chunk-YTMXBHFM.js";
4
+ } from "../chunk-T375SUOZ.js";
5
5
  import {
6
6
  judgeSpans
7
7
  } from "../chunk-47X6LRCE.js";
@@ -9,8 +9,8 @@ import "../chunk-5BKGXME7.js";
9
9
  import {
10
10
  TraceEmitter
11
11
  } from "../chunk-TVVP3ZZQ.js";
12
- import "../chunk-QYJT52YW.js";
13
- import "../chunk-NSBPE2FW.js";
12
+ import "../chunk-3BFEG2F6.js";
13
+ import "../chunk-PZ5AY32C.js";
14
14
 
15
15
  // src/builder-eval/builder-session.ts
16
16
  var BuilderSession = class {
@@ -1,14 +1,20 @@
1
- export { C as CampaignStorage, D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverConstraints, a as GepaDriverOptions, H as HeldOutGateOptions, O as OpenAutoPrOptions, b as OpenAutoPrResult, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, e as RunImprovementLoopResult, f as RunOptimizationOptions, g as RunOptimizationResult, h as composeGate, i as countSentenceEdits, j as defaultProductionGate, k as evolutionaryDriver, l as extractH2Sections, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, q as openAutoPr, r as runCampaign, s as runEval, t as runImprovementLoop, u as runOptimization, v as surfaceHash } from '../run-improvement-loop-BhfdjrMY.js';
2
- import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, C as CodeSurface } from '../types-BgrxOJSf.js';
3
- export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, b as DispatchContext, D as DispatchFn, G as Gate, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, I as ImprovementDriver, r as JudgeAggregate, a as JudgeConfig, s as JudgeDimension, J as JudgeScore, t as LabeledScenarioSource, M as MutableSurface, u as Mutator, O as OptimizerConfig, P as ProposeContext, R as RedactionStatus, S as Scenario, v as ScenarioAggregate, w as SessionScript, T as TraceSpan, x as labelTrustRank } from '../types-BgrxOJSf.js';
4
- import '../llm-client-BXVRUZyX.js';
5
- import '../errors-mje_cKOs.js';
1
+ import { C as CampaignStorage } from '../provenance-D0WeCXt1.js';
2
+ export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, E as EmitLoopProvenanceArgs, a as EmitLoopProvenanceResult, b as EvolutionaryDriverOptions, G as GepaDriverConstraints, c as GepaDriverOptions, H as HeldOutGateOptions, L as LoopProvenanceBackend, d as LoopProvenanceCandidate, e as LoopProvenanceRecord, O as OpenAutoPrOptions, f as OpenAutoPrResult, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, i as RunImprovementLoopResult, j as RunOptimizationOptions, k as RunOptimizationResult, l as buildLoopProvenanceRecord, m as composeGate, n as countSentenceEdits, o as defaultProductionGate, p as defaultRenderDiff, q as emitLoopProvenance, r as evolutionaryDriver, s as extractH2Sections, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, x as loopProvenanceSpans, y as openAutoPr, z as provenanceRecordPath, A as provenanceSpansPath, F as runCampaign, I as runEval, J as runImprovementLoop, K as runOptimization, M as surfaceContentHash, N as surfaceHash } from '../provenance-D0WeCXt1.js';
3
+ import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, S as Scenario, b as DispatchContext, a as JudgeConfig, g as LabeledScenarioSource, C as CampaignResult, h as CodeSurface } from '../types-Beb6KPqZ.js';
4
+ export { i as CampaignAggregates, j as CampaignArtifactWriter, k as CampaignCellResult, l as CampaignCostMeter, m as CampaignTokenUsage, n as CampaignTraceWriter, D as DispatchFn, G as Gate, o as GateContext, p as GateDecision, q as GateResult, r as GenerationCandidate, s as GenerationRecord, I as ImprovementDriver, t as JudgeAggregate, u as JudgeDimension, J as JudgeScore, M as MutableSurface, v as Mutator, O as OptimizerConfig, P as ProposeContext, w as ProposedCandidate, R as RedactionStatus, x as ScenarioAggregate, y as SessionScript, T as TraceSpan, z as isProposedCandidate, A as labelTrustRank } from '../types-Beb6KPqZ.js';
5
+ import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-9J9hxdm2.js';
6
+ import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
7
+ import { a as RunSplitTag, R as RunRecord } from '../run-record-DgUVo5pw.js';
8
+ import '../llm-client-DbjLfz-K.js';
6
9
  import '../raw-provider-sink-C46HDghv.js';
7
- import '../red-team-CrC5MZYd.js';
8
- import '../dataset-BlwAtYYf.js';
10
+ import '../red-team-DW9Ca_tj.js';
11
+ import '../dataset-B2kL-fSM.js';
9
12
  import '../store-CKUAgsJz.js';
10
13
  import '../schema-m0gsnbt3.js';
11
- import '../run-record-etiCMsUq.js';
14
+ import '../index-D9dwa00f.js';
15
+ import '../summary-report-BQvXpvaR.js';
16
+ import '../failure-cluster-CL7IVgkJ.js';
17
+ import '../judge-calibration-DilmB3Ml.js';
12
18
 
13
19
  /**
14
20
  * @experimental
@@ -71,6 +77,144 @@ declare class FsLabeledScenarioStore implements LabeledScenarioStore {
71
77
  private pathForSource;
72
78
  }
73
79
 
80
+ /**
81
+ * @experimental
82
+ *
83
+ * `runProfileMatrix` — the missing keystone between `runAgentMatrix` and the
84
+ * backend-integrity guard.
85
+ *
86
+ * The gap it closes: `runAgentMatrix` is a topology-opaque scheduler whose
87
+ * cells return a bare `{ output, verdict, costUsd }` — no `tokenUsage`, not a
88
+ * `RunRecord`. `assertRealBackend` / `summarizeBackendIntegrity` key on
89
+ * `RunRecord.tokenUsage`, so they cannot run on a raw matrix result. Every
90
+ * consumer therefore hand-writes the same bridge: fan a profile × scenario
91
+ * cartesian, call dispatch, fabricate a `RunRecord` with token usage, thread it
92
+ * back, run the integrity guard. That hand-rolled bridge is exactly the pile of
93
+ * bespoke `eval:*` scripts the adoption skills keep trying (and failing) to
94
+ * forbid.
95
+ *
96
+ * `runProfileMatrix` IS that bridge, once:
97
+ *
98
+ * - axis 3 (PROFILE) = `profiles: AgentProfile[]`
99
+ * - axis 1 (PERSONA/SCENARIO) = `scenarios: Scenario[]` (each scenario carries
100
+ * its persona; `personaOf` groups them for the `byPersona` pivot)
101
+ * - the scoring axis = `judges`
102
+ *
103
+ * It runs `runCampaign` once per profile (reusing its seeds, reps, bootstrap
104
+ * CIs, resumability, and the `LabeledScenarioStore` capture flywheel), maps
105
+ * every cell to a validated `RunRecord` carrying the real `tokenUsage` the
106
+ * dispatch reported via `ctx.cost.observeTokens`, and runs `assertRealBackend`
107
+ * BY CONSTRUCTION before returning — so a stub-backend run fails loudly instead
108
+ * of reporting a clean 0/N leaderboard.
109
+ *
110
+ * Dispatch contract: a dispatch that calls an LLM MUST report usage via
111
+ * `ctx.cost.observeTokens({ input, output })` (and cost via `ctx.cost.observe`).
112
+ * A dispatch that reports zero tokens is indistinguishable from a stub and the
113
+ * integrity guard treats it as one.
114
+ */
115
+
116
+ /** Thrown when the matrix is misconfigured (no profiles, a profile whose model
117
+ * lacks a snapshot version, etc.). Distinct from `BackendIntegrityError`,
118
+ * which signals a stub backend at run time. */
119
+ declare class ProfileMatrixError extends AgentEvalError {
120
+ constructor(message: string);
121
+ }
122
+ /** Dispatch for one cell: render `profile` against `scenario`, returning the
123
+ * artifact the judges score. Report LLM usage via `ctx.cost.observeTokens`
124
+ * and `ctx.cost.observe` — the integrity guard depends on it. */
125
+ type ProfileDispatchFn<TScenario extends Scenario, TArtifact> = (profile: AgentProfile, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
126
+ interface RunProfileMatrixOptions<TScenario extends Scenario, TArtifact> {
127
+ /** Axis 3 — the agent-under-test configurations. Each is one column. */
128
+ profiles: AgentProfile[];
129
+ /** Axis 1 — the persona/scenario corpus, run against every profile. */
130
+ scenarios: TScenario[];
131
+ /** Renders one (profile, scenario) cell. */
132
+ dispatch: ProfileDispatchFn<TScenario, TArtifact>;
133
+ /** The scoring axis. */
134
+ judges?: JudgeConfig<TArtifact, TScenario>[];
135
+ /** Where each profile's campaign writes artifacts/traces. One subdir per
136
+ * profile. */
137
+ runDir: string;
138
+ /** Git SHA the harness ran from — stamped onto every RunRecord (mandatory
139
+ * for paper-grade records). */
140
+ commitSha: string;
141
+ /** Logical experiment id shared across the whole matrix so the promotion
142
+ * gate can pair profiles on matched scenarios. Default: a hash of the
143
+ * profile + scenario ids. */
144
+ experimentId?: string;
145
+ /** Which split these runs belong to. Default `'search'`. */
146
+ splitTag?: RunSplitTag;
147
+ /** Replicates per (profile, scenario) cell for CI bands. Default 1. */
148
+ reps?: number;
149
+ /** Campaign seed (per profile). Default 42. */
150
+ seed?: number;
151
+ /**
152
+ * Backend-integrity posture, enforced AFTER the matrix completes:
153
+ * - `'assert'` (default) — throw `BackendIntegrityError` if the run was a
154
+ * stub (and, with `allowMixed:false`, if it was mixed).
155
+ * - `'warn'` — log the verdict but never throw.
156
+ * - `'off'` — skip the guard entirely (only for offline/replay analysis).
157
+ */
158
+ integrity?: 'assert' | 'warn' | 'off';
159
+ /** Forwarded to `assertRealBackend`. Default true (tolerate partial 429
160
+ * cascades); set false for strict CI gates. */
161
+ allowMixed?: boolean;
162
+ /** Max concurrent cells WITHIN each profile's campaign. Default 2.
163
+ * Profiles run sequentially so the cost ceiling is honored deterministically. */
164
+ maxConcurrency?: number;
165
+ /** Cumulative USD cap per profile campaign. */
166
+ costCeiling?: number;
167
+ /** Capture flywheel — forwarded to each campaign. */
168
+ labeledStore?: LabeledScenarioStore | 'off';
169
+ captureSource?: LabeledScenarioSource;
170
+ /** Storage backend. Default `fsCampaignStorage`. Pass
171
+ * `inMemoryCampaignStorage()` for edge/CF-Worker/test runs. */
172
+ storage?: CampaignStorage;
173
+ /** Test seam — override the wall clock. */
174
+ now?: () => Date;
175
+ /** Optional persona key per scenario — drives the `byPersona` pivot. When
176
+ * unset, `byPersona` is omitted. */
177
+ personaOf?: (scenario: TScenario) => string;
178
+ /** Validate every produced RunRecord with `validateRunRecord` (fail-loud).
179
+ * Default true — catches bad model snapshots and non-finite judge dims at
180
+ * the boundary instead of letting them poison downstream analysis. */
181
+ validate?: boolean;
182
+ }
183
+ interface ProfileSummary {
184
+ profileId: string;
185
+ profileHash: string;
186
+ model: string;
187
+ /** RunRecords produced for this profile (= scenarios × reps). */
188
+ records: number;
189
+ /** Mean composite across this profile's records. */
190
+ meanComposite: number;
191
+ totalCostUsd: number;
192
+ /** Per-profile integrity verdict — surfaces a single profile that ran stub
193
+ * even when the matrix as a whole looks real. */
194
+ integrity: BackendIntegrityReport;
195
+ }
196
+ interface ScenarioRollup {
197
+ meanComposite: number;
198
+ n: number;
199
+ }
200
+ interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {
201
+ matrixId: string;
202
+ experimentId: string;
203
+ /** One RunRecord per (profile, scenario, rep) cell — the integrity-checked,
204
+ * paper-grade output. Feed straight into `analyzeRuns`, `HeldOutGate`,
205
+ * scorecards, the hosted wire format. */
206
+ records: RunRecord[];
207
+ byProfile: Record<string, ProfileSummary>;
208
+ byScenario: Record<string, ScenarioRollup>;
209
+ /** Present only when `personaOf` was supplied. */
210
+ byPersona?: Record<string, ScenarioRollup>;
211
+ /** Whole-matrix integrity report (the one `integrity:'assert'` enforces). */
212
+ integrity: BackendIntegrityReport;
213
+ /** The raw per-profile campaign results, keyed by profile id. */
214
+ campaigns: Record<string, CampaignResult<TArtifact, TScenario>>;
215
+ }
216
+ declare function runProfileMatrix<TScenario extends Scenario, TArtifact>(opts: RunProfileMatrixOptions<TScenario, TArtifact>): Promise<RunProfileMatrixResult<TArtifact, TScenario>>;
217
+
74
218
  /**
75
219
  * @experimental
76
220
  *
@@ -126,4 +270,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
126
270
  * as a ref under the adapter's worktree dir. */
127
271
  declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
128
272
 
129
- export { CodeSurface, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type Worktree, type WorktreeAdapter, WorktreeAdapterError, gitWorktreeAdapter, resolveWorktreePath };
273
+ export { CampaignResult, CampaignStorage, CodeSurface, DispatchContext, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, JudgeConfig, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type RunProfileMatrixOptions, type RunProfileMatrixResult, Scenario, type ScenarioRollup, type Worktree, type WorktreeAdapter, WorktreeAdapterError, gitWorktreeAdapter, resolveWorktreePath, runProfileMatrix };