npm - @tangle-network/agent-eval - Versions diffs - 0.60.0 → 0.61.0 - Mend

@tangle-network/agent-eval 0.60.0 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

package/CHANGELOG.md +21 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +5 -5
package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
package/dist/benchmarks/index.d.ts +3 -3
package/dist/builder-eval/index.js +2 -2
package/dist/campaign/index.d.ts +151 -11
package/dist/campaign/index.js +211 -10
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
package/dist/chunk-3BFEG2F6.js.map +1 -0
package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
package/dist/{chunk-GBHRUAOF.js → chunk-GMXHLSLL.js} +2 -2
package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
package/dist/{chunk-NOPYCRNG.js → chunk-OLULBECP.js} +13 -2
package/dist/chunk-OLULBECP.js.map +1 -0
package/dist/chunk-PQV2TKC3.js +27 -0
package/dist/chunk-PQV2TKC3.js.map +1 -0
package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
package/dist/{chunk-LBSXXH56.js → chunk-SUGME4OT.js} +5 -5
package/dist/chunk-SUGME4OT.js.map +1 -0
package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
package/dist/cli.js +3 -3
package/dist/contract/index.d.ts +13 -13
package/dist/contract/index.js +7 -7
package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
package/dist/control.d.ts +5 -5
package/dist/control.js +3 -3
package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
package/dist/governance/index.d.ts +3 -3
package/dist/hosted/index.d.ts +5 -5
package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
package/dist/{index-BIkvdkSU.d.ts → index-D9dwa00f.d.ts} +2 -2
package/dist/index.d.ts +24 -132
package/dist/index.js +16 -29
package/dist/index.js.map +1 -1
package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
package/dist/meta-eval/index.d.ts +3 -3
package/dist/openapi.json +1 -1
package/dist/pipelines/index.js +3 -3
package/dist/{provenance-BM8vmMBa.d.ts → provenance-D0WeCXt1.d.ts} +5 -5
package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
package/dist/reporting.d.ts +6 -6
package/dist/reporting.js +4 -4
package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
package/dist/rl.d.ts +9 -9
package/dist/rl.js +7 -7
package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
package/dist/run-campaign-HXPJAUZ3.js +10 -0
package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
package/dist/traces.d.ts +2 -2
package/dist/traces.js +3 -3
package/dist/{types-VCIXx_yo.d.ts → types-Beb6KPqZ.d.ts} +21 -1
package/dist/wire/index.d.ts +3 -3
package/dist/wire/index.js +3 -3
package/package.json +12 -25
package/dist/chunk-LBSXXH56.js.map +0 -1
package/dist/chunk-NOPYCRNG.js.map +0 -1
package/dist/chunk-QYJT52YW.js.map +0 -1
package/dist/run-campaign-5XENUKRF.js +0 -10
/package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
/package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
/package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
/package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
/package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
/package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
/package/dist/{chunk-GBHRUAOF.js.map → chunk-GMXHLSLL.js.map} +0 -0
/package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
/package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
/package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
/package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
/package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
/package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
/package/dist/{run-campaign-5XENUKRF.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0

package/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,27 @@ All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-
 ---
+## [0.61.0] — 2026-05-30 — `runProfileMatrix` (profile × scenario × persona matrix with integrity by construction)
+### Added
+- **`runProfileMatrix({ profiles, scenarios, dispatch, judges, reps, integrity, personaOf })`** (`@tangle-network/agent-eval/campaign`) — the keystone that lets a consumer express a multi-profile × scenario/persona eval as **one** call instead of a hand-rolled `eval:*` script. Fans `profiles` (axis 3) over the scenario/persona corpus (axis 1), runs `runCampaign` per profile (reusing its seeds / reps / bootstrap CIs / resumability / `LabeledScenarioStore` capture flywheel), maps every cell to a validated `RunRecord` carrying real `tokenUsage`, and runs **`assertRealBackend` by construction** (`integrity: 'assert' | 'warn' | 'off'`, default `assert`). Returns `{ records, byProfile, byScenario, byPersona, integrity, campaigns }`.
+- **`ProfileMatrixError`** — thrown at preflight (before any LLM spend) when a profile's model lacks a snapshot version or the profile/scenario lists are empty.
+### Fixed / closed gap
+- **Token usage is now captured by `runCampaign`** — `CampaignCostMeter` gains `observeTokens(usage)` + `tokens()`, and `CampaignCellResult` gains `tokenUsage`. Previously a campaign cell carried `costUsd` but no token counts, so `assertRealBackend`/`summarizeBackendIntegrity` (which key on `tokenUsage`) could not run on a `CampaignResult`. This closes the integrity gap for **every** campaign consumer, not just `runProfileMatrix`.
+### Why this matters
+A fleet eval-surface audit found every consumer hand-rolls the same matrix→dispatch→`RunRecord`→integrity bridge as a bespoke script, because no primitive produced integrity-checkable `RunRecord`s from a profile matrix. `runProfileMatrix` is that bridge, once — so the adoption skills can mandate "one matrix harness, axes as flags" with a real primitive behind it. Consumers wrap their runtime (e.g. agent-runtime `runLoop` + `reportLoopUsage`) in `dispatch`; the integrity guard then sees real LLM activity.
+### Notes
+Pure additive surface (the `CampaignCostMeter` additions are new optional methods). 7 new tests under `tests/campaign/run-profile-matrix.test.ts` — the keystone being the **stub→throws** regression (a zero-token dispatch fails the matrix loudly instead of reporting a clean 0/N leaderboard). Full suite 1527/1527 green.
+---
 ## [0.53.0] — 2026-05-27 — prior-period comparison ("did my last change help?")
 ### Added

package/dist/adapters/http.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-VCIXx_yo.js';
+import { S as Scenario, D as DispatchFn, b as DispatchContext } from '../types-Beb6KPqZ.js';
 /**
  * # `@tangle-network/agent-eval/adapters/http` — distributed Dispatch over HTTP.

package/dist/adapters/langchain.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-VCIXx_yo.js';
+import { S as Scenario, J as JudgeScore, D as DispatchFn, a as JudgeConfig } from '../types-Beb6KPqZ.js';
 /**
  * # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain

package/dist/adapters/otel.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
-import { T as TraceSpanEvent, H as HostedClient } from '../index-BIkvdkSU.js';
-import '../types-VCIXx_yo.js';
-import '../summary-report-DLxh4yWk.js';
-import '../run-record-etiCMsUq.js';
-import '../errors-mje_cKOs.js';
+import { T as TraceSpanEvent, H as HostedClient } from '../index-D9dwa00f.js';
+import '../types-Beb6KPqZ.js';
+import '../summary-report-BQvXpvaR.js';
+import '../run-record-DgUVo5pw.js';
+import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';
 import '../failure-cluster-CL7IVgkJ.js';
 import '../store-CKUAgsJz.js';

package/dist/agent-profile-9J9hxdm2.d.ts ADDED Viewed

@@ -0,0 +1,114 @@
+import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
+import { R as RunRecord } from './run-record-DgUVo5pw.js';
+/**
+ * Backend-integrity guard: distinguish "agent failed" from "eval ran against
+ * a stub / unconfigured backend." Without this guard a canonical eval can
+ * silently report `0/N passed` and look like an agent-quality problem when
+ * the LLM was never actually called — the failure mode we just hit running
+ * the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
+ * char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
+ *
+ * The shape:
+ *
+ *   const report = summarizeBackendIntegrity(records)
+ *   assertRealBackend(records)   // throws BackendIntegrityError if 100% stub
+ *
+ * A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
+ * (`costUsd` alone is unreliable — some backends successfully call LLMs but
+ *  don't propagate pricing, producing real tokens with $0 cost.)
+ *
+ * Verdicts:
+ *   - `real`   — at least one record has nonzero token usage
+ *   - `stub`   — every record is stub-mode (eval ran blind)
+ *   - `mixed`  — some records real, some stub (partial backend failure;
+ *                often the 429-cascade or auth-half-failed case)
+ */
+interface BackendIntegrityReport {
+    /** Total records inspected. */
+    totalRecords: number;
+    /** Records with input=0 AND output=0 (a stub fingerprint). */
+    stubRecords: number;
+    /** Records with nonzero token usage (real LLM activity). */
+    realRecords: number;
+    /** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
+    uncostedRecords: number;
+    /** Sum of input tokens across all records. */
+    totalInputTokens: number;
+    /** Sum of output tokens across all records. */
+    totalOutputTokens: number;
+    /** Sum of costUsd across all records. */
+    totalCostUsd: number;
+    /** Worst-case integrity verdict. */
+    verdict: 'real' | 'mixed' | 'stub';
+    /** Human-readable diagnosis suitable for terminal output. */
+    diagnosis: string;
+}
+/**
+ * Error thrown when an integrity assertion fails. Caller can pattern-match
+ * by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
+ * errors.
+ */
+declare class BackendIntegrityError extends AgentEvalError {
+    readonly report: BackendIntegrityReport;
+    constructor(message: string, report: BackendIntegrityReport);
+}
+/**
+ * Inspect a batch of RunRecords and return an integrity report. Pure
+ * function — no I/O, no logging. The caller decides what to do with the
+ * verdict (print warning, throw, gate CI, etc.).
+ */
+declare function summarizeBackendIntegrity(records: ReadonlyArray<RunRecord>): BackendIntegrityReport;
+/**
+ * Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
+ * shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
+ * to also reject mixed verdicts (recommended for CI gates).
+ *
+ * Real backends pass through silently.
+ */
+declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
+    allowMixed?: boolean;
+}): BackendIntegrityReport;
+/**
+ * @stable
+ *
+ * AgentProfile — the eval harness's unit of variation.
+ *
+ * A profile pins everything that changes agent behaviour for a benchmark
+ * cell: the model, the active skills, the prompt version, the available
+ * tools. Vary the profile — swap a model, add a skill — and re-run the suite
+ * to benchmark the change. The scorecard keys a cell on
+ * `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
+ * inside the profile, and two profiles with the same model but different
+ * skills are different cells.
+ *
+ * `agentProfileHash` is the profile's behaviour identity. Two profiles that
+ * produce the same agent behaviour share a hash (and a scorecard cell);
+ * reordering `skills` or `tools` does not change it; the human-facing `id`
+ * label does not affect it.
+ */
+interface AgentProfile {
+    /** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
+    id: string;
+    /** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
+    model: string;
+    /** Skill ids/versions active in this profile — the primary behaviour lever. */
+    skills?: string[];
+    /** Prompt version identifier. */
+    promptVersion?: string;
+    /** Tool ids available to the agent. */
+    tools?: string[];
+    /** Any other behaviour-bearing knobs that should fingerprint into the hash. */
+    metadata?: Record<string, string | number | boolean>;
+}
+/**
+ * Deterministic behaviour identity of a profile — a sha256 over the
+ * behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
+ * `id` label is excluded. Throws on a profile with no `model` — an unkeyable
+ * profile must fail loud rather than collapse into a blank-model cell.
+ */
+declare function agentProfileHash(profile: AgentProfile): string;
+export { type AgentProfile as A, type BackendIntegrityReport as B, BackendIntegrityError as a, agentProfileHash as b, assertRealBackend as c, summarizeBackendIntegrity as s };

package/dist/benchmarks/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-wlaiph9Y.js';
-import '../run-record-etiCMsUq.js';
-import '../errors-mje_cKOs.js';
+export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-Bvk35ils.js';
+import '../run-record-DgUVo5pw.js';
+import '../errors-Dwqw-T_m.js';
 import '../schema-m0gsnbt3.js';

package/dist/builder-eval/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   SandboxHarness,
   runTestGradedScenario
-} from "../chunk-YTMXBHFM.js";
+} from "../chunk-T375SUOZ.js";
 import {
   judgeSpans
 } from "../chunk-47X6LRCE.js";
@@ -9,7 +9,7 @@ import "../chunk-5BKGXME7.js";
 import {
   TraceEmitter
 } from "../chunk-TVVP3ZZQ.js";
-import "../chunk-QYJT52YW.js";
+import "../chunk-3BFEG2F6.js";
 import "../chunk-PZ5AY32C.js";
 // src/builder-eval/builder-session.ts

package/dist/campaign/index.d.ts CHANGED Viewed

@@ -1,16 +1,18 @@
-export { B as BuildLoopProvenanceArgs, C as CampaignStorage, D as DefaultProductionGateOptions, E as EmitLoopProvenanceArgs, a as EmitLoopProvenanceResult, b as EvolutionaryDriverOptions, G as GepaDriverConstraints, c as GepaDriverOptions, H as HeldOutGateOptions, L as LoopProvenanceBackend, d as LoopProvenanceCandidate, e as LoopProvenanceRecord, O as OpenAutoPrOptions, f as OpenAutoPrResult, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, i as RunImprovementLoopResult, j as RunOptimizationOptions, k as RunOptimizationResult, l as buildLoopProvenanceRecord, m as composeGate, n as countSentenceEdits, o as defaultProductionGate, p as defaultRenderDiff, q as emitLoopProvenance, r as evolutionaryDriver, s as extractH2Sections, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, x as loopProvenanceSpans, y as openAutoPr, z as provenanceRecordPath, A as provenanceSpansPath, F as runCampaign, I as runEval, J as runImprovementLoop, K as runOptimization, M as surfaceContentHash, N as surfaceHash } from '../provenance-BM8vmMBa.js';
-import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, C as CodeSurface } from '../types-VCIXx_yo.js';
-export { g as CampaignAggregates, h as CampaignArtifactWriter, i as CampaignCellResult, j as CampaignCostMeter, k as CampaignResult, l as CampaignTraceWriter, b as DispatchContext, D as DispatchFn, G as Gate, m as GateContext, n as GateDecision, o as GateResult, p as GenerationCandidate, q as GenerationRecord, I as ImprovementDriver, r as JudgeAggregate, a as JudgeConfig, s as JudgeDimension, J as JudgeScore, t as LabeledScenarioSource, M as MutableSurface, u as Mutator, O as OptimizerConfig, P as ProposeContext, v as ProposedCandidate, R as RedactionStatus, S as Scenario, w as ScenarioAggregate, x as SessionScript, T as TraceSpan, y as isProposedCandidate, z as labelTrustRank } from '../types-VCIXx_yo.js';
-import '../llm-client-BXVRUZyX.js';
-import '../errors-mje_cKOs.js';
+import { C as CampaignStorage } from '../provenance-D0WeCXt1.js';
+export { B as BuildLoopProvenanceArgs, D as DefaultProductionGateOptions, E as EmitLoopProvenanceArgs, a as EmitLoopProvenanceResult, b as EvolutionaryDriverOptions, G as GepaDriverConstraints, c as GepaDriverOptions, H as HeldOutGateOptions, L as LoopProvenanceBackend, d as LoopProvenanceCandidate, e as LoopProvenanceRecord, O as OpenAutoPrOptions, f as OpenAutoPrResult, R as RunCampaignOptions, g as RunEvalOptions, h as RunImprovementLoopOptions, i as RunImprovementLoopResult, j as RunOptimizationOptions, k as RunOptimizationResult, l as buildLoopProvenanceRecord, m as composeGate, n as countSentenceEdits, o as defaultProductionGate, p as defaultRenderDiff, q as emitLoopProvenance, r as evolutionaryDriver, s as extractH2Sections, t as fsCampaignStorage, u as gepaDriver, v as heldOutGate, w as inMemoryCampaignStorage, x as loopProvenanceSpans, y as openAutoPr, z as provenanceRecordPath, A as provenanceSpansPath, F as runCampaign, I as runEval, J as runImprovementLoop, K as runOptimization, M as surfaceContentHash, N as surfaceHash } from '../provenance-D0WeCXt1.js';
+import { L as LabeledScenarioStore, c as LabeledScenarioWrite, d as LabeledScenarioSampleArgs, e as LabeledScenarioRecord, f as LabelTrust, S as Scenario, b as DispatchContext, a as JudgeConfig, g as LabeledScenarioSource, C as CampaignResult, h as CodeSurface } from '../types-Beb6KPqZ.js';
+export { i as CampaignAggregates, j as CampaignArtifactWriter, k as CampaignCellResult, l as CampaignCostMeter, m as CampaignTokenUsage, n as CampaignTraceWriter, D as DispatchFn, G as Gate, o as GateContext, p as GateDecision, q as GateResult, r as GenerationCandidate, s as GenerationRecord, I as ImprovementDriver, t as JudgeAggregate, u as JudgeDimension, J as JudgeScore, M as MutableSurface, v as Mutator, O as OptimizerConfig, P as ProposeContext, w as ProposedCandidate, R as RedactionStatus, x as ScenarioAggregate, y as SessionScript, T as TraceSpan, z as isProposedCandidate, A as labelTrustRank } from '../types-Beb6KPqZ.js';
+import { A as AgentProfile, B as BackendIntegrityReport } from '../agent-profile-9J9hxdm2.js';
+import { A as AgentEvalError } from '../errors-Dwqw-T_m.js';
+import { a as RunSplitTag, R as RunRecord } from '../run-record-DgUVo5pw.js';
+import '../llm-client-DbjLfz-K.js';
 import '../raw-provider-sink-C46HDghv.js';
-import '../red-team-CrC5MZYd.js';
-import '../dataset-BlwAtYYf.js';
+import '../red-team-DW9Ca_tj.js';
+import '../dataset-B2kL-fSM.js';
 import '../store-CKUAgsJz.js';
 import '../schema-m0gsnbt3.js';
-import '../run-record-etiCMsUq.js';
-import '../index-BIkvdkSU.js';
-import '../summary-report-DLxh4yWk.js';
+import '../index-D9dwa00f.js';
+import '../summary-report-BQvXpvaR.js';
 import '../failure-cluster-CL7IVgkJ.js';
 import '../judge-calibration-DilmB3Ml.js';
@@ -75,6 +77,144 @@ declare class FsLabeledScenarioStore implements LabeledScenarioStore {
     private pathForSource;
 }
+/**
+ * @experimental
+ *
+ * `runProfileMatrix` — the missing keystone between `runAgentMatrix` and the
+ * backend-integrity guard.
+ *
+ * The gap it closes: `runAgentMatrix` is a topology-opaque scheduler whose
+ * cells return a bare `{ output, verdict, costUsd }` — no `tokenUsage`, not a
+ * `RunRecord`. `assertRealBackend` / `summarizeBackendIntegrity` key on
+ * `RunRecord.tokenUsage`, so they cannot run on a raw matrix result. Every
+ * consumer therefore hand-writes the same bridge: fan a profile × scenario
+ * cartesian, call dispatch, fabricate a `RunRecord` with token usage, thread it
+ * back, run the integrity guard. That hand-rolled bridge is exactly the pile of
+ * bespoke `eval:*` scripts the adoption skills keep trying (and failing) to
+ * forbid.
+ *
+ * `runProfileMatrix` IS that bridge, once:
+ *
+ *   - axis 3 (PROFILE) = `profiles: AgentProfile[]`
+ *   - axis 1 (PERSONA/SCENARIO) = `scenarios: Scenario[]` (each scenario carries
+ *     its persona; `personaOf` groups them for the `byPersona` pivot)
+ *   - the scoring axis = `judges`
+ *
+ * It runs `runCampaign` once per profile (reusing its seeds, reps, bootstrap
+ * CIs, resumability, and the `LabeledScenarioStore` capture flywheel), maps
+ * every cell to a validated `RunRecord` carrying the real `tokenUsage` the
+ * dispatch reported via `ctx.cost.observeTokens`, and runs `assertRealBackend`
+ * BY CONSTRUCTION before returning — so a stub-backend run fails loudly instead
+ * of reporting a clean 0/N leaderboard.
+ *
+ * Dispatch contract: a dispatch that calls an LLM MUST report usage via
+ * `ctx.cost.observeTokens({ input, output })` (and cost via `ctx.cost.observe`).
+ * A dispatch that reports zero tokens is indistinguishable from a stub and the
+ * integrity guard treats it as one.
+ */
+/** Thrown when the matrix is misconfigured (no profiles, a profile whose model
+ *  lacks a snapshot version, etc.). Distinct from `BackendIntegrityError`,
+ *  which signals a stub backend at run time. */
+declare class ProfileMatrixError extends AgentEvalError {
+    constructor(message: string);
+}
+/** Dispatch for one cell: render `profile` against `scenario`, returning the
+ *  artifact the judges score. Report LLM usage via `ctx.cost.observeTokens`
+ *  and `ctx.cost.observe` — the integrity guard depends on it. */
+type ProfileDispatchFn<TScenario extends Scenario, TArtifact> = (profile: AgentProfile, scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
+interface RunProfileMatrixOptions<TScenario extends Scenario, TArtifact> {
+    /** Axis 3 — the agent-under-test configurations. Each is one column. */
+    profiles: AgentProfile[];
+    /** Axis 1 — the persona/scenario corpus, run against every profile. */
+    scenarios: TScenario[];
+    /** Renders one (profile, scenario) cell. */
+    dispatch: ProfileDispatchFn<TScenario, TArtifact>;
+    /** The scoring axis. */
+    judges?: JudgeConfig<TArtifact, TScenario>[];
+    /** Where each profile's campaign writes artifacts/traces. One subdir per
+     *  profile. */
+    runDir: string;
+    /** Git SHA the harness ran from — stamped onto every RunRecord (mandatory
+     *  for paper-grade records). */
+    commitSha: string;
+    /** Logical experiment id shared across the whole matrix so the promotion
+     *  gate can pair profiles on matched scenarios. Default: a hash of the
+     *  profile + scenario ids. */
+    experimentId?: string;
+    /** Which split these runs belong to. Default `'search'`. */
+    splitTag?: RunSplitTag;
+    /** Replicates per (profile, scenario) cell for CI bands. Default 1. */
+    reps?: number;
+    /** Campaign seed (per profile). Default 42. */
+    seed?: number;
+    /**
+     * Backend-integrity posture, enforced AFTER the matrix completes:
+     *   - `'assert'` (default) — throw `BackendIntegrityError` if the run was a
+     *     stub (and, with `allowMixed:false`, if it was mixed).
+     *   - `'warn'` — log the verdict but never throw.
+     *   - `'off'` — skip the guard entirely (only for offline/replay analysis).
+     */
+    integrity?: 'assert' | 'warn' | 'off';
+    /** Forwarded to `assertRealBackend`. Default true (tolerate partial 429
+     *  cascades); set false for strict CI gates. */
+    allowMixed?: boolean;
+    /** Max concurrent cells WITHIN each profile's campaign. Default 2.
+     *  Profiles run sequentially so the cost ceiling is honored deterministically. */
+    maxConcurrency?: number;
+    /** Cumulative USD cap per profile campaign. */
+    costCeiling?: number;
+    /** Capture flywheel — forwarded to each campaign. */
+    labeledStore?: LabeledScenarioStore | 'off';
+    captureSource?: LabeledScenarioSource;
+    /** Storage backend. Default `fsCampaignStorage`. Pass
+     *  `inMemoryCampaignStorage()` for edge/CF-Worker/test runs. */
+    storage?: CampaignStorage;
+    /** Test seam — override the wall clock. */
+    now?: () => Date;
+    /** Optional persona key per scenario — drives the `byPersona` pivot. When
+     *  unset, `byPersona` is omitted. */
+    personaOf?: (scenario: TScenario) => string;
+    /** Validate every produced RunRecord with `validateRunRecord` (fail-loud).
+     *  Default true — catches bad model snapshots and non-finite judge dims at
+     *  the boundary instead of letting them poison downstream analysis. */
+    validate?: boolean;
+}
+interface ProfileSummary {
+    profileId: string;
+    profileHash: string;
+    model: string;
+    /** RunRecords produced for this profile (= scenarios × reps). */
+    records: number;
+    /** Mean composite across this profile's records. */
+    meanComposite: number;
+    totalCostUsd: number;
+    /** Per-profile integrity verdict — surfaces a single profile that ran stub
+     *  even when the matrix as a whole looks real. */
+    integrity: BackendIntegrityReport;
+}
+interface ScenarioRollup {
+    meanComposite: number;
+    n: number;
+}
+interface RunProfileMatrixResult<TArtifact, TScenario extends Scenario> {
+    matrixId: string;
+    experimentId: string;
+    /** One RunRecord per (profile, scenario, rep) cell — the integrity-checked,
+     *  paper-grade output. Feed straight into `analyzeRuns`, `HeldOutGate`,
+     *  scorecards, the hosted wire format. */
+    records: RunRecord[];
+    byProfile: Record<string, ProfileSummary>;
+    byScenario: Record<string, ScenarioRollup>;
+    /** Present only when `personaOf` was supplied. */
+    byPersona?: Record<string, ScenarioRollup>;
+    /** Whole-matrix integrity report (the one `integrity:'assert'` enforces). */
+    integrity: BackendIntegrityReport;
+    /** The raw per-profile campaign results, keyed by profile id. */
+    campaigns: Record<string, CampaignResult<TArtifact, TScenario>>;
+}
+declare function runProfileMatrix<TScenario extends Scenario, TArtifact>(opts: RunProfileMatrixOptions<TScenario, TArtifact>): Promise<RunProfileMatrixResult<TArtifact, TScenario>>;
 /**
  * @experimental
  *
@@ -130,4 +270,4 @@ declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAd
  *  as a ref under the adapter's worktree dir. */
 declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
-export { CodeSurface, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type Worktree, type WorktreeAdapter, WorktreeAdapterError, gitWorktreeAdapter, resolveWorktreePath };
+export { CampaignResult, CampaignStorage, CodeSurface, DispatchContext, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type GitWorktreeAdapterOptions, JudgeConfig, LabelTrust, LabeledScenarioRecord, LabeledScenarioSampleArgs, LabeledScenarioSource, LabeledScenarioStore, LabeledScenarioStoreError, LabeledScenarioWrite, type ProfileDispatchFn, ProfileMatrixError, type ProfileSummary, type RunProfileMatrixOptions, type RunProfileMatrixResult, Scenario, type ScenarioRollup, type Worktree, type WorktreeAdapter, WorktreeAdapterError, gitWorktreeAdapter, resolveWorktreePath, runProfileMatrix };

package/dist/campaign/index.js CHANGED Viewed

@@ -20,19 +20,31 @@ import {
   runOptimization,
   surfaceContentHash,
   surfaceHash
-} from "../chunk-LBSXXH56.js";
+} from "../chunk-SUGME4OT.js";
 import {
   fsCampaignStorage,
   inMemoryCampaignStorage,
   runCampaign
-} from "../chunk-NOPYCRNG.js";
-import "../chunk-GBHRUAOF.js";
+} from "../chunk-OLULBECP.js";
+import {
+  agentProfileHash
+} from "../chunk-PQV2TKC3.js";
+import {
+  assertRealBackend,
+  summarizeBackendIntegrity
+} from "../chunk-GMXHLSLL.js";
 import "../chunk-YV7J7X5N.js";
-import "../chunk-S3SDD56V.js";
+import {
+  validateRunRecord
+} from "../chunk-F3SRAAZO.js";
+import "../chunk-ITBRCT73.js";
 import "../chunk-GGE4NNQT.js";
-import "../chunk-VXNVVBZO.js";
+import "../chunk-VSMTAMNK.js";
+import "../chunk-IHDHUN2X.js";
 import "../chunk-PC4UYEBM.js";
-import "../chunk-QYJT52YW.js";
+import {
+  AgentEvalError
+} from "../chunk-3BFEG2F6.js";
 import "../chunk-PZ5AY32C.js";
 // src/campaign/labeled-store/fs-adapter.ts
@@ -246,10 +258,197 @@ function appendLine(path, line) {
   }
 }
+// src/campaign/presets/run-profile-matrix.ts
+import { createHash as createHash2 } from "crypto";
+import { join as join2 } from "path";
+var ProfileMatrixError = class extends AgentEvalError {
+  constructor(message) {
+    super("profile_matrix", message);
+  }
+};
+function sanitize(id) {
+  return id.replace(/[^a-zA-Z0-9_-]/g, "_");
+}
+function sha(input) {
+  return createHash2("sha256").update(JSON.stringify(input)).digest("hex");
+}
+function mean(xs) {
+  return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+function cellComposite(cell) {
+  const composites = Object.values(cell.judgeScores).map((s) => s.composite);
+  return composites.length === 0 ? 0 : mean(composites);
+}
+function buildRunRecord(args) {
+  const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } = args;
+  const composite = cellComposite(cell);
+  const raw = { composite };
+  const perJudge = {};
+  const dimAccum = {};
+  const notes = [];
+  for (const [judgeName, js] of Object.entries(cell.judgeScores)) {
+    perJudge[judgeName] = { ...js.dimensions };
+    for (const [dim, value] of Object.entries(js.dimensions)) {
+      raw[`${judgeName}.${dim}`] = value;
+      (dimAccum[dim] ??= []).push(value);
+    }
+    if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
+  }
+  const perDimMean = {};
+  for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values);
+  const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
+  if (Object.keys(perJudge).length > 0) {
+    outcome.judgeScores = {
+      perJudge,
+      perDimMean,
+      composite,
+      ...notes.length > 0 ? { notes: notes.join(" | ") } : {}
+    };
+  }
+  return {
+    runId: `${matrixId}:${profile.id}:${cell.cellId}`,
+    experimentId,
+    candidateId: profile.id,
+    seed: cell.seed,
+    model: profile.model,
+    promptHash: profileHash,
+    configHash,
+    commitSha,
+    wallMs: cell.durationMs,
+    costUsd: cell.costUsd,
+    tokenUsage: cell.tokenUsage,
+    outcome,
+    splitTag,
+    scenarioId: cell.scenarioId,
+    ...cell.error ? { failureMode: cell.error } : {}
+  };
+}
+async function runProfileMatrix(opts) {
+  if (opts.profiles.length === 0) throw new ProfileMatrixError("profiles must not be empty");
+  if (opts.scenarios.length === 0) throw new ProfileMatrixError("scenarios must not be empty");
+  const splitTag = opts.splitTag ?? "search";
+  const seed = opts.seed ?? 42;
+  const validate = opts.validate ?? true;
+  const integrityMode = opts.integrity ?? "assert";
+  const profileIds = opts.profiles.map((p) => p.id);
+  const experimentId = opts.experimentId ?? `pm_${sha({ profileIds, scenarios: opts.scenarios.map((s) => s.id) }).slice(0, 16)}`;
+  const matrixId = `mtx_${sha({ experimentId, profileIds, seed, splitTag }).slice(0, 16)}`;
+  for (const profile of opts.profiles) {
+    const profileHash = agentProfileHash(profile);
+    try {
+      validateRunRecord({
+        runId: `${matrixId}:${profile.id}:probe`,
+        experimentId,
+        candidateId: profile.id,
+        seed,
+        model: profile.model,
+        promptHash: profileHash,
+        configHash: profileHash,
+        commitSha: opts.commitSha,
+        wallMs: 0,
+        costUsd: 0,
+        tokenUsage: { input: 0, output: 0 },
+        outcome: splitTag === "holdout" ? { holdoutScore: 0, raw: {} } : { searchScore: 0, raw: {} },
+        splitTag
+      });
+    } catch (err) {
+      throw new ProfileMatrixError(
+        `profile '${profile.id}' is not recordable: ${err instanceof Error ? err.message : String(err)}`
+      );
+    }
+  }
+  const records = [];
+  const campaigns = {};
+  const byProfile = {};
+  for (const profile of opts.profiles) {
+    const profileHash = agentProfileHash(profile);
+    const configHash = sha({
+      profile: profileHash,
+      judges: (opts.judges ?? []).map((j) => j.name),
+      seed,
+      splitTag
+    });
+    const dispatch = (scenario, ctx) => opts.dispatch(profile, scenario, ctx);
+    Object.defineProperty(dispatch, "name", { value: `profile_${sanitize(profile.id)}` });
+    const campaign = await runCampaign({
+      scenarios: opts.scenarios,
+      dispatch,
+      judges: opts.judges,
+      seed,
+      reps: opts.reps,
+      maxConcurrency: opts.maxConcurrency,
+      costCeiling: opts.costCeiling,
+      labeledStore: opts.labeledStore,
+      captureSource: opts.captureSource,
+      storage: opts.storage,
+      now: opts.now,
+      runDir: join2(opts.runDir, sanitize(profile.id))
+    });
+    campaigns[profile.id] = campaign;
+    const profileRecords = [];
+    for (const cell of campaign.cells) {
+      const record = buildRunRecord({
+        cell,
+        profile,
+        profileHash,
+        configHash,
+        experimentId,
+        splitTag,
+        commitSha: opts.commitSha,
+        matrixId
+      });
+      if (validate) validateRunRecord(record);
+      profileRecords.push(record);
+      records.push(record);
+    }
+    byProfile[profile.id] = {
+      profileId: profile.id,
+      profileHash,
+      model: profile.model,
+      records: profileRecords.length,
+      meanComposite: mean(profileRecords.map(compositeOf)),
+      totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
+      integrity: summarizeBackendIntegrity(profileRecords)
+    };
+  }
+  const integrity = summarizeBackendIntegrity(records);
+  if (integrityMode === "assert") {
+    assertRealBackend(records, { allowMixed: opts.allowMixed ?? true });
+  } else if (integrityMode === "warn" && integrity.verdict !== "real") {
+    console.warn(
+      `[runProfileMatrix] backend integrity: ${integrity.verdict} \u2014 ${integrity.diagnosis}`
+    );
+  }
+  const byScenario = rollup(records, (r) => r.scenarioId);
+  const byPersona = opts.personaOf ? rollupByPersona(records, opts.scenarios, opts.personaOf) : void 0;
+  return { matrixId, experimentId, records, byProfile, byScenario, byPersona, integrity, campaigns };
+}
+function compositeOf(r) {
+  return r.outcome.holdoutScore ?? r.outcome.searchScore ?? 0;
+}
+function rollup(records, keyOf) {
+  const groups = /* @__PURE__ */ new Map();
+  for (const r of records) {
+    const key = keyOf(r);
+    if (key === void 0) continue;
+    const arr = groups.get(key) ?? [];
+    arr.push(compositeOf(r));
+    groups.set(key, arr);
+  }
+  const out = {};
+  for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length };
+  return out;
+}
+function rollupByPersona(records, scenarios, personaOf) {
+  const personaByScenarioId = /* @__PURE__ */ new Map();
+  for (const s of scenarios) personaByScenarioId.set(s.id, personaOf(s));
+  return rollup(records, (r) => r.scenarioId ? personaByScenarioId.get(r.scenarioId) : void 0);
+}
 // src/campaign/worktree/index.ts
 import { execFileSync } from "child_process";
 import { existsSync as existsSync2 } from "fs";
-import { basename, isAbsolute, join as join2 } from "path";
+import { basename, isAbsolute, join as join3 } from "path";
 var WorktreeAdapterError = class extends Error {
   constructor(message, cause) {
     super(message);
@@ -271,13 +470,13 @@ function slug(label) {
 }
 function gitWorktreeAdapter(opts) {
   const git = opts.git ?? defaultGit;
-  const worktreeDir = opts.worktreeDir ?? join2(opts.repoRoot, ".worktrees");
+  const worktreeDir = opts.worktreeDir ?? join3(opts.repoRoot, ".worktrees");
   const branchPrefix = opts.branchPrefix ?? "improve";
   return {
     async create({ baseRef, label }) {
       const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
       const branch = `${branchPrefix}/${id}`;
-      const path = join2(worktreeDir, id);
+      const path = join3(worktreeDir, id);
       git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
       return { path, branch, baseRef };
     },
@@ -302,12 +501,13 @@ function gitWorktreeAdapter(opts) {
 }
 function resolveWorktreePath(surface, worktreeDir) {
   if (isAbsolute(surface.worktreeRef) && existsSync2(surface.worktreeRef)) return surface.worktreeRef;
-  if (worktreeDir) return join2(worktreeDir, basename(surface.worktreeRef));
+  if (worktreeDir) return join3(worktreeDir, basename(surface.worktreeRef));
   return surface.worktreeRef;
 }
 export {
   FsLabeledScenarioStore,
   LabeledScenarioStoreError,
+  ProfileMatrixError,
   WorktreeAdapterError,
   buildLoopProvenanceRecord,
   composeGate,
@@ -333,6 +533,7 @@ export {
   runEval,
   runImprovementLoop,
   runOptimization,
+  runProfileMatrix,
   surfaceContentHash,
   surfaceHash
 };